forked from nhsx/stm-survey-text
-
Notifications
You must be signed in to change notification settings - Fork 0
/
wordnet.R
104 lines (74 loc) · 2.64 KB
/
wordnet.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#### Using WordNet #####
#' Explores the use of hypernyms, hyponyms and synonyms from WordNet package.
## Installation
# download jre and jdk
# install.packages
# Sys.setenv(JAVA_HOME="C:/Program Files/Java/jdk-18/")
# library(rJava)
# install.packages("wordnet")
library("wordnet")
# initalise wordnet dictionary
initDict()
# set this value if you haven't already. See the README for more
# Sys.setenv(WNHOME = "C:/Program Files (x86)/WordNet/2.1")
getDict()
getFilterTypes()
# ! - antonym, @ - hypernym, & - similar(adjectives only) ~ - hyponym
## Getting synonymns
# option a
filter <- getTermFilter("ExactMatchFilter", "company", TRUE)
terms <- getIndexTerms("NOUN", 1, filter)
getSynonyms(terms[[1]])
# option b
syn <- synonyms("spouse", "NOUN")
teststring <- c("staff", "spouse", "cancer")
# for a list of words
results <- list()
for (i in teststring){
# print(i)
print(synonyms(i, "NOUN"))
}
## get similar words - often producing empty lists
# try finding hypernym and then hyponyms
filter <- getTermFilter("ExactMatchFilter", "spouse", TRUE)
terms <- getIndexTerms("NOUN", 1, filter)
synsets <- getSynsets(terms[[1]])
related <- getRelatedSynsets(synsets[[1]], "@")
sapply(related, getWord)
related2 <- getRelatedSynsets(related[[2]], "~")
sapply(related2, getWord)
similar <- unlist(sapply(related2, getWord))
# similar words that are in vocab
for (el in similar){
if(el %in% stmdata$vocab){print(el)}
}
# use hypernyms to get the overarching name for terms.
# Group terms accordingly - or replace terms with hypernymns in text
# examples:
# " health centre" --> institution establishment --> organisation
# "hospital" --> medical building --> building, edifice
# "doctor" --> medical practitioner --> health professional
# 'asthma' --> respiratory disease --> disease
filter <- getTermFilter("WildCardFilter", "asthma", TRUE)
terms <- getIndexTerms("NOUN", 1, filter)
synsets <- getSynsets(terms[[1]])
related <- getRelatedSynsets(synsets[[1]], "@")
sapply(related, getWord)
related2 <- getRelatedSynsets(related[[1]], "@")
sapply(related2, getWord)
related3 <- getRelatedSynsets(related[[1]], "@")
sapply(related3, getWord)
## get the hyponymns
filter <- getTermFilter("ExactMatchFilter", "medical practitioner", TRUE)
terms <- getIndexTerms("NOUN", 1, filter)
synsets <- getSynsets(terms[[1]])
related <- getRelatedSynsets(synsets[[1]], "~")
# ! - antonym, @ - hypernym, & - similar
# list of the hyponymns
hypon <- unlist(lapply(related, getWord))
# lists can be very long and there may be a second level of hyponyms
# needed to search the vocab in the text.
# hyponymns in vocab
for (el in hypon){
if(el %in% stmdata$vocab){print(el)}
}