-
Notifications
You must be signed in to change notification settings - Fork 1
/
topic-modeling-script
140 lines (115 loc) · 5.24 KB
/
topic-modeling-script
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
### Topic Modeling for the Goethe prose and drama corpus ###
### cf http://www.matthewjockers.net/materials/dh-2014-introduction-to-text-analysis-and-topic-modeling-with-r/ ###
setwd("~/Documents/Entsagung bei Goethe/Topic Modeling")
# Prelims
inputDir <- "Korpus Topic Modeling"
files.v <- dir(path = inputDir, pattern = "*\\.txt")
chunk.size <- 1000 # number of words per chunk
# Text Chunking Function
makeFlexTextChunks <- function(inputDir, file.name, chunk.size=1000, percentage=TRUE){
text.file.path<-file.path(inputDir, file.name)
text.lines.v <- scan(text.file.path, what="character", sep="\n")
novel.v <- paste(text.lines.v, collapse=" ")
novel.lower.v <- tolower(novel.v)
novel.lower.l <- strsplit(novel.lower.v, "\\W")
novel.word.v <- unlist(novel.lower.l)
not.blanks.v <- which(novel.word.v!="")
novel.word.v <- novel.word.v[not.blanks.v]
x <- seq_along(novel.word.v)
if(percentage){
max.length <- length(novel.word.v)/chunk.size
chunks.l <- split(novel.word.v, ceiling(x/max.length))
} else {
chunks.l <- split(novel.word.v, ceiling(x/chunk.size))
if(length(chunks.l[[length(chunks.l)]]) <=
length(chunks.l[[length(chunks.l)]])/2){
chunks.l[[length(chunks.l)-1]] <-
c(chunks.l[[length(chunks.l)-1]], chunks.l[[length(chunks.l)]])
chunks.l[[length(chunks.l)]] <- NULL
}
}
chunks.l <- lapply(chunks.l, paste, collapse=" ")
chunks.df <- do.call(rbind, chunks.l)
return(chunks.df)
}
# Loop for chunking each text in the corpus directory
topic.l <- NULL
for(i in 1:length(files.v)){
chunk.m <- makeFlexTextChunks(inputDir, files.v[i], chunk.size)
textname <- gsub("\\..*","", files.v[i])
segments.m <- cbind(paste(textname, segment=1:nrow(chunk.m), sep="_"), chunk.m)
topic.l[[textname]] <- segments.m
}
topic.m <- do.call(rbind, topic.l)
# Convert the matrix to a data frame for mallet processing
# Prepare for mallet
documents <- as.data.frame(topic.m, stringsAsFactors=F)
colnames(documents) <- c("id", "text")
# Load and run Mallet
library(rJava)
library(mallet)
mallet.instances <- mallet.import(documents$id, documents$text, "Stopwordlist_Goethe_Jan.csv", FALSE, token.regexp="[\\p{L}']+") # insert your own stopwordlist file here
# Create a topic trainer object.
topic.model <- MalletLDA(num.topics=60)
topic.model$loadDocuments(mallet.instances)
vocabulary <- topic.model$getVocabulary()
# examine some of the vocabulary
vocabulary[1:50]
vocabulary[5000:5050]
word.freqs <- mallet.word.freqs(topic.model)
# examine some of the word frequencies:
head(word.freqs)
topic.model$setAlphaOptimization(40, 80)
topic.model$train(4000)
topic.words.m <- mallet.topic.words(topic.model, smoothed=TRUE, normalized=TRUE)
# how big is the resulting matrix?
dim(topic.words.m)
# set the column names to make the matrix easier to read:
colnames(topic.words.m) <- vocabulary
# examine a specific topic
topic.num <- 8 # the topic id you wish to examine
num.top.words<-10 # the number of top words in the topic you want to examine
mallet.top.words(topic.model, topic.words.m[topic.num,], num.top.words)
# Visualize topics as word clouds
# be sure you have installed the wordcloud package
library(RColorBrewer)
library(wordcloud)
topic.num <- 1
num.top.words<-100
topic.top.words <- mallet.top.words(topic.model, topic.words.m[1,], 100)
wordcloud(topic.top.words$words, topic.top.words$weights, c(4,.8), rot.per=0, random.order=F)
num.topics<-60
num.top.words<-25
for(i in 1:num.topics){
topic.top.words <- mallet.top.words(topic.model, topic.words.m[i,], num.top.words)
wordcloud(topic.top.words$words, topic.top.words$weights, c(4,.8), rot.per=0, random.order=F)
}
# save all word clouds in one pdf file
library(RColorBrewer)
library(wordcloud)
pdf(file="Goetheclouds4.pdf")
num.topics<-60
num.top.words<-25
for(i in 1:num.topics){
topic.top.words <- mallet.top.words(topic.model, topic.words.m[i,], num.top.words)
print(wordcloud(topic.top.words$words, topic.top.words$weights, c(4,.8), rot.per=0, random.order=F))
}
dev.off()
# topic per document
doc.topics.m <- mallet.doc.topics(topic.model, smoothed=T, normalized=T)
doc.topics.m # column=topics; row=documents; think about the values as percentages or proportions of the document
dim(doc.topics.m) ## [1] 56534 60 -> the 60 Goethe texts were chunked into 56534 segments
# topics in whole documents (up to now only segments)
file.ids.v <- documents[,1] # in documents the IDs of the chunked documents are saved
head(file.ids.v)
file.id.l <- strsplit(file.ids.v, "_")
file.chunk.id.l <- lapply(file.id.l, rbind)
file.chunk.id.m <- do.call(rbind, file.chunk.id.l) # column 1: which row in doc.topics.m belongs to which document?
head(file.chunk.id.m)
# topic average for each topic in each document
doc.topics.df <- as.data.frame(doc.topics.m) #we also need the letter values
doc.topics.df <- cbind(file.chunk.id.m[,1], doc.topics.df)
doc.topic.means.df <- aggregate(doc.topics.df[, 2:ncol(doc.topics.df)], list(doc.topics.df[,1]), mean)
doc.topic.means.df # everything in one place; one row per text; 1. row = file name (called group by aggregate); further rows for the single topics
barplot(doc.topic.means.df[, "V45"], names.arg=c(1:60)) # visualization of topic 6
filename <- as.character(doc.topic.means.df[59, "Group.1"]) # document name of text 48 -> "1809-Wahlverwandtschaften"