-
Notifications
You must be signed in to change notification settings - Fork 0
/
document_analyzer.py
113 lines (86 loc) · 2.97 KB
/
document_analyzer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import requests
import re
import html2text
import math
# TF RELATED
def process_document(URL):
# setup
s = requests.session()
s.headers= {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:53.0) Gecko/20100101 Firefox/53.0'}
# get document in html form
response = s.get(URL)
raw_text = response.text
# convert html to text
document = html2text.html2text(raw_text).encode('utf-8')
return document
def generate_term_count_dict(document):
# seperate terms into list
terms_list = re.findall(b"[a-zA-Z]+", document)
# record term counts in a dictionary
term_count_dict = {}
for term in terms_list:
if len(term) >= 2:
term = term.lower()
if term in term_count_dict:
term_count_dict[term] += 1
else:
term_count_dict[term] = 1
return term_count_dict
def sum_values_in_dict(dict):
sum = 0
for key, value in dict.items():
sum = sum + value
return sum
def calculate_tf(term_count, num_of_terms):
return term_count / num_of_terms
# tf is specific to document for each word
def generate_term_tf_dict(term_count_dict):
num_of_terms = sum_values_in_dict(term_count_dict)
term_tf_dict = {}
for key, value in term_count_dict.items():
term_tf_dict[key] = calculate_tf(value, float(num_of_terms))
return term_tf_dict
# IDF RELATED
def generate_set_of_all_terms(URL_termcountdict_dict):
terms_set = []
for URL, term_count_dict in URL_termcountdict_dict.items():
for term, count in term_count_dict.items():
if term not in terms_set:
terms_set.append(term)
return terms_set
def calculate_idf(num_of_docs_containing_term, num_of_docs):
return math.log(num_of_docs / float(num_of_docs_containing_term))
# idf is general to all documents for each word
def generate_term_idf_dict(terms_set, URL_termcountdict_dict):
num_of_docs = len(URL_termcountdict_dict)
term_idf_dict = {}
for term in terms_set:
containing_doc_count = 0
for URL, dictionary in URL_termcountdict_dict.items():
if term in dictionary:
containing_doc_count += 1
term_idf_dict[term] = calculate_idf(containing_doc_count, num_of_docs)
return term_idf_dict
def get_tfidf_scores(URLs):
# save count of each term for each URL
URL_termcountdict_dict = {}
# save tf of each term for each URL
URL_termtfdict_dict = {}
for URL in URLs:
doc = process_document(URL)
term_count_dict = generate_term_count_dict(doc)
term_tf_dict = generate_term_tf_dict(term_count_dict)
URL_termcountdict_dict[URL] = term_count_dict
URL_termtfdict_dict[URL] = term_tf_dict
terms_set = generate_set_of_all_terms(URL_termcountdict_dict)
term_idf_dict = generate_term_idf_dict(terms_set, URL_termcountdict_dict)
term_tfidf_dict = {}
for URL in URLs:
for term in terms_set:
if term in URL_termcountdict_dict[URL]: # check if term exists in URL
if term in term_tfidf_dict:
term_tfidf_dict[term] += URL_termtfdict_dict[URL][term] * term_idf_dict[term]
else:
term_tfidf_dict[term] = URL_termtfdict_dict[URL][term] * term_idf_dict[term]
print(term_tfidf_dict)
return term_tfidf_dict