-
Notifications
You must be signed in to change notification settings - Fork 0
/
Testing_bangla.py
109 lines (88 loc) · 2.99 KB
/
Testing_bangla.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
f=open("o.txt","r",encoding="utf8")
orig=f.read().replace("\n"," ")
f.close()
f2=open("s.txt","r",encoding="utf8")
plag=f2.read().replace("\n"," ")
f2.close()
#word tokenisation
tokens_o=word_tokenize(orig)
tokens_p=word_tokenize(plag)
#stop word removal
#punctuation removal
#stop_words=['তাই','সুতরাং','তথাপি','অধিকন্তু','অতএব','পরিশেষে','যদিও','অবশেষে','এখন','তবে','হওয়া','যেমন','তেমন','হইতে','হতে','থেকে','চেয়ে']
stop_words=[]
punctuations=['"','.','(',')',',',';',':',"''",'``']
#punctuations=[]
tokens_o = [w for w in tokens_o if not w in stop_words and not w in punctuations]
tokens_p = [w for w in tokens_p if not w in stop_words and not w in punctuations]
print(tokens_o)
print(tokens_p)
#Trigram Similiarity measures
trigrams_o=[]
for i in range(len(tokens_o)-2):
t=(tokens_o[i],tokens_o[i+1],tokens_o[i+2])
trigrams_o.append(t)
s=0
trigrams_p=[]
for i in range(len(tokens_p)-2):
t=(tokens_p[i],tokens_p[i+1],tokens_p[i+2])
trigrams_p.append(t)
if t in trigrams_o:
s+=1
print('Document plagiarism ->')
#jaccord coefficient = (S(o)^S(p))/(S(o) U S(p))
J=s/(len(trigrams_o)+len(trigrams_p))
print('By using Jaccord Coefficient Method:',J*100,'%')
#containment measure
C=s/len(trigrams_p)
print('By using Containment Measure Method:',C*100,'%')
#longest common subsequence
#dynamic programming algorithm for finding lcs
def lcs(l1,l2):
s1=word_tokenize(l1)
s2=word_tokenize(l2)
# storing the dp values
dp = [[None]*(len(s1)+1) for i in range(len(s2)+1)]
for i in range(len(s2)+1):
for j in range(len(s1)+1):
if i == 0 or j == 0:
dp[i][j] = 0
elif s2[i-1] == s1[j-1]:
dp[i][j] = dp[i-1][j-1]+1
else:
dp[i][j] = max(dp[i-1][j] , dp[i][j-1])
return dp[len(s2)][len(s1)]
# Sentence Tokenization - Custom Approach
def sentence_tokenize(word_tokens):
words = ''
sentence = []
for word in word_tokens:
if '।' in word or '?' in word:
words = words + ' ' + word
sentence.append(words)
words = ''
else:
if len(words)==0:
words = word
else:
words = words + ' ' + word
return sentence
sent_o=sentence_tokenize(tokens_o)
sent_p=sentence_tokenize(tokens_p)
print(sent_o)
print(sent_p)
#maximum length of LCS for a sentence in suspicious text
max_lcs=0
sum_lcs=0
for i in sent_p:
for j in sent_o:
l=lcs(i,j)
max_lcs=max(max_lcs,l)
sum_lcs+=max_lcs
max_lcs=0
score=sum_lcs/len(tokens_p)
print('By using Longest Common Subsequence:',score*100,'%')