saritha5 commited on
Commit
1d990cf
·
1 Parent(s): c5f0a6a

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +188 -0
app.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import random
4
+ import statistics
5
+ import numpy as np
6
+ import matplotlib.pyplot as plt
7
+ import threading
8
+ import time
9
+ import queue
10
+ sys.path.append(os.path.abspath("../lib"))
11
+ sys.path.append(os.path.abspath("../supv"))
12
+ sys.path.append(os.path.abspath("../text"))
13
+ from util import *
14
+ from sampler import *
15
+ from tnn import *
16
+ from txproc import *
17
+
18
+ emailDoms = ["yahoo.com", "gmail.com", "hotmail.com", "aol.com"]
19
+
20
+ st.title("Duplicate Records Prediction")
21
+
22
+ def printNgramVec(ngv):
23
+ """
24
+ print ngram vector
25
+ """
26
+ print("ngram vector")
27
+ for i in range(len(ngv)):
28
+ if ngv[i] > 0:
29
+ print("{} {}".format(i, ngv[i]))
30
+
31
+ def createNegMatch(tdata, ri):
32
+ """
33
+ create negative match by randomly selecting another record
34
+ """
35
+ nri = randomInt(0, len(tdata)-1)
36
+ while nri == ri:
37
+ nri = randomInt(0, len(tdata)-1)
38
+ return tdata[nri]
39
+
40
+ def createNgramCreator():
41
+ """ create ngram creator """
42
+ cng = CharNGram(["lcc", "ucc", "dig"], 3, True)
43
+ spc = ["@", "#", "_", "-", "."]
44
+ cng.addSpChar(spc)
45
+ cng.setWsRepl("$")
46
+ cng.finalize()
47
+ return cng
48
+
49
+ def getSim(rec, incOutput=True):
50
+ """ get rec pair similarity """
51
+ #print(rec)
52
+ sim = list()
53
+ for i in range(6):
54
+ #print("field " + str(i))
55
+ if i == 3:
56
+ s = levenshteinSimilarity(rec[i],rec[i+6])
57
+ else:
58
+ ngv1 = cng.toMgramCount(rec[i])
59
+ ngv2 = cng.toMgramCount(rec[i+6])
60
+ #printNgramVec(ngv1)
61
+ #printNgramVec(ngv2)
62
+ s = cosineSimilarity(ngv1, ngv2)
63
+ sim.append(s)
64
+ ss = toStrFromList(sim, 6)
65
+ srec = ss + "," + rec[-1] if incOutput else ss
66
+ return srec
67
+
68
+
69
+ class SimThread (threading.Thread):
70
+ """ multi threaded similarity calculation """
71
+
72
+ def __init__(self, tName, cng, qu, incOutput, outQu, outQuSize):
73
+ """ initialize """
74
+ threading.Thread.__init__(self)
75
+ self.tName = tName
76
+ self.cng = cng
77
+ self.qu = qu
78
+ self.incOutput = incOutput
79
+ self.outQu = outQu
80
+ self.outQuSize = outQuSize
81
+
82
+ def run(self):
83
+ """ exeution """
84
+ while not exitFlag:
85
+ rec = dequeue(self.qu, workQuLock)
86
+ if rec is not None:
87
+ srec = getSim(rec, self.incOutput)
88
+ if outQu is None:
89
+ print(srec)
90
+ else:
91
+ enqueue(srec, self.outQu, outQuLock, self.outQuSize)
92
+
93
+ def createThreads(nworker, cng, workQu, incOutput, outQu, outQuSize):
94
+ """create worker threads """
95
+ threadList = list(map(lambda i : "Thread-" + str(i+1), range(nworker)))
96
+ threads = list()
97
+ for tName in threadList:
98
+ thread = SimThread(tName, cng, workQu, incOutput, outQu, outQuSize)
99
+ thread.start()
100
+ threads.append(thread)
101
+ return threads
102
+
103
+
104
+ def enqueue(rec, qu, quLock, qSize):
105
+ """ enqueue record """
106
+ queued = False
107
+ while not queued:
108
+ quLock.acquire()
109
+ if qu.qsize() < qSize - 1:
110
+ qu.put(rec)
111
+ queued = True
112
+ quLock.release()
113
+ time.sleep(1)
114
+
115
+ def dequeue(qu, quLock):
116
+ """ dequeue record """
117
+ rec = None
118
+ quLock.acquire()
119
+ if not qu.empty():
120
+ rec = qu.get()
121
+ quLock.release()
122
+
123
+ return rec
124
+
125
+ test_file = 'pers_new_dup.txt'
126
+ exist_file = 'pers_exist.txt'
127
+ prop_file = 'tnn_disamb.properties'
128
+
129
+ def predict_main(test_file,exist_file,prop_file):
130
+ #multi threading related
131
+ workQuLock = threading.Lock()
132
+ outQuLock = threading.Lock()
133
+ exitFlag = False
134
+
135
+ """ predict with neural network model """
136
+ newFilePath = test_file
137
+ existFilePath = exist_file
138
+ nworker = 1
139
+ prFile = prop_file
140
+
141
+ regr = FeedForwardNetwork(prFile)
142
+ regr.buildModel()
143
+ cng = createNgramCreator()
144
+
145
+ #create threads
146
+ qSize = 100
147
+ workQu = queue.Queue(qSize)
148
+ outQu = queue.Queue(qSize)
149
+ threads = createThreads(nworker, cng, workQu, False, outQu, qSize)
150
+
151
+ for nrec in fileRecGen(newFilePath):
152
+ srecs = list()
153
+ ecount = 0
154
+ y_pred = []
155
+ #print("processing ", nrec)
156
+ for erec in fileRecGen(existFilePath):
157
+ rec = nrec.copy()
158
+ rec.extend(erec)
159
+ #print(rec)
160
+
161
+ enqueue(rec, workQu, workQuLock, qSize)
162
+ srec = dequeue(outQu, outQuLock)
163
+ if srec is not None:
164
+ srecs.append(strToFloatArray(srec))
165
+ ecount += 1
166
+
167
+ #wait til workq queue is drained
168
+ while not workQu.empty():
169
+ pass
170
+
171
+ #drain out queue
172
+ while len(srecs) < ecount:
173
+ srec = dequeue(outQu, outQuLock)
174
+ if srec is not None:
175
+ srecs.append(strToFloatArray(srec))
176
+ #predict
177
+ simMax = 0
178
+ sims = FeedForwardNetwork.predict(regr, srecs)
179
+ sims = sims.reshape(sims.shape[0])
180
+ y_pred.append(max(sims))
181
+ #print("{} {:.3f}".format(nrec, y_pred))
182
+ print(nrec, max(y_pred))
183
+
184
+ # exitFlag = True
185
+
186
+ predict_main(test_file,exist_file,prop_file)
187
+
188
+ st.header("End")