Spaces:
Runtime error
Runtime error
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import random
|
4 |
+
import statistics
|
5 |
+
import numpy as np
|
6 |
+
import matplotlib.pyplot as plt
|
7 |
+
import threading
|
8 |
+
import time
|
9 |
+
import queue
|
10 |
+
sys.path.append(os.path.abspath("../lib"))
|
11 |
+
sys.path.append(os.path.abspath("../supv"))
|
12 |
+
sys.path.append(os.path.abspath("../text"))
|
13 |
+
from util import *
|
14 |
+
from sampler import *
|
15 |
+
from tnn import *
|
16 |
+
from txproc import *
|
17 |
+
|
18 |
+
emailDoms = ["yahoo.com", "gmail.com", "hotmail.com", "aol.com"]
|
19 |
+
|
20 |
+
st.title("Duplicate Records Prediction")
|
21 |
+
|
22 |
+
def printNgramVec(ngv):
|
23 |
+
"""
|
24 |
+
print ngram vector
|
25 |
+
"""
|
26 |
+
print("ngram vector")
|
27 |
+
for i in range(len(ngv)):
|
28 |
+
if ngv[i] > 0:
|
29 |
+
print("{} {}".format(i, ngv[i]))
|
30 |
+
|
31 |
+
def createNegMatch(tdata, ri):
|
32 |
+
"""
|
33 |
+
create negative match by randomly selecting another record
|
34 |
+
"""
|
35 |
+
nri = randomInt(0, len(tdata)-1)
|
36 |
+
while nri == ri:
|
37 |
+
nri = randomInt(0, len(tdata)-1)
|
38 |
+
return tdata[nri]
|
39 |
+
|
40 |
+
def createNgramCreator():
|
41 |
+
""" create ngram creator """
|
42 |
+
cng = CharNGram(["lcc", "ucc", "dig"], 3, True)
|
43 |
+
spc = ["@", "#", "_", "-", "."]
|
44 |
+
cng.addSpChar(spc)
|
45 |
+
cng.setWsRepl("$")
|
46 |
+
cng.finalize()
|
47 |
+
return cng
|
48 |
+
|
49 |
+
def getSim(rec, incOutput=True):
|
50 |
+
""" get rec pair similarity """
|
51 |
+
#print(rec)
|
52 |
+
sim = list()
|
53 |
+
for i in range(6):
|
54 |
+
#print("field " + str(i))
|
55 |
+
if i == 3:
|
56 |
+
s = levenshteinSimilarity(rec[i],rec[i+6])
|
57 |
+
else:
|
58 |
+
ngv1 = cng.toMgramCount(rec[i])
|
59 |
+
ngv2 = cng.toMgramCount(rec[i+6])
|
60 |
+
#printNgramVec(ngv1)
|
61 |
+
#printNgramVec(ngv2)
|
62 |
+
s = cosineSimilarity(ngv1, ngv2)
|
63 |
+
sim.append(s)
|
64 |
+
ss = toStrFromList(sim, 6)
|
65 |
+
srec = ss + "," + rec[-1] if incOutput else ss
|
66 |
+
return srec
|
67 |
+
|
68 |
+
|
69 |
+
class SimThread (threading.Thread):
|
70 |
+
""" multi threaded similarity calculation """
|
71 |
+
|
72 |
+
def __init__(self, tName, cng, qu, incOutput, outQu, outQuSize):
|
73 |
+
""" initialize """
|
74 |
+
threading.Thread.__init__(self)
|
75 |
+
self.tName = tName
|
76 |
+
self.cng = cng
|
77 |
+
self.qu = qu
|
78 |
+
self.incOutput = incOutput
|
79 |
+
self.outQu = outQu
|
80 |
+
self.outQuSize = outQuSize
|
81 |
+
|
82 |
+
def run(self):
|
83 |
+
""" exeution """
|
84 |
+
while not exitFlag:
|
85 |
+
rec = dequeue(self.qu, workQuLock)
|
86 |
+
if rec is not None:
|
87 |
+
srec = getSim(rec, self.incOutput)
|
88 |
+
if outQu is None:
|
89 |
+
print(srec)
|
90 |
+
else:
|
91 |
+
enqueue(srec, self.outQu, outQuLock, self.outQuSize)
|
92 |
+
|
93 |
+
def createThreads(nworker, cng, workQu, incOutput, outQu, outQuSize):
|
94 |
+
"""create worker threads """
|
95 |
+
threadList = list(map(lambda i : "Thread-" + str(i+1), range(nworker)))
|
96 |
+
threads = list()
|
97 |
+
for tName in threadList:
|
98 |
+
thread = SimThread(tName, cng, workQu, incOutput, outQu, outQuSize)
|
99 |
+
thread.start()
|
100 |
+
threads.append(thread)
|
101 |
+
return threads
|
102 |
+
|
103 |
+
|
104 |
+
def enqueue(rec, qu, quLock, qSize):
|
105 |
+
""" enqueue record """
|
106 |
+
queued = False
|
107 |
+
while not queued:
|
108 |
+
quLock.acquire()
|
109 |
+
if qu.qsize() < qSize - 1:
|
110 |
+
qu.put(rec)
|
111 |
+
queued = True
|
112 |
+
quLock.release()
|
113 |
+
time.sleep(1)
|
114 |
+
|
115 |
+
def dequeue(qu, quLock):
|
116 |
+
""" dequeue record """
|
117 |
+
rec = None
|
118 |
+
quLock.acquire()
|
119 |
+
if not qu.empty():
|
120 |
+
rec = qu.get()
|
121 |
+
quLock.release()
|
122 |
+
|
123 |
+
return rec
|
124 |
+
|
125 |
+
test_file = 'pers_new_dup.txt'
|
126 |
+
exist_file = 'pers_exist.txt'
|
127 |
+
prop_file = 'tnn_disamb.properties'
|
128 |
+
|
129 |
+
def predict_main(test_file,exist_file,prop_file):
|
130 |
+
#multi threading related
|
131 |
+
workQuLock = threading.Lock()
|
132 |
+
outQuLock = threading.Lock()
|
133 |
+
exitFlag = False
|
134 |
+
|
135 |
+
""" predict with neural network model """
|
136 |
+
newFilePath = test_file
|
137 |
+
existFilePath = exist_file
|
138 |
+
nworker = 1
|
139 |
+
prFile = prop_file
|
140 |
+
|
141 |
+
regr = FeedForwardNetwork(prFile)
|
142 |
+
regr.buildModel()
|
143 |
+
cng = createNgramCreator()
|
144 |
+
|
145 |
+
#create threads
|
146 |
+
qSize = 100
|
147 |
+
workQu = queue.Queue(qSize)
|
148 |
+
outQu = queue.Queue(qSize)
|
149 |
+
threads = createThreads(nworker, cng, workQu, False, outQu, qSize)
|
150 |
+
|
151 |
+
for nrec in fileRecGen(newFilePath):
|
152 |
+
srecs = list()
|
153 |
+
ecount = 0
|
154 |
+
y_pred = []
|
155 |
+
#print("processing ", nrec)
|
156 |
+
for erec in fileRecGen(existFilePath):
|
157 |
+
rec = nrec.copy()
|
158 |
+
rec.extend(erec)
|
159 |
+
#print(rec)
|
160 |
+
|
161 |
+
enqueue(rec, workQu, workQuLock, qSize)
|
162 |
+
srec = dequeue(outQu, outQuLock)
|
163 |
+
if srec is not None:
|
164 |
+
srecs.append(strToFloatArray(srec))
|
165 |
+
ecount += 1
|
166 |
+
|
167 |
+
#wait til workq queue is drained
|
168 |
+
while not workQu.empty():
|
169 |
+
pass
|
170 |
+
|
171 |
+
#drain out queue
|
172 |
+
while len(srecs) < ecount:
|
173 |
+
srec = dequeue(outQu, outQuLock)
|
174 |
+
if srec is not None:
|
175 |
+
srecs.append(strToFloatArray(srec))
|
176 |
+
#predict
|
177 |
+
simMax = 0
|
178 |
+
sims = FeedForwardNetwork.predict(regr, srecs)
|
179 |
+
sims = sims.reshape(sims.shape[0])
|
180 |
+
y_pred.append(max(sims))
|
181 |
+
#print("{} {:.3f}".format(nrec, y_pred))
|
182 |
+
print(nrec, max(y_pred))
|
183 |
+
|
184 |
+
# exitFlag = True
|
185 |
+
|
186 |
+
predict_main(test_file,exist_file,prop_file)
|
187 |
+
|
188 |
+
st.header("End")
|