saritha5's picture
Update app.py
757e41f
raw
history blame
5.14 kB
import os
import sys
import random
import statistics
import numpy as np
import matplotlib.pyplot as plt
import threading
import time
import queue
sys.path.append(os.path.abspath("../lib"))
sys.path.append(os.path.abspath("../supv"))
sys.path.append(os.path.abspath("../text"))
from util import *
from sampler import *
from tnn import *
from txproc import *
import streamlit as st
emailDoms = ["yahoo.com", "gmail.com", "hotmail.com", "aol.com"]
st.title("Duplicate Records Prediction")
def printNgramVec(ngv):
"""
print ngram vector
"""
print("ngram vector")
for i in range(len(ngv)):
if ngv[i] > 0:
print("{} {}".format(i, ngv[i]))
def createNegMatch(tdata, ri):
"""
create negative match by randomly selecting another record
"""
nri = randomInt(0, len(tdata)-1)
while nri == ri:
nri = randomInt(0, len(tdata)-1)
return tdata[nri]
def createNgramCreator():
""" create ngram creator """
cng = CharNGram(["lcc", "ucc", "dig"], 3, True)
spc = ["@", "#", "_", "-", "."]
cng.addSpChar(spc)
cng.setWsRepl("$")
cng.finalize()
return cng
def getSim(rec, incOutput=True):
""" get rec pair similarity """
#print(rec)
sim = list()
for i in range(6):
#print("field " + str(i))
if i == 3:
s = levenshteinSimilarity(rec[i],rec[i+6])
else:
ngv1 = cng.toMgramCount(rec[i])
ngv2 = cng.toMgramCount(rec[i+6])
#printNgramVec(ngv1)
#printNgramVec(ngv2)
s = cosineSimilarity(ngv1, ngv2)
sim.append(s)
ss = toStrFromList(sim, 6)
srec = ss + "," + rec[-1] if incOutput else ss
return srec
class SimThread (threading.Thread):
""" multi threaded similarity calculation """
def __init__(self, tName, cng, qu, incOutput, outQu, outQuSize):
""" initialize """
threading.Thread.__init__(self)
self.tName = tName
self.cng = cng
self.qu = qu
self.incOutput = incOutput
self.outQu = outQu
self.outQuSize = outQuSize
def run(self):
""" exeution """
while not exitFlag:
rec = dequeue(self.qu, workQuLock)
if rec is not None:
srec = getSim(rec, self.incOutput)
if outQu is None:
print(srec)
else:
enqueue(srec, self.outQu, outQuLock, self.outQuSize)
def createThreads(nworker, cng, workQu, incOutput, outQu, outQuSize):
"""create worker threads """
threadList = list(map(lambda i : "Thread-" + str(i+1), range(nworker)))
threads = list()
for tName in threadList:
thread = SimThread(tName, cng, workQu, incOutput, outQu, outQuSize)
thread.start()
threads.append(thread)
return threads
def enqueue(rec, qu, quLock, qSize):
""" enqueue record """
queued = False
while not queued:
quLock.acquire()
if qu.qsize() < qSize - 1:
qu.put(rec)
queued = True
quLock.release()
time.sleep(1)
def dequeue(qu, quLock):
""" dequeue record """
rec = None
quLock.acquire()
if not qu.empty():
rec = qu.get()
quLock.release()
return rec
test_file = 'pers_new_dup.txt'
exist_file = 'pers_exist.txt'
prop_file = 'tnn_disamb.properties'
def predict_main(test_file,exist_file,prop_file):
#multi threading related
workQuLock = threading.Lock()
outQuLock = threading.Lock()
exitFlag = False
""" predict with neural network model """
newFilePath = test_file
existFilePath = exist_file
nworker = 1
prFile = prop_file
regr = FeedForwardNetwork(prFile)
regr.buildModel()
cng = createNgramCreator()
#create threads
qSize = 100
workQu = queue.Queue(qSize)
outQu = queue.Queue(qSize)
threads = createThreads(nworker, cng, workQu, False, outQu, qSize)
for nrec in fileRecGen(newFilePath):
srecs = list()
ecount = 0
y_pred = []
#print("processing ", nrec)
for erec in fileRecGen(existFilePath):
rec = nrec.copy()
rec.extend(erec)
#print(rec)
enqueue(rec, workQu, workQuLock, qSize)
srec = dequeue(outQu, outQuLock)
if srec is not None:
srecs.append(strToFloatArray(srec))
ecount += 1
#wait til workq queue is drained
while not workQu.empty():
pass
#drain out queue
while len(srecs) < ecount:
srec = dequeue(outQu, outQuLock)
if srec is not None:
srecs.append(strToFloatArray(srec))
#predict
simMax = 0
sims = FeedForwardNetwork.predict(regr, srecs)
sims = sims.reshape(sims.shape[0])
y_pred.append(max(sims))
#print("{} {:.3f}".format(nrec, y_pred))
print(nrec, max(y_pred))
exitFlag = True
predict_main(test_file,exist_file,prop_file)
st.header("End")