saritha5's picture
Update app.py
d03c834
import os
import sys
import random
import statistics
import numpy as np
import matplotlib.pyplot as plt
import threading
import time
import queue
sys.path.append(os.path.abspath("../lib"))
sys.path.append(os.path.abspath("../supv"))
sys.path.append(os.path.abspath("../text"))
from util import *
from sampler import *
from tnn import *
from txproc import *
import streamlit as st
emailDoms = ["yahoo.com", "gmail.com", "hotmail.com", "aol.com"]
st.title("Duplicate Records Prediction")
def printNgramVec(ngv):
"""
print ngram vector
"""
print("ngram vector")
for i in range(len(ngv)):
if ngv[i] > 0:
print("{} {}".format(i, ngv[i]))
def createNegMatch(tdata, ri):
"""
create negative match by randomly selecting another record
"""
nri = randomInt(0, len(tdata)-1)
while nri == ri:
nri = randomInt(0, len(tdata)-1)
return tdata[nri]
def createNgramCreator():
""" create ngram creator """
cng = CharNGram(["lcc", "ucc", "dig"], 3, True)
spc = ["@", "#", "_", "-", "."]
cng.addSpChar(spc)
cng.setWsRepl("$")
cng.finalize()
return cng
def getSim(rec, incOutput=True):
""" get rec pair similarity """
#print(rec)
sim = list()
for i in range(6):
#print("field " + str(i))
if i == 3:
s = levenshteinSimilarity(rec[i],rec[i+6])
else:
ngv1 = cng.toMgramCount(rec[i])
ngv2 = cng.toMgramCount(rec[i+6])
#printNgramVec(ngv1)
#printNgramVec(ngv2)
s = cosineSimilarity(ngv1, ngv2)
sim.append(s)
ss = toStrFromList(sim, 6)
srec = ss + "," + rec[-1] if incOutput else ss
return srec
class SimThread (threading.Thread):
""" multi threaded similarity calculation """
def __init__(self, tName, cng, qu, incOutput, outQu, outQuSize):
""" initialize """
threading.Thread.__init__(self)
self.tName = tName
self.cng = cng
self.qu = qu
self.incOutput = incOutput
self.outQu = outQu
self.outQuSize = outQuSize
def run(self):
""" exeution """
exitFlag =True
while not exitFlag:
rec = dequeue(self.qu, workQuLock)
if rec is not None:
srec = getSim(rec, self.incOutput)
if outQu is None:
print(srec)
else:
enqueue(srec, self.outQu, outQuLock, self.outQuSize)
def createThreads(nworker, cng, workQu, incOutput, outQu, outQuSize):
"""create worker threads """
threadList = list(map(lambda i : "Thread-" + str(i+1), range(nworker)))
threads = list()
for tName in threadList:
thread = SimThread(tName, cng, workQu, incOutput, outQu, outQuSize)
thread.start()
threads.append(thread)
return threads
def enqueue(rec, qu, quLock, qSize):
""" enqueue record """
queued = False
while not queued:
quLock.acquire()
if qu.qsize() < qSize - 1:
qu.put(rec)
queued = True
quLock.release()
time.sleep(1)
def dequeue(qu, quLock):
""" dequeue record """
rec = None
quLock.acquire()
if not qu.empty():
rec = qu.get()
quLock.release()
return rec
test_file = 'pers_new_dup.txt'
exist_file = 'pers_exist.txt'
prop_file = 'tnn_disamb.properties'
def predict_main(test_file,exist_file,prop_file):
#multi threading related
workQuLock = threading.Lock()
outQuLock = threading.Lock()
exitFlag = False
""" predict with neural network model """
newFilePath = test_file
existFilePath = exist_file
nworker = 1
prFile = prop_file
regr = FeedForwardNetwork(prFile)
regr.buildModel()
cng = createNgramCreator()
#create threads
qSize = 100
workQu = queue.Queue(qSize)
outQu = queue.Queue(qSize)
threads = createThreads(nworker, cng, workQu, False, outQu, qSize)
for nrec in fileRecGen(newFilePath):
srecs = list()
ecount = 0
y_pred = []
#print("processing ", nrec)
for erec in fileRecGen(existFilePath):
rec = nrec.copy()
rec.extend(erec)
#print(rec)
enqueue(rec, workQu, workQuLock, qSize)
srec = dequeue(outQu, outQuLock)
if srec is not None:
srecs.append(strToFloatArray(srec))
ecount += 1
#wait til workq queue is drained
while not workQu.empty():
pass
#drain out queue
while len(srecs) < ecount:
srec = dequeue(outQu, outQuLock)
if srec is not None:
srecs.append(strToFloatArray(srec))
#predict
simMax = 0
sims = FeedForwardNetwork.predict(regr, srecs)
sims = sims.reshape(sims.shape[0])
y_pred.append(max(sims))
#print("{} {:.3f}".format(nrec, y_pred))
print(nrec, max(y_pred))
# exitFlag = True
st.header(predict_main(test_file,exist_file,prop_file))
st.header("End")