import os import sys import random import statistics import numpy as np import matplotlib.pyplot as plt import threading import time import queue sys.path.append(os.path.abspath("../lib")) sys.path.append(os.path.abspath("../supv")) sys.path.append(os.path.abspath("../text")) from util import * from sampler import * from tnn import * from txproc import * import streamlit as st emailDoms = ["yahoo.com", "gmail.com", "hotmail.com", "aol.com"] st.title("Duplicate Records Prediction") def printNgramVec(ngv): """ print ngram vector """ print("ngram vector") for i in range(len(ngv)): if ngv[i] > 0: print("{} {}".format(i, ngv[i])) def createNegMatch(tdata, ri): """ create negative match by randomly selecting another record """ nri = randomInt(0, len(tdata)-1) while nri == ri: nri = randomInt(0, len(tdata)-1) return tdata[nri] def createNgramCreator(): """ create ngram creator """ cng = CharNGram(["lcc", "ucc", "dig"], 3, True) spc = ["@", "#", "_", "-", "."] cng.addSpChar(spc) cng.setWsRepl("$") cng.finalize() return cng def getSim(rec, incOutput=True): """ get rec pair similarity """ #print(rec) sim = list() for i in range(6): #print("field " + str(i)) if i == 3: s = levenshteinSimilarity(rec[i],rec[i+6]) else: ngv1 = cng.toMgramCount(rec[i]) ngv2 = cng.toMgramCount(rec[i+6]) #printNgramVec(ngv1) #printNgramVec(ngv2) s = cosineSimilarity(ngv1, ngv2) sim.append(s) ss = toStrFromList(sim, 6) srec = ss + "," + rec[-1] if incOutput else ss return srec class SimThread (threading.Thread): """ multi threaded similarity calculation """ def __init__(self, tName, cng, qu, incOutput, outQu, outQuSize): """ initialize """ threading.Thread.__init__(self) self.tName = tName self.cng = cng self.qu = qu self.incOutput = incOutput self.outQu = outQu self.outQuSize = outQuSize def run(self): """ exeution """ exitFlag =True while not exitFlag: rec = dequeue(self.qu, workQuLock) if rec is not None: srec = getSim(rec, self.incOutput) if outQu is None: print(srec) else: enqueue(srec, self.outQu, outQuLock, self.outQuSize) def createThreads(nworker, cng, workQu, incOutput, outQu, outQuSize): """create worker threads """ threadList = list(map(lambda i : "Thread-" + str(i+1), range(nworker))) threads = list() for tName in threadList: thread = SimThread(tName, cng, workQu, incOutput, outQu, outQuSize) thread.start() threads.append(thread) return threads def enqueue(rec, qu, quLock, qSize): """ enqueue record """ queued = False while not queued: quLock.acquire() if qu.qsize() < qSize - 1: qu.put(rec) queued = True quLock.release() time.sleep(1) def dequeue(qu, quLock): """ dequeue record """ rec = None quLock.acquire() if not qu.empty(): rec = qu.get() quLock.release() return rec test_file = 'pers_new_dup.txt' exist_file = 'pers_exist.txt' prop_file = 'tnn_disamb.properties' def predict_main(test_file,exist_file,prop_file): #multi threading related workQuLock = threading.Lock() outQuLock = threading.Lock() exitFlag = False """ predict with neural network model """ newFilePath = test_file existFilePath = exist_file nworker = 1 prFile = prop_file regr = FeedForwardNetwork(prFile) regr.buildModel() cng = createNgramCreator() #create threads qSize = 100 workQu = queue.Queue(qSize) outQu = queue.Queue(qSize) threads = createThreads(nworker, cng, workQu, False, outQu, qSize) for nrec in fileRecGen(newFilePath): srecs = list() ecount = 0 y_pred = [] #print("processing ", nrec) for erec in fileRecGen(existFilePath): rec = nrec.copy() rec.extend(erec) #print(rec) enqueue(rec, workQu, workQuLock, qSize) srec = dequeue(outQu, outQuLock) if srec is not None: srecs.append(strToFloatArray(srec)) ecount += 1 #wait til workq queue is drained while not workQu.empty(): pass #drain out queue while len(srecs) < ecount: srec = dequeue(outQu, outQuLock) if srec is not None: srecs.append(strToFloatArray(srec)) #predict simMax = 0 sims = FeedForwardNetwork.predict(regr, srecs) sims = sims.reshape(sims.shape[0]) y_pred.append(max(sims)) #print("{} {:.3f}".format(nrec, y_pred)) print(nrec, max(y_pred)) # exitFlag = True st.header(predict_main(test_file,exist_file,prop_file)) st.header("End")