Spaces:
Runtime error
Runtime error
#!/usr/bin/env python | |
# coding: utf-8 | |
# In[20]: | |
import os | |
import sys | |
import random | |
import statistics | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import threading | |
import time | |
import queue | |
sys.path.append(os.path.abspath("../lib")) | |
sys.path.append(os.path.abspath("../supv")) | |
sys.path.append(os.path.abspath("../text")) | |
from util import * | |
from sampler import * | |
from tnn import * | |
from txproc import * | |
emailDoms = ["yahoo.com", "gmail.com", "hotmail.com", "aol.com"] | |
# In[21]: | |
def mutStr(st): | |
"""mutate a char in string""" | |
l = len(st) | |
ci = randomInt(0, l - 1) | |
cv = st[ci] | |
if cv.isdigit(): | |
r = selectRandomFromList(dig) | |
elif cv.isupper(): | |
r = selectRandomFromList(ucc) | |
else: | |
r = selectRandomFromList(lcc) | |
nst = st[:ci] + r + st[ci+1:] if l > 1 else r | |
return nst | |
# In[23]: | |
def createPosMatch(rec, fi): | |
""" | |
create positive match by mutating a field | |
""" | |
mrec = rec.copy() | |
fv = mrec[fi] | |
nc = fv.split() | |
le = len(nc) | |
if fi == 0: | |
#name | |
if isEventSampled(50): | |
nfv = nc[0] + " " + selectRandomFromList(ucc) + " " + nc[1] | |
else: | |
nc[1] = mutStr(nc[1]) | |
nfv = nc[0] + " " + nc[1] | |
elif fi == 1: | |
#address | |
mutated = False | |
if isEventSampled(50): | |
mutated = True | |
s = nc[-1] | |
if s == "Rd": | |
nc[-1] = "Road" | |
elif s == "Ave": | |
nc[-1] = "Avenue" | |
elif s == "St": | |
nc[-1] = "Street" | |
elif s == "Dr": | |
nc[-1] = "Drive" | |
else: | |
mutated = False | |
if not mutated: | |
si = randomInt(0, 1) | |
nc[si] = mutStr(nc[si]) | |
nfv = " ".join(nc) | |
elif fi == 2: | |
#city | |
si = randomInt(0, le - 1) if le > 1 else 0 | |
nc[si] = mutStr(nc[si]) | |
nfv = " ".join(nc) if le > 1 else nc[0] | |
elif fi == 3: | |
#state | |
nc[0] = mutStr(nc[0]) | |
nfv = nc[0] | |
elif fi == 4: | |
#zip | |
nc[0] = mutStr(nc[0]) | |
nfv = nc[0] | |
elif fi == 5: | |
if isEventSampled(60): | |
nc[0] = mutStr(nc[0]) | |
nfv = nc[0] | |
else: | |
nfv = genLowCaseID(randomInt(4, 10)) + "@" + selectRandomFromList(emailDoms) | |
mrec[fi] = nfv | |
return mrec | |
# In[24]: | |
def printNgramVec(ngv): | |
""" | |
print ngram vector | |
""" | |
print("ngram vector") | |
for i in range(len(ngv)): | |
if ngv[i] > 0: | |
print("{} {}".format(i, ngv[i])) | |
# In[25]: | |
def createNegMatch(tdata, ri): | |
""" | |
create negative match by randomly selecting another record | |
""" | |
nri = randomInt(0, len(tdata)-1) | |
while nri == ri: | |
nri = randomInt(0, len(tdata)-1) | |
return tdata[nri] | |
# In[26]: | |
def createNgramCreator(): | |
""" create ngram creator """ | |
cng = CharNGram(["lcc", "ucc", "dig"], 3, True) | |
spc = ["@", "#", "_", "-", "."] | |
cng.addSpChar(spc) | |
cng.setWsRepl("$") | |
cng.finalize() | |
return cng | |
# In[27]: | |
def getSim(rec, incOutput=True): | |
""" get rec pair similarity """ | |
#print(rec) | |
sim = list() | |
for i in range(6): | |
#print("field " + str(i)) | |
if i == 3: | |
s = levenshteinSimilarity(rec[i],rec[i+6]) | |
else: | |
ngv1 = cng.toMgramCount(rec[i]) | |
ngv2 = cng.toMgramCount(rec[i+6]) | |
#printNgramVec(ngv1) | |
#printNgramVec(ngv2) | |
s = cosineSimilarity(ngv1, ngv2) | |
sim.append(s) | |
ss = toStrFromList(sim, 6) | |
srec = ss + "," + rec[-1] if incOutput else ss | |
return srec | |
# In[28]: | |
class SimThread (threading.Thread): | |
""" multi threaded similarity calculation """ | |
def __init__(self, tName, cng, qu, incOutput, outQu, outQuSize): | |
""" initialize """ | |
threading.Thread.__init__(self) | |
self.tName = tName | |
self.cng = cng | |
self.qu = qu | |
self.incOutput = incOutput | |
self.outQu = outQu | |
self.outQuSize = outQuSize | |
def run(self): | |
""" exeution """ | |
while not exitFlag: | |
rec = dequeue(self.qu, workQuLock) | |
if rec is not None: | |
srec = getSim(rec, self.incOutput) | |
if outQu is None: | |
print(srec) | |
else: | |
enqueue(srec, self.outQu, outQuLock, self.outQuSize) | |
def createThreads(nworker, cng, workQu, incOutput, outQu, outQuSize): | |
"""create worker threads """ | |
threadList = list(map(lambda i : "Thread-" + str(i+1), range(nworker))) | |
threads = list() | |
if(outQu is None): | |
outQu = queue.Queue(outQuSize) | |
for tName in threadList: | |
thread = SimThread(tName, cng, workQu, incOutput, outQu, outQuSize) | |
thread.start() | |
threads.append(thread) | |
return threads | |
def enqueue(rec, qu, quLock, qSize): | |
""" enqueue record """ | |
queued = False | |
while not queued: | |
quLock.acquire() | |
if qu.qsize() < qSize - 1: | |
qu.put(rec) | |
queued = True | |
quLock.release() | |
time.sleep(1) | |
def dequeue(qu, quLock): | |
""" dequeue record """ | |
rec = None | |
quLock.acquire() | |
if not qu.empty(): | |
rec = qu.get() | |
quLock.release() | |
return rec | |
# In[30]: | |
if __name__ == "__main__": | |
op = sys.argv[1] | |
#multi threading related | |
workQuLock = threading.Lock() | |
outQuLock = threading.Lock() | |
exitFlag = False | |
if op == "gen": | |
fp= open(r'pers.txt', 'w', encoding='utf-8') | |
""" generate data from from source file""" | |
srcFilePath = sys.argv[2] | |
i = 0 | |
x="" | |
for rec in fileRecGen(srcFilePath, ","): | |
if i > 0: | |
nrec = list() | |
fname = rec[0] | |
lname = rec[1] | |
nrec.append(fname + " " + lname) | |
''' | |
nrec.append(rec[-9][1:-1]) | |
nrec.append(rec[-8][1:-1]) | |
nrec.append(rec[-6][1:-1]) | |
''' | |
nrec.append(rec[-9]) | |
nrec.append(rec[-8]) | |
nrec.append(rec[-6]) | |
z = rec[-5] | |
nrec.append(z) | |
nrec.append(rec[-2]) | |
print(",".join(nrec)) | |
x= ",".join(nrec) | |
print(x) | |
fp.write(x) | |
fp.write("\n") | |
i += 1 | |
fp.close() | |
if op == "genad": | |
""" generate additional data by swapping name and address with another random record""" | |
srcFilePath = sys.argv[2] | |
nrec = int(sys.argv[3]) | |
tdata = getFileLines(srcFilePath) | |
x="" | |
fp= open(r'pers_new.txt', 'w', encoding='utf-8') | |
for _ in range(nrec): | |
r1 = selectRandomFromList(tdata) | |
#print(",".join(r1)) | |
r2 = selectRandomFromList(tdata) | |
while r1[0] == r2[0]: | |
r1 = selectRandomFromList(tdata) | |
r2 = selectRandomFromList(tdata) | |
nm = r2[0] | |
r1[0] = nm | |
r1[1] = r2[1] | |
email = nm.split()[0].lower() + "@" + r1[5].split("@")[1] | |
r1[5] = email | |
print(",".join(r1)) | |
x= ",".join(r1) | |
print(x) | |
fp.write(x) | |
fp.write("\n") | |
fp.close() | |
if op == "gendup": | |
fp= open(r'pers_new_dup.txt', 'w', encoding='utf-8') | |
""" replace some records in first file with records from another file""" | |
srcFilePath = sys.argv[2] | |
dupFilePath = sys.argv[3] | |
ndup = int(sys.argv[4]) | |
tdata = getFileLines(srcFilePath, None) | |
percen = 10 | |
tdataSec = list() | |
while len(tdataSec) < ndup: | |
tdataSec = getFileSampleLines(dupFilePath, percen) | |
print("----------", tdataSec) | |
percen = int(percen * ndup / len(tdataSec) + 2) | |
tdataSec = selectRandomSubListFromList(tdataSec, ndup) | |
drecs = list() | |
for rec in tdataSec: | |
fi = randomInt(0, 5) | |
mrec = createPosMatch(rec, fi) | |
if isEventSampled(30): | |
fi = randomInt(0, 5) | |
mrec = createPosMatch(mrec, fi) | |
drecs.append(",".join(mrec)) | |
setListRandomFromList(tdata, drecs) | |
for r in tdata: | |
print(r) | |
#x= ",".join(nrec) | |
fp.write(r) | |
fp.write("\n") | |
fp.close() | |
elif op == "genpn": | |
fp= open(r'ppers1.txt', 'w', encoding='utf-8') | |
""" generate pos pos and pos neg paire """ | |
srcFilePath = sys.argv[2] | |
tdata = getFileLines(srcFilePath, None) if len(sys.argv) == 3 else getFileLines(sys.argv[3], None) | |
ri = 0 | |
for rec in fileRecGen(srcFilePath, "\n"): | |
for _ in range(2): | |
fi = randomInt(0, 5) | |
rec_copy = rec[0].split(",") | |
mrec = createPosMatch(rec_copy, fi) | |
if isEventSampled(30): | |
fi = randomInt(0, 5) | |
mrec = createPosMatch(mrec, fi) | |
#print(",".join(rec) + "," + ",".join(mrec) + "," + "1") | |
x= ",".join(rec)+ "," + ",".join(mrec) + "," + "1" | |
print(x) | |
fp.write(x) | |
fp.write("\n") | |
for _ in range(2): | |
mrec = createNegMatch(tdata, ri) | |
#print(",".join(rec) + "," + mrec + "," + "0") | |
x= ",".join(rec)+ "," + mrec + "," + "0" | |
fp.write(x) | |
fp.write("\n") | |
ri += 1 | |
fp.close() | |
elif op == "sim": | |
fp= open(r'spers_tr.txt', 'w', encoding='utf-8') | |
""" create field pair similarity """ | |
srcFilePath = sys.argv[2] | |
cng = CharNGram(["lcc", "ucc", "dig"], 3, True) | |
spc = ["@", "#", "_", "-", "."] | |
cng.addSpChar(spc) | |
cng.setWsRepl("$") | |
cng.finalize() | |
c = 0 | |
x = "" | |
for rec in fileRecGen(srcFilePath, ","): | |
#print(",".join(rec)) | |
srec = getSim(rec) | |
#print(srec) | |
c += 1 | |
x= ",".join(srec) | |
print(x) | |
fp.write(srec) | |
fp.write("\n") | |
fp.close() | |
elif op == "msim": | |
""" create field pair similarity in parallel""" | |
srcFilePath = sys.argv[2] | |
nworker = int(sys.argv[3]) | |
cng = createNgramCreator() | |
c = 0 | |
#create threads | |
qSize = 100 | |
outQu = queue.Queue(qSize) | |
workQu = queue.Queue(qSize) | |
#threads = createThreads(nworker, cng, workQu, True, None, None) | |
threads = createThreads(nworker, cng, workQu, True, outQu, qSize) | |
# print(rec, workQu, qSize) | |
for rec in fileRecGen(srcFilePath, ","): | |
enqueue(rec, workQu, workQuLock, qSize) | |
#wrqp up | |
while not workQu.empty(): | |
pass | |
exitFlag = True | |
for t in threads: | |
t.join() | |
elif op == "nnTrain": | |
""" train neural network model """ | |
prFile = sys.argv[2] | |
regr = FeedForwardNetwork(prFile) | |
regr.buildModel() | |
FeedForwardNetwork.batchTrain(regr) | |
elif op == "nnPred": | |
""" predict with neural network model """ | |
newFilePath = sys.argv[2] | |
existFilePath = sys.argv[3] | |
nworker = int(sys.argv[4]) | |
prFile = sys.argv[5] | |
regr = FeedForwardNetwork(prFile) | |
regr.buildModel() | |
cng = createNgramCreator() | |
#create threads | |
qSize = 100 | |
workQu = queue.Queue(qSize) | |
outQu = queue.Queue(qSize) | |
threads = createThreads(nworker, cng, workQu, False, outQu, qSize) | |
for nrec in fileRecGen(newFilePath): | |
srecs = list() | |
ecount = 0 | |
y_pred = [] | |
# print("processing ", nrec) | |
for erec in fileRecGen(existFilePath): | |
rec = nrec.copy() | |
rec.extend(erec) | |
# print(rec) | |
enqueue(rec, workQu, workQuLock, qSize) | |
srec = dequeue(outQu, outQuLock) | |
if srec is not None: | |
srecs.append(strToFloatArray(srec)) | |
ecount += 1 | |
# print("srecs1", srecs) | |
#wait til workq queue is drained | |
while not workQu.empty(): | |
pass | |
#drain out queue | |
while len(srecs) < ecount: | |
srec = dequeue(outQu, outQuLock) | |
if srec is not None: | |
srecs.append(strToFloatArray(srec)) | |
time.sleep(1) | |
# print("srecs--------------", srecs) | |
#predict | |
simMax = 0 | |
sims = FeedForwardNetwork.predict(regr, srecs) | |
#sims = FeedForwardNetwork.predict(nrec, srecs) | |
sims = sims.reshape(sims.shape[0]) | |
y_pred.append(max(sims)) | |
#print("{} {:.3f}".format(nrec, y_pred)) | |
print(nrec, max(y_pred)) | |
#exitFlag = True | |
#for t in threads: | |
# t.join() | |
elif op == "nnPred_withthread": | |
""" predict with neural network model """ | |
newFilePath = sys.argv[2] | |
existFilePath = sys.argv[3] | |
nworker = int(sys.argv[4]) | |
prFile = sys.argv[5] | |
regr = FeedForwardNetwork(prFile) | |
regr.buildModel() | |
cng = createNgramCreator() | |
#create threads | |
qSize = 100 | |
workQu = queue.Queue(qSize) | |
outQu = queue.Queue(qSize) | |
threads = createThreads(nworker, cng, workQu, False, outQu, qSize) | |
for nrec in fileRecGen(newFilePath): | |
srecs = list() | |
ecount = 0 | |
print("processing ", nrec) | |
for erec in fileRecGen(existFilePath): | |
rec = nrec.copy() | |
rec.extend(erec) | |
print(rec) | |
enqueue(rec, workQu, workQuLock, qSize) | |
srec = dequeue(outQu, outQuLock) | |
if srec is not None: | |
srecs.append(strToFloatArray(srec)) | |
ecount += 1 | |
#wait til workq queue is drained | |
while not workQu.empty(): | |
pass | |
#drain out queue | |
while len(srecs) < ecount: | |
srec = dequeue(outQu, outQuLock) | |
if srec is not None: | |
srecs.append(strToFloatArray(srec)) | |
time.sleep(1) | |
#predict | |
simMax = 0 | |
sims = FeedForwardNetwork.predict(regr, srecs) | |
sims = sims.reshape(sims.shape[0]) | |
print("{} {:.3f}".format(nrec, max(sims))) | |
exitFlag = True | |
for t in threads: | |
t.join() | |
# In[ ]: | |