{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "1e3fc800", "metadata": {}, "outputs": [], "source": [ "import os\n", "import sys\n", "import random\n", "import statistics \n", "import numpy as np\n", "import matplotlib.pyplot as plt \n", "import threading\n", "import time\n", "import queue\n", "sys.path.append(os.path.abspath(\"../lib\"))\n", "sys.path.append(os.path.abspath(\"../supv\"))\n", "sys.path.append(os.path.abspath(\"../text\"))\n", "from util import *\n", "from sampler import *\n", "from tnn import *\n", "from txproc import *\n", "\n", "emailDoms = [\"yahoo.com\", \"gmail.com\", \"hotmail.com\", \"aol.com\"]" ] }, { "cell_type": "code", "execution_count": 2, "id": "e9195f68", "metadata": {}, "outputs": [], "source": [ "def printNgramVec(ngv):\n", " \"\"\"\n", " print ngram vector\n", " \"\"\"\n", " print(\"ngram vector\")\n", " for i in range(len(ngv)):\n", " if ngv[i] > 0:\n", " print(\"{} {}\".format(i, ngv[i]))" ] }, { "cell_type": "code", "execution_count": 3, "id": "a78fc5c8", "metadata": {}, "outputs": [], "source": [ "def createNegMatch(tdata, ri):\n", " \"\"\"\n", " create negative match by randomly selecting another record\n", " \"\"\"\n", " nri = randomInt(0, len(tdata)-1)\n", " while nri == ri:\n", " nri = randomInt(0, len(tdata)-1)\n", " return tdata[nri]" ] }, { "cell_type": "code", "execution_count": 4, "id": "3645fe7c", "metadata": {}, "outputs": [], "source": [ "def createNgramCreator():\n", " \"\"\" create ngram creator \"\"\"\n", " cng = CharNGram([\"lcc\", \"ucc\", \"dig\"], 3, True)\n", " spc = [\"@\", \"#\", \"_\", \"-\", \".\"]\n", " cng.addSpChar(spc)\n", " cng.setWsRepl(\"$\")\n", " cng.finalize()\n", " return cng" ] }, { "cell_type": "code", "execution_count": 5, "id": "f153bac5", "metadata": {}, "outputs": [], "source": [ "def getSim(rec, incOutput=True):\n", " \"\"\" get rec pair similarity \"\"\"\n", " #print(rec)\n", " sim = list()\n", " for i in range(6):\n", " #print(\"field \" + str(i))\n", " if i == 3:\n", " s = levenshteinSimilarity(rec[i],rec[i+6])\n", " else:\n", " ngv1 = cng.toMgramCount(rec[i])\n", " ngv2 = cng.toMgramCount(rec[i+6])\n", " #printNgramVec(ngv1)\n", " #printNgramVec(ngv2)\n", " s = cosineSimilarity(ngv1, ngv2)\n", " sim.append(s)\n", " ss = toStrFromList(sim, 6)\n", " srec = ss + \",\" + rec[-1] if incOutput else ss\n", " return srec" ] }, { "cell_type": "code", "execution_count": 6, "id": "9e9ef369", "metadata": {}, "outputs": [], "source": [ "class SimThread (threading.Thread):\n", " \"\"\" multi threaded similarity calculation \"\"\"\n", "\n", " def __init__(self, tName, cng, qu, incOutput, outQu, outQuSize):\n", " \"\"\" initialize \"\"\"\n", " threading.Thread.__init__(self)\n", " self.tName = tName\n", " self.cng = cng\n", " self.qu = qu\n", " self.incOutput = incOutput\n", " self.outQu = outQu\n", " self.outQuSize = outQuSize\n", "\n", " def run(self):\n", " \"\"\" exeution \"\"\"\n", " while not exitFlag:\n", " rec = dequeue(self.qu, workQuLock)\n", " if rec is not None:\n", " srec = getSim(rec, self.incOutput)\n", " if outQu is None:\n", " print(srec)\n", " else:\n", " enqueue(srec, self.outQu, outQuLock, self.outQuSize)\n", "\n", "def createThreads(nworker, cng, workQu, incOutput, outQu, outQuSize):\n", " \"\"\"create worker threads \"\"\"\n", " threadList = list(map(lambda i : \"Thread-\" + str(i+1), range(nworker)))\n", " threads = list()\n", " for tName in threadList:\n", " thread = SimThread(tName, cng, workQu, incOutput, outQu, outQuSize)\n", " thread.start()\n", " threads.append(thread)\n", " return threads\n", "\n", "\n", "def enqueue(rec, qu, quLock, qSize): \n", " \"\"\" enqueue record \"\"\"\n", " queued = False\n", " while not queued:\n", " quLock.acquire()\n", " if qu.qsize() < qSize - 1:\n", " qu.put(rec)\n", " queued = True\n", " quLock.release()\n", " time.sleep(1)\n", "\n", "def dequeue(qu, quLock): \n", " \"\"\" dequeue record \"\"\"\n", " rec = None\n", " quLock.acquire()\n", " if not qu.empty():\n", " rec = qu.get()\n", " quLock.release()\n", "\n", " return rec" ] }, { "cell_type": "code", "execution_count": 33, "id": "8248426b", "metadata": {}, "outputs": [], "source": [ "if __name__ == \"__main__\":\n", " #multi threading related\n", " workQuLock = threading.Lock()\n", " outQuLock = threading.Lock()\n", " exitFlag = False\n", "\n", " \"\"\" predict with neural network model \"\"\"\n", " newFilePath = sys.argv[1]\n", " existFilePath = sys.argv[2]\n", " nworker = int(sys.argv[3])\n", " prFile = sys.argv[4]\n", " \n", " regr = FeedForwardNetwork(prFile)\n", " regr.buildModel()\n", " cng = createNgramCreator()\n", " \n", " #create threads\n", " qSize = 100\n", " workQu = queue.Queue(qSize)\n", " outQu = queue.Queue(qSize)\n", " threads = createThreads(nworker, cng, workQu, False, outQu, qSize)\n", " \n", " for nrec in fileRecGen(newFilePath):\n", " srecs = list()\n", " ecount = 0\n", " y_pred = []\n", " #print(\"processing \", nrec)\n", " for erec in fileRecGen(existFilePath):\n", " rec = nrec.copy()\n", " rec.extend(erec)\n", " #print(rec)\n", " \n", " enqueue(rec, workQu, workQuLock, qSize)\n", " srec = dequeue(outQu, outQuLock)\n", " if srec is not None:\n", " srecs.append(strToFloatArray(srec))\n", " ecount += 1\n", "\n", " #wait til workq queue is drained\n", " while not workQu.empty():\n", " pass\n", "\n", " #drain out queue\n", " while len(srecs) < ecount:\n", " srec = dequeue(outQu, outQuLock)\n", " if srec is not None:\n", " srecs.append(strToFloatArray(srec))\n", " #predict \n", " simMax = 0\n", " sims = FeedForwardNetwork.predict(regr, srecs)\n", " sims = sims.reshape(sims.shape[0])\n", " y_pred.append(max(sims))\n", " #print(\"{} {:.3f}\".format(nrec, y_pred))\n", " print(nrec, max(y_pred))\n", "\n", "# exitFlag = True" ] }, { "cell_type": "code", "execution_count": 34, "id": "62187449", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "self.config \n", "..restoring model checkpoint\n", "['Lawrence Lorens', '9 Hpy', 'Providence', 'RI', '2906', 'lawrence.lorens@hotmail.com'] 0.9111754\n" ] }, { "ename": "KeyboardInterrupt", "evalue": "", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", "Input \u001b[1;32mIn [34]\u001b[0m, in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[43mpredict_main\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", "Input \u001b[1;32mIn [33]\u001b[0m, in \u001b[0;36mpredict_main\u001b[1;34m()\u001b[0m\n\u001b[0;32m 48\u001b[0m \u001b[38;5;66;03m#drain out queue\u001b[39;00m\n\u001b[0;32m 49\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(srecs) \u001b[38;5;241m<\u001b[39m ecount:\n\u001b[1;32m---> 50\u001b[0m srec \u001b[38;5;241m=\u001b[39m \u001b[43mdequeue\u001b[49m\u001b[43m(\u001b[49m\u001b[43moutQu\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutQuLock\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 51\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m srec \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m 52\u001b[0m srecs\u001b[38;5;241m.\u001b[39mappend(strToFloatArray(srec))\n", "Input \u001b[1;32mIn [8]\u001b[0m, in \u001b[0;36mdequeue\u001b[1;34m(qu, quLock)\u001b[0m\n\u001b[0;32m 49\u001b[0m rec \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m 50\u001b[0m quLock\u001b[38;5;241m.\u001b[39macquire()\n\u001b[1;32m---> 51\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[43mqu\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mempty\u001b[49m():\n\u001b[0;32m 52\u001b[0m rec \u001b[38;5;241m=\u001b[39m qu\u001b[38;5;241m.\u001b[39mget()\n\u001b[0;32m 53\u001b[0m quLock\u001b[38;5;241m.\u001b[39mrelease()\n", "\u001b[1;31mKeyboardInterrupt\u001b[0m: " ] } ], "source": [ "predict_main()" ] }, { "cell_type": "code", "execution_count": null, "id": "8fa85a13", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" } }, "nbformat": 4, "nbformat_minor": 5 }