Spaces:

ThirdEyeData
/

Duplicate_Records_Prediction

Runtime error

File size: 10,603 Bytes

c5f0a6a

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "1e3fc800",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import sys\n",
    "import random\n",
    "import statistics \n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt \n",
    "import threading\n",
    "import time\n",
    "import queue\n",
    "sys.path.append(os.path.abspath(\"../lib\"))\n",
    "sys.path.append(os.path.abspath(\"../supv\"))\n",
    "sys.path.append(os.path.abspath(\"../text\"))\n",
    "from util import *\n",
    "from sampler import *\n",
    "from tnn import *\n",
    "from txproc import *\n",
    "\n",
    "emailDoms = [\"yahoo.com\", \"gmail.com\", \"hotmail.com\", \"aol.com\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "e9195f68",
   "metadata": {},
   "outputs": [],
   "source": [
    "def printNgramVec(ngv):\n",
    "    \"\"\"\n",
    "    print ngram vector\n",
    "    \"\"\"\n",
    "    print(\"ngram vector\")\n",
    "    for i in range(len(ngv)):\n",
    "        if ngv[i] > 0:\n",
    "            print(\"{} {}\".format(i, ngv[i]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "a78fc5c8",
   "metadata": {},
   "outputs": [],
   "source": [
    "def createNegMatch(tdata, ri):\n",
    "    \"\"\"\n",
    "    create negative match by randomly selecting another record\n",
    "    \"\"\"\n",
    "    nri = randomInt(0, len(tdata)-1)\n",
    "    while nri == ri:\n",
    "        nri = randomInt(0, len(tdata)-1)\n",
    "    return tdata[nri]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "3645fe7c",
   "metadata": {},
   "outputs": [],
   "source": [
    "def createNgramCreator():\n",
    "    \"\"\" create ngram creator \"\"\"\n",
    "    cng = CharNGram([\"lcc\", \"ucc\", \"dig\"], 3, True)\n",
    "    spc = [\"@\", \"#\", \"_\", \"-\", \".\"]\n",
    "    cng.addSpChar(spc)\n",
    "    cng.setWsRepl(\"$\")\n",
    "    cng.finalize()\n",
    "    return cng"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "f153bac5",
   "metadata": {},
   "outputs": [],
   "source": [
    "def getSim(rec, incOutput=True):\n",
    "    \"\"\" get rec pair similarity \"\"\"\n",
    "    #print(rec)\n",
    "    sim = list()\n",
    "    for i in range(6):\n",
    "        #print(\"field \" + str(i))\n",
    "        if i == 3:\n",
    "            s = levenshteinSimilarity(rec[i],rec[i+6])\n",
    "        else:\n",
    "            ngv1 = cng.toMgramCount(rec[i])\n",
    "            ngv2 = cng.toMgramCount(rec[i+6])\n",
    "            #printNgramVec(ngv1)\n",
    "            #printNgramVec(ngv2)\n",
    "            s = cosineSimilarity(ngv1, ngv2)\n",
    "        sim.append(s)\n",
    "    ss = toStrFromList(sim, 6)\n",
    "    srec = ss + \",\" + rec[-1] if incOutput else ss\n",
    "    return srec"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "9e9ef369",
   "metadata": {},
   "outputs": [],
   "source": [
    "class SimThread (threading.Thread):\n",
    "    \"\"\" multi threaded similarity calculation \"\"\"\n",
    "\n",
    "    def __init__(self, tName, cng, qu, incOutput, outQu, outQuSize):\n",
    "        \"\"\" initialize \"\"\"\n",
    "        threading.Thread.__init__(self)\n",
    "        self.tName = tName\n",
    "        self.cng = cng\n",
    "        self.qu = qu\n",
    "        self.incOutput = incOutput\n",
    "        self.outQu = outQu\n",
    "        self.outQuSize = outQuSize\n",
    "\n",
    "    def run(self):\n",
    "        \"\"\" exeution \"\"\"\n",
    "        while not exitFlag:\n",
    "            rec = dequeue(self.qu, workQuLock)\n",
    "            if rec is not None:\n",
    "                srec = getSim(rec, self.incOutput)\n",
    "                if outQu is None:\n",
    "                    print(srec)\n",
    "                else:\n",
    "                    enqueue(srec, self.outQu, outQuLock, self.outQuSize)\n",
    "\n",
    "def createThreads(nworker, cng, workQu, incOutput, outQu, outQuSize):\n",
    "    \"\"\"create worker threads \"\"\"\n",
    "    threadList = list(map(lambda i : \"Thread-\" + str(i+1), range(nworker)))\n",
    "    threads = list()\n",
    "    for tName in threadList:\n",
    "        thread = SimThread(tName, cng, workQu, incOutput, outQu, outQuSize)\n",
    "        thread.start()\n",
    "        threads.append(thread)\n",
    "    return threads\n",
    "\n",
    "\n",
    "def enqueue(rec, qu, quLock, qSize): \n",
    "    \"\"\" enqueue record \"\"\"\n",
    "    queued = False\n",
    "    while not queued:\n",
    "        quLock.acquire()\n",
    "        if qu.qsize() < qSize - 1:\n",
    "            qu.put(rec)\n",
    "            queued = True\n",
    "        quLock.release()\n",
    "        time.sleep(1)\n",
    "\n",
    "def dequeue(qu, quLock): \n",
    "    \"\"\" dequeue record \"\"\"\n",
    "    rec = None\n",
    "    quLock.acquire()\n",
    "    if not qu.empty():\n",
    "        rec = qu.get()\n",
    "    quLock.release()\n",
    "\n",
    "    return rec"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "8248426b",
   "metadata": {},
   "outputs": [],
   "source": [
    "if __name__ == \"__main__\":\n",
    "    #multi threading related\n",
    "    workQuLock = threading.Lock()\n",
    "    outQuLock = threading.Lock()\n",
    "    exitFlag = False\n",
    "\n",
    "    \"\"\" predict with neural network model \"\"\"\n",
    "    newFilePath = sys.argv[1]\n",
    "    existFilePath = sys.argv[2]\n",
    "    nworker = int(sys.argv[3])\n",
    "    prFile = sys.argv[4]\n",
    "    \n",
    "    regr = FeedForwardNetwork(prFile)\n",
    "    regr.buildModel()\n",
    "    cng = createNgramCreator()\n",
    "    \n",
    "    #create threads\n",
    "    qSize = 100\n",
    "    workQu = queue.Queue(qSize)\n",
    "    outQu = queue.Queue(qSize)\n",
    "    threads = createThreads(nworker, cng, workQu, False, outQu, qSize)\n",
    "    \n",
    "    for nrec in fileRecGen(newFilePath):\n",
    "        srecs = list()\n",
    "        ecount = 0\n",
    "        y_pred = []\n",
    "        #print(\"processing \", nrec)\n",
    "        for erec in fileRecGen(existFilePath):\n",
    "            rec = nrec.copy()\n",
    "            rec.extend(erec)\n",
    "            #print(rec)\n",
    "            \n",
    "            enqueue(rec, workQu, workQuLock, qSize)\n",
    "            srec = dequeue(outQu, outQuLock)\n",
    "            if srec is not None:\n",
    "                srecs.append(strToFloatArray(srec))\n",
    "            ecount += 1\n",
    "\n",
    "            #wait til workq queue is drained\n",
    "            while not workQu.empty():\n",
    "                pass\n",
    "\n",
    "            #drain out queue\n",
    "            while len(srecs) < ecount:\n",
    "                srec = dequeue(outQu, outQuLock)\n",
    "                if srec is not None:\n",
    "                    srecs.append(strToFloatArray(srec))\n",
    "            #predict        \n",
    "            simMax = 0\n",
    "            sims = FeedForwardNetwork.predict(regr, srecs)\n",
    "            sims = sims.reshape(sims.shape[0])\n",
    "            y_pred.append(max(sims))\n",
    "            #print(\"{}  {:.3f}\".format(nrec, y_pred))\n",
    "        print(nrec, max(y_pred))\n",
    "\n",
    "#         exitFlag = True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "62187449",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "self.config <mlutil.Configuration object at 0x00000205D0157F10>\n",
      "..restoring model checkpoint\n",
      "['Lawrence Lorens', '9 Hpy', 'Providence', 'RI', '2906', '[email protected]'] 0.9111754\n"
     ]
    },
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "Input \u001b[1;32mIn [34]\u001b[0m, in \u001b[0;36m<cell line: 1>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[43mpredict_main\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
      "Input \u001b[1;32mIn [33]\u001b[0m, in \u001b[0;36mpredict_main\u001b[1;34m()\u001b[0m\n\u001b[0;32m     48\u001b[0m \u001b[38;5;66;03m#drain out queue\u001b[39;00m\n\u001b[0;32m     49\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(srecs) \u001b[38;5;241m<\u001b[39m ecount:\n\u001b[1;32m---> 50\u001b[0m     srec \u001b[38;5;241m=\u001b[39m \u001b[43mdequeue\u001b[49m\u001b[43m(\u001b[49m\u001b[43moutQu\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutQuLock\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m     51\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m srec \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m     52\u001b[0m         srecs\u001b[38;5;241m.\u001b[39mappend(strToFloatArray(srec))\n",
      "Input \u001b[1;32mIn [8]\u001b[0m, in \u001b[0;36mdequeue\u001b[1;34m(qu, quLock)\u001b[0m\n\u001b[0;32m     49\u001b[0m rec \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m     50\u001b[0m quLock\u001b[38;5;241m.\u001b[39macquire()\n\u001b[1;32m---> 51\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[43mqu\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mempty\u001b[49m():\n\u001b[0;32m     52\u001b[0m     rec \u001b[38;5;241m=\u001b[39m qu\u001b[38;5;241m.\u001b[39mget()\n\u001b[0;32m     53\u001b[0m quLock\u001b[38;5;241m.\u001b[39mrelease()\n",
      "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "predict_main()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8fa85a13",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}