Spaces:

ThirdEyeData
/

Duplicate_Records_Prediction

Runtime error

File size: 73,212 Bytes

fc22863

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "21cb09bb",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import sys\n",
    "from random import randint\n",
    "import random\n",
    "import time\n",
    "import uuid\n",
    "from datetime import datetime\n",
    "import math\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import logging\n",
    "import logging.handlers\n",
    "import pickle\n",
    "from contextlib import contextmanager\n",
    "\n",
    "tokens = [\"0\",\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\",\"A\",\"B\",\"C\",\"D\",\"E\",\"F\",\"G\",\"H\",\"I\",\"J\",\"K\",\"L\",\"M\",\n",
    "    \"N\",\"O\",\"P\",\"Q\",\"R\",\"S\",\"T\",\"U\",\"V\",\"W\",\"X\",\"Y\",\"Z\",\"0\",\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\"]\n",
    "numTokens = tokens[:10]\n",
    "alphaTokens = tokens[10:36]\n",
    "loCaseChars = [\"a\", \"b\", \"c\", \"d\", \"e\", \"f\", \"g\", \"h\", \"i\", \"j\", \"k\",\"l\",\"m\",\"n\",\"o\",\n",
    "\"p\",\"q\",\"r\",\"s\",\"t\",\"u\",\"v\",\"w\",\"x\",\"y\",\"z\"]\n",
    "\n",
    "typeInt = \"int\"\n",
    "typeFloat = \"float\"\n",
    "typeString = \"string\"\n",
    "\n",
    "secInMinute = 60\n",
    "secInHour = 60 * 60\n",
    "secInDay = 24 * secInHour\n",
    "secInWeek = 7 * secInDay\n",
    "secInYear = 365 * secInDay\n",
    "secInMonth = secInYear / 12\n",
    "\n",
    "minInHour = 60\n",
    "minInDay = 24 * minInHour\n",
    "\n",
    "ftPerYard = 3\n",
    "ftPerMile = ftPerYard * 1760\n",
    "\n",
    "\n",
    "def genID(size):\n",
    "    \"\"\"\n",
    "    generates ID\n",
    "\n",
    "    Parameters\n",
    "        size : size of ID\n",
    "    \"\"\"\n",
    "    id = \"\"\n",
    "    for i in range(size):\n",
    "        id = id + selectRandomFromList(tokens)\n",
    "    return id\n",
    "\n",
    "def genIdList(numId, idSize):\n",
    "    \"\"\"\n",
    "    generate list of IDs\n",
    "\n",
    "    Parameters:\n",
    "        numId: number of Ids\n",
    "        idSize: ID size\n",
    "    \"\"\"\n",
    "    iDs = []\n",
    "    for i in range(numId):\n",
    "        iDs.append(genID(idSize))\n",
    "    return iDs\n",
    "\n",
    "def genNumID(size):\n",
    "    \"\"\"\n",
    "    generates ID consisting of digits onl\n",
    "\n",
    "    Parameters\n",
    "        size : size of ID\n",
    "    \"\"\"\n",
    "    id = \"\"\n",
    "    for i in range(size):\n",
    "        id = id + selectRandomFromList(numTokens)\n",
    "    return id\n",
    "\n",
    "def genLowCaseID(size):\n",
    "    \"\"\"\n",
    "    generates ID consisting of lower case chars\n",
    "\n",
    "    Parameters\n",
    "        size : size of ID\n",
    "    \"\"\"\n",
    "    id = \"\"\n",
    "    for i in range(size):\n",
    "        id = id + selectRandomFromList(loCaseChars)\n",
    "    return id\n",
    "\n",
    "def genNumIdList(numId, idSize):\n",
    "    \"\"\"\n",
    "    generate list of numeric IDs\n",
    "\n",
    "    Parameters:\n",
    "        numId: number of Ids\n",
    "        idSize: ID size\n",
    "    \"\"\"\n",
    "    iDs = []\n",
    "    for i in range(numId):\n",
    "        iDs.append(genNumID(idSize))\n",
    "    return iDs\n",
    "\n",
    "def genNameInitial():\n",
    "    \"\"\"\n",
    "    generate name initial\n",
    "    \"\"\"\n",
    "    return selectRandomFromList(alphaTokens) + selectRandomFromList(alphaTokens)\n",
    "\n",
    "def genPhoneNum(arCode):\n",
    "    \"\"\"\n",
    "    generates phone number\n",
    "\n",
    "    Parameters\n",
    "        arCode: area code\n",
    "    \"\"\"\n",
    "    phNum = genNumID(7)\n",
    "    return arCode + str(phNum)\n",
    "\n",
    "def selectRandomFromList(ldata):\n",
    "    \"\"\"\n",
    "    select an element randomly from a lis\n",
    "\n",
    "    Parameters\n",
    "        ldata : list data\n",
    "    \"\"\"\n",
    "    return ldata[randint(0, len(ldata)-1)]\n",
    "\n",
    "def selectOtherRandomFromList(ldata, cval):\n",
    "    \"\"\"\n",
    "    select an element randomly from a list excluding the given one\n",
    "\n",
    "    Parameters\n",
    "        ldata : list data\n",
    "        cval : value to be excluded\n",
    "    \"\"\"\n",
    "    nval = selectRandomFromList(ldata)\n",
    "    while nval == cval:\n",
    "        nval = selectRandomFromList(ldata)\n",
    "    return nval\n",
    "\n",
    "def selectRandomSubListFromList(ldata, num):\n",
    "    \"\"\"\n",
    "    generates random sublist from a list without replacemment\n",
    "\n",
    "    Parameters\n",
    "        ldata : list data\n",
    "        num : output list size\n",
    "    \"\"\"\n",
    "    assertLesser(num, len(ldata), \"size of sublist to be sampled greater than or equal to main list\")\n",
    "    i = randint(0, len(ldata)-1)\n",
    "    sel = ldata[i]\n",
    "    selSet = {i}\n",
    "    selList = [sel]\n",
    "    while (len(selSet) < num):\n",
    "        i = randint(0, len(ldata)-1)\n",
    "        if (i not in selSet):\n",
    "            sel = ldata[i]\n",
    "            selSet.add(i)\n",
    "            selList.append(sel)\n",
    "    return selList\n",
    "\n",
    "def selectRandomSubListFromListWithRepl(ldata, num):\n",
    "    \"\"\"\n",
    "    generates random sublist from a list with replacemment\n",
    "\n",
    "    Parameters\n",
    "        ldata : list data\n",
    "        num : output list size\n",
    "    \"\"\"\n",
    "    return list(map(lambda i : selectRandomFromList(ldata), range(num)))\n",
    "\n",
    "def selectRandomFromDict(ddata):\n",
    "    \"\"\"\n",
    "    select an element randomly from a dictionary\n",
    "\n",
    "    Parameters\n",
    "        ddata : dictionary data\n",
    "    \"\"\"\n",
    "    dkeys = list(ddata.keys())\n",
    "    dk = selectRandomFromList(dkeys)\n",
    "    el = (dk, ddata[dk])\n",
    "    return el\n",
    "\n",
    "def setListRandomFromList(ldata, ldataRepl):\n",
    "    \"\"\"\n",
    "    sets some elents in the first list randomly with elements from the second list\n",
    "\n",
    "    Parameters\n",
    "        ldata : list data\n",
    "        ldataRepl : list with replacement data\n",
    "    \"\"\"\n",
    "    l = len(ldata)\n",
    "    selSet = set()\n",
    "    for d in ldataRepl:\n",
    "        i = randint(0, l-1)\n",
    "        while i in selSet:\n",
    "            i = randint(0, l-1)\n",
    "        ldata[i] = d\n",
    "        selSet.add(i)\n",
    "\n",
    "def genIpAddress():\n",
    "    \"\"\"\n",
    "    generates IP address\n",
    "    \"\"\"\n",
    "    i1 = randint(0,256)\n",
    "    i2 = randint(0,256)\n",
    "    i3 = randint(0,256)\n",
    "    i4 = randint(0,256)\n",
    "    ip = \"%d.%d.%d.%d\" %(i1,i2,i3,i4)\n",
    "    return ip\n",
    "\n",
    "def curTimeMs():\n",
    "    \"\"\"\n",
    "    current time in ms\n",
    "    \"\"\"\n",
    "    return int((datetime.utcnow() - datetime(1970,1,1)).total_seconds() * 1000)\n",
    "\n",
    "def secDegPolyFit(x1, y1, x2, y2, x3, y3):\n",
    "    \"\"\"\n",
    "    second deg polynomial \t\n",
    "\n",
    "    Parameters\n",
    "        x1 : 1st point x\n",
    "        y1 : 1st point y\n",
    "        x2 : 2nd point x\n",
    "        y2 : 2nd point y\n",
    "        x3 : 3rd point x\n",
    "        y3 : 3rd point y\n",
    "    \"\"\"\n",
    "    t = (y1 - y2) / (x1 - x2)\n",
    "    a = t - (y2 - y3) / (x2 - x3)\n",
    "    a = a / (x1 - x3)\n",
    "    b = t - a * (x1 + x2)\n",
    "    c = y1 - a * x1 * x1 - b * x1\n",
    "    return (a, b, c)\n",
    "\n",
    "def range_limit(val, minv, maxv):\n",
    "    \"\"\"\n",
    "    range limit a value\n",
    "\n",
    "    Parameters\n",
    "        val : data value\n",
    "        minv : minimum\n",
    "        maxv : maximum\n",
    "    \"\"\"\n",
    "    if (val < minv):\n",
    "        val = minv\n",
    "    elif (val > maxv):\n",
    "        val = maxv\n",
    "    return val\n",
    "\n",
    "def isInRange(val, minv, maxv):\n",
    "    \"\"\"\n",
    "    checks if within range\n",
    "\n",
    "    Parameters\n",
    "        val : data value\n",
    "        minv : minimum\n",
    "        maxv : maximum\n",
    "    \"\"\"\n",
    "    return val >= minv and val <= maxv\n",
    "\n",
    "def stripFileLines(filePath, offset):\n",
    "    \"\"\"\n",
    "    strips number of chars from both ends\n",
    "\n",
    "    Parameters\n",
    "        filePath : file path\n",
    "        offset : offset from both ends of  line \n",
    "    \"\"\"\n",
    "    fp = open(filePath, \"r\")\n",
    "    for line in fp:\n",
    "        stripped = line[offset:len(line) - 1 - offset]\n",
    "        print (stripped)\n",
    "    fp.close()\n",
    "\n",
    "def genLatLong(lat1, long1, lat2, long2):\n",
    "    \"\"\"\n",
    "    generate lat log within limits\n",
    "\n",
    "    Parameters\n",
    "        lat1 : lat of 1st point\n",
    "        long1 : long of 1st point\n",
    "        lat2 : lat of 2nd point\n",
    "        long2 : long of 2nd point\n",
    "    \"\"\"\n",
    "    lat = lat1 + (lat2 - lat1) * random.random()\n",
    "    longg = long1 + (long2 - long1) * random.random()\n",
    "    return (lat, longg)\n",
    "\n",
    "def geoDistance(lat1, long1, lat2, long2):\n",
    "    \"\"\"\n",
    "    find geo distance in ft\n",
    "\n",
    "    Parameters\n",
    "        lat1 : lat of 1st point\n",
    "        long1 : long of 1st point\n",
    "        lat2 : lat of 2nd point\n",
    "        long2 : long of 2nd point\n",
    "    \"\"\"\n",
    "    latDiff = math.radians(lat1 - lat2)\n",
    "    longDiff = math.radians(long1 - long2)\n",
    "    l1 = math.sin(latDiff/2.0)\n",
    "    l2 = math.sin(longDiff/2.0)\n",
    "    l3 = math.cos(math.radians(lat1))\n",
    "    l4 = math.cos(math.radians(lat2))\n",
    "    a = l1 * l1 + l3 * l4 * l2 * l2\n",
    "    l5 = math.sqrt(a)\n",
    "    l6 = math.sqrt(1.0 - a)\n",
    "    c = 2.0 * math.atan2(l5, l6)\n",
    "    r = 6371008.8 * 3.280840\n",
    "    return c * r\n",
    "\n",
    "def minLimit(val, limit):\n",
    "    \"\"\"\n",
    "    min limit\n",
    "    Parameters\n",
    "    \"\"\"\n",
    "    if (val < limit):\n",
    "        val = limit\n",
    "    return val;\n",
    "\n",
    "def maxLimit(val, limit):\n",
    "    \"\"\"\n",
    "    max limit\n",
    "    Parameters\n",
    "    \"\"\"\n",
    "    if (val > limit):\n",
    "        val = limit\n",
    "    return val;\n",
    "\n",
    "def rangeSample(val, minLim, maxLim):\n",
    "    \"\"\"\n",
    "    if out side range sample within range\n",
    "\n",
    "    Parameters\n",
    "        val : value\n",
    "        minLim : minimum\n",
    "        maxLim : maximum\n",
    "    \"\"\"\n",
    "    if val < minLim or val > maxLim:\n",
    "        val = randint(minLim, maxLim)\n",
    "    return val\n",
    "\n",
    "def genRandomIntListWithinRange(size, minLim, maxLim):\n",
    "    \"\"\"\n",
    "    random unique list of integers within range\n",
    "\n",
    "    Parameters\n",
    "        size : size of returned list\n",
    "        minLim : minimum\n",
    "        maxLim : maximum\n",
    "    \"\"\"\n",
    "    values = set()\n",
    "    for i in range(size):\n",
    "        val = randint(minLim, maxLim)\n",
    "        while val not in values:\n",
    "            values.add(val)\n",
    "    return list(values)\n",
    "\n",
    "def preturbScalar(value, vrange):\n",
    "    \"\"\"\n",
    "    preturbs a mutiplicative value within range\n",
    "\n",
    "    Parameters\n",
    "        value : data value\n",
    "        vrange : value delta  fraction\n",
    "    \"\"\"\n",
    "    scale = 1.0 - vrange + 2 * vrange * random.random() \n",
    "    return value * scale\n",
    "\n",
    "def preturbScalarAbs(value, vrange):\n",
    "    \"\"\"\n",
    "    preturbs an absolute value within range\n",
    "\n",
    "    Parameters\n",
    "        value : data value\n",
    "        vrange : value delta  absolute\n",
    "    \"\"\"\n",
    "    delta = - vrange + 2.0 * vrange * random.random() \n",
    "    return value + delta\n",
    "\n",
    "def preturbVector(values, vrange):\n",
    "    \"\"\"\n",
    "    preturbs a list within range\n",
    "\n",
    "    Parameters\n",
    "        values : list data\n",
    "        vrange : value delta  fraction\n",
    "    \"\"\"\n",
    "    nValues = list(map(lambda va: preturbScalar(va, vrange), values))\n",
    "    return nValues\n",
    "\n",
    "def randomShiftVector(values, smin, smax):\n",
    "    \"\"\"\n",
    "    shifts  a list by a random quanity with a range\n",
    "\n",
    "    Parameters\n",
    "        values : list data\n",
    "        smin : samplinf minimum\n",
    "        smax : sampling maximum\n",
    "    \"\"\"\n",
    "    shift = np.random.uniform(smin, smax)\n",
    "    return list(map(lambda va: va + shift, values))\n",
    "\n",
    "def floatRange(beg, end, incr):\n",
    "    \"\"\"\n",
    "    generates float range\n",
    "\n",
    "    Parameters\n",
    "        beg :range begin\n",
    "        end: range end\n",
    "        incr : range increment\n",
    "    \"\"\"\n",
    "    return list(np.arange(beg, end, incr))\n",
    "\n",
    "def shuffle(values, *numShuffles):\n",
    "    \"\"\"\n",
    "    in place shuffling with swap of pairs\n",
    "\n",
    "    Parameters\n",
    "        values : list data\n",
    "        numShuffles : parameter list for number of shuffles\n",
    "    \"\"\"\n",
    "    size = len(values)\n",
    "    if len(numShuffles) == 0:\n",
    "        numShuffle = int(size / 2)\n",
    "    elif len(numShuffles) == 1:\n",
    "        numShuffle = numShuffles[0]\n",
    "    else:\n",
    "        numShuffle = randint(numShuffles[0], numShuffles[1])\n",
    "    print(\"numShuffle {}\".format(numShuffle))\n",
    "    for i in range(numShuffle):\n",
    "        first = random.randint(0, size - 1)\n",
    "        second = random.randint(0, size - 1)\n",
    "        while first == second:\n",
    "            second = random.randint(0, size - 1)\n",
    "        tmp = values[first]\n",
    "        values[first] = values[second]\n",
    "        values[second] = tmp\n",
    "\n",
    "\n",
    "def splitList(itms, numGr):\n",
    "    \"\"\"\n",
    "    splits a list into sub lists of approximately equal size, with items in sublists randomly chod=sen\n",
    "\n",
    "    Parameters\n",
    "        itms ; list of values\t\t\n",
    "        numGr : no of groups\n",
    "    \"\"\"\n",
    "    tcount = len(itms)\n",
    "    cItems = list(itms)\n",
    "    sz = int(len(cItems) / numGr)\n",
    "    groups = list()\n",
    "    count = 0\n",
    "    for i in range(numGr):\n",
    "        if (i == numGr - 1):\n",
    "            csz = tcount - count\n",
    "        else:\n",
    "            csz = sz + randint(-2, 2)\n",
    "            count += csz\n",
    "        gr = list()\n",
    "        for  j in range(csz):\n",
    "            it = selectRandomFromList(cItems)\n",
    "            gr.append(it)\n",
    "            cItems.remove(it)\n",
    "        groups.append(gr)\n",
    "    return groups\n",
    "\n",
    "def multVector(values, vrange):\n",
    "    \"\"\"\n",
    "    multiplies a list within value  range\n",
    "\n",
    "    Parameters\n",
    "        values : list of values\n",
    "        vrange : fraction of vaue to be used to update\n",
    "    \"\"\"\n",
    "    scale = 1.0 - vrange + 2 * vrange * random.random()\n",
    "    nValues = list(map(lambda va: va * scale, values))\n",
    "    return nValues\n",
    "\n",
    "def weightedAverage(values, weights):\n",
    "    \"\"\"\n",
    "    calculates weighted average\n",
    "\n",
    "    Parameters\n",
    "        values : list of values\n",
    "        weights : list of weights\n",
    "    \"\"\"\t\t\n",
    "    assert len(values) == len(weights), \"values and weights should be same size\"\n",
    "    vw = zip(values, weights)\n",
    "    wva = list(map(lambda e : e[0] * e[1], vw))\n",
    "    #wa = sum(x * y for x, y in vw) / sum(weights)\n",
    "    wav = sum(wva) / sum(weights)\n",
    "    return wav\n",
    "\n",
    "def extractFields(line, delim, keepIndices):\n",
    "    \"\"\"\n",
    "    breaks a line into fields and keeps only specified fileds and returns new line\n",
    "\n",
    "    Parameters\n",
    "        line ; deli separated string\n",
    "        delim : delemeter\n",
    "        keepIndices : list of indexes to fields to be retained\n",
    "    \"\"\"\n",
    "    items = line.split(delim)\n",
    "    newLine = []\n",
    "    for i in keepIndices:\n",
    "        newLine.append(line[i])\n",
    "    return delim.join(newLine)\n",
    "\n",
    "def remFields(line, delim, remIndices):\n",
    "    \"\"\"\n",
    "    removes fields from delim separated string\n",
    "\n",
    "    Parameters\n",
    "        line ; delemeter separated string\n",
    "        delim : delemeter\n",
    "        remIndices : list of indexes to fields to be removed\n",
    "    \"\"\"\n",
    "    items = line.split(delim)\n",
    "    newLine = []\n",
    "    for i in range(len(items)):\n",
    "        if not arrayContains(remIndices, i):\n",
    "            newLine.append(line[i])\n",
    "    return delim.join(newLine)\n",
    "\n",
    "def extractList(data, indices):\n",
    "    \"\"\"\n",
    "    extracts list from another list, given indices\n",
    "\n",
    "    Parameters\n",
    "        remIndices : list data\n",
    "        indices : list of indexes to fields to be retained\n",
    "    \"\"\"\n",
    "    if areAllFieldsIncluded(data, indices):\n",
    "        exList = data.copy()\n",
    "        #print(\"all indices\")\n",
    "    else:\n",
    "        exList = list()\n",
    "        le = len(data)\n",
    "        for i in indices:\n",
    "            assert i < le , \"index {} out of bound {}\".format(i, le)\n",
    "            exList.append(data[i])\n",
    "\n",
    "    return exList\n",
    "\n",
    "def arrayContains(arr, item):\n",
    "    \"\"\"\n",
    "    checks if array contains an item \n",
    "\n",
    "    Parameters\n",
    "        arr : list data\n",
    "        item : item to search\n",
    "    \"\"\"\n",
    "    contains = True\n",
    "    try:\n",
    "        arr.index(item)\n",
    "    except ValueError:\n",
    "        contains = False\n",
    "    return contains\n",
    "\n",
    "def strToIntArray(line, delim=\",\"):\n",
    "    \"\"\"\n",
    "    int array from delim separated string\n",
    "\n",
    "    Parameters\n",
    "        line ; delemeter separated string\n",
    "    \"\"\"\n",
    "    arr = line.split(delim)\n",
    "    return [int(a) for a in arr]\n",
    "\n",
    "def strToFloatArray(line, delim=\",\"):\n",
    "    \"\"\"\n",
    "    float array from delim separated string\n",
    "\n",
    "    Parameters\n",
    "        line ; delemeter separated string\n",
    "    \"\"\"\n",
    "    arr = line.split(delim)\n",
    "    return [float(a) for a in arr]\n",
    "\n",
    "def strListOrRangeToIntArray(line):\n",
    "    \"\"\"\n",
    "    int array from delim separated string or range\n",
    "\n",
    "    Parameters\n",
    "        line ; delemeter separated string\n",
    "    \"\"\"\n",
    "    varr = line.split(\",\")\n",
    "    if (len(varr) > 1):\n",
    "        iarr =  list(map(lambda v: int(v), varr))\n",
    "    else:\n",
    "        vrange = line.split(\":\")\n",
    "        if (len(vrange) == 2):\n",
    "            lo = int(vrange[0])\n",
    "            hi = int(vrange[1])\n",
    "            iarr = list(range(lo, hi+1))\n",
    "        else:\n",
    "            iarr = [int(line)]\n",
    "    return iarr\n",
    "\n",
    "def toStr(val, precision):\n",
    "    \"\"\"\n",
    "    converts any type to string\t\n",
    "\n",
    "    Parameters\n",
    "        val : value\n",
    "        precision ; precision for float value\n",
    "    \"\"\"\n",
    "    if type(val) == float or type(val) == np.float64 or type(val) == np.float32:\n",
    "        format = \"%\" + \".%df\" %(precision)\n",
    "        sVal = format %(val)\n",
    "    else:\n",
    "        sVal = str(val)\n",
    "    return sVal\n",
    "\n",
    "def toStrFromList(values, precision, delim=\",\"):\n",
    "    \"\"\"\n",
    "    converts list of any type to delim separated string\n",
    "\n",
    "    Parameters\n",
    "        values : list data\n",
    "        precision ; precision for float value\n",
    "        delim : delemeter\n",
    "    \"\"\"\n",
    "    sValues = list(map(lambda v: toStr(v, precision), values))\n",
    "    return delim.join(sValues)\n",
    "\n",
    "def toIntList(values):\n",
    "    \"\"\"\n",
    "    convert to int list\n",
    "\n",
    "    Parameters\n",
    "        values : list data\n",
    "    \"\"\"\n",
    "    return list(map(lambda va: int(va), values))\n",
    "\n",
    "def toFloatList(values):\n",
    "    \"\"\"\n",
    "    convert to float list\n",
    "\n",
    "    Parameters\n",
    "        values : list data\n",
    "    \"\"\"\n",
    "    return list(map(lambda va: float(va), values))\n",
    "\n",
    "def toStrList(values, precision=None):\n",
    "    \"\"\"\n",
    "    convert to string list\n",
    "\n",
    "    Parameters\n",
    "        values : list data\n",
    "        precision ; precision for float value\n",
    "    \"\"\"\n",
    "    return list(map(lambda va: toStr(va, precision), values))\n",
    "\n",
    "def toIntFromBoolean(value):\n",
    "    \"\"\"\n",
    "    convert to int\n",
    "\n",
    "    Parameters\n",
    "        value : boolean value\n",
    "    \"\"\"\n",
    "    ival = 1 if value else 0\n",
    "    return ival\n",
    "\n",
    "def typedValue(val, dtype=None):\n",
    "    \"\"\"\n",
    "    return typed value given string, discovers data type if not specified\n",
    "\n",
    "    Parameters\n",
    "        val : value\n",
    "        dtype : data type\n",
    "    \"\"\"\n",
    "    tVal = None\n",
    "\n",
    "    if dtype is not None:\n",
    "        if dtype == \"num\":\n",
    "            dtype = \"int\" if dtype.find(\".\") == -1 else \"float\"\n",
    "\n",
    "        if dtype == \"int\":\n",
    "            tVal = int(val)\n",
    "        elif dtype == \"float\":\n",
    "            tVal = float(val)\n",
    "        elif dtype == \"bool\":\n",
    "            tVal = bool(val)\n",
    "        else:\n",
    "            tVal = val\n",
    "    else:\n",
    "        if type(val) == str:\n",
    "            lVal = val.lower()\n",
    "\n",
    "            #int\n",
    "            done = True\n",
    "            try:\n",
    "                tVal = int(val)\n",
    "            except ValueError:\n",
    "                done = False\n",
    "\n",
    "            #float\n",
    "            if not done:\n",
    "                done = True\n",
    "                try:\n",
    "                    tVal = float(val)\n",
    "                except ValueError:\n",
    "                    done = False\n",
    "\n",
    "            #boolean\n",
    "            if not done:\n",
    "                done = True\n",
    "                if lVal == \"true\":\n",
    "                    tVal = True\n",
    "                elif lVal == \"false\":\n",
    "                    tVal = False\n",
    "                else:\n",
    "                    done = False\n",
    "            #None\t\t\n",
    "            if not done:\n",
    "                if lVal == \"none\":\n",
    "                    tVal = None\n",
    "                else:\n",
    "                    tVal = val\n",
    "        else:\n",
    "            tVal = val\n",
    "\n",
    "    return tVal\n",
    "\n",
    "def getAllFiles(dirPath):\n",
    "    \"\"\"\n",
    "    get all files recursively\n",
    "\n",
    "    Parameters\n",
    "        dirPath : directory path\n",
    "    \"\"\"\n",
    "    filePaths = []\n",
    "    for (thisDir, subDirs, fileNames) in os.walk(dirPath):\n",
    "        for fileName in fileNames:\n",
    "            filePaths.append(os.path.join(thisDir, fileName))\n",
    "    filePaths.sort()\n",
    "    return filePaths\n",
    "\n",
    "def getFileContent(fpath, verbose=False):\n",
    "    \"\"\"\n",
    "    get file contents in directory\n",
    "\n",
    "    Parameters\n",
    "        fpath ; directory path\n",
    "        verbose : verbosity flag\n",
    "    \"\"\"\n",
    "    # dcument list\n",
    "    docComplete  = []\n",
    "    filePaths = getAllFiles(fpath)\n",
    "\n",
    "    # read files\n",
    "    for filePath in filePaths:\n",
    "        if verbose:\n",
    "            print(\"next file \" + filePath)\n",
    "        with open(filePath, 'r') as contentFile:\n",
    "            content = contentFile.read()\n",
    "            docComplete.append(content)\n",
    "    return (docComplete, filePaths)\n",
    "\n",
    "def getOneFileContent(fpath):\n",
    "    \"\"\"\n",
    "    get one file contents\n",
    "\n",
    "    Parameters\n",
    "        fpath : file path\n",
    "    \"\"\"\n",
    "    with open(fpath, 'r') as contentFile:\n",
    "        docStr = contentFile.read()\n",
    "    return docStr\n",
    "\n",
    "def getFileLines(dirPath, delim=\",\"):\n",
    "    \"\"\"\n",
    "    get lines from a file\n",
    "\n",
    "    Parameters\n",
    "        dirPath : file path\n",
    "        delim : delemeter\n",
    "    \"\"\"\n",
    "    lines = list()\n",
    "    for li in fileRecGen(dirPath, delim):\n",
    "        lines.append(li)\n",
    "    return lines\n",
    "\n",
    "def getFileSampleLines(dirPath, percen, delim=\",\"):\n",
    "    \"\"\"\n",
    "    get sampled lines from a file\n",
    "\n",
    "    Parameters\n",
    "        dirPath : file path\n",
    "        percen : sampling percentage\n",
    "        delim : delemeter\n",
    "    \"\"\"\n",
    "    lines = list()\n",
    "    for li in fileRecGen(dirPath, delim):\n",
    "        if randint(0, 100) < percen:\n",
    "            lines.append(li)\n",
    "    return lines\n",
    "\n",
    "def getFileColumnAsString(dirPath, index, delim=\",\"):\n",
    "    \"\"\"\n",
    "    get string column from a file\n",
    "\n",
    "    Parameters\n",
    "        dirPath : file path\n",
    "        index : index\n",
    "        delim : delemeter\n",
    "    \"\"\"\n",
    "    fields = list()\n",
    "    for rec in fileRecGen(dirPath, delim):\n",
    "        fields.append(rec[index])\n",
    "    #print(fields)\t\n",
    "    return fields\n",
    "\n",
    "def getFileColumnsAsString(dirPath, indexes, delim=\",\"):\n",
    "    \"\"\"\n",
    "    get multiple string columns from a file\n",
    "\n",
    "    Parameters\n",
    "        dirPath : file path\n",
    "        indexes : indexes of columns\n",
    "        delim : delemeter\n",
    "    \"\"\"\n",
    "    nindex = len(indexes)\n",
    "    columns = list(map(lambda i : list(), range(nindex)))\n",
    "    for rec in fileRecGen(dirPath, delim):\n",
    "        for i in range(nindex):\n",
    "            columns[i].append(rec[indexes[i]])\n",
    "    return columns\n",
    "\n",
    "def getFileColumnAsFloat(dirPath, index, delim=\",\"):\n",
    "    \"\"\"\n",
    "    get float fileds from a file\n",
    "\n",
    "    Parameters\n",
    "        dirPath : file path\n",
    "        index : index\n",
    "        delim : delemeter\n",
    "    \"\"\"\n",
    "    #print(\"{}  {}\".format(dirPath, index))\n",
    "    fields = getFileColumnAsString(dirPath, index, delim)\n",
    "    return list(map(lambda v:float(v), fields))\n",
    "\n",
    "def getFileColumnAsInt(dirPath, index, delim=\",\"):\n",
    "    \"\"\"\n",
    "    get float fileds from a file\n",
    "\n",
    "    Parameters\n",
    "        dirPath : file path\n",
    "        index : index\n",
    "        delim : delemeter\n",
    "    \"\"\"\n",
    "    fields = getFileColumnAsString(dirPath, index, delim)\n",
    "    return list(map(lambda v:int(v), fields))\n",
    "\n",
    "def getFileAsIntMatrix(dirPath, columns, delim=\",\"):\n",
    "    \"\"\"\n",
    "    extracts int matrix from csv file given column indices with each row being  concatenation of \n",
    "    extracted column values row size = num of columns\n",
    "\n",
    "    Parameters\n",
    "        dirPath : file path\n",
    "        columns : indexes of columns\n",
    "        delim : delemeter\n",
    "    \"\"\"\n",
    "    mat = list()\n",
    "    for rec in  fileSelFieldsRecGen(dirPath, columns, delim):\n",
    "        mat.append(asIntList(rec))\n",
    "    return mat\n",
    "\n",
    "def getFileAsFloatMatrix(dirPath, columns, delim=\",\"):\n",
    "    \"\"\"\n",
    "    extracts float matrix from csv file given column indices with each row being concatenation of  \n",
    "    extracted column values row size = num of columns\n",
    "    Parameters\n",
    "        dirPath : file path\n",
    "        columns : indexes of columns\n",
    "        delim : delemeter\n",
    "    \"\"\"\n",
    "    mat = list()\n",
    "    for rec in  fileSelFieldsRecGen(dirPath, columns, delim):\n",
    "        mat.append(asFloatList(rec))\n",
    "    return mat\n",
    "\n",
    "def getFileAsFloatColumn(dirPath):\n",
    "    \"\"\"\n",
    "    grt float list from a file with one float per row\n",
    "    Parameters\n",
    "        dirPath : file path\n",
    "    \"\"\"\n",
    "    flist = list()\n",
    "    for rec in fileRecGen(dirPath, None):\n",
    "        flist.append(float(rec))\n",
    "    return flist\n",
    "\n",
    "def getFileAsFiltFloatMatrix(dirPath, filt, columns, delim=\",\"):\n",
    "    \"\"\"\n",
    "    extracts float matrix from csv file given row filter and column indices with each row being \n",
    "    concatenation of  extracted column values row size = num of columns\n",
    "    Parameters\n",
    "        dirPath : file path\n",
    "        columns : indexes of columns\n",
    "        filt : row filter lambda\n",
    "        delim : delemeter\n",
    "    \"\"\"\n",
    "    mat = list()\n",
    "    for rec in  fileFiltSelFieldsRecGen(dirPath, filt, columns, delim):\n",
    "        mat.append(asFloatList(rec))\n",
    "    return mat\n",
    "\n",
    "def getFileAsTypedRecords(dirPath, types, delim=\",\"):\n",
    "    \"\"\"\n",
    "    extracts typed records from csv file with each row being concatenation of  \n",
    "    extracted column values \n",
    "    Parameters\n",
    "        dirPath : file path\n",
    "        types : data types\n",
    "        delim : delemeter\n",
    "    \"\"\"\n",
    "    (dtypes, cvalues) = extractTypesFromString(types)\n",
    "    tdata = list()\n",
    "    for rec in  fileRecGen(dirPath, delim):\n",
    "        trec = list()\n",
    "        for index, value in enumerate(rec):\n",
    "            value = __convToTyped(index, value, dtypes)\n",
    "            trec.append(value)\n",
    "        tdata.append(trec)\n",
    "    return tdata\n",
    "\n",
    "\n",
    "def getFileColsAsTypedRecords(dirPath, columns, types, delim=\",\"):\n",
    "    \"\"\"\n",
    "    extracts typed records from csv file given column indices with each row being concatenation of  \n",
    "    extracted column values \n",
    "    Parameters\n",
    "    Parameters\n",
    "        dirPath : file path\n",
    "        columns : column indexes\n",
    "        types : data types\n",
    "        delim : delemeter\n",
    "    \"\"\"\n",
    "    (dtypes, cvalues) = extractTypesFromString(types)\n",
    "    tdata = list()\n",
    "    for rec in  fileSelFieldsRecGen(dirPath, columns, delim):\n",
    "        trec = list()\n",
    "        for indx, value in enumerate(rec):\n",
    "            tindx = columns[indx]\n",
    "            value = __convToTyped(tindx, value, dtypes)\n",
    "            trec.append(value)\n",
    "        tdata.append(trec)\n",
    "    return tdata\n",
    "\n",
    "def getFileColumnsMinMax(dirPath, columns, dtype, delim=\",\"):\n",
    "    \"\"\"\n",
    "    extracts numeric matrix from csv file given column indices. For each column return min and max\n",
    "    Parameters\n",
    "        dirPath : file path\n",
    "        columns : column indexes\n",
    "        dtype : data type\n",
    "        delim : delemeter\n",
    "    \"\"\"\n",
    "    dtypes = list(map(lambda c : str(c) + \":\" + dtype, columns))\n",
    "    dtypes = \",\".join(dtypes)\n",
    "    #print(dtypes)\n",
    "\n",
    "    tdata = getFileColsAsTypedRecords(dirPath, columns, dtypes, delim)\n",
    "    minMax = list()\n",
    "    ncola = len(tdata[0])\n",
    "    ncole = len(columns)\n",
    "    assertEqual(ncola, ncole, \"actual no of columns different from expected\")\n",
    "\n",
    "    for ci in range(ncole):\t\n",
    "        vmin = sys.float_info.max\n",
    "        vmax = sys.float_info.min\n",
    "        for r in tdata:\n",
    "            cv = r[ci]\n",
    "            vmin = cv if cv < vmin else vmin\n",
    "            vmax = cv if cv > vmax else vmax\n",
    "        mm = (vmin, vmax, vmax - vmin)\n",
    "        minMax.append(mm)\n",
    "\n",
    "    return minMax\n",
    "\n",
    "\n",
    "def getRecAsTypedRecord(rec, types, delim=None):\n",
    "    \"\"\"\n",
    "    converts record to  typed records \n",
    "    Parameters\n",
    "        rec : delemeter separate string or list of string\n",
    "        types : field  data types\n",
    "        delim : delemeter\n",
    "    \"\"\"\t\n",
    "    if delim is not None:\n",
    "        rec = rec.split(delim)\n",
    "    (dtypes, cvalues) = extractTypesFromString(types)\n",
    "    #print(types)\n",
    "    #print(dtypes)\n",
    "    trec = list()\n",
    "    for ind, value in enumerate(rec):\n",
    "        tvalue = __convToTyped(ind, value, dtypes)\n",
    "        trec.append(tvalue)\n",
    "    return trec\n",
    "\n",
    "def __convToTyped(index, value, dtypes):\n",
    "    \"\"\"\n",
    "    convert to typed value \n",
    "    Parameters\n",
    "        index : index in type list\n",
    "        value : data value\n",
    "        dtypes : data type list\n",
    "    \"\"\"\n",
    "    #print(index, value)\n",
    "    dtype = dtypes[index]\n",
    "    tvalue = value\n",
    "    if dtype == \"int\":\n",
    "        tvalue = int(value)\n",
    "    elif dtype == \"float\":\n",
    "        tvalue = float(value)\n",
    "    return tvalue\n",
    "\n",
    "\n",
    "\n",
    "def extractTypesFromString(types):\n",
    "    \"\"\"\n",
    "    extracts column data types and set values for categorical variables \n",
    "    Parameters\n",
    "        types : encoded type information\n",
    "    \"\"\"\n",
    "    ftypes = types.split(\",\")\n",
    "    dtypes = dict()\n",
    "    cvalues = dict()\n",
    "    for ftype in ftypes:\n",
    "        items = ftype.split(\":\") \n",
    "        cindex = int(items[0])\n",
    "        dtype = items[1]\n",
    "        dtypes[cindex] = dtype\n",
    "        if len(items) == 3:\n",
    "            sitems = items[2].split()\n",
    "            cvalues[cindex] = sitems\n",
    "    return (dtypes, cvalues)\n",
    "\n",
    "def getMultipleFileAsInttMatrix(dirPathWithCol,  delim=\",\"):\n",
    "    \"\"\"\n",
    "    extracts int matrix from from csv files given column index for each file. \n",
    "    num of columns  = number of rows in each file and num of rows = number of files\n",
    "    Parameters\n",
    "        dirPathWithCol: list of file path and collumn index pair\n",
    "        delim : delemeter\n",
    "    \"\"\"\n",
    "    mat = list()\n",
    "    minLen = -1\n",
    "    for path, col in dirPathWithCol:\n",
    "        colVals = getFileColumnAsInt(path, col, delim)\n",
    "        if minLen < 0 or len(colVals) < minLen:\n",
    "            minLen = len(colVals)\n",
    "        mat.append(colVals)\n",
    "\n",
    "    #make all same length\n",
    "    mat = list(map(lambda li:li[:minLen], mat))\n",
    "    return mat\n",
    "\n",
    "def getMultipleFileAsFloatMatrix(dirPathWithCol,  delim=\",\"):\n",
    "    \"\"\"\n",
    "    extracts float matrix from from csv files given column index for each file. \n",
    "    num of columns  = number of rows in each file and num of rows = number of files\n",
    "    Parameters\n",
    "        dirPathWithCol: list of file path and collumn index pair\n",
    "        delim : delemeter\n",
    "    \"\"\"\n",
    "    mat = list()\n",
    "    minLen = -1\n",
    "    for path, col in dirPathWithCol:\n",
    "        colVals = getFileColumnAsFloat(path, col, delim)\n",
    "        if minLen < 0 or len(colVals) < minLen:\n",
    "            minLen = len(colVals)\n",
    "        mat.append(colVals)\n",
    "\n",
    "    #make all same length\n",
    "    mat = list(map(lambda li:li[:minLen], mat))\n",
    "    return mat\n",
    "\n",
    "def writeStrListToFile(ldata, filePath, delem=\",\"):\n",
    "    \"\"\"\n",
    "    writes list of dlem separated string or list of list of string to afile\n",
    "\n",
    "    Parameters\n",
    "        ldata : list data\n",
    "        filePath : file path\n",
    "        delim : delemeter\n",
    "    \"\"\"\n",
    "    with open(filePath, \"w\") as fh:\n",
    "        for r in ldata:\n",
    "            if type(r) == list:\n",
    "                r = delem.join(r)\n",
    "            fh.write(r + \"\\n\")\n",
    "\n",
    "def writeFloatListToFile(ldata, prec, filePath):\n",
    "    \"\"\"\n",
    "    writes float list to file, one value per line\n",
    "\n",
    "    Parameters\n",
    "        ldata : list data\n",
    "        prec : precision\n",
    "        filePath : file path\n",
    "    \"\"\"\n",
    "    with open(filePath, \"w\") as fh:\n",
    "        for d in ldata:\n",
    "            fh.write(formatFloat(prec, d) + \"\\n\")\n",
    "\n",
    "\n",
    "def takeFirst(elems):\n",
    "    \"\"\"\n",
    "    return fisrt item\n",
    "    Parameters\n",
    "        elems : list of data \n",
    "    \"\"\"\n",
    "    return elems[0]\n",
    "\n",
    "def takeSecond(elems):\n",
    "    \"\"\"\n",
    "    return 2nd element\n",
    "    Parameters\n",
    "        elems : list of data \n",
    "    \"\"\"\n",
    "    return elems[1]\n",
    "\n",
    "def takeThird(elems):\n",
    "    \"\"\"\n",
    "    returns 3rd element\n",
    "    Parameters\n",
    "        elems : list of data \n",
    "    \"\"\"\n",
    "    return elems[2]\n",
    "\n",
    "def addToKeyedCounter(dCounter, key, count=1):\n",
    "    \"\"\"\n",
    "    add to to keyed counter\n",
    "    Parameters\n",
    "        dCounter : dictionary of counters\n",
    "        key : dictionary key\n",
    "        count : count to add\n",
    "    \"\"\"\n",
    "    curCount = dCounter.get(key, 0)\n",
    "    dCounter[key] = curCount + count\n",
    "\n",
    "def incrKeyedCounter(dCounter, key):\n",
    "    \"\"\"\n",
    "    increment keyed counter\n",
    "    Parameters\n",
    "        dCounter : dictionary of counters\n",
    "        key : dictionary key\n",
    "    \"\"\"\n",
    "    addToKeyedCounter(dCounter, key, 1)\n",
    "\n",
    "def appendKeyedList(dList, key, elem):\n",
    "    \"\"\"\n",
    "    keyed list\n",
    "    Parameters\n",
    "        dList : dictionary of lists\n",
    "        key : dictionary key\n",
    "        elem : value to append\n",
    "    \"\"\"\n",
    "    curList = dList.get(key, [])\n",
    "    curList.append(elem)\n",
    "    dList[key] = curList\n",
    "\n",
    "def isNumber(st):\n",
    "    \"\"\"\n",
    "    Returns True is string is a number\n",
    "    Parameters\n",
    "        st : string value\n",
    "    \"\"\"\n",
    "    return st.replace('.','',1).isdigit()\n",
    "\n",
    "def removeNan(values):\n",
    "    \"\"\"\n",
    "    removes nan from list\n",
    "    Parameters\n",
    "        values : list data\n",
    "    \"\"\"\n",
    "    return list(filter(lambda v: not math.isnan(v), values))\n",
    "\n",
    "def fileRecGen(filePath, delim = \",\"):\n",
    "    \"\"\"\n",
    "    file record generator\n",
    "    Parameters\n",
    "        filePath ; file path\n",
    "        delim : delemeter\n",
    "    \"\"\"\n",
    "    with open(filePath, \"r\") as fp:\n",
    "        for line in fp:\t\n",
    "            line = line[:-1]\n",
    "            if delim is not None:\n",
    "                line = line.split(delim)\n",
    "            yield line\n",
    "\n",
    "def fileSelFieldsRecGen(dirPath, columns, delim=\",\"):\n",
    "    \"\"\"\n",
    "    file record generator given column indices \n",
    "    Parameters\n",
    "        filePath ; file path\n",
    "        columns : column indexes as int array or coma separated string\n",
    "        delim : delemeter\n",
    "    \"\"\"\n",
    "    if type(columns) == str:\n",
    "        columns = strToIntArray(columns, delim)\n",
    "    for rec in fileRecGen(dirPath, delim):\n",
    "        extracted = extractList(rec, columns)\n",
    "        yield extracted\n",
    "\n",
    "def fileFiltRecGen(filePath, filt, delim = \",\"):\n",
    "    \"\"\"\n",
    "    file record generator with  row filter applied\n",
    "    Parameters\n",
    "        filePath ; file path\n",
    "        filt : row filter\n",
    "        delim : delemeter\n",
    "    \"\"\"\n",
    "    with open(filePath, \"r\") as fp:\n",
    "        for line in fp:\t\n",
    "            line = line[:-1]\n",
    "            if delim is not None:\n",
    "                line = line.split(delim)\n",
    "            if filt(line):\n",
    "                yield line\n",
    "\n",
    "def fileFiltSelFieldsRecGen(filePath, filt, columns, delim = \",\"):\n",
    "    \"\"\"\n",
    "    file record generator with  row and column filter applied\n",
    "    Parameters\n",
    "        filePath ; file path\n",
    "        filt : row filter\n",
    "        columns : column indexes as int array or coma separated string\n",
    "        delim : delemeter\n",
    "    \"\"\"\n",
    "    columns = strToIntArray(columns, delim)\n",
    "    with open(filePath, \"r\") as fp:\n",
    "        for line in fp:\t\n",
    "            line = line[:-1]\n",
    "            if delim is not None:\n",
    "                line = line.split(delim)\n",
    "            if filt(line):\n",
    "                selected = extractList(line, columns)\n",
    "                yield selected\n",
    "\n",
    "def fileTypedRecGen(filePath, ftypes, delim = \",\"):\n",
    "    \"\"\"\n",
    "    file typed record generator\n",
    "    Parameters\n",
    "        filePath ; file path\n",
    "        ftypes : list of field types\n",
    "        delim : delemeter\n",
    "    \"\"\"\n",
    "    with open(filePath, \"r\") as fp:\n",
    "        for line in fp:\t\n",
    "            line = line[:-1]\n",
    "            line = line.split(delim)\n",
    "            for i in range(0, len(ftypes), 2):\n",
    "                ci = ftypes[i]\n",
    "                dtype = ftypes[i+1]\n",
    "                assertLesser(ci, len(line), \"index out of bound\")\n",
    "                if dtype == \"int\":\n",
    "                    line[ci] = int(line[ci])\n",
    "                elif dtype == \"float\":\n",
    "                    line[ci] = float(line[ci])\n",
    "                else:\n",
    "                    exitWithMsg(\"invalid data type\")\n",
    "            yield line\n",
    "\n",
    "def fileMutatedFieldsRecGen(dirPath, mutator, delim=\",\"):\n",
    "    \"\"\"\n",
    "    file record generator with some columns mutated \n",
    "    Parameters\n",
    "        dirPath ; file path\n",
    "        mutator : row field mutator\n",
    "        delim : delemeter\n",
    "    \"\"\"\n",
    "    for rec in fileRecGen(dirPath, delim):\n",
    "        mutated = mutator(rec)\n",
    "        yield mutated\n",
    "\n",
    "def tableSelFieldsFilter(tdata, columns):\n",
    "    \"\"\"\n",
    "    gets tabular data for selected columns \n",
    "    Parameters\n",
    "        tdata : tabular data\n",
    "        columns : column indexes\n",
    "    \"\"\"\n",
    "    if areAllFieldsIncluded(tdata[0], columns):\n",
    "        ntdata = tdata\n",
    "    else:\n",
    "        ntdata = list()\n",
    "        for rec in tdata:\n",
    "            #print(rec)\n",
    "            #print(columns)\n",
    "            nrec = extractList(rec, columns)\n",
    "            ntdata.append(nrec)\n",
    "    return ntdata\n",
    "\n",
    "\n",
    "def areAllFieldsIncluded(ldata, columns):\n",
    "    \"\"\"\n",
    "    return True id all indexes are in the columns\n",
    "    Parameters\n",
    "        ldata : list data\n",
    "        columns : column indexes\n",
    "    \"\"\"\n",
    "    return list(range(len(ldata))) == columns\n",
    "\n",
    "def asIntList(items):\n",
    "    \"\"\"\n",
    "    returns int list\n",
    "    Parameters\n",
    "        items : list data\n",
    "    \"\"\"\n",
    "    return [int(i) for i in items]\n",
    "\n",
    "def asFloatList(items):\n",
    "    \"\"\"\n",
    "    returns float list\n",
    "    Parameters\n",
    "        items : list data\n",
    "    \"\"\"\n",
    "    return [float(i) for i in items]\n",
    "\n",
    "def pastTime(interval, unit):\n",
    "    \"\"\"\n",
    "    current and past time\n",
    "    Parameters\n",
    "        interval : time interval\n",
    "        unit: time unit\n",
    "    \"\"\"\n",
    "    curTime = int(time.time())\n",
    "    if unit == \"d\":\n",
    "        pastTime = curTime - interval * secInDay\n",
    "    elif unit == \"h\":\n",
    "        pastTime = curTime - interval * secInHour\n",
    "    elif unit == \"m\":\n",
    "        pastTime = curTime - interval * secInMinute\n",
    "    else:\n",
    "        raise ValueError(\"invalid time unit \" + unit)\n",
    "    return (curTime, pastTime)\n",
    "\n",
    "def minuteAlign(ts):\n",
    "    \"\"\"\n",
    "    minute aligned time\t\n",
    "    Parameters\n",
    "        ts : time stamp in sec\n",
    "    \"\"\"\n",
    "    return int((ts / secInMinute)) * secInMinute\n",
    "\n",
    "def multMinuteAlign(ts, min):\n",
    "    \"\"\"\n",
    "    multi minute aligned time\t\n",
    "    Parameters\n",
    "        ts : time stamp in sec\n",
    "        min : minute value\n",
    "    \"\"\"\n",
    "    intv = secInMinute * min\n",
    "    return int((ts / intv)) * intv\n",
    "\n",
    "def hourAlign(ts):\n",
    "    \"\"\"\n",
    "    hour aligned time\n",
    "    Parameters\n",
    "        ts : time stamp in sec\n",
    "    \"\"\"\n",
    "    return int((ts / secInHour)) * secInHour\n",
    "\n",
    "def hourOfDayAlign(ts, hour):\n",
    "    \"\"\"\n",
    "    hour of day aligned time\n",
    "    Parameters\n",
    "        ts : time stamp in sec\n",
    "        hour : hour of day\n",
    "    \"\"\"\n",
    "    day = int(ts / secInDay)\n",
    "    return (24 * day + hour) * secInHour\n",
    "\n",
    "def dayAlign(ts):\n",
    "    \"\"\"\n",
    "    day aligned time\n",
    "    Parameters\n",
    "        ts : time stamp in sec\n",
    "    \"\"\"\n",
    "    return int(ts / secInDay) * secInDay\n",
    "\n",
    "def timeAlign(ts, unit):\n",
    "    \"\"\"\n",
    "    boundary alignment of time\n",
    "    Parameters\n",
    "        ts : time stamp in sec\n",
    "        unit : unit of time\n",
    "    \"\"\"\n",
    "    alignedTs = 0\n",
    "    if unit == \"s\":\n",
    "        alignedTs = ts\n",
    "    elif unit == \"m\":\n",
    "        alignedTs = minuteAlign(ts)\n",
    "    elif unit == \"h\":\n",
    "        alignedTs = hourAlign(ts)\n",
    "    elif unit == \"d\":\n",
    "        alignedTs = dayAlign(ts)\n",
    "    else:\n",
    "        raise ValueError(\"invalid time unit\")\n",
    "    return alignedTs\n",
    "\n",
    "def monthOfYear(ts):\n",
    "    \"\"\"\n",
    "    month of year\n",
    "    Parameters\n",
    "        ts : time stamp in sec\n",
    "    \"\"\"\n",
    "    rem = ts % secInYear\n",
    "    dow = int(rem / secInMonth)\n",
    "    return dow\n",
    "\n",
    "def dayOfWeek(ts):\n",
    "    \"\"\"\n",
    "    day of week\n",
    "    Parameters\n",
    "        ts : time stamp in sec\n",
    "    \"\"\"\n",
    "    rem = ts % secInWeek\n",
    "    dow = int(rem / secInDay)\n",
    "    return dow\n",
    "\n",
    "def hourOfDay(ts):\n",
    "    \"\"\"\n",
    "    hour of day\n",
    "    Parameters\n",
    "        ts : time stamp in sec\n",
    "    \"\"\"\n",
    "    rem = ts % secInDay\n",
    "    hod = int(rem / secInHour)\n",
    "    return hod\n",
    "\n",
    "def processCmdLineArgs(expectedTypes, usage):\n",
    "    \"\"\"\n",
    "    process command line args and returns args as typed values\n",
    "    Parameters\n",
    "        expectedTypes : expected data types of arguments\n",
    "        usage : usage message string\n",
    "    \"\"\"\n",
    "    args = []\n",
    "    numComLineArgs = len(sys.argv)\n",
    "    numExpected = len(expectedTypes)\n",
    "    if (numComLineArgs - 1 == len(expectedTypes)):\n",
    "        try:\n",
    "            for i in range(0, numExpected):\n",
    "                if (expectedTypes[i] == typeInt):\n",
    "                    args.append(int(sys.argv[i+1]))\n",
    "                elif (expectedTypes[i] == typeFloat):\n",
    "                    args.append(float(sys.argv[i+1]))\n",
    "                elif (expectedTypes[i] == typeString):\n",
    "                    args.append(sys.argv[i+1])\n",
    "        except ValueError:\n",
    "            print (\"expected number of command line arguments found but there is type mis match\")\n",
    "            sys.exit(1)\n",
    "    else:\n",
    "        print (\"expected number of command line arguments not found\")\n",
    "        print (usage)\n",
    "        sys.exit(1)\n",
    "    return args\n",
    "\n",
    "def mutateString(val, numMutate, ctype):\n",
    "    \"\"\"\n",
    "    mutate string multiple times\n",
    "    Parameters\n",
    "        val : string value\n",
    "        numMutate : num of mutations\n",
    "        ctype : type of character to mutate with\n",
    "    \"\"\"\n",
    "    mutations = set()\n",
    "    count = 0\n",
    "    while count < numMutate:\n",
    "        j = randint(0, len(val)-1)\n",
    "        if j not in mutations:\n",
    "            if ctype == \"alpha\":\n",
    "                ch = selectRandomFromList(alphaTokens)\n",
    "            elif ctype == \"num\":\n",
    "                ch = selectRandomFromList(numTokens)\n",
    "            elif ctype == \"any\":\n",
    "                ch = selectRandomFromList(tokens)\n",
    "            val = val[:j] + ch + val[j+1:]\n",
    "            mutations.add(j)\n",
    "            count += 1\n",
    "    return val\n",
    "\n",
    "def mutateList(values, numMutate, vmin, vmax):\n",
    "    \"\"\"\n",
    "    mutate list multiple times\n",
    "    Parameters\n",
    "        values : list value\n",
    "        numMutate : num of mutations\n",
    "        vmin : minimum of value range\n",
    "        vmax : maximum of value range\n",
    "    \"\"\"\n",
    "    mutations = set()\n",
    "    count = 0\n",
    "    while count < numMutate:\n",
    "        j = randint(0, len(values)-1)\n",
    "        if j not in mutations:\n",
    "            values[j] = np.random.uniform(vmin, vmax)\n",
    "            count += 1\n",
    "    return values\n",
    "\n",
    "\n",
    "def swap(values, first, second):\n",
    "    \"\"\"\n",
    "    swap two elements\n",
    "    Parameters\n",
    "        values : list value\n",
    "        first : first swap position\n",
    "        second : second swap position\n",
    "    \"\"\"\n",
    "    t = values[first]\n",
    "    values[first] = values[second]\n",
    "    values[second] = t\n",
    "\n",
    "def swapBetweenLists(values1, values2):\n",
    "    \"\"\"\n",
    "    swap two elements between 2 lists\n",
    "    Parameters\n",
    "        values1 : first list of values\n",
    "        values2 : second list of values\n",
    "    \"\"\"\n",
    "    p1 = randint(0, len(values1)-1)\n",
    "    p2 = randint(0, len(values2)-1)\n",
    "    tmp = values1[p1]\n",
    "    values1[p1] = values2[p2]\n",
    "    values2[p2] = tmp\n",
    "\n",
    "def safeAppend(values, value):\n",
    "    \"\"\"\n",
    "    append only if not None\n",
    "    Parameters\n",
    "        values : list value\n",
    "        value : value to append\n",
    "    \"\"\"\n",
    "    if value is not None:\n",
    "        values.append(value)\n",
    "\n",
    "def getAllIndex(ldata, fldata):\n",
    "    \"\"\"\n",
    "    get ALL indexes of list elements\n",
    "    Parameters\n",
    "        ldata : list data to find index in\n",
    "        fldata : list data for values for index look up\n",
    "    \"\"\"\n",
    "    return list(map(lambda e : fldata.index(e), ldata))\n",
    "\n",
    "def findIntersection(lOne, lTwo):\n",
    "    \"\"\"\n",
    "    find intersection elements between 2 lists\n",
    "    Parameters\n",
    "        lOne : first list of data\n",
    "        lTwo : second list of data\n",
    "    \"\"\"\n",
    "    sOne = set(lOne)\n",
    "    sTwo = set(lTwo)\n",
    "    sInt = sOne.intersection(sTwo)\n",
    "    return list(sInt)\n",
    "\n",
    "def isIntvOverlapped(rOne, rTwo):\n",
    "    \"\"\"\n",
    "    checks overlap between 2 intervals\n",
    "    Parameters\n",
    "        rOne : first interval boundaries\n",
    "        rTwo : second interval boundaries\n",
    "    \"\"\"\n",
    "    clear = rOne[1] <=  rTwo[0] or rOne[0] >=  rTwo[1] \n",
    "    return not clear\n",
    "\n",
    "def isIntvLess(rOne, rTwo):\n",
    "    \"\"\"\n",
    "    checks if first iterval is less than second\n",
    "    Parameters\n",
    "        rOne : first interval boundaries\n",
    "        rTwo : second interval boundaries\n",
    "    \"\"\"\n",
    "    less = rOne[1] <=  rTwo[0] \n",
    "    return less\n",
    "\n",
    "def findRank(e, values):\n",
    "    \"\"\"\n",
    "    find rank of value in a list\n",
    "    Parameters\n",
    "        e : value to compare with\n",
    "        values : list data\n",
    "    \"\"\"\n",
    "    count =  1\n",
    "    for ve in values:\n",
    "        if ve < e:\n",
    "            count += 1\n",
    "    return count\n",
    "\n",
    "def findRanks(toBeRanked, values):\n",
    "    \"\"\"\n",
    "    find ranks of values in one list in another list\n",
    "    Parameters\n",
    "        toBeRanked : list of values for which ranks are found\n",
    "        values : list in which rank is found : \n",
    "    \"\"\"\n",
    "    return list(map(lambda e: findRank(e, values), toBeRanked))\n",
    "\n",
    "def formatFloat(prec, value, label = None):\n",
    "    \"\"\"\n",
    "    formats a float with optional label\n",
    "    Parameters\n",
    "        prec : precision\n",
    "        value : data value\n",
    "        label : label for data\n",
    "    \"\"\"\n",
    "    st = (label + \" \") if label else \"\"\n",
    "    formatter = \"{:.\" + str(prec) + \"f}\" \n",
    "    return st + formatter.format(value)\n",
    "\n",
    "def formatAny(value, label = None):\n",
    "    \"\"\"\n",
    "    formats any obkect with optional label\n",
    "    Parameters\n",
    "        value : data value\n",
    "        label : label for data\n",
    "    \"\"\"\n",
    "    st = (label + \" \") if label else \"\"\n",
    "    return st + str(value)\n",
    "\n",
    "def printList(values):\n",
    "    \"\"\"\n",
    "    pretty print list\n",
    "    Parameters\n",
    "        values : list of values\n",
    "    \"\"\"\n",
    "    for v in values:\n",
    "        print(v)\n",
    "\n",
    "def printMap(values, klab, vlab, precision, offset=16):\n",
    "    \"\"\"\n",
    "    pretty print hash map\n",
    "    Parameters\n",
    "        values : dictionary of values\n",
    "        klab : label for key\n",
    "        vlab : label for value\n",
    "        precision : precision\n",
    "        offset : left justify offset\n",
    "    \"\"\"\n",
    "    print(klab.ljust(offset, \" \") + vlab)\n",
    "    for k in values.keys():\n",
    "        v = values[k]\n",
    "        ks = toStr(k, precision).ljust(offset, \" \")\n",
    "        vs = toStr(v, precision)\n",
    "        print(ks +  vs)\n",
    "\n",
    "def printPairList(values, lab1, lab2, precision, offset=16):\n",
    "    \"\"\"\n",
    "    pretty print list of pairs\n",
    "    Parameters\n",
    "        values : dictionary of values\n",
    "        lab1 : first label\n",
    "        lab2 : second label\n",
    "        precision : precision\n",
    "        offset : left justify offset\n",
    "    \"\"\"\n",
    "    print(lab1.ljust(offset, \" \") + lab2)\n",
    "    for (v1, v2) in values:\n",
    "        sv1 = toStr(v1, precision).ljust(offset, \" \")\n",
    "        sv2 = toStr(v2, precision)\n",
    "        print(sv1 + sv2)\n",
    "\n",
    "def createMap(*values):\n",
    "    \"\"\"\n",
    "    create disctionary with results\n",
    "    Parameters\n",
    "        values : sequence of key value pairs\n",
    "    \"\"\"\n",
    "    result = dict()\n",
    "    for i in range(0, len(values), 2):\n",
    "        result[values[i]] = values[i+1]\n",
    "    return result\n",
    "\n",
    "def getColMinMax(table, col):\n",
    "    \"\"\"\n",
    "    return min, max values of a column\n",
    "    Parameters\n",
    "        table : tabular data\n",
    "        col : column index\n",
    "    \"\"\"\n",
    "    vmin = None\n",
    "    vmax = None\n",
    "    for rec in table:\n",
    "        value = rec[col]\n",
    "        if vmin is None:\n",
    "            vmin = value\n",
    "            vmax = value\n",
    "        else:\n",
    "            if value < vmin:\n",
    "                vmin = value\n",
    "            elif value > vmax:\n",
    "                vmax = value\n",
    "    return (vmin, vmax, vmax - vmin)\n",
    "\n",
    "def createLogger(name, logFilePath, logLevName):\n",
    "    \"\"\"\n",
    "    creates logger\n",
    "    Parameters\n",
    "        name : logger name\n",
    "        logFilePath : log file path\n",
    "        logLevName : log level\n",
    "    \"\"\"\n",
    "    logger = logging.getLogger(name)\n",
    "    fHandler = logging.handlers.RotatingFileHandler(logFilePath, maxBytes=1048576, backupCount=4)\n",
    "    logLev = logLevName.lower()\n",
    "    if logLev == \"debug\":\n",
    "        logLevel = logging.DEBUG\n",
    "    elif logLev == \"info\":\n",
    "        logLevel = logging.INFO\n",
    "    elif logLev == \"warning\":\n",
    "        logLevel = logging.WARNING\n",
    "    elif logLev == \"error\":\n",
    "        logLevel = logging.ERROR\n",
    "    elif logLev == \"critical\":\n",
    "        logLevel = logging.CRITICAL\n",
    "    else:\n",
    "        raise ValueError(\"invalid log level name \" + logLevelName)\n",
    "    fHandler.setLevel(logLevel)\n",
    "    fFormat = logging.Formatter(\"%(asctime)s - %(name)s - %(levelname)s - %(message)s\")\n",
    "    fHandler.setFormatter(fFormat)\n",
    "    logger.addHandler(fHandler)\n",
    "    logger.setLevel(logLevel)\n",
    "    return logger\n",
    "\n",
    "@contextmanager\n",
    "def suppressStdout():\n",
    "    \"\"\"\n",
    "    suppress stdout\n",
    "    Parameters\n",
    "    \"\"\"\n",
    "    with open(os.devnull, \"w\") as devnull:\n",
    "        oldStdout = sys.stdout\n",
    "        sys.stdout = devnull\n",
    "        try:  \n",
    "            yield\n",
    "        finally:\n",
    "            sys.stdout = oldStdout\n",
    "\n",
    "def exitWithMsg(msg):\n",
    "    \"\"\"\n",
    "    print message and exit\n",
    "    Parameters\n",
    "        msg : message\n",
    "    \"\"\"\n",
    "    print(msg + \" -- quitting\")\n",
    "    sys.exit(0)\n",
    "\n",
    "def drawLine(data, yscale=None):\n",
    "    \"\"\"\n",
    "    line plot\n",
    "    Parameters\n",
    "        data : list data\n",
    "        yscale : y axis scale\n",
    "    \"\"\"\n",
    "    plt.plot(data)\n",
    "    if yscale:\n",
    "        step = int(yscale / 10)\n",
    "        step = int(step / 10) * 10\n",
    "        plt.yticks(range(0, yscale, step))\n",
    "    plt.show()\n",
    "\n",
    "def drawPlot(x, y, xlabel, ylabel):\n",
    "    \"\"\"\n",
    "    line plot\n",
    "    Parameters\n",
    "        x : x values\n",
    "        y : y values\n",
    "        xlabel : x axis label\n",
    "        ylabel : y axis label\n",
    "    \"\"\"\n",
    "    plt.plot(x,y)\n",
    "    plt.xlabel(xlabel)\n",
    "    plt.ylabel(ylabel)\n",
    "    plt.show()\n",
    "\n",
    "def drawPairPlot(x, y1, y2, xlabel,ylabel, y1label, y2label):\n",
    "    \"\"\"\n",
    "    line plot of 2 lines\n",
    "    Parameters\n",
    "        x : x values\n",
    "        y1 : first y values\n",
    "        y2 : second y values\n",
    "        xlabel : x labbel\n",
    "        ylabel : y label\n",
    "        y1label : first plot label\n",
    "        y2label : second plot label\n",
    "    \"\"\"\n",
    "    plt.plot(x, y1, label = y1label)\n",
    "    plt.plot(x, y2, label = y2label)\n",
    "    plt.xlabel(xlabel)\n",
    "    plt.ylabel(ylabel)\n",
    "    plt.legend()\n",
    "    plt.show()\n",
    "\n",
    "def drawHist(ldata, myTitle, myXlabel, myYlabel, nbins=10):\n",
    "    \"\"\"\n",
    "    draw histogram\n",
    "    Parameters\n",
    "        ldata : list data\n",
    "        myTitle : title\n",
    "        myXlabel : x label\n",
    "        myYlabel : y label \n",
    "        nbins : num of bins\n",
    "    \"\"\"\n",
    "    plt.hist(ldata, bins=nbins, density=True)\n",
    "    plt.title(myTitle)\n",
    "    plt.xlabel(myXlabel)\n",
    "    plt.ylabel(myYlabel)\n",
    "    plt.show()\n",
    "\n",
    "def saveObject(obj, filePath):\n",
    "    \"\"\"\n",
    "    saves an object\n",
    "    Parameters\n",
    "        obj : object\n",
    "        filePath : file path for saved object\n",
    "    \"\"\"\n",
    "    with open(filePath, \"wb\") as outfile:\n",
    "        pickle.dump(obj,outfile)\n",
    "\n",
    "def restoreObject(filePath):\n",
    "    \"\"\"\n",
    "    restores an object\n",
    "    Parameters\n",
    "        filePath : file path to restore object from\n",
    "    \"\"\"\n",
    "    with open(filePath, \"rb\") as infile:\n",
    "        obj = pickle.load(infile)\n",
    "    return obj\n",
    "\n",
    "def isNumeric(data):\n",
    "    \"\"\"\n",
    "    true if all elements int or float\n",
    "    Parameters\n",
    "        data : numeric data list\n",
    "    \"\"\"\n",
    "    if type(data) == list or type(data) == np.ndarray:\n",
    "        col = pd.Series(data)\n",
    "    else:\n",
    "        col = data\n",
    "    return col.dtype == np.int32 or col.dtype == np.int64 or col.dtype == np.float32 or col.dtype == np.float64\n",
    "\n",
    "def isInteger(data):\n",
    "    \"\"\"\n",
    "    true if all elements int \n",
    "    Parameters\n",
    "        data : numeric data list\n",
    "    \"\"\"\n",
    "    if type(data) == list or type(data) == np.ndarray:\n",
    "        col = pd.Series(data)\n",
    "    else:\n",
    "        col = data\n",
    "    return col.dtype == np.int32 or col.dtype == np.int64\n",
    "\n",
    "def isFloat(data):\n",
    "    \"\"\"\n",
    "    true if all elements  float\n",
    "    Parameters\n",
    "        data : numeric data list\n",
    "    \"\"\"\n",
    "    if type(data) == list or type(data) == np.ndarray:\n",
    "        col = pd.Series(data)\n",
    "    else:\n",
    "        col = data\n",
    "    return col.dtype == np.float32 or col.dtype == np.float64\n",
    "\n",
    "def isBinary(data):\n",
    "    \"\"\"\n",
    "    true if all elements either 0 or 1\n",
    "    Parameters\n",
    "        data : binary data\n",
    "    \"\"\"\n",
    "    re = next((d for d in data if not (type(d) == int and (d == 0 or d == 1))), None)\n",
    "    return (re is None)\n",
    "\n",
    "def isCategorical(data):\n",
    "    \"\"\"\n",
    "    true if all elements int or string\n",
    "    Parameters\n",
    "        data : data value\n",
    "    \"\"\"\n",
    "    re = next((d for d in data if not (type(d) == int or type(d) == str)), None)\n",
    "    return (re is None)\n",
    "\n",
    "def assertEqual(value, veq, msg):\n",
    "    \"\"\"\n",
    "    assert equal to\n",
    "    Parameters\n",
    "        value : value\n",
    "        veq : value to be equated with\n",
    "        msg : error msg\n",
    "    \"\"\"\n",
    "    assert value == veq , msg\n",
    "\n",
    "def assertGreater(value, vmin, msg):\n",
    "    \"\"\"\n",
    "    assert greater than \n",
    "    Parameters\n",
    "        value : value\n",
    "        vmin : minimum value\n",
    "        msg : error msg\n",
    "    \"\"\"\n",
    "    assert value > vmin , msg\n",
    "\n",
    "def assertGreaterEqual(value, vmin, msg):\n",
    "    \"\"\"\n",
    "    assert greater than \n",
    "    Parameters\n",
    "        value : value\n",
    "        vmin : minimum value\n",
    "        msg : error msg\n",
    "    \"\"\"\n",
    "    assert value >= vmin , msg\n",
    "\n",
    "def assertLesser(value, vmax, msg):\n",
    "    \"\"\"\n",
    "    assert less than\n",
    "    Parameters\n",
    "        value : value\n",
    "        vmax : maximum value\n",
    "        msg : error msg\n",
    "    \"\"\"\n",
    "    assert value < vmax , msg\n",
    "\n",
    "def assertLesserEqual(value, vmax, msg):\n",
    "    \"\"\"\n",
    "    assert less than\n",
    "    Parameters\n",
    "        value : value\n",
    "        vmax : maximum value\n",
    "        msg : error msg\n",
    "    \"\"\"\n",
    "    assert value <= vmax , msg\n",
    "\n",
    "def assertWithinRange(value, vmin, vmax, msg):\n",
    "    \"\"\"\n",
    "    assert within range\n",
    "    Parameters\n",
    "        value : value\n",
    "        vmin : minimum value\n",
    "        vmax : maximum value\n",
    "        msg : error msg\n",
    "    \"\"\"\n",
    "    assert value >= vmin and value <= vmax, msg\n",
    "\n",
    "def assertInList(value, values, msg):\n",
    "    \"\"\"\n",
    "    assert contains in a list\n",
    "    Parameters\n",
    "        value ; balue to check for inclusion\n",
    "        values : list data\n",
    "        msg : error msg\n",
    "    \"\"\"\n",
    "    assert value in values, msg\n",
    "\n",
    "def maxListDist(l1, l2):\n",
    "    \"\"\"\n",
    "    maximum list element difference between 2 lists\n",
    "    Parameters\n",
    "        l1 : first list data\n",
    "        l2 : second list data\n",
    "    \"\"\"\n",
    "    dist = max(list(map(lambda v : abs(v[0] - v[1]), zip(l1, l2))))\t\n",
    "    return dist\n",
    "\n",
    "def fileLineCount(fPath):\n",
    "    \"\"\" \n",
    "    number of lines ina file \n",
    "    Parameters\n",
    "        fPath : file path\n",
    "    \"\"\"\n",
    "    with open(fPath) as f:\n",
    "        for i, li in enumerate(f):\n",
    "            pass\n",
    "    return (i + 1)\n",
    "\n",
    "def getAlphaNumCharCount(sdata):\n",
    "    \"\"\" \n",
    "    number of alphabetic and numeric charcters in a string \n",
    "    Parameters\n",
    "        sdata : string data\n",
    "    \"\"\"\n",
    "    acount = 0\n",
    "    ncount = 0\n",
    "    scount = 0\n",
    "    ocount = 0\n",
    "    assertEqual(type(sdata), str, \"input must be string\")\n",
    "    for c in sdata:\n",
    "        if c.isnumeric():\n",
    "            ncount += 1\n",
    "        elif c.isalpha():\n",
    "            acount += 1\n",
    "        elif c.isspace():\n",
    "            scount += 1\n",
    "        else:\n",
    "            ocount += 1\n",
    "    r = (acount, ncount, ocount)\n",
    "    return r\n",
    "\n",
    "class StepFunction:\n",
    "    \"\"\"\n",
    "    step function\n",
    "    Parameters\n",
    "    \"\"\"\n",
    "    def __init__(self,  *values):\n",
    "        \"\"\"\n",
    "        initilizer\n",
    "\n",
    "        Parameters\n",
    "            values : list of tuples, wich each tuple containing 2 x values and corresponding y value\n",
    "        \"\"\"\n",
    "        self.points = values\n",
    "\n",
    "    def find(self, x):\n",
    "        \"\"\"\n",
    "        finds step function value\n",
    "\n",
    "        Parameters\n",
    "            x : x value\n",
    "        \"\"\"\n",
    "        found = False\n",
    "        y = 0\n",
    "        for p in self.points:\n",
    "            if (x >= p[0] and x < p[1]):\n",
    "                y = p[2]\n",
    "                found = True\n",
    "                break\n",
    "\n",
    "        if not found:\n",
    "            l = len(self.points)\n",
    "            if (x < self.points[0][0]):\n",
    "                y = self.points[0][2]\n",
    "            elif (x > self.points[l-1][1]):\n",
    "                y = self.points[l-1][2]\n",
    "        return y\n",
    "\n",
    "\n",
    "class DummyVarGenerator:\n",
    "    \"\"\"\n",
    "    dummy variable generator for categorical variable\n",
    "    \"\"\"\n",
    "    def __init__(self,  rowSize, catValues, trueVal, falseVal, delim=None):\n",
    "        \"\"\"\n",
    "        initilizer\n",
    "\n",
    "        Parameters\n",
    "            rowSize : row size\n",
    "            catValues : dictionary with field index as key and list of categorical values as value\n",
    "            trueVal : true value, typically \"1\"\n",
    "            falseval : false value , typically \"0\"\n",
    "            delim : field delemeter\n",
    "        \"\"\"\n",
    "        self.rowSize = rowSize\n",
    "        self.catValues = catValues\n",
    "        numCatVar = len(catValues)\n",
    "        colCount = 0\n",
    "        for v in self.catValues.values():\n",
    "            colCount += len(v)\n",
    "        self.newRowSize = rowSize - numCatVar + colCount\n",
    "        #print (\"new row size {}\".format(self.newRowSize))\n",
    "        self.trueVal = trueVal\n",
    "        self.falseVal = falseVal\n",
    "        self.delim = delim\n",
    "\n",
    "    def processRow(self, row):\n",
    "        \"\"\"\n",
    "        encodes categorical variables, returning as delemeter separate dstring or list\n",
    "\n",
    "        Parameters\n",
    "            row : row either delemeter separated string or list\n",
    "        \"\"\"\n",
    "        if self.delim is not None:\n",
    "            rowArr = row.split(self.delim)\n",
    "            msg = \"row does not have expected number of columns found \" + str(len(rowArr)) + \" expected \" + str(self.rowSize)\n",
    "            assert len(rowArr) == self.rowSize, msg\n",
    "        else:\n",
    "            rowArr = row\n",
    "\n",
    "        newRowArr = []\n",
    "        for i in range(len(rowArr)):\n",
    "            curVal = rowArr[i]\n",
    "            if (i in self.catValues):\n",
    "                values = self.catValues[i]\n",
    "                for val in values:\n",
    "                    if val == curVal:\n",
    "                        newVal = self.trueVal\n",
    "                    else:\n",
    "                        newVal = self.falseVal\n",
    "                    newRowArr.append(newVal)\n",
    "            else:\n",
    "                newRowArr.append(curVal)\n",
    "        assert len(newRowArr) == self.newRowSize, \"invalid new row size \" + str(len(newRowArr)) + \" expected \" + str(self.newRowSize)\n",
    "        encRow = self.delim.join(newRowArr) if self.delim is not None else newRowArr\n",
    "        return encRow\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}