{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "2d05ce02", "metadata": {}, "outputs": [], "source": [ "import os\n", "import sys\n", "import numpy as np\n", "from sklearn import preprocessing\n", "from sklearn import metrics\n", "from sklearn.datasets import make_blobs\n", "from sklearn.datasets import make_classification\n", "import random\n", "from math import *\n", "from decimal import Decimal\n", "import statistics\n", "import jprops\n", "from Levenshtein import distance as ld\n", "from util import *\n", "from sampler import *\n", "\n", "class Configuration:\n", " \"\"\"\n", " Configuration management. Supports default value, mandatory value and typed value.\n", " \"\"\"\n", " def __init__(self, configFile, defValues, verbose=False):\n", " \"\"\"\n", " initializer\n", "\n", " Parameters\n", " configFile : config file path\n", " defValues : dictionary of default values\n", " verbose : verbosity flag\n", " \"\"\"\n", " configs = {}\n", " with open(configFile) as fp:\n", " for key, value in jprops.iter_properties(fp):\n", " configs[key] = value\n", " self.configs = configs\n", " self.defValues = defValues\n", " self.verbose = verbose\n", "\n", " def override(self, configFile):\n", " \"\"\"\n", " over ride configuration from file\n", "\n", " Parameters\n", " configFile : override config file path\n", " \"\"\"\n", " with open(configFile) as fp:\n", " for key, value in jprops.iter_properties(fp):\n", " self.configs[key] = value\n", "\n", "\n", " def setParam(self, name, value):\n", " \"\"\"\n", " override individual configuration\n", " Parameters\n", " name : config param name\n", " value : config param value\n", " \"\"\"\n", " self.configs[name] = value\n", "\n", "\n", " def getStringConfig(self, name):\n", " \"\"\"\n", " get string param\n", " Parameters\n", " name : config param name\n", " \"\"\"\n", " if self.isNone(name):\n", " val = (None, False)\n", " elif self.isDefault(name):\n", " val = (self.handleDefault(name), True)\n", " else:\n", " val = (self.configs[name], False)\n", " if self.verbose:\n", " print( \"{} {} {}\".format(name, self.configs[name], val[0]))\n", " return val\n", "\n", "\n", " def getIntConfig(self, name):\n", " \"\"\"\n", " get int param\n", " Parameters\n", " name : config param name\n", " \"\"\"\n", " #print \"%s %s\" %(name,self.configs[name])\n", " if self.isNone(name):\n", " val = (None, False)\n", " elif self.isDefault(name):\n", " val = (self.handleDefault(name), True)\n", " else:\n", " val = (int(self.configs[name]), False)\n", " if self.verbose:\n", " print( \"{} {} {}\".format(name, self.configs[name], val[0]))\n", " return val\n", "\n", "\n", " def getFloatConfig(self, name):\n", " \"\"\"\n", " get float param\n", " Parameters\n", " name : config param name\n", " \"\"\"\n", " #print \"%s %s\" %(name,self.configs[name])\n", " if self.isNone(name):\n", " val = (None, False)\n", " elif self.isDefault(name):\n", " val = (self.handleDefault(name), True)\n", " else:\n", " val = (float(self.configs[name]), False)\n", " if self.verbose:\n", " print( \"{} {} {:06.3f}\".format(name, self.configs[name], val[0]))\n", " return val\n", "\n", "\n", " def getBooleanConfig(self, name):\n", " \"\"\"\n", " #get boolean param\n", " Parameters\n", " name : config param name\n", " \"\"\"\n", " if self.isNone(name):\n", " val = (None, False)\n", " elif self.isDefault(name):\n", " val = (self.handleDefault(name), True)\n", " else:\n", " bVal = self.configs[name].lower() == \"true\"\n", " val = (bVal, False)\n", " if self.verbose:\n", " print( \"{} {} {}\".format(name, self.configs[name], val[0]))\n", " return val\n", "\n", "\n", " def getIntListConfig(self, name, delim=\",\"):\n", " \"\"\"\n", " get int list param\n", " Parameters\n", " name : config param name\n", " delim : delemeter\n", " \"\"\"\n", " if self.isNone(name):\n", " val = (None, False)\n", " elif self.isDefault(name):\n", " val = (self.handleDefault(name), True)\n", " else:\n", " delSepStr = self.getStringConfig(name)\n", "\n", " #specified as range\n", " intList = strListOrRangeToIntArray(delSepStr[0])\n", " val =(intList, delSepStr[1])\n", " return val\n", "\n", " def getFloatListConfig(self, name, delim=\",\"):\n", " \"\"\"\n", " get float list param\n", " Parameters\n", " name : config param name\n", " delim : delemeter\n", " \"\"\"\n", " delSepStr = self.getStringConfig(name)\n", " if self.isNone(name):\n", " val = (None, False)\n", " elif self.isDefault(name):\n", " val = (self.handleDefault(name), True)\n", " else:\n", " flList = strToFloatArray(delSepStr[0], delim)\n", " val =(flList, delSepStr[1])\n", " return val\n", "\n", "\n", " def getStringListConfig(self, name, delim=\",\"):\n", " \"\"\"\n", " get string list param\n", " Parameters\n", " name : config param name\n", " delim : delemeter\n", " \"\"\"\n", " delSepStr = self.getStringConfig(name)\n", " if self.isNone(name):\n", " val = (None, False)\n", " elif self.isDefault(name):\n", " val = (self.handleDefault(name), True)\n", " else:\n", " strList = delSepStr[0].split(delim)\n", " val = (strList, delSepStr[1])\n", " return val\n", "\n", " def handleDefault(self, name):\n", " \"\"\"\n", " handles default\n", " Parameters\n", " name : config param name\n", " \"\"\"\n", " dVal = self.defValues[name]\n", " if (dVal[1] is None):\n", " val = dVal[0]\n", " else:\n", " raise ValueError(dVal[1])\n", " return val\n", "\n", "\n", " def isNone(self, name):\n", " \"\"\"\n", " true is value is None\t\n", " Parameters\n", " name : config param name\n", " \"\"\"\n", " return self.configs[name].lower() == \"none\"\n", "\n", "\n", " def isDefault(self, name):\n", " \"\"\"\n", " true if the value is default\t\n", " Parameters\n", " name : config param name\n", " \"\"\"\n", " de = self.configs[name] == \"_\"\n", " #print de\n", " return de\n", "\n", "\n", " def eitherOrStringConfig(self, firstName, secondName):\n", " \"\"\"\n", " returns one of two string parameters\t\n", " Parameters\n", " firstName : first parameter name\n", " secondName : second parameter name\t\n", " \"\"\"\n", " if not self.isNone(firstName):\n", " first = self.getStringConfig(firstName)[0]\n", " second = None\n", " if not self.isNone(secondName):\n", " raise ValueError(\"only one of the two parameters should be set and not both \" + firstName + \" \" + secondName)\n", " else:\n", " if not self.isNone(secondName):\n", " second = self.getStringConfig(secondtName)[0]\n", " first = None\n", " else:\n", " raise ValueError(\"at least one of the two parameters should be set \" + firstName + \" \" + secondName)\n", " return (first, second)\n", "\n", "\n", " def eitherOrIntConfig(self, firstName, secondName):\n", " \"\"\"\n", " returns one of two int parameters\t\n", " Parameters\n", " firstName : first parameter name\n", " secondName : second parameter name\t\n", " \"\"\"\n", " if not self.isNone(firstName):\n", " first = self.getIntConfig(firstName)[0]\n", " second = None\n", " if not self.isNone(secondName):\n", " raise ValueError(\"only one of the two parameters should be set and not both \" + firstName + \" \" + secondName)\n", " else:\n", " if not self.isNone(secondName):\n", " second = self.getIntConfig(secondsName)[0]\n", " first = None\n", " else:\n", " raise ValueError(\"at least one of the two parameters should be set \" + firstName + \" \" + secondName)\n", " return (first, second)\n", "\n", "\n", "class CatLabelGenerator:\n", " \"\"\"\n", " label generator for categorical variables\n", " \"\"\"\n", " def __init__(self, catValues, delim):\n", " \"\"\"\n", " initilizers\n", "\n", " Parameters\n", " catValues : dictionary of categorical values\n", " delim : delemeter\n", " \"\"\"\n", " self.encoders = {}\n", " self.catValues = catValues\n", " self.delim = delim\n", " for k in self.catValues.keys():\t\n", " le = preprocessing.LabelEncoder()\t\n", " le.fit(self.catValues[k])\n", " self.encoders[k] = le\n", "\n", " def processRow(self, row):\t\n", " \"\"\"\n", " encode row categorical values\n", "\n", " Parameters:\n", " row : data row\n", " \"\"\"\n", " #print row\n", " rowArr = row.split(self.delim)\n", " for i in range(len(rowArr)):\n", " if (i in self.catValues):\n", " curVal = rowArr[i]\n", " assert curVal in self.catValues[i], \"categorival value invalid\"\n", " encVal = self.encoders[i].transform([curVal])\n", " rowArr[i] = str(encVal[0])\n", " return self.delim.join(rowArr)\t\t\n", "\n", " def getOrigLabels(self, indx):\n", " \"\"\"\n", " get original labels\n", "\n", " Parameters:\n", " indx : column index\n", " \"\"\"\n", " return self.encoders[indx].classes_\t\n", "\n", "\n", "class SupvLearningDataGenerator:\n", " \"\"\"\n", " data generator for supervised learning\n", " \"\"\"\n", " def __init__(self, configFile):\n", " \"\"\"\n", " initilizers\n", "\n", " Parameters\n", " configFile : config file path\n", " \"\"\"\n", " defValues = dict()\n", " defValues[\"common.num.samp\"] = (100, None)\n", " defValues[\"common.num.feat\"] = (5, None)\n", " defValues[\"common.feat.trans\"] = (None, None)\n", " defValues[\"common.feat.types\"] = (None, \"missing feature types\")\n", " defValues[\"common.cat.feat.distr\"] = (None, None)\n", " defValues[\"common.output.precision\"] = (3, None)\n", " defValues[\"common.error\"] = (0.01, None)\n", " defValues[\"class.gen.technique\"] = (\"blob\", None)\n", " defValues[\"class.num.feat.informative\"] = (2, None)\n", " defValues[\"class.num.feat.redundant\"] = (2, None)\n", " defValues[\"class.num.feat.repeated\"] = (0, None)\n", " defValues[\"class.num.feat.cat\"] = (0, None)\n", " defValues[\"class.num.class\"] = (2, None)\n", "\n", " self.config = Configuration(configFile, defValues)\n", "\n", " def genClassifierData(self):\n", " \"\"\"\n", " generates classifier data\n", " \"\"\"\n", " nsamp = self.config.getIntConfig(\"common.num.samp\")[0]\n", " nfeat = self.config.getIntConfig(\"common.num.feat\")[0]\n", " nclass = self.config.getIntConfig(\"class.num.class\")[0]\n", " #transform with shift and scale\n", " ftrans = self.config.getFloatListConfig(\"common.feat.trans\")[0]\n", " feTrans = dict()\n", " for i in range(0, len(ftrans), 2):\n", " tr = (ftrans[i], ftrans[i+1])\n", " indx = int(i/2)\n", " feTrans[indx] = tr\n", "\n", " ftypes = self.config.getStringListConfig(\"common.feat.types\")[0]\n", "\n", " # categorical feature distribution\n", " feCatDist = dict()\n", " fcatdl = self.config.getStringListConfig(\"common.cat.feat.distr\")[0]\n", " for fcatds in fcatdl:\n", " fcatd = fcatds.split(\":\")\n", " feInd = int(fcatd[0])\n", " clVal = int(fcatd[1])\n", " key = (feInd, clVal)\t\t#feature index and class value\n", " dist = list(map(lambda i : (fcatd[i], float(fcatd[i+1])), range(2, len(fcatd), 2)))\n", " feCatDist[key] = CategoricalRejectSampler(*dist)\n", "\n", " #shift and scale\n", " genTechnique = self.config.getStringConfig(\"class.gen.technique\")[0]\n", " error = self.config.getFloatConfig(\"common.error\")[0]\n", " if genTechnique == \"blob\":\n", " features, claz = make_blobs(n_samples=nsamp, centers=nclass, n_features=nfeat)\n", " for i in range(nsamp):\t\t\t#shift and scale\n", " for j in range(nfeat):\n", " tr = feTrans[j]\n", " features[i,j] = (features[i,j] + tr[0]) * tr[1]\n", " claz = np.array(list(map(lambda c : random.randint(0, nclass-1) if random.random() < error else c, claz)))\n", " elif genTechnique == \"classify\":\n", " nfeatInfo = self.config.getIntConfig(\"class.num.feat.informative\")[0]\n", " nfeatRed = self.config.getIntConfig(\"class.num.feat.redundant\")[0]\n", " nfeatRep = self.config.getIntConfig(\"class.num.feat.repeated\")[0]\n", " shifts = list(map(lambda i : feTrans[i][0], range(nfeat)))\n", " scales = list(map(lambda i : feTrans[i][1], range(nfeat)))\n", " features, claz = make_classification(n_samples=nsamp, n_features=nfeat, n_informative=nfeatInfo, n_redundant=nfeatRed, \n", " n_repeated=nfeatRep, n_classes=nclass, flip_y=error, shift=shifts, scale=scales)\n", " else:\n", " raise \"invalid genaration technique\"\n", "\n", " # add categorical features and format\n", " nCatFeat = self.config.getIntConfig(\"class.num.feat.cat\")[0]\n", " prec = self.config.getIntConfig(\"common.output.precision\")[0]\n", " for f , c in zip(features, claz):\n", " nfs = list(map(lambda i : self.numFeToStr(i, f[i], c, ftypes[i], prec), range(nfeat)))\n", " if nCatFeat > 0:\n", " cfs = list(map(lambda i : self.catFe(i, c, ftypes[i], feCatDist), range(nfeat, nfeat + nCatFeat, 1)))\n", " rec = \",\".join(nfs) + \",\" + \",\".join(cfs) + \",\" + str(c)\n", " else:\n", " rec = \",\".join(nfs) + \",\" + str(c)\n", " yield rec\n", "\n", " def numFeToStr(self, fv, ft, prec):\n", " \"\"\"\n", " nummeric feature value to string\n", "\n", " Parameters\n", " fv : field value\n", " ft : field data type\n", " prec : precision\n", " \"\"\"\n", " if ft == \"float\":\n", " s = formatFloat(prec, fv)\n", " elif ft ==\"int\":\n", " s = str(int(fv))\n", " else:\t\t\n", " raise \"invalid type expecting float or int\"\n", " return s\n", "\n", " def catFe(self, i, cv, ft, feCatDist):\n", " \"\"\"\n", " generate categorical feature\n", "\n", " Parameters\n", " i : col index\n", " cv : class value\n", " ft : field data type\n", " feCatDist : cat value distribution\n", " \"\"\"\n", " if ft == \"cat\":\n", " key = (i, cv)\n", " s = feCatDist[key].sample()\n", " else:\t\t\n", " raise \"invalid type expecting categorical\"\n", " return s\n", "\n", "\n", "\n", "def loadDataFile(file, delim, cols, colIndices):\n", " \"\"\"\n", " loads delim separated file and extracts columns\n", " Parameters\n", " file : file path\n", " delim : delemeter\n", " cols : columns to use from file\n", " colIndices ; columns to extract\n", " \"\"\"\n", " data = np.loadtxt(file, delimiter=delim, usecols=cols)\n", " extrData = data[:,colIndices]\n", " return (data, extrData)\n", "\n", "def loadFeatDataFile(file, delim, cols):\n", " \"\"\"\n", " loads delim separated file and extracts columns\n", "\n", " Parameters\n", " file : file path\n", " delim : delemeter\n", " cols : columns to use from file\n", " \"\"\"\n", " data = np.loadtxt(file, delimiter=delim, usecols=cols)\n", " return data\n", "\n", "def extrColumns(arr, columns):\n", " \"\"\"\n", " extracts columns\n", "\n", " Parameters\n", " arr : 2D array\n", " columns : columns\n", " \"\"\"\n", " return arr[:, columns]\n", "\n", "def subSample(featData, clsData, subSampleRate, withReplacement):\n", " \"\"\"\n", " subsample feature and class label data\t\n", " Parameters\n", " featData : 2D array of feature data\n", " clsData : arrray of class labels\n", " subSampleRate : fraction to be sampled\n", " withReplacement : true if sampling with replacement\n", " \"\"\"\n", " sampSize = int(featData.shape[0] * subSampleRate)\n", " sampledIndx = np.random.choice(featData.shape[0],sampSize, replace=withReplacement)\n", " sampFeat = featData[sampledIndx]\n", " sampCls = clsData[sampledIndx]\n", " return(sampFeat, sampCls)\n", "\n", "def euclideanDistance(x,y):\n", " \"\"\"\n", " euclidean distance\n", " Parameters\n", " x : first vector\n", " y : second fvector\n", " \"\"\"\n", " return sqrt(sum(pow(a-b, 2) for a, b in zip(x, y)))\n", "\n", "def squareRooted(x):\n", " \"\"\"\n", " square root of sum square\n", " Parameters\n", " x : data vector\n", " \"\"\"\n", " return round(sqrt(sum([a*a for a in x])),3)\n", "\n", "def cosineSimilarity(x,y):\n", " \"\"\"\n", " cosine similarity\n", "\n", " Parameters\n", " x : first vector\n", " y : second fvector\n", " \"\"\"\n", " numerator = sum(a*b for a,b in zip(x,y))\n", " denominator = squareRooted(x) * squareRooted(y)\n", " return round(numerator / float(denominator), 3)\n", "\n", "def cosineDistance(x,y):\n", " \"\"\"\n", " cosine distance\n", " Parameters\n", " x : first vector\n", " y : second fvector\n", " \"\"\"\n", " return 1.0 - cosineSimilarity(x,y)\n", "\n", "def manhattanDistance(x,y):\n", " \"\"\"\n", " manhattan distance\n", " Parameters\n", " x : first vector\n", " y : second fvector\n", " \"\"\"\n", " return sum(abs(a-b) for a,b in zip(x,y))\n", "\n", "def nthRoot(value, nRoot):\n", " \"\"\"\n", " nth root\n", " Parameters\n", " value : data value\n", " nRoot : root\n", " \"\"\"\n", " rootValue = 1/float(nRoot)\n", " return round (Decimal(value) ** Decimal(rootValue),3)\n", "\n", "def minkowskiDistance(x,y,pValue):\n", " \"\"\"\n", " minkowski distance\n", " Parameters\n", " x : first vector\n", " y : second fvector\n", " pValue : power factor\n", " \"\"\"\n", " return nthRoot(sum(pow(abs(a-b),pValue) for a,b in zip(x, y)), pValue)\n", "\n", "def jaccardSimilarityX(x,y):\n", " \"\"\"\n", " jaccard similarity\n", " Parameters\n", " x : first vector\n", " y : second fvector\n", " \"\"\"\n", " intersectionCardinality = len(set.intersection(*[set(x), set(y)]))\n", " unionCardinality = len(set.union(*[set(x), set(y)]))\n", " return intersectionCardinality/float(unionCardinality)\n", "\n", "def jaccardSimilarity(x,y,wx=1.0,wy=1.0):\n", " \"\"\"\n", " jaccard similarity\n", "\n", " Parameters\n", " x : first vector\n", " y : second fvector\n", " wx : weight for x\n", " wy : weight for y\n", " \"\"\"\n", " sx = set(x)\n", " sy = set(y)\n", " sxyInt = sx.intersection(sy)\n", " intCardinality = len(sxyInt)\n", " sxIntDiff = sx.difference(sxyInt)\n", " syIntDiff = sy.difference(sxyInt)\n", " unionCardinality = len(sx.union(sy))\n", " return intCardinality/float(intCardinality + wx * len(sxIntDiff) + wy * len(syIntDiff))\n", "\n", "def levenshteinSimilarity(s1, s2):\n", " \"\"\"\n", " Levenshtein similarity for strings\n", "\n", " Parameters\n", " sx : first string\n", " sy : second string\n", " \"\"\"\n", " assert type(s1) == str and type(s2) == str, \"Levenshtein similarity is for string only\"\n", " d = ld(s1,s2)\n", " #print(d)\n", " l = max(len(s1),len(s2))\n", " d = 1.0 - min(d/l, 1.0)\n", " return d\t\n", "\n", "def norm(values, po=2):\n", " \"\"\"\n", " norm\n", " Parameters\n", " values : list of values\n", " po : power\n", " \"\"\"\n", " no = sum(list(map(lambda v: pow(v,po), values)))\n", " no = pow(no,1.0/po)\n", " return list(map(lambda v: v/no, values))\n", "\n", "def createOneHotVec(size, indx = -1):\n", " \"\"\"\n", " random one hot vector\n", "\n", " Parameters\n", " size : vector size\n", " indx : one hot position\n", " \"\"\"\n", " vec = [0] * size\n", " s = random.randint(0, size - 1) if indx < 0 else indx\n", " vec[s] = 1\n", " return vec\n", "\n", "def createAllOneHotVec(size):\n", " \"\"\"\n", " create all one hot vectors\n", "\n", " Parameters\n", " size : vector size and no of vectors\n", " \"\"\"\n", " vecs = list()\n", " for i in range(size):\n", " vec = [0] * size\n", " vec[i] = 1\n", " vecs.append(vec)\n", " return vecs\n", "\n", "def blockShuffle(data, blockSize):\n", " \"\"\"\n", " block shuffle \t\n", "\n", " Parameters\n", " data : list data\n", " blockSize : block size\n", " \"\"\"\n", " numBlock = int(len(data) / blockSize)\n", " remain = len(data) % blockSize\n", " numBlock += (1 if remain > 0 else 0)\n", " shuffled = list()\n", " for i in range(numBlock):\n", " b = random.randint(0, numBlock-1)\n", " beg = b * blockSize\n", " if (b < numBlock-1):\n", " end = beg + blockSize\n", " shuffled.extend(data[beg:end])\t\t\n", " else:\n", " shuffled.extend(data[beg:])\n", " return shuffled\t\n", "\n", "def shuffle(data, numShuffle):\n", " \"\"\"\n", " shuffle data by randonm swapping\n", "\n", " Parameters\n", " data : list data\n", " numShuffle : no of pairwise swaps\n", " \"\"\"\n", " sz = len(data)\n", " if numShuffle is None:\n", " numShuffle = int(sz / 2)\n", " for i in range(numShuffle):\n", " fi = random.randint(0, sz -1)\n", " se = random.randint(0, sz -1)\n", " tmp = data[fi]\n", " data[fi] = data[se]\n", " data[se] = tmp\t\n", "\n", "def randomWalk(size, start, lowStep, highStep):\n", " \"\"\"\n", " random walk\t\n", "\n", " Parameters\n", " size : list data\n", " start : initial position\n", " lowStep : step min\n", " highStep : step max\n", " \"\"\"\n", " cur = start\n", " for i in range(size):\n", " yield cur\n", " cur += randomFloat(lowStep, highStep)\n", "\n", "def binaryEcodeCategorical(values, value):\n", " \"\"\"\n", " one hot binary encoding\t\n", "\n", " Parameters\n", " values : list of values\n", " value : value to be replaced with 1\n", " \"\"\"\n", " size = len(values)\n", " vec = [0] * size\n", " for i in range(size):\n", " if (values[i] == value):\n", " vec[i] = 1\n", " return vec\t\t\n", "\n", "def createLabeledSeq(inputData, tw):\n", " \"\"\"\n", " Creates feature, label pair from sequence data, where we have tw number of features followed by output\n", "\n", " Parameters\n", " values : list containing feature and label\n", " tw : no of features\n", " \"\"\"\n", " features = list()\n", " labels = list()\n", " l = len(inputDta)\n", " for i in range(l - tw):\n", " trainSeq = inputData[i:i+tw]\n", " trainLabel = inputData[i+tw]\n", " features.append(trainSeq)\n", " labels.append(trainLabel)\n", " return (features, labels)\n", "\n", "def createLabeledSeq(filePath, delim, index, tw):\n", " \"\"\"\n", " Creates feature, label pair from 1D sequence data in file\t\n", "\n", " Parameters\n", " filePath : file path\n", " delim : delemeter\n", " index : column index\n", " tw : no of features\n", " \"\"\"\n", " seqData = getFileColumnAsFloat(filePath, delim, index)\n", " return createLabeledSeq(seqData, tw)\n", "\n", "def fromMultDimSeqToTabular(data, inpSize, seqLen):\n", " \"\"\"\n", " Input shape (nrow, inpSize * seqLen) output shape(nrow * seqLen, inpSize)\n", "\n", " Parameters\n", " data : 2D array\n", " inpSize : each input size in sequence\n", " seqLen : sequence length\n", " \"\"\"\t\n", " nrow = data.shape[0]\n", " assert data.shape[1] == inpSize * seqLen, \"invalid input size or sequence length\"\n", " return data.reshape(nrow * seqLen, inpSize)\n", "\n", "def fromTabularToMultDimSeq(data, inpSize, seqLen):\n", " \"\"\"\n", " Input shape (nrow * seqLen, inpSize) output shape (nrow, inpSize * seqLen) \n", " Parameters\n", " data : 2D array\n", " inpSize : each input size in sequence\n", " seqLen : sequence length\n", " \"\"\"\t\n", " nrow = int(data.shape[0] / seqLen)\n", " assert data.shape[1] == inpSize, \"invalid input size\"\n", " return data.reshape(nrow, seqLen * inpSize)\n", "\n", "def difference(data, interval=1):\n", " \"\"\"\n", " takes difference in time series data\n", " Parameters\n", " data :list data\n", " interval : interval for difference\n", " \"\"\"\n", " diff = list()\n", " for i in range(interval, len(data)):\n", " value = data[i] - data[i - interval]\n", " diff.append(value)\n", " return diff\n", "\n", "def normalizeMatrix(data, norm, axis=1):\n", " \"\"\"\n", " normalized each row of the matrix\n", "\n", " Parameters\n", " data : 2D data\n", " nporm : normalization method\n", " axis : row or column\n", " \"\"\"\n", " normalized = preprocessing.normalize(data,norm=norm, axis=axis)\n", " return normalized\n", "\n", "def standardizeMatrix(data, axis=0):\n", " \"\"\"\n", " standardizes each column of the matrix with mean and std deviation\n", " Parameters\n", " data : 2D data\n", " axis : row or column\n", " \"\"\"\n", " standardized = preprocessing.scale(data, axis=axis)\n", " return standardized\n", "\n", "def asNumpyArray(data):\n", " \"\"\"\n", " converts to numpy array\n", " Parameters\n", " data : array\n", " \"\"\"\n", " return np.array(data)\n", "\n", "def perfMetric(metric, yActual, yPred, clabels=None):\n", " \"\"\"\n", " predictive model accuracy metric\n", " Parameters\n", " metric : accuracy metric\n", " yActual : actual values array\n", " yPred : predicted values array\n", " clabels : class labels\n", " \"\"\"\n", " if metric == \"rsquare\":\n", " score = metrics.r2_score(yActual, yPred)\n", " elif metric == \"mae\":\n", " score = metrics.mean_absolute_error(yActual, yPred)\n", " elif metric == \"mse\":\n", " score = metrics.mean_squared_error(yActual, yPred)\n", " elif metric == \"acc\":\n", " yPred = np.rint(yPred)\n", " score = metrics.accuracy_score(yActual, yPred)\n", " elif metric == \"mlAcc\":\n", " yPred = np.argmax(yPred, axis=1)\n", " score = metrics.accuracy_score(yActual, yPred)\n", " elif metric == \"prec\":\n", " yPred = np.argmax(yPred, axis=1)\n", " score = metrics.precision_score(yActual, yPred)\n", " elif metric == \"rec\":\n", " yPred = np.argmax(yPred, axis=1)\n", " score = metrics.recall_score(yActual, yPred)\n", " elif metric == \"fone\":\n", " yPred = np.argmax(yPred, axis=1)\n", " score = metrics.f1_score(yActual, yPred)\n", " elif metric == \"confm\":\n", " yPred = np.argmax(yPred, axis=1)\n", " score = metrics.confusion_matrix(yActual, yPred)\n", " elif metric == \"clarep\":\n", " yPred = np.argmax(yPred, axis=1)\n", " score = metrics.classification_report(yActual, yPred)\n", " elif metric == \"bce\":\n", " if clabels is None:\n", " clabels = [0, 1]\n", " score = metrics.log_loss(yActual, yPred, labels=clabels)\n", " elif metric == \"ce\":\n", " assert clabels is not None, \"labels must be provided\"\n", " score = metrics.log_loss(yActual, yPred, labels=clabels)\n", " else:\n", " exitWithMsg(\"invalid prediction performance metric \" + metric)\n", " return score\n", "\n", "def scaleData(data, method):\n", " \"\"\"\n", " scales feature data column wise\n", " Parameters\n", " data : 2D array\n", " method : scaling method\n", " \"\"\"\n", " if method == \"minmax\":\n", " scaler = preprocessing.MinMaxScaler()\n", " data = scaler.fit_transform(data)\n", " elif method == \"zscale\":\n", " data = preprocessing.scale(data)\t\n", " else:\n", " raise ValueError(\"invalid scaling method\")\t\n", " return data\n", "\n", "def scaleDataWithParams(data, method, scParams):\n", " \"\"\"\n", " scales feature data column wise\n", " Parameters\n", " data : 2D array\n", " method : scaling method\n", " scParams : scaling parameters\n", " \"\"\"\n", " if method == \"minmax\":\n", " data = scaleMinMaxTabData(data, scParams)\n", " elif method == \"zscale\":\n", " raise ValueError(\"invalid scaling method\")\t\n", " else:\n", " raise ValueError(\"invalid scaling method\")\t\n", " return data\n", "\n", "\n", "def scaleMinMaxTabData(tdata, minMax):\n", " \"\"\"\n", " for tabular scales feature data column wise using min max values for each field\n", " Parameters\n", " tdata : 2D array\n", " minMax : ni, max and range for each column\n", " \"\"\"\n", " stdata = list()\n", " for r in tdata:\n", " srdata = list()\n", " for i, c in enumerate(r):\n", " sd = (c - minMax[i][0]) / minMax[i][2]\n", " srdata.append(sd)\n", " stdata.append(srdata)\n", " return stdata\n", "\n", "def scaleMinMax(rdata, minMax):\n", " \"\"\"\n", " scales feature data column wise using min max values for each field\n", " Parameters\n", " rdata : data array\n", " minMax : ni, max and range for each column\n", " \"\"\"\n", " srdata = list()\n", " for i in range(len(rdata)):\n", " d = rdata[i]\n", " sd = (d - minMax[i][0]) / minMax[i][2]\n", " srdata.append(sd)\n", " return srdata\n", "\n", "def harmonicNum(n):\n", " \"\"\"\n", " harmonic number\n", " Parameters\n", " n : number\n", " \"\"\"\n", " h = 0\n", " for i in range(1, n+1, 1):\n", " h += 1.0 / i\n", " return h\n", "\n", "def digammaFun(n):\n", " \"\"\"\n", " figamma function\n", " Parameters\n", " n : number\n", " \"\"\"\n", " #Euler Mascheroni constant\n", " ec = 0.577216\n", " return harmonicNum(n - 1) - ec\n", "\n", "def getDataPartitions(tdata, types, columns = None):\n", " \"\"\"\n", " partitions data with the given columns and random split point defined with predicates\n", " Parameters\n", " tdata : 2D array\n", " types : data typers\n", " columns : column indexes\n", " \"\"\"\n", " (dtypes, cvalues) = extractTypesFromString(types)\n", " if columns is None:\n", " ncol = len(data[0])\n", " columns = list(range(ncol))\n", " ncol = len(columns)\n", " #print(columns)\n", "\n", " # partition predicates\n", " partitions = None\n", " for c in columns:\n", " #print(c)\n", " dtype = dtypes[c]\n", " pred = list()\n", " if dtype == \"int\" or dtype == \"float\":\n", " (vmin, vmax) = getColMinMax(tdata, c)\n", " r = vmax - vmin\n", " rmin = vmin + .2 * r\n", " rmax = vmax - .2 * r\n", " sp = randomFloat(rmin, rmax)\n", " if dtype == \"int\":\n", " sp = int(sp)\n", " else:\n", " sp = \"{:.3f}\".format(sp)\n", " sp = float(sp)\n", " pred.append([c, \"LT\", sp])\n", " pred.append([c, \"GE\", sp])\n", " elif dtype == \"cat\":\n", " cv = cvalues[c]\n", " card = len(cv) \n", " if card < 3:\n", " num = 1\n", " else:\n", " num = randomInt(1, card - 1)\n", " sp = selectRandomSubListFromList(cv, num)\n", " sp = \" \".join(sp)\n", " pred.append([c, \"IN\", sp])\n", " pred.append([c, \"NOTIN\", sp])\n", "\n", " #print(pred)\n", " if partitions is None:\n", " partitions = pred.copy()\n", " #print(\"initial\")\n", " #print(partitions)\n", " else:\n", " #print(\"extension\")\n", " tparts = list()\n", " for p in partitions:\n", " #print(p)\n", " l1 = p.copy()\n", " l1.extend(pred[0])\n", " l2 = p.copy()\n", " l2.extend(pred[1])\n", " #print(\"after extension\")\n", " #print(l1)\n", " #print(l2)\n", " tparts.append(l1)\n", " tparts.append(l2)\n", " partitions = tparts\t\n", " #print(\"extending\")\n", " #print(partitions)\n", "\n", " #for p in partitions:\n", " #print(p)\t\n", " return partitions\t\t\t\n", "\n", "def genAlmostUniformDistr(size, nswap=50):\n", " \"\"\"\n", " generate probability distribution\n", "\n", " Parameters\n", " size : distr size\n", " nswap : no of mass swaps\n", " \"\"\"\n", " un = 1.0 / size\n", " distr = [un] * size\n", " distr = mutDistr(distr, 0.1 * un, nswap)\n", " return distr\n", "\n", "def mutDistr(distr, shift, nswap=50):\n", " \"\"\"\n", " mutates a probability distribution\n", "\n", " Parameters\n", " distr distribution\n", " shift : amount of shift for swap\n", " nswap : no of mass swaps\n", " \"\"\"\n", " size = len(distr)\n", " for _ in range(nswap):\n", " fi = randomInt(0, size -1)\n", " si = randomInt(0, size -1)\n", " while fi == si:\n", " fi = randomInt(0, size -1)\n", " si = randomInt(0, size -1)\n", "\n", " shift = randomFloat(0, shift)\n", " t = distr[fi]\n", " distr[fi] -= shift\n", " if (distr[fi] < 0):\n", " distr[fi] = 0.0\n", " shift = t\n", " distr[si] += shift\n", " return distr\n", "\n", "def generateBinDistribution(size, ntrue):\n", " \"\"\"\n", " generate binary array with some elements set to 1\n", "\n", " Parameters\n", " size : distr size\n", " ntrue : no of true values\n", " \"\"\"\n", " distr = [0] * size\n", " idxs = selectRandomSubListFromList(list(range(size)), ntrue)\n", " for i in idxs:\n", " distr[i] = 1\n", " return distr\n", "\n", "def mutBinaryDistr(distr, nmut):\n", " \"\"\"\n", " mutate binary distribution\n", "\n", " Parameters\n", " distr : distr\n", " nmut : no of mutations\n", " \"\"\"\n", " idxs = selectRandomSubListFromList(list(range(len(distr))), nmut)\n", " for i in idxs:\n", " distr[i] = distr[i] ^ 1\n", "\n", "\n", "def fileSelFieldSubSeqModifierGen(filePath, column, offset, seqLen, modifier, precision, delim=\",\"):\n", " \"\"\"\n", " file record generator that superimposes given data in the specified segment of a column\n", " Parameters\n", " filePath ; file path\n", " column : column index \n", " offset : offset into column values\n", " seqLen : length of subseq\n", " modifier : data to be superimposed either list or a sampler object\n", " precision : floating point precision\n", " delim : delemeter\n", " \"\"\"\n", " beg = offset\n", " end = beg + seqLen\n", " isList = type(modifier) == list\n", " i = 0\n", " for rec in fileRecGen(filePath, delim):\n", " if i >= beg and i < end:\n", " va = float(rec[column])\n", " if isList:\n", " va += modifier[i - beg] \n", " else:\n", " va += modifier.sample()\n", " rec[column] = formatFloat(precision, va)\n", " yield delim.join(rec)\n", " i += 1\n", "\n", "class ShiftedDataGenerator:\n", " \"\"\"\n", " transforms data for distribution shift\n", " \"\"\"\n", " def __init__(self, types, tdata, addFact, multFact):\n", " \"\"\"\n", " initializer\n", "\n", " Parameters\n", " types data types\n", " tdata : 2D array\n", " addFact ; factor for data shift\n", " multFact ; factor for data scaling\n", " \"\"\"\n", " (self.dtypes, self.cvalues) = extractTypesFromString(types)\n", "\n", " self.limits = dict()\n", " for k,v in self.dtypes.items():\n", " if v == \"int\" or v == \"false\":\n", " (vmax, vmin) = getColMinMax(tdata, k)\n", " self.limits[k] = vmax - vmin\n", " self.addMin = - addFact / 2\n", " self.addMax = addFact / 2\n", " self.multMin = 1.0 - multFact / 2\n", " self.multMax = 1.0 + multFact / 2\n", "\n", "\n", "\n", "\n", " def transform(self, tdata):\n", " \"\"\"\n", " linear transforms data to create distribution shift with random shift and scale\n", " Parameters\n", " types : data types\n", " \"\"\"\n", " transforms = dict()\n", " for k,v in self.dtypes.items():\n", " if v == \"int\" or v == \"false\":\t\t\t\t\n", " shift = randomFloat(self.addMin, self.addMax) * self.limits[k] \n", " scale = randomFloat(self.multMin, self.multMax)\n", " trns = (shift, scale)\n", " transforms[k] = trns\n", " elif v == \"cat\":\n", " transforms[k] = isEventSampled(50)\n", "\n", " ttdata = list()\n", " for rec in tdata:\n", " nrec = rec.copy()\n", " for c in range(len(rec)):\n", " if c in self.dtypes:\n", " dtype = self.dtypes[c]\n", " if dtype == \"int\" or dtype == \"float\":\n", " (shift, scale) = transforms[c]\n", " nval = shift + rec[c] * scale\n", " if dtype == \"int\":\n", " nrec[c] = int(nval)\n", " else:\n", " nrec[c] = nval\n", " elif dtype == \"cat\":\n", " cv = self.cvalues[c]\n", " if transforms[c]:\n", " nval = selectOtherRandomFromList(cv, rec[c])\n", " nrec[c] = nval\n", "\n", " ttdata.append(nrec)\n", "\n", " return ttdata\n", "\n", " def transformSpecified(self, tdata, sshift, scale):\n", " \"\"\"\n", " linear transforms data to create distribution shift shift specified shift and scale\n", " Parameters\n", " types : data types\n", " sshift : shift factor\n", " scale : scale factor\n", " \"\"\"\n", " transforms = dict()\n", " for k,v in self.dtypes.items():\n", " if v == \"int\" or v == \"false\":\t\t\t\t\n", " shift = sshift * self.limits[k] \n", " trns = (shift, scale)\n", " transforms[k] = trns\n", " elif v == \"cat\":\n", " transforms[k] = isEventSampled(50)\n", "\n", " ttdata = self.__scaleShift(tdata, transforms)\n", " return ttdata\n", "\n", " def __scaleShift(self, tdata, transforms):\n", " \"\"\"\n", " shifts and scales tabular data\n", "\n", " Parameters\n", " tdata : 2D array\n", " transforms : transforms to apply\n", " \"\"\"\n", " ttdata = list()\n", " for rec in tdata:\n", " nrec = rec.copy()\n", " for c in range(len(rec)):\n", " if c in self.dtypes:\n", " dtype = self.dtypes[c]\n", " if dtype == \"int\" or dtype == \"float\":\n", " (shift, scale) = transforms[c]\n", " nval = shift + rec[c] * scale\n", " if dtype == \"int\":\n", " nrec[c] = int(nval)\n", " else:\n", " nrec[c] = nval\n", " elif dtype == \"cat\":\n", " cv = self.cvalues[c]\n", " if transforms[c]:\n", " #nval = selectOtherRandomFromList(cv, rec[c])\n", " #nrec[c] = nval\n", " pass\n", "\n", " ttdata.append(nrec)\n", " return ttdata\n", "\n", "class RollingStat(object):\n", " \"\"\"\n", " stats for rolling windowt\n", " \"\"\"\n", " def __init__(self, wsize):\n", " \"\"\"\n", " initializer\n", "\n", " Parameters\n", " wsize : window size\n", " \"\"\"\n", " self.window = list()\n", " self.wsize = wsize\n", " self.mean = None\n", " self.sd = None\n", "\n", " def add(self, value):\n", " \"\"\"\n", " add a value\n", "\n", " Parameters\n", " value : value to add\n", " \"\"\"\n", " self.window.append(value)\n", " if len(self.window) > self.wsize:\n", " self.window = self.window[1:]\n", "\n", " def getStat(self):\n", " \"\"\"\n", " get rolling window mean and std deviation\n", " \"\"\"\n", " assertGreater(len(self.window), 0, \"window is empty\")\n", " if len(self.window) == 1:\n", " self.mean = self.window[0]\n", " self.sd = 0\n", " else:\n", " self.mean = statistics.mean(self.window)\n", " self.sd = statistics.stdev(self.window, xbar=self.mean)\n", " re = (self.mean, self.sd)\n", " return re\n", "\n", " def getSize(self):\n", " \"\"\"\n", " return window size\n", " \"\"\"\n", " return len(self.window)\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" } }, "nbformat": 4, "nbformat_minor": 5 }