{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "f4cbab42", "metadata": {}, "outputs": [], "source": [ "import sys\n", "import random \n", "import time\n", "import math\n", "import numpy as np\n", "import statistics \n", "from util import *\n", "\n", "\"\"\"\n", "histogram class\n", "\"\"\"\n", "class Histogram:\n", " def __init__(self, min, binWidth):\n", " \"\"\"\n", " initializer\n", "\n", " Parameters\n", " min : min x\n", " binWidth : bin width\n", " \"\"\"\n", " self.xmin = min\n", " self.binWidth = binWidth\n", " self.normalized = False\n", "\n", " @classmethod\n", " def createInitialized(cls, xmin, binWidth, values):\n", " \"\"\"\n", " create histogram instance with min domain, bin width and values\n", "\n", " Parameters\n", " min : min x\n", " binWidth : bin width\n", " values : y values\n", " \"\"\"\n", " instance = cls(xmin, binWidth)\n", " instance.xmax = xmin + binWidth * (len(values) - 1)\n", " instance.ymin = 0\n", " instance.bins = np.array(values)\n", " instance.fmax = 0\n", " for v in values:\n", " if (v > instance.fmax):\n", " instance.fmax = v\n", " instance.ymin = 0.0\n", " instance.ymax = instance.fmax\n", " return instance\n", "\n", " @classmethod\n", " def createWithNumBins(cls, values, numBins=20):\n", " \"\"\"\n", " create histogram instance values and no of bins\n", "\n", " Parameters\n", " values : y values\n", " numBins : no of bins\n", " \"\"\"\n", " xmin = min(values)\n", " xmax = max(values)\n", " binWidth = (xmax + .01 - (xmin - .01)) / numBins\n", " instance = cls(xmin, binWidth)\n", " instance.xmax = xmax\n", " instance.numBin = numBins\n", " instance.bins = np.zeros(instance.numBin)\n", " for v in values:\n", " instance.add(v)\n", " return instance\n", "\n", " @classmethod\n", " def createUninitialized(cls, xmin, xmax, binWidth):\n", " \"\"\"\n", " create histogram instance with no y values using domain min , max and bin width\n", "\n", " Parameters\n", " min : min x\n", " max : max x\n", " binWidth : bin width\n", " \"\"\"\n", " instance = cls(xmin, binWidth)\n", " instance.xmax = xmax\n", " instance.numBin = (xmax - xmin) / binWidth + 1\n", " instance.bins = np.zeros(instance.numBin)\n", " return instance\n", "\n", " def initialize(self):\n", " \"\"\"\n", " set y values to 0\n", " \"\"\"\n", " self.bins = np.zeros(self.numBin)\n", "\n", " def add(self, value):\n", " \"\"\"\n", " adds a value to a bin\n", "\n", " Parameters\n", " value : value\n", " \"\"\"\n", " bin = int((value - self.xmin) / self.binWidth)\n", " if (bin < 0 or bin > self.numBin - 1):\n", " print (bin)\n", " raise ValueError(\"outside histogram range\")\n", " self.bins[bin] += 1.0\n", "\n", " def normalize(self):\n", " \"\"\"\n", " normalize bin counts\n", " \"\"\"\n", " if not self.normalized:\n", " total = self.bins.sum()\n", " self.bins = np.divide(self.bins, total)\n", " self.normalized = True\n", "\n", " def cumDistr(self):\n", " \"\"\"\n", " cumulative dists\n", " \"\"\"\n", " self.normalize()\n", " self.cbins = np.cumsum(self.bins)\n", " return self.cbins\n", "\n", " def distr(self):\n", " \"\"\"\n", " distr\n", " \"\"\"\n", " self.normalize()\n", " return self.bins\n", "\n", "\n", " def percentile(self, percent):\n", " \"\"\"\n", " return value corresponding to a percentile\n", "\n", " Parameters\n", " percent : percentile value\n", " \"\"\"\n", " if self.cbins is None:\n", " raise ValueError(\"cumulative distribution is not available\")\n", "\n", " for i,cuml in enumerate(self.cbins):\n", " if percent > cuml:\n", " value = (i * self.binWidth) - (self.binWidth / 2) + \\\n", " (percent - self.cbins[i-1]) * self.binWidth / (self.cbins[i] - self.cbins[i-1]) \n", " break\n", " return value\n", "\n", " def max(self):\n", " \"\"\"\n", " return max bin value \n", " \"\"\"\n", " return self.bins.max()\n", "\n", " def value(self, x):\n", " \"\"\"\n", " return a bin value\t\n", "\n", " Parameters\n", " x : x value\n", " \"\"\"\n", " bin = int((x - self.xmin) / self.binWidth)\n", " f = self.bins[bin]\n", " return f\n", "\n", " def bin(self, x):\n", " \"\"\"\n", " return a bin index\t\n", "\n", " Parameters\n", " x : x value\n", " \"\"\"\n", " return int((x - self.xmin) / self.binWidth)\n", "\n", " def cumValue(self, x):\n", " \"\"\"\n", " return a cumulative bin value\t\n", "\n", " Parameters\n", " x : x value\n", " \"\"\"\n", " bin = int((x - self.xmin) / self.binWidth)\n", " c = self.cbins[bin]\n", " return c\n", "\n", "\n", " def getMinMax(self):\n", " \"\"\"\n", " returns x min and x max\n", " \"\"\"\n", " return (self.xmin, self.xmax)\n", "\n", " def boundedValue(self, x):\n", " \"\"\"\n", " return x bounde by min and max\t\n", "\n", " Parameters\n", " x : x value\n", " \"\"\"\n", " if x < self.xmin:\n", " x = self.xmin\n", " elif x > self.xmax:\n", " x = self.xmax\n", " return x\n", "\n", "\"\"\"\n", "categorical histogram class\n", "\"\"\"\n", "class CatHistogram:\n", " def __init__(self):\n", " \"\"\"\n", " initializer\n", " \"\"\"\n", " self.binCounts = dict()\n", " self.counts = 0\n", " self.normalized = False\n", "\n", " def add(self, value):\n", " \"\"\"\n", " adds a value to a bin\n", "\n", " Parameters\n", " x : x value\n", " \"\"\"\n", " addToKeyedCounter(self.binCounts, value)\n", " self.counts += 1\t\n", "\n", " def normalize(self):\n", " \"\"\"\n", " normalize\n", " \"\"\"\n", " if not self.normalized:\n", " self.binCounts = dict(map(lambda r : (r[0],r[1] / self.counts), self.binCounts.items()))\n", " self.normalized = True\n", "\n", " def getMode(self):\n", " \"\"\"\n", " get mode\n", " \"\"\"\n", " maxk = None\n", " maxv = 0\n", " #print(self.binCounts)\n", " for k,v in self.binCounts.items():\n", " if v > maxv:\n", " maxk = k\n", " maxv = v\n", " return (maxk, maxv)\t\n", "\n", " def getEntropy(self):\n", " \"\"\"\n", " get entropy\n", " \"\"\"\n", " self.normalize()\n", " entr = 0 \n", " #print(self.binCounts)\n", " for k,v in self.binCounts.items():\n", " entr -= v * math.log(v)\n", " return entr\n", "\n", " def getUniqueValues(self):\n", " \"\"\"\n", " get unique values\n", " \"\"\"\t\t\n", " return list(self.binCounts.keys())\n", "\n", " def getDistr(self):\n", " \"\"\"\n", " get distribution\n", " \"\"\"\t\n", " self.normalize()\t\n", " return self.binCounts.copy()\n", "\n", "class RunningStat:\n", " \"\"\"\n", " running stat class\n", " \"\"\"\n", " def __init__(self):\n", " \"\"\"\n", " initializer\t\n", " \"\"\"\n", " self.sum = 0.0\n", " self.sumSq = 0.0\n", " self.count = 0\n", "\n", " @staticmethod\n", " def create(count, sum, sumSq):\n", " \"\"\"\n", " creates iinstance\t\n", "\n", " Parameters\n", " sum : sum of values\n", " sumSq : sum of valure squared\n", " \"\"\"\n", " rs = RunningStat()\n", " rs.sum = sum\n", " rs.sumSq = sumSq\n", " rs.count = count\n", " return rs\n", "\n", " def add(self, value):\n", " \"\"\"\n", " adds new value\n", " Parameters\n", " value : value to add\n", " \"\"\"\n", " self.sum += value\n", " self.sumSq += (value * value)\n", " self.count += 1\n", "\n", " def getStat(self):\n", " \"\"\"\n", " return mean and std deviation \n", " \"\"\"\n", " mean = self.sum /self. count\n", " t = self.sumSq / (self.count - 1) - mean * mean * self.count / (self.count - 1)\n", " sd = math.sqrt(t)\n", " re = (mean, sd)\n", " return re\n", "\n", " def addGetStat(self,value):\n", " \"\"\"\n", " calculate mean and std deviation with new value added\n", " Parameters\n", " value : value to add\n", " \"\"\"\n", " self.add(value)\n", " re = self.getStat()\n", " return re\n", "\n", " def getCount(self):\n", " \"\"\"\n", " return count\n", " \"\"\"\n", " return self.count\n", "\n", " def getState(self):\n", " \"\"\"\n", " return state\n", " \"\"\"\n", " s = (self.count, self.sum, self.sumSq)\n", " return s\n", "\n", "class SlidingWindowStat:\n", " \"\"\"\n", " sliding window stats\n", " \"\"\"\n", " def __init__(self):\n", " \"\"\"\n", " initializer\n", " \"\"\"\n", " self.sum = 0.0\n", " self.sumSq = 0.0\n", " self.count = 0\n", " self.values = None\n", "\n", " @staticmethod\n", " def create(values, sum, sumSq):\n", " \"\"\"\n", " creates iinstance\t\n", "\n", " Parameters\n", " sum : sum of values\n", " sumSq : sum of valure squared\n", " \"\"\"\n", " sws = SlidingWindowStat()\n", " sws.sum = sum\n", " sws.sumSq = sumSq\n", " self.values = values.copy()\n", " sws.count = len(self.values)\n", " return sws\n", "\n", " @staticmethod\n", " def initialize(values):\n", " \"\"\"\n", " creates iinstance\t\n", "\n", " Parameters\n", " values : list of values\n", " \"\"\"\n", " sws = SlidingWindowStat()\n", " sws.values = values.copy()\n", " for v in sws.values:\n", " sws.sum += v\n", " sws.sumSq += v * v\t\t\n", " sws.count = len(sws.values)\n", " return sws\n", "\n", " @staticmethod\n", " def createEmpty(count):\n", " \"\"\"\n", " creates iinstance\t\n", "\n", " Parameters\n", " count : count of values\n", " \"\"\"\n", " sws = SlidingWindowStat()\n", " sws.count = count\n", " sws.values = list()\n", " return sws\n", "\n", " def add(self, value):\n", " \"\"\"\n", " adds new value\n", "\n", " Parameters\n", " value : value to add\n", " \"\"\"\n", " self.values.append(value)\t\t\n", " if len(self.values) > self.count:\n", " self.sum += value - self.values[0]\n", " self.sumSq += (value * value) - (self.values[0] * self.values[0])\n", " self.values.pop(0)\n", " else:\n", " self.sum += value\n", " self.sumSq += (value * value)\n", "\n", "\n", " def getStat(self):\n", " \"\"\"\n", " calculate mean and std deviation \n", " \"\"\"\n", " mean = self.sum /self. count\n", " t = self.sumSq / (self.count - 1) - mean * mean * self.count / (self.count - 1)\n", " sd = math.sqrt(t)\n", " re = (mean, sd)\n", " return re\n", "\n", " def addGetStat(self,value):\n", " \"\"\"\n", " calculate mean and std deviation with new value added\n", " \"\"\"\n", " self.add(value)\n", " re = self.getStat()\n", " return re\n", "\n", " def getCount(self):\n", " \"\"\"\n", " return count\n", " \"\"\"\n", " return self.count\n", "\n", " def getCurSize(self):\n", " \"\"\"\n", " return count\n", " \"\"\"\n", " return len(self.values)\n", "\n", " def getState(self):\n", " \"\"\"\n", " return state\n", " \"\"\"\n", " s = (self.count, self.sum, self.sumSq)\n", " return s\n", "\n", "\n", "def basicStat(ldata):\n", " \"\"\"\n", " mean and std dev\n", " Parameters\n", " ldata : list of values\n", " \"\"\"\n", " m = statistics.mean(ldata)\n", " s = statistics.stdev(ldata, xbar=m)\n", " r = (m, s)\n", " return r\n", "\n", "def getFileColumnStat(filePath, col, delem=\",\"):\n", " \"\"\"\n", " gets stats for a file column\n", "\n", " Parameters\n", " filePath : file path\n", " col : col index\n", " delem : field delemter\n", " \"\"\"\n", " rs = RunningStat()\n", " for rec in fileRecGen(filePath, delem):\n", " va = float(rec[col])\n", " rs.add(va)\n", "\n", " return rs.getStat()\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" } }, "nbformat": 4, "nbformat_minor": 5 }