Spaces:

ThirdEyeData
/

Duplicate_Records_Prediction

Runtime error

File size: 35,761 Bytes

fc22863

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3853095d",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import sys\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import torch\n",
    "from torch.autograd import Variable\n",
    "from torch.utils.data import Dataset, TensorDataset\n",
    "from torch.utils.data import DataLoader\n",
    "import sklearn as sk\n",
    "from sklearn.neighbors import KDTree\n",
    "import matplotlib\n",
    "import random\n",
    "import jprops\n",
    "from random import randint\n",
    "import statistics\n",
    "sys.path.append(os.path.abspath(\"../lib\"))\n",
    "from util import *\n",
    "from mlutil import *\n",
    "\n",
    "\"\"\"\n",
    "forward hook function\n",
    "\"\"\"\n",
    "intermedOut = {}\n",
    "lvalues = list()\n",
    "\n",
    "def hookFn(m, i, o):\n",
    "    \"\"\"\n",
    "    call back for latent values\n",
    "    \"\"\"\n",
    "    #intermedOut[m] = o\n",
    "    lv = o.data.cpu().numpy()\n",
    "    lv = lv[0].tolist()\n",
    "    lvalues.append(lv)\n",
    "    #print(lv)\n",
    "\n",
    "def getLatValues():\n",
    "    \"\"\"\n",
    "    \"\"\"\n",
    "    return lvalues\n",
    "\n",
    "class FeedForwardNetwork(torch.nn.Module):\n",
    "    def __init__(self, configFile, addDefValues=None):\n",
    "        \"\"\"\n",
    "        In the constructor we instantiate two nn.Linear modules and assign them as\n",
    "        member variables.\n",
    "\n",
    "        Parameters\n",
    "            configFile : config file path\n",
    "            addDefValues : dictionary of additional default values\t\n",
    "        \"\"\"\n",
    "        defValues = dict() if addDefValues is None else addDefValues.copy()\n",
    "        defValues[\"common.mode\"] = (\"training\", None)\n",
    "        defValues[\"common.model.directory\"] = (\"model\", None)\n",
    "        defValues[\"common.model.file\"] = (None, None)\n",
    "        defValues[\"common.preprocessing\"] = (None, None)\n",
    "        defValues[\"common.scaling.method\"] = (\"zscale\", None)\n",
    "        defValues[\"common.scaling.minrows\"] = (50, None)\n",
    "        defValues[\"common.scaling.param.file\"] = (None, None)\n",
    "        defValues[\"common.verbose\"] = (False, None)\n",
    "        defValues[\"common.device\"] = (\"cpu\", None)\n",
    "        defValues[\"train.data.file\"] = (None, \"missing training data file\")\n",
    "        defValues[\"train.data.fields\"] = (None, \"missing training data field ordinals\")\n",
    "        defValues[\"train.data.feature.fields\"] = (None, \"missing training data feature field ordinals\")\n",
    "        defValues[\"train.data.out.fields\"] = (None, \"missing training data feature field ordinals\")\n",
    "        defValues[\"train.layer.data\"] = (None, \"missing layer data\")\n",
    "        defValues[\"train.input.size\"] = (None, None)\n",
    "        defValues[\"train.output.size\"] = (None, \"missing  output size\")\n",
    "        defValues[\"train.batch.size\"] = (10, None)\n",
    "        defValues[\"train.loss.reduction\"] = (\"mean\", None)\n",
    "        defValues[\"train.num.iterations\"] = (500, None)\n",
    "        defValues[\"train.lossFn\"] = (\"mse\", None) \n",
    "        defValues[\"train.optimizer\"] = (\"sgd\", None) \n",
    "        defValues[\"train.opt.learning.rate\"] = (.0001, None)\n",
    "        defValues[\"train.opt.weight.decay\"] = (0, None) \n",
    "        defValues[\"train.opt.momentum\"] = (0, None) \n",
    "        defValues[\"train.opt.eps\"] = (1e-08, None) \n",
    "        defValues[\"train.opt.dampening\"] = (0, None) \n",
    "        defValues[\"train.opt.momentum.nesterov\"] = (False, None) \n",
    "        defValues[\"train.opt.betas\"] = ([0.9, 0.999], None) \n",
    "        defValues[\"train.opt.alpha\"] = (0.99, None) \n",
    "        defValues[\"train.save.model\"] = (False, None) \n",
    "        defValues[\"train.track.error\"] = (False, None) \n",
    "        defValues[\"train.epoch.intv\"] = (5, None) \n",
    "        defValues[\"train.batch.intv\"] = (5, None) \n",
    "        defValues[\"train.print.weights\"] = (False, None) \n",
    "        defValues[\"valid.data.file\"] = (None, None)\n",
    "        defValues[\"valid.accuracy.metric\"] = (None, None)\n",
    "        defValues[\"predict.data.file\"] = (None, None)\n",
    "        defValues[\"predict.use.saved.model\"] = (True, None)\n",
    "        defValues[\"predict.output\"] = (\"binary\", None)\n",
    "        defValues[\"predict.feat.pad.size\"] = (60, None)\n",
    "        defValues[\"predict.print.output\"] = (True, None)\n",
    "        defValues[\"calibrate.num.bins\"] = (10, None)\n",
    "        defValues[\"calibrate.pred.prob.thresh\"] = (0.5, None)\n",
    "        defValues[\"calibrate.num.nearest.neighbors\"] = (10, None)\n",
    "        self.config = Configuration(configFile, defValues)\n",
    "\n",
    "        super(FeedForwardNetwork, self).__init__()\n",
    "\n",
    "    def setConfigParam(self, name, value):\n",
    "        \"\"\"\n",
    "        set config param\n",
    "\n",
    "        Parameters\n",
    "            name : config name\n",
    "            value : config value\n",
    "        \"\"\"\n",
    "        self.config.setParam(name, value)\n",
    "\n",
    "    def getConfig(self):\n",
    "        \"\"\"\n",
    "        get config object\n",
    "        \"\"\"\n",
    "        return self.config\n",
    "\n",
    "    def setVerbose(self, verbose):\n",
    "        self.verbose = verbose\n",
    "\n",
    "    def buildModel(self):\n",
    "        \"\"\"\n",
    "        Loads configuration and builds the various piecess necessary for the model\n",
    "        \"\"\"\n",
    "        torch.manual_seed(9999)\n",
    "\n",
    "        self.verbose = self.config.getBooleanConfig(\"common.verbose\")[0]\n",
    "        numinp = self.config.getIntConfig(\"train.input.size\")[0]\n",
    "        if numinp is None:\n",
    "            numinp = len(self.config.getIntListConfig(\"train.data.feature.fields\")[0])\n",
    "        #numOut = len(self.config.getStringConfig(\"train.data.out.fields\")[0].split(\",\"))\n",
    "        self.outputSize = self.config.getIntConfig(\"train.output.size\")[0]\n",
    "        self.batchSize = self.config.getIntConfig(\"train.batch.size\")[0]\n",
    "        #lossRed = self.config.getStringConfig(\"train.loss.reduction\")[0]\n",
    "        #learnRate = self.config.getFloatConfig(\"train.opt.learning.rate\")[0]\n",
    "        self.numIter = self.config.getIntConfig(\"train.num.iterations\")[0]\n",
    "        optimizer = self.config.getStringConfig(\"train.optimizer\")[0]\n",
    "        self.lossFnStr = self.config.getStringConfig(\"train.lossFn\")[0]\n",
    "        self.accMetric = self.config.getStringConfig(\"valid.accuracy.metric\")[0]\n",
    "        self.trackErr = self.config.getBooleanConfig(\"train.track.error\")[0]\n",
    "        self.batchIntv = self.config.getIntConfig(\"train.batch.intv\")[0]\n",
    "        self.restored = False\n",
    "        self.clabels = list(range(self.outputSize)) if self.outputSize > 1 else None\n",
    "\n",
    "        #build network\n",
    "        layers = list()\n",
    "        ninp = numinp\n",
    "        trData =  self.config.getStringConfig(\"train.layer.data\")[0].split(\",\")\n",
    "        for ld in trData:\n",
    "            lde = ld.split(\":\")\n",
    "            assert len(lde) == 5, \"expecting 5 items for layer data\"\n",
    "\n",
    "            #num of units, activation, whether batch normalize, whether batch normalize after activation, dropout fraction\n",
    "            nunit = int(lde[0])\n",
    "            actStr = lde[1]\n",
    "            act = FeedForwardNetwork.createActivation(actStr) if actStr != \"none\"  else None\n",
    "            bnorm = lde[2] == \"true\"\n",
    "            afterAct = lde[3] == \"true\"\n",
    "            dpr = float(lde[4])\n",
    "\n",
    "            layers.append(torch.nn.Linear(ninp, nunit))\t\t\t\n",
    "            if bnorm:\n",
    "                #with batch norm\n",
    "                if afterAct:\n",
    "                    safeAppend(layers, act)\n",
    "                    layers.append(torch.nn.BatchNorm1d(nunit))\n",
    "                else:\n",
    "                    layers.append(torch.nn.BatchNorm1d(nunit))\n",
    "                    safeAppend(layers, act)\n",
    "            else:\n",
    "                #without batch norm\n",
    "                safeAppend(layers, act)\n",
    "\n",
    "            if dpr > 0:\n",
    "                layers.append(torch.nn.Dropout(dpr))\n",
    "            ninp = nunit\n",
    "\n",
    "        self.layers = torch.nn.Sequential(*layers)\t\n",
    "\n",
    "        self.device = FeedForwardNetwork.getDevice(self)\n",
    "\n",
    "        #training data\n",
    "        dataFile = self.config.getStringConfig(\"train.data.file\")[0]\n",
    "        (featData, outData) = FeedForwardNetwork.prepData(self, dataFile)\n",
    "        self.featData = torch.from_numpy(featData)\n",
    "        self.outData = torch.from_numpy(outData)\n",
    "\n",
    "        #validation data\n",
    "        dataFile = self.config.getStringConfig(\"valid.data.file\")[0]\n",
    "        (featDataV, outDataV) = FeedForwardNetwork.prepData(self, dataFile)\n",
    "        self.validFeatData = torch.from_numpy(featDataV)\n",
    "        self.validOutData = torch.from_numpy(outDataV)\n",
    "\n",
    "        # loss function and optimizer\n",
    "        self.lossFn = FeedForwardNetwork.createLossFunction(self, self.lossFnStr)\n",
    "        self.optimizer =  FeedForwardNetwork.createOptimizer(self, optimizer)\n",
    "\n",
    "        self.yPred  = None\n",
    "        self.restored = False\n",
    "\n",
    "        #mode to device\n",
    "        self.device = FeedForwardNetwork.getDevice(self)\t\n",
    "        self.featData = self.featData.to(self.device)\n",
    "        self.outData = self.outData.to(self.device)\n",
    "        self.validFeatData = self.validFeatData.to(self.device)\n",
    "        self.to(self.device)\n",
    "\n",
    "    @staticmethod\n",
    "    def getDevice(model):\n",
    "        \"\"\"\n",
    "        gets device\n",
    "\n",
    "        Parameters\n",
    "            model : torch model\n",
    "        \"\"\"\n",
    "        devType = model.config.getStringConfig(\"common.device\")[0]\n",
    "        if devType == \"cuda\":\n",
    "            if torch.cuda.is_available():\n",
    "                device = torch.device(\"cuda\")\n",
    "            else:\n",
    "                exitWithMsg(\"cuda not available\")\n",
    "        else:\n",
    "            device = torch.device(\"cpu\")\n",
    "        return device\n",
    "\n",
    "    def setValidationData(self, dataSource, prep=True):\n",
    "        \"\"\"\n",
    "        sets validation data\n",
    "\n",
    "        Parameters\n",
    "            dataSource : data source str if file path or 2D array\n",
    "            prep : if True load and prepare \n",
    "        \"\"\"\n",
    "        if prep:\n",
    "            (featDataV, outDataV) = FeedForwardNetwork.prepData(self, dataSource)\n",
    "            self.validFeatData = torch.from_numpy(featDataV)\n",
    "            self.validOutData = outDataV\n",
    "        else:\n",
    "            self.validFeatData = torch.from_numpy(dataSource[0])\n",
    "            self.validOutData = dataSource[1]\t\t\n",
    "\n",
    "        self.validFeatData = self.validFeatData.to(self.device)\n",
    "\n",
    "    @staticmethod\n",
    "    def createActivation(actName):\n",
    "        \"\"\"\n",
    "        create activation\n",
    "\n",
    "        Parameters\n",
    "            actName : activation name\n",
    "        \"\"\"\n",
    "        if actName is None:\n",
    "            activation = None\n",
    "        elif actName == \"relu\":\n",
    "            activation = torch.nn.ReLU()\n",
    "        elif actName == \"tanh\":\n",
    "            activation = torch.nn.Tanh()\n",
    "        elif actName == \"sigmoid\":\n",
    "            activation = torch.nn.Sigmoid()\n",
    "        elif actName == \"softmax\":\n",
    "            activation = torch.nn.Softmax(dim=1)\n",
    "        else:\n",
    "            exitWithMsg(\"invalid activation function name \" + actName)\n",
    "        return activation\n",
    "\n",
    "    @staticmethod\n",
    "    def createLossFunction(model, lossFnName):\n",
    "        \"\"\"\n",
    "        create loss function\n",
    "\n",
    "        Parameters\n",
    "            lossFnName : loss function name\n",
    "        \"\"\"\n",
    "        config = model.config\n",
    "        lossRed = config.getStringConfig(\"train.loss.reduction\")[0]\n",
    "        if lossFnName == \"ltwo\" or lossFnName == \"mse\":\n",
    "            lossFunc = torch.nn.MSELoss(reduction=lossRed)\n",
    "        elif lossFnName == \"ce\":\n",
    "            lossFunc = torch.nn.CrossEntropyLoss(reduction=lossRed)\n",
    "        elif lossFnName == \"lone\" or lossFnName == \"mae\":\n",
    "            lossFunc = torch.nn.L1Loss(reduction=lossRed)\n",
    "        elif lossFnName == \"bce\":\n",
    "            lossFunc = torch.nn.BCELoss(reduction=lossRed)\n",
    "        elif lossFnName == \"bcel\":\n",
    "            lossFunc = torch.nn.BCEWithLogitsLoss(reduction=lossRed)\n",
    "        elif lossFnName == \"sm\":\n",
    "            lossFunc = torch.nn.SoftMarginLoss(reduction=lossRed)\n",
    "        elif lossFnName == \"mlsm\":\n",
    "            lossFunc = torch.nn.MultiLabelSoftMarginLoss(reduction=lossRed)\n",
    "        else:\n",
    "            exitWithMsg(\"invalid loss function name \" + lossFnName)\n",
    "        return lossFunc\n",
    "\n",
    "    @staticmethod\n",
    "    def createOptimizer(model, optName):\n",
    "        \"\"\"\n",
    "        create optimizer\n",
    "\n",
    "        Parameters\n",
    "            optName : optimizer name\n",
    "        \"\"\"\n",
    "        config = model.config\n",
    "        learnRate = config.getFloatConfig(\"train.opt.learning.rate\")[0]\n",
    "        weightDecay = config.getFloatConfig(\"train.opt.weight.decay\")[0]\n",
    "        momentum = config.getFloatConfig(\"train.opt.momentum\")[0]\n",
    "        eps = config.getFloatConfig(\"train.opt.eps\")[0]\n",
    "        if optName == \"sgd\":\n",
    "            dampening = config.getFloatConfig(\"train.opt.dampening\")[0]\n",
    "            momentumNesterov = config.getBooleanConfig(\"train.opt.momentum.nesterov\")[0]\n",
    "            optimizer = torch.optim.SGD(model.parameters(),lr=learnRate, momentum=momentum, \n",
    "            dampening=dampening, weight_decay=weightDecay, nesterov=momentumNesterov)\n",
    "        elif optName == \"adam\":\n",
    "            betas = config.getFloatListConfig(\"train.opt.betas\")[0]\n",
    "            betas = (betas[0], betas[1]) \n",
    "            optimizer = torch.optim.Adam(model.parameters(), lr=learnRate,betas=betas, eps = eps,\n",
    "            weight_decay=weightDecay)\n",
    "        elif optName == \"rmsprop\":\n",
    "            alpha = config.getFloatConfig(\"train.opt.alpha\")[0]\n",
    "            optimizer = torch.optim.RMSprop(model.parameters(), lr=learnRate, alpha=alpha,\n",
    "            eps=eps, weight_decay=weightDecay, momentum=momentum)\n",
    "        else:\n",
    "            exitWithMsg(\"invalid optimizer name \" + optName)\n",
    "        return optimizer\n",
    "\n",
    "\n",
    "    def forward(self, x):\n",
    "        \"\"\"\n",
    "        In the forward function we accept a Tensor of input data and we must return\n",
    "        a Tensor of output data. We can use Modules defined in the constructor as\n",
    "        well as arbitrary (differentiable) operations on Tensors.\n",
    "\n",
    "        Parameters\n",
    "            x : data batch\n",
    "        \"\"\"\n",
    "        y = self.layers(x)\t\n",
    "        return y\n",
    "\n",
    "    @staticmethod\n",
    "    def addForwardHook(model, l, cl = 0):\n",
    "        \"\"\"\n",
    "        register forward hooks\n",
    "\n",
    "        Parameters\n",
    "            l : \n",
    "            cl :\n",
    "        \"\"\"\n",
    "        for name, layer in model._modules.items():\n",
    "            #If it is a sequential, don't register a hook on it\n",
    "            # but recursively register hook on all it's module children\n",
    "            print(str(cl) + \" : \" + name)\n",
    "            if isinstance(layer, torch.nn.Sequential):\n",
    "                FeedForwardNetwork.addForwardHook(layer, l, cl)\n",
    "            else:\n",
    "            #\t it's a non sequential. Register a hook\n",
    "                if cl == l:\n",
    "                    print(\"setting hook at layer \" + str(l))\n",
    "                    layer.register_forward_hook(hookFn)\n",
    "                cl += 1\n",
    "\n",
    "    @staticmethod\n",
    "    def prepData(model, dataSource, includeOutFld=True):\n",
    "        \"\"\"\n",
    "        loads and prepares  data\n",
    "\n",
    "        Parameters\n",
    "            dataSource : data source str if file path or 2D array\n",
    "            includeOutFld : True if target freld to be included\n",
    "        \"\"\"\n",
    "        # parameters\n",
    "        fieldIndices = model.config.getIntListConfig(\"train.data.fields\")[0]\n",
    "        featFieldIndices = model.config.getIntListConfig(\"train.data.feature.fields\")[0]\n",
    "\n",
    "        #all data and feature data\n",
    "        isDataFile = isinstance(dataSource, str)\n",
    "        selFieldIndices = fieldIndices if includeOutFld else fieldIndices[:-1]\n",
    "        if isDataFile: \n",
    "            #source file path \n",
    "            (data, featData) = loadDataFile(dataSource, \",\", selFieldIndices, featFieldIndices)\n",
    "        else:\n",
    "            # tabular data\n",
    "            data = tableSelFieldsFilter(dataSource, selFieldIndices)\n",
    "            featData = tableSelFieldsFilter(data, featFieldIndices)\n",
    "            #print(featData)\n",
    "            featData = np.array(featData)\n",
    "\n",
    "        if (model.config.getStringConfig(\"common.preprocessing\")[0] == \"scale\"):\n",
    "            scalingMethod = model.config.getStringConfig(\"common.scaling.method\")[0]\n",
    "\n",
    "            #scale only if there are enough rows\n",
    "            nrow = featData.shape[0]\n",
    "            minrows = model.config.getIntConfig(\"common.scaling.minrows\")[0]\n",
    "            if nrow > minrows:\n",
    "                #in place scaling\n",
    "                featData = scaleData(featData, scalingMethod)\n",
    "            else:\n",
    "                #use pre computes scaling parameters\n",
    "                spFile = model.config.getStringConfig(\"common.scaling.param.file\")[0]\n",
    "                if spFile is None:\n",
    "                    exitWithMsg(\"for small data sets pre computed scaling parameters need to provided\")\n",
    "                scParams = restoreObject(spFile)\n",
    "                featData = scaleDataWithParams(featData, scalingMethod, scParams)\n",
    "                featData = np.array(featData)\n",
    "\n",
    "        # target data\n",
    "        if includeOutFld:\n",
    "            outFieldIndices = model.config.getStringConfig(\"train.data.out.fields\")[0]\n",
    "            outFieldIndices = strToIntArray(outFieldIndices, \",\")\n",
    "            if isDataFile:\n",
    "                outData = data[:,outFieldIndices]\n",
    "            else:\n",
    "                outData = tableSelFieldsFilter(data, outFieldIndices)\n",
    "                outData = np.array(outData)\n",
    "            foData = (featData.astype(np.float32), outData.astype(np.float32))\n",
    "        else:\n",
    "            foData = featData.astype(np.float32)\n",
    "        return foData\n",
    "\n",
    "    @staticmethod\n",
    "    def saveCheckpt(model):\n",
    "        \"\"\"\n",
    "        checkpoints model\n",
    "\n",
    "        Parameters\n",
    "            model : torch model\n",
    "        \"\"\"\n",
    "        print(\"..saving model checkpoint\")\n",
    "        modelDirectory = model.config.getStringConfig(\"common.model.directory\")[0]\n",
    "        assert os.path.exists(modelDirectory), \"model save directory does not exist\"\n",
    "        modelFile = model.config.getStringConfig(\"common.model.file\")[0]\n",
    "        filepath = os.path.join(modelDirectory, modelFile)\n",
    "        state = {\"state_dict\": model.state_dict(), \"optim_dict\": model.optimizer.state_dict()}\n",
    "        torch.save(state, filepath)\n",
    "        if model.verbose:\n",
    "            print(\"model saved\")\n",
    "\n",
    "    @staticmethod\n",
    "    def restoreCheckpt(model, loadOpt=False):\n",
    "        \"\"\"\n",
    "        restored checkpointed model\n",
    "\n",
    "        Parameters\n",
    "            model : torch model\n",
    "            loadOpt : True if optimizer to be loaded\n",
    "        \"\"\"\n",
    "        if not model.restored:\n",
    "            print(\"..restoring model checkpoint\")\n",
    "            modelDirectory = model.config.getStringConfig(\"common.model.directory\")[0]\n",
    "            modelFile = model.config.getStringConfig(\"common.model.file\")[0]\n",
    "            filepath = os.path.join(modelDirectory, modelFile)\n",
    "            assert os.path.exists(filepath), \"model save file does not exist\"\n",
    "            checkpoint = torch.load(filepath)\n",
    "            model.load_state_dict(checkpoint[\"state_dict\"])\n",
    "            model.to(model.device)\n",
    "            if loadOpt:\n",
    "                model.optimizer.load_state_dict(checkpoint[\"optim_dict\"])\n",
    "            model.restored = True\n",
    "\n",
    "    @staticmethod\n",
    "    def processClassifOutput(yPred, config):\n",
    "        \"\"\"\n",
    "        extracts probability label 1 or label with highest probability\n",
    "\n",
    "        Parameters\n",
    "            yPred : predicted output\n",
    "            config : config object\n",
    "        \"\"\"\n",
    "        outType = config.getStringConfig(\"predict.output\")[0]\n",
    "        if outType == \"prob\":\n",
    "            outputSize = config.getIntConfig(\"train.output.size\")[0]\n",
    "            if outputSize == 2:\n",
    "                #return prob of pos class for binary classifier \n",
    "                yPred = yPred[:, 1]\n",
    "            else:\n",
    "                #return  class value and probability for multi classifier \n",
    "                yCl = np.argmax(yPred, axis=1)\n",
    "                yPred = list(map(lambda y : y[0][y[1]], zip(yPred, yCl)))\n",
    "                yPred = zip(yCl, yPred)\n",
    "        else:\n",
    "            yPred = np.argmax(yPred, axis=1)\n",
    "        return yPred\n",
    "\n",
    "    @staticmethod\n",
    "    def printPrediction(yPred, config, dataSource):\n",
    "        \"\"\"\n",
    "        prints input feature data and prediction\n",
    "\n",
    "        Parameters\n",
    "            yPred : predicted output\n",
    "            config : config object\n",
    "            dataSource : data source str if file path or 2D array\n",
    "        \"\"\"\n",
    "        #prDataFilePath = config.getStringConfig(\"predict.data.file\")[0]\n",
    "        padWidth = config.getIntConfig(\"predict.feat.pad.size\")[0]\n",
    "        i = 0\n",
    "        if type(dataSource) == str:\n",
    "            for rec in fileRecGen(dataSource, \",\"):\n",
    "                feat = (\",\".join(rec)).ljust(padWidth, \" \")\n",
    "                rec = feat + \"\\t\" + str(yPred[i])\n",
    "                print(rec)\n",
    "                i += 1\n",
    "        else:\n",
    "            for rec in dataSource:\n",
    "                srec = toStrList(rec, 6)\n",
    "                feat = (\",\".join(srec)).ljust(padWidth, \" \")\n",
    "                srec = feat + \"\\t\" + str(yPred[i])\n",
    "                print(srec)\n",
    "                i += 1\n",
    "\n",
    "\n",
    "    @staticmethod\n",
    "    def allTrain(model):\n",
    "        \"\"\"\n",
    "        train with all data\n",
    "\n",
    "        Parameters\n",
    "            model : torch model\n",
    "        \"\"\"\n",
    "        # train mode\n",
    "        model.train()\n",
    "        for t in range(model.numIter):\n",
    "\n",
    "\n",
    "            # Forward pass: Compute predicted y by passing x to the model\n",
    "            yPred = model(model.featData)\n",
    "\n",
    "            # Compute and print loss\n",
    "            loss = model.lossFn(yPred, model.outData)\n",
    "            if model.verbose and  t % 50 == 0:\n",
    "                print(\"epoch {}  loss {:.6f}\".format(t, loss.item()))\n",
    "\n",
    "            # Zero gradients, perform a backward pass, and update the weights.\n",
    "            model.optimizer.zero_grad()\n",
    "            loss.backward()\n",
    "            model.optimizer.step()    \t\n",
    "\n",
    "        #validate\n",
    "        model.eval()\n",
    "        yPred = model(model.validFeatData)\n",
    "        yPred = yPred.data.cpu().numpy()\n",
    "        yActual = model.validOutData\n",
    "        if model.verbose:\n",
    "            result = np.concatenate((yPred, yActual), axis = 1)\n",
    "            print(\"predicted  actual\")\n",
    "            print(result)\n",
    "\n",
    "        score = perfMetric(model.accMetric, yActual, yPred)\n",
    "        print(formatFloat(3, score, \"perf score\"))\n",
    "        return score\n",
    "\n",
    "    @staticmethod\n",
    "    def batchTrain(model):\n",
    "        \"\"\"\n",
    "        train with batch data\n",
    "\n",
    "        Parameters\n",
    "            model : torch model\n",
    "        \"\"\"\n",
    "        model.restored = False\n",
    "        trainData = TensorDataset(model.featData, model.outData)\n",
    "        trainDataLoader = DataLoader(dataset=trainData, batch_size=model.batchSize, shuffle=True)\n",
    "        epochIntv = model.config.getIntConfig(\"train.epoch.intv\")[0]\n",
    "\n",
    "        # train mode\n",
    "        model.train()\n",
    "\n",
    "        if model.trackErr:\n",
    "            trErr = list()\n",
    "            vaErr = list()\n",
    "        #epoch\n",
    "        for t in range(model.numIter):\n",
    "            #batch\n",
    "            b = 0\n",
    "            epochLoss = 0.0\n",
    "            for xBatch, yBatch in trainDataLoader:\n",
    "\n",
    "                # Forward pass: Compute predicted y by passing x to the model\n",
    "                xBatch, yBatch = xBatch.to(model.device), yBatch.to(model.device)\n",
    "                yPred = model(xBatch)\n",
    "\n",
    "                # Compute and print loss\n",
    "                loss = model.lossFn(yPred, yBatch)\n",
    "                if model.verbose and t % epochIntv == 0 and b % model.batchIntv == 0:\n",
    "                    print(\"epoch {}  batch {}  loss {:.6f}\".format(t, b, loss.item()))\n",
    "\n",
    "                if model.trackErr and model.batchIntv == 0:\n",
    "                    epochLoss += loss.item()\n",
    "\n",
    "                #error tracking at batch level\n",
    "                if model.trackErr and model.batchIntv > 0 and b % model.batchIntv == 0:\n",
    "                    trErr.append(loss.item())\n",
    "                    vloss = FeedForwardNetwork.evaluateModel(model)\n",
    "                    vaErr.append(vloss)\n",
    "\n",
    "                # Zero gradients, perform a backward pass, and update the weights.\n",
    "                model.optimizer.zero_grad()\n",
    "                loss.backward()\n",
    "                model.optimizer.step()    \t\n",
    "                b += 1\n",
    "\n",
    "            #error tracking at epoch level\n",
    "            if model.trackErr and model.batchIntv == 0:\n",
    "                epochLoss /= len(trainDataLoader)\n",
    "                trErr.append(epochLoss)\n",
    "                vloss = FeedForwardNetwork.evaluateModel(model)\n",
    "                vaErr.append(vloss)\n",
    "\n",
    "        #validate\n",
    "        model.eval()\n",
    "        yPred = model(model.validFeatData)\n",
    "        yPred = yPred.data.cpu().numpy()\n",
    "        yActual = model.validOutData\n",
    "        if model.verbose:\n",
    "            vsize = yPred.shape[0]\n",
    "            print(\"\\npredicted \\t\\t actual\")\n",
    "            for i in range(vsize):\n",
    "                print(str(yPred[i]) + \"\\t\" + str(yActual[i]))\n",
    "\n",
    "        score = perfMetric(model.accMetric, yActual, yPred)\n",
    "        print(yActual)\n",
    "        print(yPred)\n",
    "        print(formatFloat(3, score, \"perf score\"))\n",
    "\n",
    "        #save\n",
    "        modelSave = model.config.getBooleanConfig(\"train.model.save\")[0]\n",
    "        if modelSave:\n",
    "            FeedForwardNetwork.saveCheckpt(model)\n",
    "\n",
    "        if model.trackErr:\n",
    "            FeedForwardNetwork.errorPlot(model, trErr, vaErr)\n",
    "\n",
    "        if model.config.getBooleanConfig(\"train.print.weights\")[0]:\n",
    "            print(\"model weights\")\n",
    "            for param in model.parameters():\n",
    "                print(param.data)\n",
    "        return score\n",
    "\n",
    "    @staticmethod\n",
    "    def errorPlot(model, trErr, vaErr):\n",
    "        \"\"\"\n",
    "        plot errors\n",
    "\n",
    "        Parameters\n",
    "            trErr : training error list\t\n",
    "            vaErr : validation error list\t\n",
    "        \"\"\"\n",
    "        x = np.arange(len(trErr))\n",
    "        plt.plot(x,trErr,label = \"training error\")\n",
    "        plt.plot(x,vaErr,label = \"validation error\")\n",
    "        plt.xlabel(\"iteration\")\n",
    "        plt.ylabel(\"error\")\n",
    "        plt.legend([\"training error\", \"validation error\"], loc='upper left')\n",
    "        plt.show()\n",
    "\n",
    "    @staticmethod\n",
    "    def modelPredict(model, dataSource = None):\n",
    "        \"\"\"\n",
    "        predict\n",
    "\n",
    "        Parameters\n",
    "            model : torch model\n",
    "            dataSource : data source\n",
    "        \"\"\"\n",
    "        #train or restore model\n",
    "        useSavedModel = model.config.getBooleanConfig(\"predict.use.saved.model\")[0]\n",
    "        if useSavedModel:\n",
    "            FeedForwardNetwork.restoreCheckpt(model)\n",
    "        else:\n",
    "            FeedForwardNetwork.batchTrain(model) \n",
    "\n",
    "        #predict\n",
    "        if dataSource is None:\n",
    "            dataSource = model.config.getStringConfig(\"predict.data.file\")[0]\n",
    "        featData  = FeedForwardNetwork.prepData(model, dataSource, False)\n",
    "        #print(featData)\n",
    "        featData = torch.from_numpy(featData)\n",
    "        featData = featData.to(model.device)\n",
    "\n",
    "        model.eval()\n",
    "        yPred = model(featData)\n",
    "        yPred = yPred.data.cpu().numpy()\n",
    "        #print(yPred)\n",
    "\n",
    "        if model.outputSize >= 2:\n",
    "            #classification\n",
    "            yPred = FeedForwardNetwork.processClassifOutput(yPred, model.config)\n",
    "\n",
    "        # print prediction\n",
    "        if model.config.getBooleanConfig(\"predict.print.output\")[0]:\n",
    "            FeedForwardNetwork.printPrediction(yPred, model.config, dataSource)\n",
    "\n",
    "        return yPred\n",
    "\n",
    "    def predict(self, dataSource = None):\n",
    "        \"\"\"\n",
    "        predict\n",
    "\n",
    "        Parameters\n",
    "            dataSource : data source\n",
    "        \"\"\"\n",
    "        return FeedForwardNetwork.modelPredict(self, dataSource)\n",
    "\n",
    "    @staticmethod\n",
    "    def evaluateModel(model):\n",
    "        \"\"\"\n",
    "        evaluate model\n",
    "\n",
    "        Parameters\n",
    "            model : torch model\n",
    "        \"\"\"\n",
    "        model.eval()\n",
    "        with torch.no_grad():\n",
    "            yPred = model(model.validFeatData)\n",
    "            #yPred = yPred.data.cpu().numpy()\n",
    "            yActual = model.validOutData\n",
    "            score = model.lossFn(yPred, yActual).item()\n",
    "        model.train()\n",
    "        return score\n",
    "\n",
    "    @staticmethod\n",
    "    def prepValidate(model, dataSource=None):\n",
    "        \"\"\"\n",
    "        prepare for validation\n",
    "\n",
    "        Parameters\n",
    "            model : torch model\n",
    "            dataSource : data source\n",
    "        \"\"\"\n",
    "        #train or restore model\n",
    "        if not model.restored:\n",
    "            useSavedModel = model.config.getBooleanConfig(\"predict.use.saved.model\")[0]\n",
    "            if useSavedModel:\n",
    "                FeedForwardNetwork.restoreCheckpt(model)\n",
    "            else:\n",
    "                FeedForwardNetwork.batchTrain(model)\n",
    "            model.restored = True\n",
    "\n",
    "        if \tdataSource is not None:\n",
    "            model.setValidationData(dataSource)\n",
    "\n",
    "    @staticmethod\n",
    "    def validateModel(model, retPred=False):\n",
    "        \"\"\"\n",
    "        pmodel validation\n",
    "\n",
    "        Parameters\n",
    "            model : torch model\n",
    "            retPred : if True return prediction\n",
    "        \"\"\"\n",
    "        model.eval()\n",
    "        yPred = model(model.validFeatData)\n",
    "        yPred = yPred.data.cpu().numpy()\n",
    "        model.yPred = yPred\n",
    "        yActual = model.validOutData\n",
    "        vsize = yPred.shape[0]\n",
    "        if model.verbose:\n",
    "            print(\"\\npredicted \\t actual\")\n",
    "            for i in range(vsize):\n",
    "                print(\"{:.3f}\\t\\t{:.3f}\".format(yPred[i][0], yActual[i][0]))\n",
    "\n",
    "        score = perfMetric(model.accMetric, yActual, yPred)\n",
    "        print(formatFloat(3, score, \"perf score\"))\n",
    "\n",
    "        if retPred:\n",
    "            y = list(map(lambda i : (yPred[i][0], yActual[i][0]), range(vsize)))\n",
    "            res = (y, score)\n",
    "            return res\n",
    "        else:\t\n",
    "            return score"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}