Spaces:

hardiktiwari
/

tensora-autotrain

Sleeping

File size: 21,578 Bytes

33d4721

import copy
from collections import defaultdict
from dataclasses import dataclass
from functools import partial
from typing import List, Optional

import numpy as np
from sklearn import ensemble, impute, linear_model
from sklearn import metrics as skmetrics
from sklearn import naive_bayes, neighbors, pipeline, preprocessing, svm, tree
from xgboost import XGBClassifier, XGBRegressor


MARKDOWN = """
---
tags:
- autotrain
- tabular
- {task}
- tabular-{task}
datasets:
- {dataset}
---

# Model Trained Using AutoTrain

- Problem type: Tabular {task}

## Validation Metrics

{metrics}

## Best Params

{params}

## Usage

```python
import json
import joblib
import pandas as pd

model = joblib.load('model.joblib')
config = json.load(open('config.json'))

features = config['features']

# data = pd.read_csv("data.csv")
data = data[features]

predictions = model.predict(data)  # or model.predict_proba(data)

# predictions can be converted to original labels using label_encoders.pkl

```
"""

_MODELS: dict = defaultdict(dict)
_MODELS["xgboost"]["classification"] = XGBClassifier
_MODELS["xgboost"]["regression"] = XGBRegressor
_MODELS["logistic_regression"]["classification"] = linear_model.LogisticRegression
_MODELS["logistic_regression"]["regression"] = linear_model.LogisticRegression
_MODELS["random_forest"]["classification"] = ensemble.RandomForestClassifier
_MODELS["random_forest"]["regression"] = ensemble.RandomForestRegressor
_MODELS["extra_trees"]["classification"] = ensemble.ExtraTreesClassifier
_MODELS["extra_trees"]["regression"] = ensemble.ExtraTreesRegressor
_MODELS["gradient_boosting"]["classification"] = ensemble.GradientBoostingClassifier
_MODELS["gradient_boosting"]["regression"] = ensemble.GradientBoostingRegressor
_MODELS["adaboost"]["classification"] = ensemble.AdaBoostClassifier
_MODELS["adaboost"]["regression"] = ensemble.AdaBoostRegressor
_MODELS["ridge"]["classification"] = linear_model.RidgeClassifier
_MODELS["ridge"]["regression"] = linear_model.Ridge
_MODELS["svm"]["classification"] = svm.LinearSVC
_MODELS["svm"]["regression"] = svm.LinearSVR
_MODELS["decision_tree"]["classification"] = tree.DecisionTreeClassifier
_MODELS["decision_tree"]["regression"] = tree.DecisionTreeRegressor
_MODELS["lasso"]["regression"] = linear_model.Lasso
_MODELS["linear_regression"]["regression"] = linear_model.LinearRegression
_MODELS["naive_bayes"]["classification"] = naive_bayes.GaussianNB
_MODELS["knn"]["classification"] = neighbors.KNeighborsClassifier
_MODELS["knn"]["regression"] = neighbors.KNeighborsRegressor

CLASSIFICATION_TASKS = ("binary_classification", "multi_class_classification", "multi_label_classification")
REGRESSION_TASKS = ("single_column_regression", "multi_column_regression")


@dataclass
class TabularMetrics:
    """
    A class to calculate various metrics for different types of tabular tasks.

    Attributes:
    -----------
    sub_task : str
        The type of sub-task. It can be one of the following:
        - "binary_classification"
        - "multi_class_classification"
        - "single_column_regression"
        - "multi_column_regression"
        - "multi_label_classification"
    labels : Optional[List], optional
        The list of labels for multi-class classification tasks (default is None).

    Methods:
    --------
    __post_init__():
        Initializes the valid metrics based on the sub-task type.

    calculate(y_true, y_pred):
        Calculates the metrics based on the true and predicted values.

        Parameters:
        -----------
        y_true : array-like
            True labels or values.
        y_pred : array-like
            Predicted labels or values.

        Returns:
        --------
        dict
            A dictionary with metric names as keys and their calculated values as values.
    """

    sub_task: str
    labels: Optional[List] = None

    def __post_init__(self):
        if self.sub_task == "binary_classification":
            self.valid_metrics = {
                "auc": skmetrics.roc_auc_score,
                "logloss": skmetrics.log_loss,
                "f1": skmetrics.f1_score,
                "accuracy": skmetrics.accuracy_score,
                "precision": skmetrics.precision_score,
                "recall": skmetrics.recall_score,
            }
        elif self.sub_task == "multi_class_classification":
            self.valid_metrics = {
                "logloss": partial(skmetrics.log_loss, labels=self.labels),
                "accuracy": skmetrics.accuracy_score,
                "mlogloss": partial(skmetrics.log_loss, labels=self.labels),
                "f1_macro": partial(skmetrics.f1_score, average="macro", labels=self.labels),
                "f1_micro": partial(skmetrics.f1_score, average="micro", labels=self.labels),
                "f1_weighted": partial(skmetrics.f1_score, average="weighted", labels=self.labels),
                "precision_macro": partial(skmetrics.precision_score, average="macro", labels=self.labels),
                "precision_micro": partial(skmetrics.precision_score, average="micro", labels=self.labels),
                "precision_weighted": partial(skmetrics.precision_score, average="weighted", labels=self.labels),
                "recall_macro": partial(skmetrics.recall_score, average="macro", labels=self.labels),
                "recall_micro": partial(skmetrics.recall_score, average="micro", labels=self.labels),
                "recall_weighted": partial(skmetrics.recall_score, average="weighted", labels=self.labels),
            }
        elif self.sub_task in ("single_column_regression", "multi_column_regression"):
            self.valid_metrics = {
                "r2": skmetrics.r2_score,
                "mse": skmetrics.mean_squared_error,
                "mae": skmetrics.mean_absolute_error,
                "rmse": partial(skmetrics.mean_squared_error, squared=False),
                "rmsle": partial(skmetrics.mean_squared_log_error, squared=False),
            }
        elif self.sub_task == "multi_label_classification":
            self.valid_metrics = {
                "logloss": skmetrics.log_loss,
            }
        else:
            raise ValueError("Invalid problem type")

    def calculate(self, y_true, y_pred):
        metrics = {}
        for metric_name, metric_func in self.valid_metrics.items():
            if self.sub_task == "binary_classification":
                if metric_name == "auc":
                    metrics[metric_name] = metric_func(y_true, y_pred[:, 1])
                elif metric_name == "logloss":
                    metrics[metric_name] = metric_func(y_true, y_pred)
                else:
                    metrics[metric_name] = metric_func(y_true, y_pred[:, 1] >= 0.5)
            elif self.sub_task == "multi_class_classification":
                if metric_name in (
                    "accuracy",
                    "f1_macro",
                    "f1_micro",
                    "f1_weighted",
                    "precision_macro",
                    "precision_micro",
                    "precision_weighted",
                    "recall_macro",
                    "recall_micro",
                    "recall_weighted",
                ):
                    metrics[metric_name] = metric_func(y_true, np.argmax(y_pred, axis=1))
                else:
                    metrics[metric_name] = metric_func(y_true, y_pred)
            else:
                if metric_name == "rmsle":
                    temp_pred = copy.deepcopy(y_pred)
                    temp_pred = np.clip(temp_pred, 0, None)
                    metrics[metric_name] = metric_func(y_true, temp_pred)
                else:
                    metrics[metric_name] = metric_func(y_true, y_pred)
        return metrics


class TabularModel:
    """
    A class used to represent a Tabular Model for AutoTrain training.

    Attributes
    ----------
    model : str
        The name of the model to be used.
    preprocessor : object
        The preprocessor to be applied to the data.
    sub_task : str
        The sub-task type, either classification or regression.
    params : dict
        The parameters to be passed to the model.
    use_predict_proba : bool
        A flag indicating whether to use the predict_proba method.

    Methods
    -------
    _get_model():
        Retrieves the appropriate model based on the sub-task and model name.
    """

    def __init__(self, model, preprocessor, sub_task, params):
        self.model = model
        self.preprocessor = preprocessor
        self.sub_task = sub_task
        self.params = params
        self.use_predict_proba = True

        _model = self._get_model()
        if self.preprocessor is not None:
            self.pipeline = pipeline.Pipeline([("preprocessor", self.preprocessor), ("model", _model)])
        else:
            self.pipeline = pipeline.Pipeline([("model", _model)])

    def _get_model(self):
        if self.model in _MODELS:
            if self.sub_task in CLASSIFICATION_TASKS:
                if self.model in ("svm", "ridge"):
                    self.use_predict_proba = False
                return _MODELS[self.model]["classification"](**self.params)
            elif self.sub_task in REGRESSION_TASKS:
                self.use_predict_proba = False
                return _MODELS[self.model]["regression"](**self.params)
            else:
                raise ValueError("Invalid task")
        else:
            raise ValueError("Invalid model")


def get_params(trial, model, task):
    if model == "xgboost":
        params = {
            "learning_rate": trial.suggest_float("learning_rate", 1e-2, 0.25, log=True),
            "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 100.0, log=True),
            "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 100.0, log=True),
            "subsample": trial.suggest_float("subsample", 0.1, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 1.0),
            "max_depth": trial.suggest_int("max_depth", 1, 9),
            "early_stopping_rounds": trial.suggest_int("early_stopping_rounds", 100, 500),
            "n_estimators": trial.suggest_categorical("n_estimators", [7000, 15000, 20000]),
            "tree_method": "hist",
            "random_state": 42,
        }

        return params

    if model == "logistic_regression":
        if task in CLASSIFICATION_TASKS:
            params = {
                "C": trial.suggest_float("C", 1e-8, 1e3, log=True),
                "fit_intercept": trial.suggest_categorical("fit_intercept", [True, False]),
                "solver": trial.suggest_categorical("solver", ["liblinear", "saga"]),
                "penalty": trial.suggest_categorical("penalty", ["l1", "l2"]),
                "n_jobs": -1,
            }
            return params

        raise ValueError("Task not supported")

    if model == "random_forest":
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 10, 10000),
            "max_depth": trial.suggest_int("max_depth", 2, 15),
            "max_features": trial.suggest_categorical("max_features", ["auto", "sqrt", "log2", None]),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 20),
            "bootstrap": trial.suggest_categorical("bootstrap", [True, False]),
            "n_jobs": -1,
        }
        if task in CLASSIFICATION_TASKS:
            params["criterion"] = trial.suggest_categorical("criterion", ["gini", "entropy"])
            return params
        if task in REGRESSION_TASKS:
            params["criterion"] = trial.suggest_categorical(
                "criterion", ["squared_error", "absolute_error", "poisson"]
            )
            return params
        raise ValueError("Task not supported")

    if model == "extra_trees":
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 10, 10000),
            "max_depth": trial.suggest_int("max_depth", 2, 15),
            "max_features": trial.suggest_categorical("max_features", ["auto", "sqrt", "log2", None]),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 20),
            "bootstrap": trial.suggest_categorical("bootstrap", [True, False]),
            "n_jobs": -1,
        }
        if task in CLASSIFICATION_TASKS:
            params["criterion"] = trial.suggest_categorical("criterion", ["gini", "entropy"])
            return params
        if task in REGRESSION_TASKS:
            params["criterion"] = trial.suggest_categorical("criterion", ["squared_error", "absolute_error"])
            return params
        raise ValueError("Task not supported")

    if model == "decision_tree":
        params = {
            "max_depth": trial.suggest_int("max_depth", 1, 15),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 20),
            "max_features": trial.suggest_categorical("max_features", ["auto", "sqrt", "log2", None]),
            "splitter": trial.suggest_categorical("splitter", ["best", "random"]),
        }
        if task in CLASSIFICATION_TASKS:
            params["criterion"] = trial.suggest_categorical("criterion", ["gini", "entropy"])
            return params
        if task in REGRESSION_TASKS:
            params["criterion"] = trial.suggest_categorical(
                "criterion", ["squared_error", "absolute_error", "friedman_mse", "poisson"]
            )
            return params
        raise ValueError("Task not supported")

    if model == "linear_regression":
        if task in REGRESSION_TASKS:
            params = {
                "fit_intercept": trial.suggest_categorical("fit_intercept", [True, False]),
            }
            return params
        raise ValueError("Task not supported")

    if model == "svm":
        if task in CLASSIFICATION_TASKS:
            params = {
                "C": trial.suggest_float("C", 1e-8, 1e3, log=True),
                "fit_intercept": trial.suggest_categorical("fit_intercept", [True, False]),
                "penalty": "l2",
                "max_iter": trial.suggest_int("max_iter", 1000, 10000),
            }
            return params

        if task in REGRESSION_TASKS:
            params = {
                "C": trial.suggest_float("C", 1e-8, 1e3, log=True),
                "fit_intercept": trial.suggest_categorical("fit_intercept", [True, False]),
                "loss": trial.suggest_categorical("loss", ["epsilon_insensitive", "squared_epsilon_insensitive"]),
                "epsilon": trial.suggest_float("epsilon", 1e-8, 1e-1, log=True),
                "max_iter": trial.suggest_int("max_iter", 1000, 10000),
            }
            return params
        raise ValueError("Task not supported")

    if model == "ridge":
        params = {
            "alpha": trial.suggest_float("alpha", 1e-8, 1e3, log=True),
            "fit_intercept": trial.suggest_categorical("fit_intercept", [True, False]),
            "max_iter": trial.suggest_int("max_iter", 1000, 10000),
        }
        if task in CLASSIFICATION_TASKS:
            return params
        if task in REGRESSION_TASKS:
            return params
        raise ValueError("Task not supported")

    if model == "lasso":
        if task in REGRESSION_TASKS:
            params = {
                "alpha": trial.suggest_float("alpha", 1e-8, 1e3, log=True),
                "fit_intercept": trial.suggest_categorical("fit_intercept", [True, False]),
                "max_iter": trial.suggest_int("max_iter", 1000, 10000),
            }
            return params
        raise ValueError("Task not supported")

    if model == "knn":
        params = {
            "n_neighbors": trial.suggest_int("n_neighbors", 1, 25),
            "weights": trial.suggest_categorical("weights", ["uniform", "distance"]),
            "algorithm": trial.suggest_categorical("algorithm", ["ball_tree", "kd_tree", "brute"]),
            "leaf_size": trial.suggest_int("leaf_size", 1, 100),
            "p": trial.suggest_categorical("p", [1, 2]),
            "metric": trial.suggest_categorical("metric", ["minkowski", "euclidean", "manhattan"]),
        }
        if task in CLASSIFICATION_TASKS or task in REGRESSION_TASKS:
            return params
        raise ValueError("Task not supported")

    return ValueError("Invalid model")


def get_imputer(imputer_name):
    """
    Returns an imputer object based on the specified imputer name.

    Parameters:
    imputer_name (str): The name of the imputer to use. Can be one of the following:
                        - "median": Uses the median value for imputation.
                        - "mean": Uses the mean value for imputation.
                        - "most_frequent": Uses the most frequent value for imputation.
                        If None, returns None.

    Returns:
    impute.SimpleImputer or None: An instance of SimpleImputer with the specified strategy,
                                  or None if imputer_name is None.

    Raises:
    ValueError: If an invalid imputer_name is provided.
    """
    if imputer_name is None:
        return None
    if imputer_name == "median":
        return impute.SimpleImputer(strategy="median")
    if imputer_name == "mean":
        return impute.SimpleImputer(strategy="mean")
    if imputer_name == "most_frequent":
        return impute.SimpleImputer(strategy="most_frequent")
    raise ValueError("Invalid imputer")


def get_scaler(scaler_name):
    """
    Returns a scaler object based on the provided scaler name.

    Parameters:
    scaler_name (str): The name of the scaler to be returned.
                       Possible values are "standard", "minmax", "robust", and "normal".
                       If None, returns None.

    Returns:
    scaler: An instance of the corresponding scaler from sklearn.preprocessing.
            If the scaler_name is None, returns None.

    Raises:
    ValueError: If the scaler_name is not one of the expected values.
    """
    if scaler_name is None:
        return None
    if scaler_name == "standard":
        return preprocessing.StandardScaler()
    if scaler_name == "minmax":
        return preprocessing.MinMaxScaler()
    if scaler_name == "robust":
        return preprocessing.RobustScaler()
    if scaler_name == "normal":
        return preprocessing.Normalizer()
    raise ValueError("Invalid scaler")


def get_metric_direction(sub_task):
    """
    Determines the appropriate metric and its optimization direction based on the given sub-task.

    Parameters:
    sub_task (str): The type of sub-task. Must be one of the following:
                    - "binary_classification"
                    - "multi_class_classification"
                    - "single_column_regression"
                    - "multi_label_classification"
                    - "multi_column_regression"

    Returns:
    tuple: A tuple containing:
           - str: The metric to be used (e.g., "logloss", "mlogloss", "rmse").
           - str: The direction of optimization ("minimize").

    Raises:
    ValueError: If the provided sub_task is not one of the recognized types.
    """
    if sub_task == "binary_classification":
        return "logloss", "minimize"
    if sub_task == "multi_class_classification":
        return "mlogloss", "minimize"
    if sub_task == "single_column_regression":
        return "rmse", "minimize"
    if sub_task == "multi_label_classification":
        return "logloss", "minimize"
    if sub_task == "multi_column_regression":
        return "rmse", "minimize"
    raise ValueError("Invalid sub_task")


def get_categorical_columns(df):
    """
    Extracts the names of categorical columns from a DataFrame.

    Parameters:
    df (pandas.DataFrame): The DataFrame from which to extract categorical columns.

    Returns:
    list: A list of column names that are of categorical data type (either 'category' or 'object').
    """
    return list(df.select_dtypes(include=["category", "object"]).columns)


def get_numerical_columns(df):
    """
    Extracts and returns a list of numerical column names from a given DataFrame.

    Args:
        df (pandas.DataFrame): The DataFrame from which to extract numerical columns.

    Returns:
        list: A list of column names that have numerical data types.
    """
    return list(df.select_dtypes(include=["number"]).columns)


def create_model_card(config, sub_task, best_params, best_metrics):
    """
    Generates a markdown formatted model card with the given configuration, sub-task, best parameters, and best metrics.

    Args:
        config (object): Configuration object containing task and data path information.
        sub_task (str): The specific sub-task for which the model card is being created.
        best_params (dict): Dictionary containing the best hyperparameters for the model.
        best_metrics (dict): Dictionary containing the best performance metrics for the model.

    Returns:
        str: A string containing the formatted model card in markdown.
    """
    best_metrics = "\n".join([f"- {k}: {v}" for k, v in best_metrics.items()])
    best_params = "\n".join([f"- {k}: {v}" for k, v in best_params.items()])
    return MARKDOWN.format(
        task=config.task,
        dataset=config.data_path,
        metrics=best_metrics,
        params=best_params,
    )