# NewsClassifier

In [1]:
# Imports
import os
import gc
import time
from pathlib import Path
import json
from typing import Tuple, Dict
from warnings import filterwarnings

filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import matplotlib.pyplot as plt
import seaborn as sns
import ipywidgets as widgets
from wordcloud import WordCloud, STOPWORDS

from tqdm.auto import tqdm
from dataclasses import dataclass

import re
import nltk
from nltk.corpus import stopwords

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

from transformers import RobertaTokenizer, RobertaModel

import wandb

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\manis\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m: Currently logged in as: [33mmanishdrw1[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [3]:
@dataclass
class Cfg:
    STOPWORDS = stopwords.words("english")
    dataset_loc = "../dataset/raw/news_dataset.csv"
    test_size = 0.2

    add_special_tokens = True
    max_len = 50
    pad_to_max_length = True
    truncation = True

    change_config = False

    dropout_pb = 0.5
    lr = 1e-4
    lr_redfactor = 0.7
    lr_redpatience = 4
    epochs = 10
    batch_size = 128

    wandb_sweep = False

In [13]:
df = pd.read_csv(Cfg.dataset_loc)
print(df["Title"][10040])
print(df["Category"][10040])

Matthew McConaughey Gives Joy Behar A Foot Massage On ‘The View’
Entertainment


## Prepare Data

In [14]:
def prepare_data(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Separate headlines instance and feature selection.

    Args:
        df: original dataframe.

    Returns:
        df: new dataframe with appropriate features.
        headlines_df: dataframe cintaining "headlines" category instances.
    """
    df = df[["Title", "Category"]]
    df.rename(columns={"Title": "Text"}, inplace=True)
    df, headlines_df = df[df["Category"] != "Headlines"].reset_index(drop=True), df[df["Category"] == "Headlines"].reset_index(drop=True)

    return df, headlines_df

In [15]:
def clean_text(text: str) -> str:
    """Clean text (lower, puntuations removal, blank space removal)."""
    # lower case the text
    text = text.lower()  # necessary to do before as stopwords are in lower case

    # remove stopwords
    stp_pattern = re.compile(r"\b(" + r"|".join(Cfg.STOPWORDS) + r")\b\s*")
    text = stp_pattern.sub("", text)

    # custom cleaning
    text = text.strip()  # remove space at start or end if any
    text = re.sub(" +", " ", text)  # remove extra spaces
    text = re.sub("[^A-Za-z0-9]+", " ", text)  # remove characters that are not alphanumeric

    return text

In [16]:
def preprocess(df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict, Dict]:
    """Preprocess the data.

    Args:
        df: Dataframe on which the preprocessing steps need to be performed.

    Returns:
        df: Preprocessed Data.
        class_to_index: class labels to indices mapping
        class_to_index: indices to class labels mapping
    """
    df, headlines_df = prepare_data(df)

    cats = df["Category"].unique().tolist()
    num_classes = len(cats)
    class_to_index = {tag: i for i, tag in enumerate(cats)}
    index_to_class = {v: k for k, v in class_to_index.items()}

    df["Text"] = df["Text"].apply(clean_text)  # clean text
    df = df[["Text", "Category"]]
    df["Category"] = df["Category"].map(class_to_index)  # label encoding
    return df, class_to_index, index_to_class

In [17]:
ds, class_to_index, index_to_class = preprocess(df)
ds

Unnamed: 0,Text,Category
0,chainlink link falters hedera hbar wobbles yet...,0
1,funds punished owning nvidia shares stunning 2...,0
2,crude oil prices stalled hedge funds sold kemp,0
3,grayscale bitcoin win still half battle,0
4,home shopping editor miss labor day deals eyeing,0
...,...,...
44142,slovakia election could echo ukraine expect,6
44143,things know nobel prizes washington post,6
44144,brief calm protests killing 2 students rock im...,6
44145,one safe france vows action bedbugs sweep paris,6


In [None]:
index_to_class

In [20]:
# Data splits
train_ds, val_ds = train_test_split(ds, test_size=Cfg.test_size, stratify=ds["Category"])

In [21]:
def prepare_input(tokenizer: RobertaTokenizer, text: str) -> Dict:
    """Tokenize and prepare the input text using the provided tokenizer.

    Args:
        tokenizer (RobertaTokenizer): The Roberta tokenizer to encode the input.
        text (str): The input text to be tokenized.

    Returns:
        inputs (dict): A dictionary containing the tokenized input with keys such as 'input_ids',
            'attention_mask', etc.
    """
    inputs = tokenizer.encode_plus(
        text,
        return_tensors=None,
        add_special_tokens=Cfg.add_special_tokens,
        max_length=Cfg.max_len,
        pad_to_max_length=Cfg.pad_to_max_length,
        truncation=Cfg.truncation,
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class NewsDataset(Dataset):
    def __init__(self, ds):
        self.texts = ds["Text"].values
        self.labels = ds["Category"].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(tokenizer, self.texts[item])
        labels = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, labels


def collate(inputs: Dict) -> Dict:
    """Collate and modify the input dictionary to have the same sequence length for a particular input batch.

    Args:
        inputs (dict): A dictionary containing input tensors with varying sequence lengths.

    Returns:
        modified_inputs (dict): A modified dictionary with input tensors trimmed to have the same sequence length.
    """
    max_len = int(inputs["input_ids"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:, :max_len]
    return inputs


tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

sample_input = prepare_input(tokenizer, train_ds["Text"].values[10])
sample_input

{'input_ids': tensor([    0,   462, 25744,  7188,   155,    23,   462, 11485,   112,     2,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0])}

## Model

In [22]:
class CustomModel(nn.Module):
    def __init__(self, num_classes, change_config=False, dropout_pb=0.0):
        super(CustomModel, self).__init__()
        if change_config:
            pass
        self.model = RobertaModel.from_pretrained("roberta-base")
        self.hidden_size = self.model.config.hidden_size
        self.num_classes = num_classes
        self.dropout_pb = dropout_pb
        self.dropout = torch.nn.Dropout(self.dropout_pb)
        self.fc = nn.Linear(self.hidden_size, self.num_classes)

    def forward(self, inputs):
        output = self.model(**inputs)
        z = self.dropout(output[1])
        z = self.fc(z)
        return z

    @torch.inference_mode()
    def predict(self, inputs):
        self.eval()
        z = self(inputs)
        y_pred = torch.argmax(z, dim=1).cpu().numpy()
        return y_pred

    @torch.inference_mode()
    def predict_proba(self, inputs):
        self.eval()
        z = self(inputs)
        y_probs = F.softmax(z, dim=1).cpu().numpy()
        return y_probs

    def save(self, dp):
        with open(Path(dp, "args.json"), "w") as fp:
            contents = {
                "dropout_pb": self.dropout_pb,
                "hidden_size": self.hidden_size,
                "num_classes": self.num_classes,
            }
            json.dump(contents, fp, indent=4, sort_keys=False)
        torch.save(self.state_dict(), os.path.join(dp, "model.pt"))

    @classmethod
    def load(cls, args_fp, state_dict_fp):
        with open(args_fp, "r") as fp:
            kwargs = json.load(fp=fp)
        llm = RobertaModel.from_pretrained("roberta-base")
        model = cls(llm=llm, **kwargs)
        model.load_state_dict(torch.load(state_dict_fp, map_location=torch.device("cpu")))
        return model

In [None]:
# Initialize model check
num_classes = len(ds["Category"].unique())
model = CustomModel(num_classes=num_classes, dropout_pb=Cfg.dropout_pb)
print(model.named_parameters)

## Training

In [None]:
def train_step(train_loader: DataLoader, model, num_classes: int, loss_fn, optimizer, epoch: int) -> float:
    """Train step."""
    model.train()
    loss = 0.0
    total_iterations = len(train_loader)
    desc = f"Training - Epoch {epoch+1}"
    for step, (inputs, labels) in tqdm(enumerate(train_loader), total=total_iterations, desc=desc):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()  # reset gradients
        y_pred = model(inputs)  # forward pass
        targets = F.one_hot(labels.long(), num_classes=num_classes).float()  # one-hot (for loss_fn)
        J = loss_fn(y_pred, targets)  # define loss
        J.backward()  # backward pass
        optimizer.step()  # update weights
        loss += (J.detach().item() - loss) / (step + 1)  # cumulative loss
    return loss


def eval_step(val_loader: DataLoader, model, num_classes: int, loss_fn, epoch: int) -> Tuple[float, np.ndarray, np.ndarray]:
    """Eval step."""
    model.eval()
    loss = 0.0
    total_iterations = len(val_loader)
    desc = f"Validation - Epoch {epoch+1}"
    y_trues, y_preds = [], []
    with torch.inference_mode():
        for step, (inputs, labels) in tqdm(enumerate(val_loader), total=total_iterations, desc=desc):
            inputs = collate(inputs)
            for k, v in inputs.items():
                inputs[k] = v.to(device)
            labels = labels.to(device)
            y_pred = model(inputs)
            targets = F.one_hot(labels.long(), num_classes=num_classes).float()  # one-hot (for loss_fn)
            J = loss_fn(y_pred, targets).item()
            loss += (J - loss) / (step + 1)
            y_trues.extend(targets.cpu().numpy())
            y_preds.extend(torch.argmax(y_pred, dim=1).cpu().numpy())
    return loss, np.vstack(y_trues), np.vstack(y_preds)

### Sweep config

In [None]:
sweep_config = {"method": "random"}

metric = {"name": "val_loss", "goal": "minimize"}

sweep_config["metric"] = metric

parameters_dict = {
    "dropout_pb": {
        "values": [0.3, 0.4, 0.5],
    },
    "learning_rate": {
        "values": [0.0001, 0.001, 0.01],
    },
    "batch_size": {
        "values": [32, 64, 128],
    },
    "lr_reduce_factor": {
        "values": [0.5, 0.6, 0.7, 0.8],
    },
    "lr_reduce_patience": {
        "values": [2, 3, 4, 5],
    },
    "epochs": {"value": 1},
}

sweep_config["parameters"] = parameters_dict

In [None]:
# create sweep
if Cfg.wandb_sweep:
    sweep_id = wandb.sweep(sweep_config, project="NewsClassifier")

In [None]:
def train_loop(config=None):
    # ====================================================
    # loader
    # ====================================================

    if not Cfg.wandb_sweep:
        config = dict(
            batch_size=Cfg.batch_size,
            num_classes=7,
            epochs=Cfg.epochs,
            dropout_pb=Cfg.dropout_pb,
            learning_rate=Cfg.lr,
            lr_reduce_factor=Cfg.lr_redfactor,
            lr_reduce_patience=Cfg.lr_redpatience,
        )

    with wandb.init(project="NewsClassifier", config=config):
        config = wandb.config

        train_ds, val_ds = train_test_split(ds, test_size=Cfg.test_size, stratify=ds["Category"])

        train_dataset = NewsDataset(train_ds)
        valid_dataset = NewsDataset(val_ds)

        train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, num_workers=4, pin_memory=True, drop_last=True)
        valid_loader = DataLoader(valid_dataset, batch_size=config.batch_size, shuffle=False, num_workers=4, pin_memory=True, drop_last=False)

        # ====================================================
        # model
        # ====================================================
        num_classes = 7
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        model = CustomModel(num_classes=num_classes, dropout_pb=config.dropout_pb)
        model.to(device)

        # ====================================================
        # Training components
        # ====================================================
        criterion = nn.BCEWithLogitsLoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, mode="min", factor=config.lr_reduce_factor, patience=config.lr_reduce_patience
        )

        # ====================================================
        # loop
        # ====================================================
        wandb.watch(model, criterion, log="all", log_freq=10)

        min_loss = np.inf

        for epoch in range(config.epochs):
            start_time = time.time()

            # Step
            train_loss = train_step(train_loader, model, num_classes, criterion, optimizer, epoch)
            val_loss, _, _ = eval_step(valid_loader, model, num_classes, criterion, epoch)
            scheduler.step(val_loss)

            # scoring
            elapsed = time.time() - start_time
            wandb.log({"epoch": epoch + 1, "train_loss": train_loss, "val_loss": val_loss})
            print(f"Epoch {epoch+1} - avg_train_loss: {train_loss:.4f}  avg_val_loss: {val_loss:.4f}  time: {elapsed:.0f}s")

            if min_loss > val_loss:
                min_loss = val_loss
                print("Best Score : saving model.")
                os.makedirs("../artifacts", exist_ok=True)
                model.save("../artifacts")
            print(f"\nSaved Best Model Score: {min_loss:.4f}\n\n")

        wandb.save("../artifacts/model.pt")
        torch.cuda.empty_cache()
        gc.collect()

In [None]:
# Train/Tune
if not Cfg.wandb_sweep:
    train_loop()
else:
    wandb.agent(sweep_id, train_loop, count=10)

## Inference

In [34]:
model = CustomModel(num_classes=7)
model.load_state_dict(torch.load("../artifacts/model.pt", map_location=torch.device("cpu")))
model.to(device)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


CustomModel(
  (model): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm

In [None]:
def test_step(test_loader: DataLoader, model, num_classes: int) -> Tuple[np.ndarray, np.ndarray]:
    """Eval step."""
    model.eval()
    y_trues, y_preds = [], []
    with torch.inference_mode():
        for step, (inputs, labels) in tqdm(enumerate(test_loader)):
            inputs = collate(inputs)
            for k, v in inputs.items():
                inputs[k] = v.to(device)
            labels = labels.to(device)
            y_pred = model(inputs)
            y_trues.extend(labels.cpu().numpy())
            y_preds.extend(torch.argmax(y_pred, dim=1).cpu().numpy())
    return np.vstack(y_trues), np.vstack(y_preds)

In [None]:
test_dataset = NewsDataset(val_ds)
test_loader = DataLoader(test_dataset, batch_size=Cfg.batch_size, shuffle=False, num_workers=4, pin_memory=True, drop_last=False)

y_true, y_pred = test_step(test_loader, model, 7)

In [None]:
print(
    f'Precision: {precision_score(y_true, y_pred, average="weighted")} \n Recall: {recall_score(y_true, y_pred, average="weighted")} \n F1: {f1_score(y_true, y_pred, average="weighted")} \n Accuracy: {accuracy_score(y_true, y_pred)}'
)

## Prediction on single sample

In [None]:
val_ds

In [36]:
sample = 2
sample_input = prepare_input(tokenizer, val_ds["Text"].values[sample])

cats = df["Category"].unique().tolist()
num_classes = len(cats)
class_to_index = {tag: i for i, tag in enumerate(cats)}
index_to_class = {v: k for k, v in class_to_index.items()}

label = val_ds["Category"].values[sample]
input_ids = torch.unsqueeze(sample_input["input_ids"], 0).to(device)
attention_masks = torch.unsqueeze(sample_input["attention_mask"], 0).to(device)
test_sample = dict(input_ids=input_ids, attention_mask=attention_masks)

with torch.no_grad():
    y_pred_test_sample = model.predict_proba(test_sample)
    print(f"Ground Truth: {label}, {index_to_class[int(label)]}")
    print(f"Predicted: {np.argmax(y_pred_test_sample)}, {index_to_class[int(np.argmax(y_pred_test_sample))]}")
    print(f"Predicted Probabilities: {np.round(y_pred_test_sample, 8)[0]}")

Ground Truth: 5, Sports
Predicted: 5, Sports
Predicted Probabilities: [9.8119999e-05 1.0613000e-04 7.7200002e-06 3.2520002e-05 8.3100003e-06
 9.9973667e-01 1.0560000e-05]
