# Preparing the environment

In [None]:
!pip install datasets evaluate rouge_score --quiet

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.8/194.8 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [None]:
import warnings

import evaluate
import numpy as np
import torch
from datasets import load_dataset
from peft import LoraConfig, PeftModel, get_peft_model
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    logging,
)

CKPT_PATH = "facebook/bart-large-cnn" # Path to the fine-tuned base model
DATASET_PATH = "ccdv/arxiv-summarization" # Path to the dataset to be used for fine-tuning
HF_REPO_PATH = "spolivin/bart-arxiv-lora" # Path to the repo where LoRA adapters are saved

warnings.filterwarnings("ignore")
logging.set_verbosity_error()

# Loading and preprocessing data

In [None]:
arxiv_train = load_dataset(DATASET_PATH, split="train[:5000]")
arxiv_valid = load_dataset(DATASET_PATH, split="validation[:300]")

README.md:   0%|          | 0.00/3.96k [00:00<?, ?B/s]

train-00000-of-00015.parquet:   0%|          | 0.00/230M [00:00<?, ?B/s]

train-00001-of-00015.parquet:   0%|          | 0.00/228M [00:00<?, ?B/s]

train-00002-of-00015.parquet:   0%|          | 0.00/228M [00:00<?, ?B/s]

train-00003-of-00015.parquet:   0%|          | 0.00/227M [00:00<?, ?B/s]

train-00004-of-00015.parquet:   0%|          | 0.00/226M [00:00<?, ?B/s]

train-00005-of-00015.parquet:   0%|          | 0.00/227M [00:00<?, ?B/s]

train-00006-of-00015.parquet:   0%|          | 0.00/229M [00:00<?, ?B/s]

train-00007-of-00015.parquet:   0%|          | 0.00/230M [00:00<?, ?B/s]

train-00008-of-00015.parquet:   0%|          | 0.00/230M [00:00<?, ?B/s]

train-00009-of-00015.parquet:   0%|          | 0.00/228M [00:00<?, ?B/s]

train-00010-of-00015.parquet:   0%|          | 0.00/229M [00:00<?, ?B/s]

train-00011-of-00015.parquet:   0%|          | 0.00/231M [00:00<?, ?B/s]

train-00012-of-00015.parquet:   0%|          | 0.00/230M [00:00<?, ?B/s]

train-00013-of-00015.parquet:   0%|          | 0.00/230M [00:00<?, ?B/s]

train-00014-of-00015.parquet:   0%|          | 0.00/235M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/105M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/105M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/203037 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/6436 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6440 [00:00<?, ? examples/s]

In [None]:
arxiv_train

Dataset({
    features: ['article', 'abstract'],
    num_rows: 5000
})

In [None]:
arxiv_valid

Dataset({
    features: ['article', 'abstract'],
    num_rows: 300
})

In [None]:
# Loading the tokenizer for the base model
tokenizer = AutoTokenizer.from_pretrained(CKPT_PATH)

def tokenizing_function(examples):
    """Tokenizes article and abstract texts."""
    # Tokenizing article text
    model_inputs = tokenizer(examples["article"], max_length=1024, truncation=True)
    # Tokenizing abstrac text (label)
    labels = tokenizer(text_target=examples["abstract"], max_length=150, truncation=True)
    # Adding label tokens as a label column to model inputs
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
tokenized_arxiv_train = arxiv_train.map(tokenizing_function, batched=True)
tokenized_arxiv_valid = arxiv_valid.map(tokenizing_function, batched=True)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

# Setting up model with LoRA

In [None]:
# Loading base model from HF
model = AutoModelForSeq2SeqLM.from_pretrained(CKPT_PATH)
print(f"Model size ({CKPT_PATH}): {model.get_memory_footprint() / 1e9:.2f} GB")

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Model size (facebook/bart-large-cnn): 1.63 GB


In [None]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj"],
)

In [None]:
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 1,179,648 || all params: 407,470,080 || trainable%: 0.2895


# Preparation for fine-tuning

In [None]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    return_tensors="pt",
)

In [None]:
# Loading ROUGE metric
metric = evaluate.load("rouge")

def compute_metrics(eval_pred):
    """Computes ROUGE metrics."""
    # Retrieving predictions and labels
    predictions, labels = eval_pred

    # Decoding the predicted tokens
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Decoding labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Computing ROUGE metrics for the batch
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True, use_aggregator=True)
    result = {key: value * 100 for key, value in result.items()}

    # Adding meaned generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

# Running fine-tuning

In [None]:
torch.cuda.empty_cache()

training_args = Seq2SeqTrainingArguments(
    output_dir=HF_REPO_PATH.split("/")[-1],
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-4,
    warmup_ratio=0.1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,
    lr_scheduler_type="cosine",
    weight_decay=0.01,
    num_train_epochs=6,
    predict_with_generate=True,
    load_best_model_at_end=True,
    metric_for_best_model="rouge2",
    greater_is_better=True,
    fp16=True,
    push_to_hub=False,
    report_to="none",
    disable_tqdm=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_arxiv_train,
    eval_dataset=tokenized_arxiv_valid,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,2.701424,40.9751,13.0858,22.7218,34.0096,138.1167
2,No log,2.633963,41.9854,13.917,23.4403,35.0348,138.4233
3,No log,2.608167,42.3803,14.1218,23.6333,35.3202,136.4833
4,2.857500,2.593262,42.1068,14.1376,23.7581,35.0284,137.1067
5,2.857500,2.588403,42.2979,13.9503,23.6625,35.1867,137.14


TrainOutput(global_step=936, training_loss=2.759424845377604, metrics={'train_runtime': 5295.7373, 'train_samples_per_second': 5.665, 'train_steps_per_second': 0.177, 'total_flos': 6.486527958633677e+16, 'train_loss': 2.759424845377604, 'epoch': 5.9664})

In [None]:
model.push_to_hub(HF_REPO_PATH)

adapter_model.safetensors:   0%|          | 0.00/4.74M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/spolivin/bart-arxiv-lora/commit/e9af83dd8a453e5970aa277be89d1c344e0c918c', commit_message='Upload model', commit_description='', oid='e9af83dd8a453e5970aa277be89d1c344e0c918c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/spolivin/bart-arxiv-lora', endpoint='https://huggingface.co', repo_type='model', repo_id='spolivin/bart-arxiv-lora'), pr_revision=None, pr_num=None)

# Testing the fine-tuned model

In [None]:
tokenizer = AutoTokenizer.from_pretrained(CKPT_PATH)
base_model = AutoModelForSeq2SeqLM.from_pretrained(CKPT_PATH)
lora_model = PeftModel.from_pretrained(base_model, HF_REPO_PATH)

original_model = AutoModelForSeq2SeqLM.from_pretrained(CKPT_PATH)

adapter_config.json:   0%|          | 0.00/831 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/4.74M [00:00<?, ?B/s]

In [None]:
dataset = load_dataset(DATASET_PATH, split="test")

In [None]:
def generate_summary(model, tokenizer, text, max_length=150):
    """Generates summary for the input text."""
    # Tokenizing input text
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=1024,
    ).to("cuda" if torch.cuda.is_available() else "cpu")
    # Moving model to GPU
    model.to(inputs.input_ids.device)
    # Generating output tokens
    summary_ids = model.generate(**inputs, max_length=max_length, num_beams=4, early_stopping=True)

    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [None]:
# Testing on 100 test articles
num_samples = 100
articles = dataset["article"][:num_samples] # test articles
ground_truth_summaries = dataset["abstract"][:num_samples] # test abstracts (labels)

# Generated summaries (original model)
original_summaries = [generate_summary(original_model, tokenizer, text) for text in articles]

# Generated summaries (fine-tuned LoRA)
finetuned_summaries = [generate_summary(lora_model, tokenizer, text) for text in articles]

In [None]:
# Compute ROUGE scores
rouge_original = metric.compute(predictions=original_summaries, references=ground_truth_summaries, use_stemmer=True)
rouge_finetuned = metric.compute(predictions=finetuned_summaries, references=ground_truth_summaries, use_stemmer=True)

# Print results
print("Original Model ROUGE Scores:")
print(rouge_original)
print("\nFine-Tuned Model ROUGE Scores:")
print(rouge_finetuned)

Original Model ROUGE Scores:
{'rouge1': 0.3205922848345746, 'rouge2': 0.10185477335978205, 'rougeL': 0.18819166141800198, 'rougeLsum': 0.252078246102294}

Fine-Tuned Model ROUGE Scores:
{'rouge1': 0.432548540590151, 'rouge2': 0.16073562916243178, 'rougeL': 0.24437127658137536, 'rougeLsum': 0.3709066102489379}
