In [2]:
pip install transformers datasets torch




In [7]:
from transformers import GPT2Tokenizer

def setup_tokenizer(tokenizer):

    if tokenizer.pad_token is None:

        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.pad_token_id = tokenizer.eos_token_id

def load_text_dataset(file_path, tokenizer):
    setup_tokenizer(tokenizer)

    dataset = load_dataset('text', data_files={'train': file_path}, split='train')

    def tokenize_function(examples):
        return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)

    tokenized_datasets = dataset.map(tokenize_function, batched=True)
    return tokenized_datasets


In [8]:
import os
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, DataCollatorForLanguageModeling, Trainer, TrainingArguments

def main():

    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2LMHeadModel.from_pretrained('gpt2')


    setup_tokenizer(tokenizer)


    file_path = '/content/Stories.txt'


    train_dataset = load_text_dataset(file_path, tokenizer)

    # data collator
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )

    # training arguments
    training_args = TrainingArguments(
        output_dir='./story-generator-model',
        overwrite_output_dir=True,
        num_train_epochs=3,  # Adjust epochs based on your needs
        per_device_train_batch_size=4,
        save_steps=10_000,
        save_total_limit=2,
        prediction_loss_only=True,
    )

    # Init Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
    )


    trainer.train()


    model.save_pretrained('./story-generator-model')
    tokenizer.save_pretrained('./story-generator-model')

if __name__ == "__main__":
    main()




Map:   0%|          | 0/13 [00:00<?, ? examples/s]

Step,Training Loss


In [9]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

def evaluate_model(model, tokenizer, test_dataset):
    # data collator
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )

    # training arguments (used for evaluation only)
    training_args = TrainingArguments(
        output_dir='./results',
        per_device_eval_batch_size=4,
        logging_dir='./logs',
    )

    # Trainer instance for evaluation
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        eval_dataset=test_dataset,
    )


    eval_results = trainer.evaluate()
    return eval_results

def main():

    tokenizer = GPT2Tokenizer.from_pretrained('./story-generator-model')
    model = GPT2LMHeadModel.from_pretrained('./story-generator-model')


    file_path = '/content/Stories.txt'
    test_dataset = load_text_dataset(file_path, tokenizer)


    eval_results = evaluate_model(model, tokenizer, test_dataset)
    print("Evaluation Results:")
    print(eval_results)

if __name__ == "__main__":
    main()


Map:   0%|          | 0/13 [00:00<?, ? examples/s]

Evaluation Results:
{'eval_loss': 1.936555027961731, 'eval_model_preparation_time': 0.0042, 'eval_runtime': 39.3769, 'eval_samples_per_second': 0.33, 'eval_steps_per_second': 0.102}


In [14]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

def generate_text(prompt, model, tokenizer, max_length=1000, num_return_sequences=1, temperature=0.7, repetition_penalty=1.2):

    inputs = tokenizer.encode(prompt, return_tensors='pt')

    # Generate text
    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_length=max_length,
            num_return_sequences=num_return_sequences,
            temperature=temperature,
            repetition_penalty=repetition_penalty,
            top_k=50,                # Use top_k sampling to limit to top-k probabilities
            top_p=0.95,              # Use nucleus sampling to limit to top-p cumulative probability
            do_sample=True,          # Enable sampling for varied text generation
            pad_token_id=tokenizer.eos_token_id  # Handle padding correctly
        )

    # Decode the generated text
    generated_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

    return generated_texts

def main():
    output_dir = './results'  # Directory where the model and tokenizer are saved


    model, tokenizer = load_model_and_tokenizer(output_dir)


    prompt = "Once upon a time"

    generated_texts = generate_text(prompt, model, tokenizer, max_length=1000, num_return_sequences=1, temperature=0.7, repetition_penalty=1.2)


    for i, text in enumerate(generated_texts):
        print(f"Generated Text {i + 1}:\n{text}\n")

if __name__ == "__main__":
    main()


Generated Text 1:
Once upon a time, the people of Noxus began to question their existence. This was when they discovered an ancient artifact that had been hidden in some ruins and sought out its creator for guidance on how he might fulfill his destiny as ruler of all mankind.[1]



In [13]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

def save_model(model, tokenizer, output_dir):
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

def load_model_and_tokenizer(output_dir):
    model = GPT2LMHeadModel.from_pretrained(output_dir)
    tokenizer = GPT2Tokenizer.from_pretrained(output_dir)
    return model, tokenizer

model = GPT2LMHeadModel.from_pretrained('./story-generator-model')
tokenizer = GPT2Tokenizer.from_pretrained('./story-generator-model')
save_model(model, tokenizer, './results')
model, tokenizer = load_model_and_tokenizer('./results')