Spaces:
Sleeping
Sleeping
import gradio as gr | |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, Trainer, TrainingArguments | |
from datasets import load_dataset, Dataset | |
import torch | |
import pandas as pd | |
from huggingface_hub import notebook_login | |
from transformers import DataCollatorForSeq2Seq | |
MODEL_NAME = "microsoft/DialoGPT-small" | |
DATASET_NAME = "embedding-data/amazon-QA" | |
FINETUNED_MODEL_NAME = "MujtabaShopifyChatbot" | |
HF_TOKEN = "your_huggingface_token" | |
chatbot_pipe = None | |
def show_dataset_head(dataset, num_rows=5): | |
print("Displaying dataset preview ", dataset) | |
if isinstance(dataset, dict): | |
for split in dataset.keys(): | |
print("Current split ", split) | |
df = pd.DataFrame(dataset[split][:num_rows]) | |
cols = [col for col in ['query', 'pos', 'question', 'answer'] if col in df.columns] | |
if cols: | |
print("Dataset columns ", cols) | |
def load_and_preprocess_data(): | |
print("Loading dataset from ", DATASET_NAME) | |
dataset = load_dataset(DATASET_NAME) | |
show_dataset_head(dataset) | |
df = pd.DataFrame(dataset['train']) | |
if 'query' in df.columns and 'pos' in df.columns: | |
df = df.rename(columns={'query': 'question', 'pos': 'answer'}) | |
elif 'question' not in df.columns or 'answer' not in df.columns: | |
df = df.rename(columns={df.columns[0]: 'question', df.columns[1]: 'answer'}) | |
df = df[['question', 'answer']].dropna() | |
df = df[:5000] | |
df['answer'] = df['answer'].astype(str).str.replace(r'\[\^|\].*', '', regex=True) | |
processed_dataset = Dataset.from_pandas(df) | |
show_dataset_head(processed_dataset) | |
return processed_dataset.train_test_split(test_size=0.1) | |
def tokenize_data(dataset): | |
print("Tokenizing data with model ", MODEL_NAME) | |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
def preprocess_function(examples): | |
inputs = [f"question: {q} answer:" for q in examples["question"]] | |
targets = [str(a) for a in examples["answer"]] | |
model_inputs = tokenizer( | |
inputs, | |
max_length=128, | |
truncation=True, | |
padding='max_length' | |
) | |
labels = tokenizer( | |
targets, | |
max_length=128, | |
truncation=True, | |
padding='max_length' | |
) | |
model_inputs["labels"] = labels["input_ids"] | |
return model_inputs | |
return dataset.map(preprocess_function, batched=True) | |
def fine_tune_model(tokenized_dataset): | |
print("Starting fine-tuning process") | |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME) | |
data_collator = DataCollatorForSeq2Seq( | |
tokenizer, | |
model=model, | |
padding='longest', | |
max_length=128, | |
pad_to_multiple_of=8 | |
) | |
training_args = TrainingArguments( | |
output_dir="./results", | |
eval_strategy="epoch", | |
learning_rate=5e-5, | |
per_device_train_batch_size=4, | |
per_device_eval_batch_size=4, | |
num_train_epochs=3, | |
weight_decay=0.01, | |
save_total_limit=3, | |
fp16=torch.cuda.is_available(), | |
push_to_hub=False, | |
report_to="none", | |
logging_steps=100, | |
save_steps=500, | |
gradient_accumulation_steps=1 | |
) | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=tokenized_dataset["train"], | |
eval_dataset=tokenized_dataset["test"], | |
data_collator=data_collator, | |
tokenizer=tokenizer | |
) | |
trainer.train() | |
print("Training completed, saving model") | |
model.save_pretrained(FINETUNED_MODEL_NAME) | |
tokenizer.save_pretrained(FINETUNED_MODEL_NAME) | |
return model | |
def initialize_chatbot(): | |
global chatbot_pipe | |
print("Initializing chatbot with model ", FINETUNED_MODEL_NAME) | |
try: | |
model = AutoModelForSeq2SeqLM.from_pretrained(FINETUNED_MODEL_NAME) | |
tokenizer = AutoTokenizer.from_pretrained(FINETUNED_MODEL_NAME) | |
chatbot_pipe = pipeline( | |
"text2text-generation", | |
model=model, | |
tokenizer=tokenizer, | |
device=0 if torch.cuda.is_available() else -1 | |
) | |
print("Chatbot initialized successfully") | |
except Exception as e: | |
print("Error initializing chatbot ", e) | |
return None | |
return chatbot_pipe | |
def generate_response(message, history): | |
if chatbot_pipe is None: | |
print("Chatbot pipeline not initialized") | |
return "System error: Chatbot not ready" | |
try: | |
print("Generating response for query ", message) | |
response = chatbot_pipe( | |
f"question: {message} answer:", | |
max_length=128, | |
do_sample=True, | |
temperature=0.7, | |
top_p=0.9 | |
)[0]['generated_text'] | |
final_response = response.split("answer:")[-1].strip() | |
print("Generated response ", final_response) | |
return final_response | |
except Exception as e: | |
print("Error generating response ", e) | |
return "Sorry, I encountered an error processing your request" | |
def deploy_chatbot(): | |
print("Launching chatbot interface") | |
demo = gr.ChatInterface( | |
fn=generate_response, | |
title="Mujtaba's Shopify Assistant", | |
description="Ask about products, shipping, or store policies", | |
examples=[ | |
"Will this work with iPhone 15?", | |
"What's the return window?", | |
"Do you ship to Lahore?" | |
], | |
theme="soft", | |
cache_examples=False | |
) | |
return demo | |
if __name__ == "__main__": | |
notebook_login() | |
dataset = load_and_preprocess_data() | |
tokenized_data = tokenize_data(dataset) | |
model = fine_tune_model(tokenized_data) | |
initialize_chatbot() | |
deploy_chatbot().launch() |