# app.py

import gradio as gr
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, BitsAndBytesConfig, pipeline
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# Load dataset
dataset = load_dataset(
    "json",
    data_files="https://huggingface.co/datasets/bitext/Bitext-customer-support-llm-chatbot-training-dataset/resolve/main/bitext_customer_support.jsonl",
    split="train[:100]"  # Keep it small to avoid timeouts
)

def format(example):
    return {
        "text": f"### Instruction:\n{example['instruction']}\n\n### Response:\n{example['output']}"
    }

dataset = dataset.map(format)

# Tokenizer
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

def tokenize(example):
    tokens = tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

tokenized_dataset = dataset.map(tokenize, batched=True)

# QLoRA setup
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config
)

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules=["q_proj", "v_proj"],
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

# Training
training_args = TrainingArguments(
    output_dir="trained_model",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    num_train_epochs=1,
    logging_dir="./logs",
    save_strategy="no",
    bf16=True,
    report_to="none",
    optim="paged_adamw_8bit"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer
)

trainer.train()

# Inference pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

def chatbot(instruction):
    prompt = f"### Instruction:\n{instruction}\n\n### Response:\n"
    response = pipe(prompt, max_new_tokens=100)[0]['generated_text']
    return response[len(prompt):].strip()

gr.Interface(
    fn=chatbot,
    inputs="text",
    outputs="text",
    title="Fine-Tuned TinyLlama Bitext Chatbot"
).launch()