# app.py import gradio as gr import torch from datasets import load_dataset from transformers import ( AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, BitsAndBytesConfig, pipeline ) from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training # Load dataset dataset = load_dataset( "json", data_files="https://huggingface.co/datasets/bitext/Bitext-customer-support-llm-chatbot-training-dataset/resolve/main/bitext_customer_support.jsonl", split="train[:100]" # Keep it small to avoid timeouts ) def format(example): return { "text": f"### Instruction:\n{example['instruction']}\n\n### Response:\n{example['output']}" } dataset = dataset.map(format) # Tokenizer model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" tokenizer = AutoTokenizer.from_pretrained(model_name) tokenizer.pad_token = tokenizer.eos_token def tokenize(example): tokens = tokenizer(example["text"], truncation=True, padding="max_length", max_length=512) tokens["labels"] = tokens["input_ids"].copy() return tokens tokenized_dataset = dataset.map(tokenize, batched=True) # QLoRA setup bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16 ) model = AutoModelForCausalLM.from_pretrained( model_name, device_map="auto", quantization_config=bnb_config ) model.gradient_checkpointing_enable() model = prepare_model_for_kbit_training(model) lora_config = LoraConfig( r=8, lora_alpha=32, lora_dropout=0.05, bias="none", target_modules=["q_proj", "v_proj"], task_type="CAUSAL_LM" ) model = get_peft_model(model, lora_config) # Training training_args = TrainingArguments( output_dir="trained_model", per_device_train_batch_size=2, gradient_accumulation_steps=4, learning_rate=2e-4, num_train_epochs=1, logging_dir="./logs", save_strategy="no", bf16=True, report_to="none", optim="paged_adamw_8bit" ) trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_dataset, tokenizer=tokenizer ) trainer.train() # Inference pipeline pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) def chatbot(instruction): prompt = f"### Instruction:\n{instruction}\n\n### Response:\n" response = pipe(prompt, max_new_tokens=100)[0]['generated_text'] return response[len(prompt):].strip() gr.Interface( fn=chatbot, inputs="text", outputs="text", title="Fine-Tuned TinyLlama Bitext Chatbot" ).launch()