mojad121 commited on
Commit
d4b585a
·
verified ·
1 Parent(s): cab3e7b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -89
app.py CHANGED
@@ -1,132 +1,120 @@
1
  import gradio as gr
2
- from transformers import (
3
- AutoTokenizer,
4
- AutoModelForCausalLM,
5
- pipeline,
6
- Trainer,
7
- TrainingArguments,
8
- DataCollatorForLanguageModeling
9
- )
10
  from datasets import load_dataset, Dataset
11
  import torch
12
  import pandas as pd
 
13
  from sklearn.model_selection import train_test_split
14
 
15
-
16
- # Configuration
17
  MODEL_NAME = "microsoft/DialoGPT-medium"
18
  DATASET_NAME = "embedding-data/Amazon-QA"
19
  FINETUNED_MODEL_NAME = "MujtabaShopifyChatbot"
20
- MAX_LENGTH = 128
21
- BATCH_SIZE = 8
22
 
23
  chatbot_pipe = None
24
- tokenizer = None
25
 
26
  def show_dataset_head(dataset, num_rows=5):
27
- """Dataset preview"""
28
  if isinstance(dataset, dict):
29
  for split in dataset.keys():
 
30
  df = pd.DataFrame(dataset[split][:num_rows])
31
- print(f"\n{split} split preview:")
32
- print(df[['question', 'answer']].head() if 'question' in df.columns else df.head())
 
33
 
34
  def load_and_preprocess_data():
35
- """Data loading with cleaning"""
36
- print(f"Loading {DATASET_NAME}")
37
  try:
38
  dataset = load_dataset(DATASET_NAME)
39
  show_dataset_head(dataset)
40
 
41
  df = pd.DataFrame(dataset['train'])
42
 
43
- # Column normalization
44
  if 'query' in df.columns and 'pos' in df.columns:
45
  df = df.rename(columns={'query': 'question', 'pos': 'answer'})
46
  elif 'question' not in df.columns or 'answer' not in df.columns:
47
- if len(df.columns) >= 2:
48
- df = df.rename(columns={df.columns[0]: 'question', df.columns[1]: 'answer'})
49
- else:
50
- raise ValueError("Dataset must have at least two columns for question and answer")
51
 
52
- # Cleaning
53
  df = df[['question', 'answer']].dropna()
54
- df = df[~df['answer'].str.contains(r'\[|\^|\]', regex=True, na=False)]
55
- df = df[df['answer'].str.len() > 10]
56
- df = df[:10000]
57
 
58
- # Split
59
- train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)
60
- return Dataset.from_pandas(train_df), Dataset.from_pandas(test_df)
61
  except Exception as e:
62
- print(f"Data error: {str(e)}")
63
  raise
64
 
65
- def tokenize_data(train_dataset, test_dataset):
66
- """Basic tokenization"""
67
- global tokenizer
68
- print(f"Tokenizing with {MODEL_NAME}")
69
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
70
  tokenizer.pad_token = tokenizer.eos_token
71
 
72
  def preprocess_function(examples):
73
- texts = [f"{q} {tokenizer.eos_token} {a}" for q, a in zip(examples["question"], examples["answer"])]
74
- return tokenizer(
75
- texts,
76
- max_length=MAX_LENGTH,
 
77
  truncation=True,
78
- padding="max_length",
79
- return_tensors="pt"
80
  )
81
 
82
- train_tokenized = train_dataset.map(preprocess_function, batched=True, remove_columns=['question', 'answer'])
83
- test_tokenized = test_dataset.map(preprocess_function, batched=True, remove_columns=['question', 'answer'])
84
- return train_tokenized, test_tokenized
 
85
 
86
- def fine_tune_model(train_data, test_data):
87
- """Optimized training"""
88
- print("Starting fine-tuning")
89
  model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
 
 
 
 
 
 
90
 
91
  training_args = TrainingArguments(
92
  output_dir="./results",
93
- evaluation_strategy="steps",
94
- eval_steps=500,
95
- learning_rate=3e-5,
96
- per_device_train_batch_size=BATCH_SIZE,
97
- per_device_eval_batch_size=BATCH_SIZE,
98
- num_train_epochs=4,
99
  weight_decay=0.01,
100
- warmup_ratio=0.1,
101
  fp16=torch.cuda.is_available(),
 
 
102
  logging_steps=100,
103
- save_steps=1000,
104
- save_total_limit=2,
105
- load_best_model_at_end=True,
106
- report_to="none" # Disable W&B logging
107
  )
108
 
109
  trainer = Trainer(
110
  model=model,
111
  args=training_args,
112
- train_dataset=train_data,
113
- eval_dataset=test_data,
114
- data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
115
  )
116
 
117
  trainer.train()
 
118
  model.save_pretrained(FINETUNED_MODEL_NAME)
119
  tokenizer.save_pretrained(FINETUNED_MODEL_NAME)
120
  return model
121
 
122
  def initialize_chatbot():
123
- """Initialize generation pipeline"""
124
- global chatbot_pipe, tokenizer
125
- print(f"Loading {FINETUNED_MODEL_NAME}")
126
  try:
 
127
  tokenizer = AutoTokenizer.from_pretrained(FINETUNED_MODEL_NAME)
128
  tokenizer.pad_token = tokenizer.eos_token
129
- model = AutoModelForCausalLM.from_pretrained(FINETUNED_MODEL_NAME)
130
 
131
  chatbot_pipe = pipeline(
132
  "text-generation",
@@ -134,48 +122,54 @@ def initialize_chatbot():
134
  tokenizer=tokenizer,
135
  device=0 if torch.cuda.is_available() else -1
136
  )
 
137
  except Exception as e:
138
- print(f"Initialization failed: {str(e)}")
139
- raise
 
140
 
141
  def generate_response(message, history):
142
- """Direct generation without prompt engineering"""
143
- if not chatbot_pipe:
144
- return "System initializing..."
145
 
146
  try:
 
147
  response = chatbot_pipe(
148
- message,
149
- max_length=MAX_LENGTH,
150
  do_sample=True,
151
  temperature=0.7,
152
- top_k=50,
153
- top_p=0.9,
154
- repetition_penalty=1.2,
155
- num_return_sequences=1
156
  )[0]['generated_text']
157
-
158
- return response.split(tokenizer.eos_token)[-1].strip()
 
159
  except Exception as e:
160
- print(f"Generation error: {str(e)}")
161
- return "Please try again later."
162
 
163
  def deploy_chatbot():
164
- """Gradio interface"""
165
  demo = gr.ChatInterface(
166
  fn=generate_response,
167
- title="Shopify Assistant",
 
168
  examples=[
169
- "Does this work with iPhone 15?",
170
- "What's the return policy?",
171
- "Do you ship internationally?"
172
- ]
 
 
173
  )
174
  return demo
175
 
176
  if __name__ == "__main__":
177
- train_data, test_data = load_and_preprocess_data()
178
- train_tokenized, test_tokenized = tokenize_data(train_data, test_data)
179
- model = fine_tune_model(train_tokenized, test_tokenized)
 
 
180
  initialize_chatbot()
181
  deploy_chatbot().launch()
 
1
  import gradio as gr
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, Trainer, TrainingArguments
 
 
 
 
 
 
 
3
  from datasets import load_dataset, Dataset
4
  import torch
5
  import pandas as pd
6
+ from transformers import DataCollatorForLanguageModeling
7
  from sklearn.model_selection import train_test_split
8
 
 
 
9
  MODEL_NAME = "microsoft/DialoGPT-medium"
10
  DATASET_NAME = "embedding-data/Amazon-QA"
11
  FINETUNED_MODEL_NAME = "MujtabaShopifyChatbot"
 
 
12
 
13
  chatbot_pipe = None
 
14
 
15
  def show_dataset_head(dataset, num_rows=5):
16
+ print("Displaying dataset preview ", dataset)
17
  if isinstance(dataset, dict):
18
  for split in dataset.keys():
19
+ print("Current split ", split)
20
  df = pd.DataFrame(dataset[split][:num_rows])
21
+ cols = [col for col in ['query', 'pos', 'question', 'answer'] if col in df.columns]
22
+ if cols:
23
+ print("Dataset columns ", cols)
24
 
25
  def load_and_preprocess_data():
26
+ print("Loading dataset from ", DATASET_NAME)
 
27
  try:
28
  dataset = load_dataset(DATASET_NAME)
29
  show_dataset_head(dataset)
30
 
31
  df = pd.DataFrame(dataset['train'])
32
 
 
33
  if 'query' in df.columns and 'pos' in df.columns:
34
  df = df.rename(columns={'query': 'question', 'pos': 'answer'})
35
  elif 'question' not in df.columns or 'answer' not in df.columns:
36
+ df = df.rename(columns={df.columns[0]: 'question', df.columns[1]: 'answer'})
 
 
 
37
 
 
38
  df = df[['question', 'answer']].dropna()
39
+ df = df[:5000]
40
+
41
+ df['answer'] = df['answer'].astype(str).str.replace(r'\[\^|\].*', '', regex=True)
42
 
43
+ processed_dataset = Dataset.from_pandas(df)
44
+ show_dataset_head(processed_dataset)
45
+ return processed_dataset.train_test_split(test_size=0.1)
46
  except Exception as e:
47
+ print("Error loading dataset ", e)
48
  raise
49
 
50
+ def tokenize_data(dataset):
51
+ print("Tokenizing data with model ", MODEL_NAME)
 
 
52
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
53
  tokenizer.pad_token = tokenizer.eos_token
54
 
55
  def preprocess_function(examples):
56
+ inputs = [f"question: {q} answer: {a}" for q, a in zip(examples["question"], examples["answer"])]
57
+
58
+ model_inputs = tokenizer(
59
+ inputs,
60
+ max_length=128,
61
  truncation=True,
62
+ padding='max_length'
 
63
  )
64
 
65
+ model_inputs["labels"] = model_inputs["input_ids"].copy()
66
+ return model_inputs
67
+
68
+ return dataset.map(preprocess_function, batched=True)
69
 
70
+ def fine_tune_model(tokenized_dataset):
71
+ print("Starting fine-tuning process")
72
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
73
  model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
74
+ tokenizer.pad_token = tokenizer.eos_token
75
+
76
+ data_collator = DataCollatorForLanguageModeling(
77
+ tokenizer=tokenizer,
78
+ mlm=False
79
+ )
80
 
81
  training_args = TrainingArguments(
82
  output_dir="./results",
83
+ eval_strategy="epoch",
84
+ learning_rate=5e-5,
85
+ per_device_train_batch_size=4,
86
+ per_device_eval_batch_size=4,
87
+ num_train_epochs=3,
 
88
  weight_decay=0.01,
89
+ save_total_limit=3,
90
  fp16=torch.cuda.is_available(),
91
+ push_to_hub=False,
92
+ report_to="none",
93
  logging_steps=100,
94
+ save_steps=500
 
 
 
95
  )
96
 
97
  trainer = Trainer(
98
  model=model,
99
  args=training_args,
100
+ train_dataset=tokenized_dataset["train"],
101
+ eval_dataset=tokenized_dataset["test"],
102
+ data_collator=data_collator
103
  )
104
 
105
  trainer.train()
106
+ print("Training completed, saving model")
107
  model.save_pretrained(FINETUNED_MODEL_NAME)
108
  tokenizer.save_pretrained(FINETUNED_MODEL_NAME)
109
  return model
110
 
111
  def initialize_chatbot():
112
+ global chatbot_pipe
113
+ print("Initializing chatbot with model ", FINETUNED_MODEL_NAME)
 
114
  try:
115
+ model = AutoModelForCausalLM.from_pretrained(FINETUNED_MODEL_NAME)
116
  tokenizer = AutoTokenizer.from_pretrained(FINETUNED_MODEL_NAME)
117
  tokenizer.pad_token = tokenizer.eos_token
 
118
 
119
  chatbot_pipe = pipeline(
120
  "text-generation",
 
122
  tokenizer=tokenizer,
123
  device=0 if torch.cuda.is_available() else -1
124
  )
125
+ print("Chatbot initialized successfully")
126
  except Exception as e:
127
+ print("Error initializing chatbot ", e)
128
+ return None
129
+ return chatbot_pipe
130
 
131
  def generate_response(message, history):
132
+ if chatbot_pipe is None:
133
+ print("Chatbot pipeline not initialized")
134
+ return "System error: Chatbot not ready"
135
 
136
  try:
137
+ print("Generating response for query ", message)
138
  response = chatbot_pipe(
139
+ f"question: {message} answer:",
140
+ max_length=128,
141
  do_sample=True,
142
  temperature=0.7,
143
+ top_p=0.9
 
 
 
144
  )[0]['generated_text']
145
+ final_response = response.split("answer:")[-1].strip()
146
+ print("Generated response ", final_response)
147
+ return final_response
148
  except Exception as e:
149
+ print("Error generating response ", e)
150
+ return "Sorry, I encountered an error processing your request"
151
 
152
  def deploy_chatbot():
153
+ print("Launching chatbot interface")
154
  demo = gr.ChatInterface(
155
  fn=generate_response,
156
+ title="Mujtaba's Shopify Assistant",
157
+ description="Ask about products, shipping, or store policies",
158
  examples=[
159
+ "Will this work with iPhone 15?",
160
+ "What's the return window?",
161
+ "Do you ship to Lahore?"
162
+ ],
163
+ theme="soft",
164
+ cache_examples=False
165
  )
166
  return demo
167
 
168
  if __name__ == "__main__":
169
+ dataset = load_and_preprocess_data()
170
+ tokenized_data = tokenize_data(dataset)
171
+
172
+ model = fine_tune_model(tokenized_data)
173
+
174
  initialize_chatbot()
175
  deploy_chatbot().launch()