Prathamesh1420 commited on
Commit
3d7d1bb
·
verified ·
1 Parent(s): 0dd84ea

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +348 -200
app.py CHANGED
@@ -27,22 +27,27 @@ pinecone_api_key = os.environ.get("PINECONE_API_KEY")
27
  mlflow_tracking_uri = os.environ.get("MLFLOW_TRACKING_URI")
28
 
29
  # ------------------ DagsHub & MLflow Setup ------------------
30
- dagshub.init(
31
- repo_owner='prathamesh.khade20',
32
- repo_name='Maintenance_AI_website',
33
- mlflow=True
34
- )
35
-
36
- mlflow.set_tracking_uri(mlflow_tracking_uri)
37
- mlflow.set_experiment("Maintenance-RAG-Chatbot")
38
- mlflow.langchain.autolog()
 
 
39
 
40
  # ------------------ RAG Evaluator ------------------
41
  class RAGEvaluator:
42
  def __init__(self):
43
- self.gpt2_model, self.gpt2_tokenizer = self.load_gpt2_model()
44
- self.bias_pipeline = pipeline("zero-shot-classification", model="Hate-speech-CNERG/dehatebert-mono-english")
45
- self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
 
 
 
46
 
47
  def load_gpt2_model(self):
48
  model = GPT2LMHeadModel.from_pretrained('gpt2')
@@ -51,187 +56,264 @@ class RAGEvaluator:
51
 
52
  # BLEU, ROUGE
53
  def evaluate_bleu_rouge(self, candidates, references):
54
- bleu_score = corpus_bleu(candidates, [references]).score
55
- scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
56
- rouge_scores = [scorer.score(ref, cand) for ref, cand in zip(references, candidates)]
57
- rouge1 = sum([score['rouge1'].fmeasure for score in rouge_scores]) / len(rouge_scores)
58
- rouge2 = sum([score['rouge2'].fmeasure for score in rouge_scores]) / len(rouge_scores)
59
- rougeL = sum([score['rougeL'].fmeasure for score in rouge_scores]) / len(rouge_scores)
60
- return bleu_score, rouge1, rouge2, rougeL
 
 
 
 
61
 
62
  # BERT Score
63
  def evaluate_bert_score(self, candidates, references):
64
- P, R, F1 = score(candidates, references, lang="en", model_type='bert-base-multilingual-cased')
65
- return P.mean().item(), R.mean().item(), F1.mean().item()
 
 
 
 
66
 
67
  # Perplexity
68
  def evaluate_perplexity(self, text):
69
- encodings = self.gpt2_tokenizer(text, return_tensors='pt')
70
- max_length = self.gpt2_model.config.n_positions
71
- stride = 512
72
- lls = []
73
- for i in range(0, encodings.input_ids.size(1), stride):
74
- begin_loc = max(i + stride - max_length, 0)
75
- end_loc = min(i + stride, encodings.input_ids.size(1))
76
- trg_len = end_loc - i
77
- input_ids = encodings.input_ids[:, begin_loc:end_loc]
78
- target_ids = input_ids.clone()
79
- target_ids[:, :-trg_len] = -100
80
- with torch.no_grad():
81
- outputs = self.gpt2_model(input_ids, labels=target_ids)
82
- log_likelihood = outputs[0] * trg_len
83
- lls.append(log_likelihood)
84
- ppl = torch.exp(torch.stack(lls).sum() / end_loc)
85
- return ppl.item()
 
 
 
 
86
 
87
  # Diversity
88
  def evaluate_diversity(self, texts):
89
- all_tokens = []
90
- for text in texts:
91
- tokens = self.tokenizer.tokenize(text)
92
- all_tokens.extend(tokens)
93
- unique_bigrams = set()
94
- for i in range(len(all_tokens) - 1):
95
- unique_bigrams.add((all_tokens[i], all_tokens[i+1]))
96
- return len(unique_bigrams) / len(all_tokens) if all_tokens else 0
 
 
 
 
97
 
98
  # Racial bias
99
  def evaluate_racial_bias(self, text):
100
- results = self.bias_pipeline([text], candidate_labels=["hate speech", "not hate speech"])
101
- bias_score = results[0]['scores'][results[0]['labels'].index('hate speech')]
102
- return bias_score
 
 
 
 
103
 
104
  # METEOR
105
  def evaluate_meteor(self, candidates, references):
106
- meteor_scores = []
107
- for ref, cand in zip(references, candidates):
108
- ref_tokens = self.tokenizer.tokenize(ref)
109
- cand_tokens = self.tokenizer.tokenize(cand)
110
- common_tokens = set(ref_tokens) & set(cand_tokens)
111
- precision = len(common_tokens) / len(cand_tokens) if cand_tokens else 0
112
- recall = len(common_tokens) / len(ref_tokens) if ref_tokens else 0
113
- if precision + recall == 0:
114
- f_score = 0
115
- else:
116
- f_score = (10 * precision * recall) / (9 * precision + recall)
117
- meteor_scores.append(f_score)
118
- return sum(meteor_scores) / len(meteor_scores) if meteor_scores else 0
 
 
 
 
119
 
120
  # CHRF
121
  def evaluate_chrf(self, candidates, references):
122
- chrf_scores = []
123
- for ref, cand in zip(references, candidates):
124
- ref_chars = list(ref)
125
- cand_chars = list(cand)
126
- ref_ngrams = set()
127
- cand_ngrams = set()
128
- for i in range(len(ref_chars) - 5):
129
- ref_ngrams.add(tuple(ref_chars[i:i+6]))
130
- for i in range(len(cand_chars) - 5):
131
- cand_ngrams.add(tuple(cand_chars[i:i+6]))
132
- common_ngrams = ref_ngrams & cand_ngrams
133
- precision = len(common_ngrams) / len(cand_ngrams) if cand_ngrams else 0
134
- recall = len(common_ngrams) / len(ref_ngrams) if ref_ngrams else 0
135
- chrf_score = 2 * precision * recall / (precision + recall) if precision + recall else 0
136
- chrf_scores.append(chrf_score)
137
- return sum(chrf_scores) / len(chrf_scores) if chrf_scores else 0
 
 
 
 
138
 
139
  # Readability
140
  def evaluate_readability(self, text):
141
- words = re.findall(r'\b\w+\b', text.lower())
142
- sentences = re.split(r'[.!?]+', text)
143
- num_words = len(words)
144
- num_sentences = len([s for s in sentences if s.strip()])
145
- avg_word_length = sum(len(word) for word in words) / num_words if num_words else 0
146
- words_per_sentence = num_words / num_sentences if num_sentences else 0
147
- flesch_ease = 206.835 - (1.015 * words_per_sentence) - (84.6 * avg_word_length)
148
- flesch_grade = (0.39 * words_per_sentence) + (11.8 * avg_word_length) - 15.59
149
- return flesch_ease, flesch_grade
 
 
 
 
150
 
151
  # MAUVE
152
  def evaluate_mauve(self, reference_texts, generated_texts):
153
- out = compute_mauve(
154
- p_text=reference_texts,
155
- q_text=generated_texts,
156
- device_id=0,
157
- max_text_length=1024,
158
- verbose=False
159
- )
160
- return out.mauve
 
 
 
 
161
 
162
  def evaluate_all(self, question, response, reference):
163
- candidates = [response]
164
- references = [reference]
165
- bleu, rouge1, rouge2, rougeL = self.evaluate_bleu_rouge(candidates, references)
166
- bert_p, bert_r, bert_f1 = self.evaluate_bert_score(candidates, references)
167
- perplexity = self.evaluate_perplexity(response)
168
- diversity = self.evaluate_diversity(candidates)
169
- racial_bias = self.evaluate_racial_bias(response)
170
- meteor = self.evaluate_meteor(candidates, references)
171
- chrf = self.evaluate_chrf(candidates, references)
172
- flesch_ease, flesch_grade = self.evaluate_readability(response)
173
- mauve_score = self.evaluate_mauve(references, candidates) if len(references) > 1 else 0.0
174
- return {
175
- "BLEU": bleu,
176
- "ROUGE-1": rouge1,
177
- "ROUGE-2": rouge2,
178
- "ROUGE-L": rougeL,
179
- "BERT_Precision": bert_p,
180
- "BERT_Recall": bert_r,
181
- "BERT_F1": bert_f1,
182
- "Perplexity": perplexity,
183
- "Diversity": diversity,
184
- "Racial_Bias": racial_bias,
185
- "MAUVE": mauve_score,
186
- "METEOR": meteor,
187
- "CHRF": chrf,
188
- "Flesch_Reading_Ease": flesch_ease,
189
- "Flesch_Kincaid_Grade": flesch_grade,
190
- }
 
 
 
 
 
191
 
192
  # Initialize evaluator
193
  evaluator = RAGEvaluator()
194
 
195
  # ------------------ Pinecone ------------------
196
  def init_pinecone():
197
- pc = Pinecone(api_key=pinecone_api_key)
198
- return pc.Index("rag-granite-index")
 
 
 
 
199
 
200
  index = init_pinecone()
201
 
202
  # ------------------ Embeddings ------------------
203
- embeddings_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
 
 
 
 
204
 
205
  def get_retrieved_context(query: str, top_k=3):
206
- start = time.time()
207
- query_embedding = embeddings_model.embed_query(query)
208
- mlflow.log_metric("embedding_latency", time.time() - start)
209
- results = index.query(
210
- namespace="rag-ns",
211
- vector=query_embedding,
212
- top_k=top_k,
213
- include_metadata=True
214
- )
215
- mlflow.log_metric("retrieved_chunks", len(results['matches']))
216
- return "\n".join([m['metadata']['text'] for m in results['matches']])
 
 
 
 
 
 
 
 
 
 
 
 
217
 
218
  # ------------------ Custom LLM ------------------
219
  class LitServeLLM(LLM):
220
  endpoint_url: str
221
 
222
- @mlflow.trace
223
  def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
224
- payload = {"prompt": prompt}
225
- start_time = time.time()
226
- response = requests.post(self.endpoint_url, json=payload)
227
- mlflow.log_metric("lit_serve_latency", time.time() - start_time)
228
- if response.status_code == 200:
229
- data = response.json()
230
- mlflow.log_metric("response_tokens", len(data.get("response", "").split()))
231
- return data.get("response", "").strip()
232
- else:
233
- mlflow.log_metric("request_errors", 1)
234
- raise ValueError(f"Request failed: {response.status_code}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
 
236
  @property
237
  def _identifying_params(self) -> Mapping[str, Any]:
@@ -241,50 +323,89 @@ class LitServeLLM(LLM):
241
  def _llm_type(self) -> str:
242
  return "litserve_llm"
243
 
244
- model = LitServeLLM(endpoint_url="https://8001-01k2h9d9mervcmgfn66ybkpwvq.cloudspaces.litng.ai/predict")
 
 
 
 
 
245
 
246
  prompt = PromptTemplate(
247
  input_variables=["context", "question"],
248
  template="""
249
  You are a smart assistant. Based on the provided context, answer the question in 1–2 lines only.
250
  If the context has more details, summarize it concisely.
 
251
  Context:
252
  {context}
 
253
  Question: {question}
 
254
  Answer:
255
  """
256
  )
257
 
258
- llm_chain = LLMChain(llm=model, prompt=prompt)
 
 
 
 
 
 
 
 
 
259
 
260
  # ------------------ RAG Pipeline ------------------
261
  def get_rag_response(question):
262
- """Get the complete RAG response without streaming"""
263
- retrieved_context = get_retrieved_context(question)
264
- full_response = llm_chain.invoke({
265
- "context": retrieved_context,
266
- "question": question
267
- })["text"].strip()
268
-
269
- if "Answer:" in full_response:
270
- full_response = full_response.split("Answer:", 1)[-1].strip()
271
-
272
- return full_response, retrieved_context
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
 
274
  def rag_pipeline_stream(question):
275
  """Streaming version of RAG pipeline"""
276
- full_response, _ = get_rag_response(question)
277
-
278
- # Stream word by word
279
- words = full_response.split()
280
- current_text = ""
281
- for word in words:
282
- current_text += word + " "
283
- yield current_text
284
- time.sleep(0.05) # Adjust speed as needed
 
 
 
 
 
 
285
 
286
  # ------------------ Gradio UI ------------------
287
- with gr.Blocks(theme=gr.themes.Soft()) as demo:
288
  gr.Markdown("""
289
  # 🛠 Maintenance AI Assistant
290
  *Your intelligent companion for maintenance queries and troubleshooting*
@@ -292,7 +413,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
292
 
293
  usage_counter = gr.State(value=0)
294
  session_start = gr.State(value=datetime.now().isoformat())
295
- current_response = gr.State(value="") # Store current response for evaluation
296
 
297
  with gr.Row():
298
  with gr.Column(scale=1):
@@ -328,49 +449,68 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
328
 
329
  def track_usage(question, count, session_start, feedback_value=None):
330
  """Track usage and get response"""
331
- count += 1
332
- with mlflow.start_run(run_name=f"User-Interaction-{count}", nested=True):
333
- mlflow.log_param("question", question)
334
- mlflow.log_param("session_start", session_start)
335
- mlflow.log_param("user_feedback", feedback_value or "No feedback")
336
-
337
- if feedback_value:
338
- mlflow.log_metric("helpful_responses", 1 if feedback_value == "Helpful" else 0)
339
-
340
- mlflow.log_metric("total_queries", count)
341
-
342
- # Get response and context
343
- response, context = get_rag_response(question)
344
 
345
- # Log response metrics
346
- mlflow.log_metric("response_length", len(response))
347
- mlflow.log_metric("response_tokens", len(response.split()))
348
-
349
- return response, count, session_start, response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
350
 
351
  def evaluate_response(question, response):
352
  """Evaluate the response and return metrics"""
353
- if not question or not response:
354
- return gr.update(value={}, visible=False)
355
 
356
  try:
357
  context = get_retrieved_context(question)
358
  metrics = evaluator.evaluate_all(question, response, context)
359
 
360
- # Log metrics to MLflow
361
- for metric_name, metric_value in metrics.items():
362
- if isinstance(metric_value, (int, float)):
363
- mlflow.log_metric(metric_name, metric_value)
 
364
 
365
  return gr.update(value=metrics, visible=True)
366
  except Exception as e:
367
  print(f"Evaluation error: {e}")
368
- return gr.update(value={"error": str(e)}, visible=True)
369
 
370
  def clear_chat():
371
  """Clear the chat interface"""
372
  return "", "", gr.update(visible=False)
373
 
 
 
 
 
 
 
 
374
  # Main interaction flow
375
  ask_button.click(
376
  fn=lambda: ("", gr.update(visible=False)), # Clear previous metrics
@@ -399,8 +539,16 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
399
  )
400
 
401
  # Feedback handling
 
 
 
 
 
 
 
 
402
  feedback.change(
403
- fn=lambda feedback_val: mlflow.log_metric("user_feedback_score", 1 if feedback_val == "Helpful" else 0),
404
  inputs=[feedback],
405
  outputs=[]
406
  )
@@ -409,6 +557,6 @@ if __name__ == "__main__":
409
  demo.launch(
410
  server_name="0.0.0.0",
411
  server_port=7860,
412
- share=True,
413
  show_error=True
414
  )
 
27
  mlflow_tracking_uri = os.environ.get("MLFLOW_TRACKING_URI")
28
 
29
  # ------------------ DagsHub & MLflow Setup ------------------
30
+ try:
31
+ dagshub.init(
32
+ repo_owner='prathamesh.khade20',
33
+ repo_name='Maintenance_AI_website',
34
+ mlflow=True
35
+ )
36
+ mlflow.set_tracking_uri(mlflow_tracking_uri)
37
+ mlflow.set_experiment("Maintenance-RAG-Chatbot")
38
+ mlflow.langchain.autolog()
39
+ except Exception as e:
40
+ print(f"MLflow/DagsHub initialization failed: {e}")
41
 
42
  # ------------------ RAG Evaluator ------------------
43
  class RAGEvaluator:
44
  def __init__(self):
45
+ try:
46
+ self.gpt2_model, self.gpt2_tokenizer = self.load_gpt2_model()
47
+ self.bias_pipeline = pipeline("zero-shot-classification", model="Hate-speech-CNERG/dehatebert-mono-english")
48
+ self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
49
+ except Exception as e:
50
+ print(f"Evaluator initialization failed: {e}")
51
 
52
  def load_gpt2_model(self):
53
  model = GPT2LMHeadModel.from_pretrained('gpt2')
 
56
 
57
  # BLEU, ROUGE
58
  def evaluate_bleu_rouge(self, candidates, references):
59
+ try:
60
+ bleu_score = corpus_bleu(candidates, [references]).score
61
+ scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
62
+ rouge_scores = [scorer.score(ref, cand) for ref, cand in zip(references, candidates)]
63
+ rouge1 = sum([score['rouge1'].fmeasure for score in rouge_scores]) / len(rouge_scores)
64
+ rouge2 = sum([score['rouge2'].fmeasure for score in rouge_scores]) / len(rouge_scores)
65
+ rougeL = sum([score['rougeL'].fmeasure for score in rouge_scores]) / len(rouge_scores)
66
+ return bleu_score, rouge1, rouge2, rougeL
67
+ except Exception as e:
68
+ print(f"BLEU/ROUGE evaluation failed: {e}")
69
+ return 0, 0, 0, 0
70
 
71
  # BERT Score
72
  def evaluate_bert_score(self, candidates, references):
73
+ try:
74
+ P, R, F1 = score(candidates, references, lang="en", model_type='bert-base-multilingual-cased')
75
+ return P.mean().item(), R.mean().item(), F1.mean().item()
76
+ except Exception as e:
77
+ print(f"BERT score evaluation failed: {e}")
78
+ return 0, 0, 0
79
 
80
  # Perplexity
81
  def evaluate_perplexity(self, text):
82
+ try:
83
+ encodings = self.gpt2_tokenizer(text, return_tensors='pt')
84
+ max_length = self.gpt2_model.config.n_positions
85
+ stride = 512
86
+ lls = []
87
+ for i in range(0, encodings.input_ids.size(1), stride):
88
+ begin_loc = max(i + stride - max_length, 0)
89
+ end_loc = min(i + stride, encodings.input_ids.size(1))
90
+ trg_len = end_loc - i
91
+ input_ids = encodings.input_ids[:, begin_loc:end_loc]
92
+ target_ids = input_ids.clone()
93
+ target_ids[:, :-trg_len] = -100
94
+ with torch.no_grad():
95
+ outputs = self.gpt2_model(input_ids, labels=target_ids)
96
+ log_likelihood = outputs[0] * trg_len
97
+ lls.append(log_likelihood)
98
+ ppl = torch.exp(torch.stack(lls).sum() / end_loc)
99
+ return ppl.item()
100
+ except Exception as e:
101
+ print(f"Perplexity evaluation failed: {e}")
102
+ return 1000.0 # High perplexity indicates error
103
 
104
  # Diversity
105
  def evaluate_diversity(self, texts):
106
+ try:
107
+ all_tokens = []
108
+ for text in texts:
109
+ tokens = self.tokenizer.tokenize(text)
110
+ all_tokens.extend(tokens)
111
+ unique_bigrams = set()
112
+ for i in range(len(all_tokens) - 1):
113
+ unique_bigrams.add((all_tokens[i], all_tokens[i+1]))
114
+ return len(unique_bigrams) / len(all_tokens) if all_tokens else 0
115
+ except Exception as e:
116
+ print(f"Diversity evaluation failed: {e}")
117
+ return 0
118
 
119
  # Racial bias
120
  def evaluate_racial_bias(self, text):
121
+ try:
122
+ results = self.bias_pipeline([text], candidate_labels=["hate speech", "not hate speech"])
123
+ bias_score = results[0]['scores'][results[0]['labels'].index('hate speech')]
124
+ return bias_score
125
+ except Exception as e:
126
+ print(f"Bias evaluation failed: {e}")
127
+ return 0
128
 
129
  # METEOR
130
  def evaluate_meteor(self, candidates, references):
131
+ try:
132
+ meteor_scores = []
133
+ for ref, cand in zip(references, candidates):
134
+ ref_tokens = self.tokenizer.tokenize(ref)
135
+ cand_tokens = self.tokenizer.tokenize(cand)
136
+ common_tokens = set(ref_tokens) & set(cand_tokens)
137
+ precision = len(common_tokens) / len(cand_tokens) if cand_tokens else 0
138
+ recall = len(common_tokens) / len(ref_tokens) if ref_tokens else 0
139
+ if precision + recall == 0:
140
+ f_score = 0
141
+ else:
142
+ f_score = (10 * precision * recall) / (9 * precision + recall)
143
+ meteor_scores.append(f_score)
144
+ return sum(meteor_scores) / len(meteor_scores) if meteor_scores else 0
145
+ except Exception as e:
146
+ print(f"METEOR evaluation failed: {e}")
147
+ return 0
148
 
149
  # CHRF
150
  def evaluate_chrf(self, candidates, references):
151
+ try:
152
+ chrf_scores = []
153
+ for ref, cand in zip(references, candidates):
154
+ ref_chars = list(ref)
155
+ cand_chars = list(cand)
156
+ ref_ngrams = set()
157
+ cand_ngrams = set()
158
+ for i in range(len(ref_chars) - 5):
159
+ ref_ngrams.add(tuple(ref_chars[i:i+6]))
160
+ for i in range(len(cand_chars) - 5):
161
+ cand_ngrams.add(tuple(cand_chars[i:i+6]))
162
+ common_ngrams = ref_ngrams & cand_ngrams
163
+ precision = len(common_ngrams) / len(cand_ngrams) if cand_ngrams else 0
164
+ recall = len(common_ngrams) / len(ref_ngrams) if ref_ngrams else 0
165
+ chrf_score = 2 * precision * recall / (precision + recall) if precision + recall else 0
166
+ chrf_scores.append(chrf_score)
167
+ return sum(chrf_scores) / len(chrf_scores) if chrf_scores else 0
168
+ except Exception as e:
169
+ print(f"CHRF evaluation failed: {e}")
170
+ return 0
171
 
172
  # Readability
173
  def evaluate_readability(self, text):
174
+ try:
175
+ words = re.findall(r'\b\w+\b', text.lower())
176
+ sentences = re.split(r'[.!?]+', text)
177
+ num_words = len(words)
178
+ num_sentences = len([s for s in sentences if s.strip()])
179
+ avg_word_length = sum(len(word) for word in words) / num_words if num_words else 0
180
+ words_per_sentence = num_words / num_sentences if num_sentences else 0
181
+ flesch_ease = 206.835 - (1.015 * words_per_sentence) - (84.6 * avg_word_length)
182
+ flesch_grade = (0.39 * words_per_sentence) + (11.8 * avg_word_length) - 15.59
183
+ return flesch_ease, flesch_grade
184
+ except Exception as e:
185
+ print(f"Readability evaluation failed: {e}")
186
+ return 0, 0
187
 
188
  # MAUVE
189
  def evaluate_mauve(self, reference_texts, generated_texts):
190
+ try:
191
+ out = compute_mauve(
192
+ p_text=reference_texts,
193
+ q_text=generated_texts,
194
+ device_id=0,
195
+ max_text_length=1024,
196
+ verbose=False
197
+ )
198
+ return out.mauve
199
+ except Exception as e:
200
+ print(f"MAUVE evaluation failed: {e}")
201
+ return 0.0
202
 
203
  def evaluate_all(self, question, response, reference):
204
+ try:
205
+ candidates = [response]
206
+ references = [reference]
207
+ bleu, rouge1, rouge2, rougeL = self.evaluate_bleu_rouge(candidates, references)
208
+ bert_p, bert_r, bert_f1 = self.evaluate_bert_score(candidates, references)
209
+ perplexity = self.evaluate_perplexity(response)
210
+ diversity = self.evaluate_diversity(candidates)
211
+ racial_bias = self.evaluate_racial_bias(response)
212
+ meteor = self.evaluate_meteor(candidates, references)
213
+ chrf = self.evaluate_chrf(candidates, references)
214
+ flesch_ease, flesch_grade = self.evaluate_readability(response)
215
+ mauve_score = self.evaluate_mauve(references, candidates) if len(references) > 1 else 0.0
216
+
217
+ return {
218
+ "BLEU": bleu,
219
+ "ROUGE-1": rouge1,
220
+ "ROUGE-2": rouge2,
221
+ "ROUGE-L": rougeL,
222
+ "BERT_Precision": bert_p,
223
+ "BERT_Recall": bert_r,
224
+ "BERT_F1": bert_f1,
225
+ "Perplexity": perplexity,
226
+ "Diversity": diversity,
227
+ "Racial_Bias": racial_bias,
228
+ "MAUVE": mauve_score,
229
+ "METEOR": meteor,
230
+ "CHRF": chrf,
231
+ "Flesch_Reading_Ease": flesch_ease,
232
+ "Flesch_Kincaid_Grade": flesch_grade,
233
+ }
234
+ except Exception as e:
235
+ print(f"Complete evaluation failed: {e}")
236
+ return {"error": str(e)}
237
 
238
  # Initialize evaluator
239
  evaluator = RAGEvaluator()
240
 
241
  # ------------------ Pinecone ------------------
242
  def init_pinecone():
243
+ try:
244
+ pc = Pinecone(api_key=pinecone_api_key)
245
+ return pc.Index("rag-granite-index")
246
+ except Exception as e:
247
+ print(f"Pinecone initialization failed: {e}")
248
+ return None
249
 
250
  index = init_pinecone()
251
 
252
  # ------------------ Embeddings ------------------
253
+ try:
254
+ embeddings_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
255
+ except Exception as e:
256
+ print(f"Embeddings initialization failed: {e}")
257
+ embeddings_model = None
258
 
259
  def get_retrieved_context(query: str, top_k=3):
260
+ if not index or not embeddings_model:
261
+ return "No context available - system initialization failed"
262
+
263
+ try:
264
+ start = time.time()
265
+ query_embedding = embeddings_model.embed_query(query)
266
+ if mlflow.active_run():
267
+ mlflow.log_metric("embedding_latency", time.time() - start)
268
+
269
+ results = index.query(
270
+ namespace="rag-ns",
271
+ vector=query_embedding,
272
+ top_k=top_k,
273
+ include_metadata=True
274
+ )
275
+
276
+ if mlflow.active_run():
277
+ mlflow.log_metric("retrieved_chunks", len(results['matches']))
278
+
279
+ return "\n".join([m['metadata']['text'] for m in results['matches']])
280
+ except Exception as e:
281
+ print(f"Context retrieval failed: {e}")
282
+ return f"Context retrieval error: {str(e)}"
283
 
284
  # ------------------ Custom LLM ------------------
285
  class LitServeLLM(LLM):
286
  endpoint_url: str
287
 
 
288
  def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
289
+ try:
290
+ payload = {"prompt": prompt}
291
+ start_time = time.time()
292
+ response = requests.post(self.endpoint_url, json=payload, timeout=30)
293
+
294
+ if mlflow.active_run():
295
+ mlflow.log_metric("lit_serve_latency", time.time() - start_time)
296
+
297
+ if response.status_code == 200:
298
+ data = response.json()
299
+ if mlflow.active_run():
300
+ mlflow.log_metric("response_tokens", len(data.get("response", "").split()))
301
+ return data.get("response", "").strip()
302
+ else:
303
+ if mlflow.active_run():
304
+ mlflow.log_metric("request_errors", 1)
305
+ error_msg = f"Request failed with status {response.status_code}"
306
+ print(f"LLM Error: {error_msg}")
307
+ return f"I apologize, but I'm currently experiencing technical difficulties. Error: {error_msg}"
308
+
309
+ except requests.exceptions.Timeout:
310
+ error_msg = "Request timeout - service unavailable"
311
+ print(f"LLM Timeout: {error_msg}")
312
+ return f"I apologize, but the service is currently unavailable. Please try again later."
313
+ except Exception as e:
314
+ error_msg = f"Connection error: {str(e)}"
315
+ print(f"LLM Connection Error: {error_msg}")
316
+ return f"I apologize, but I'm having trouble connecting to the service. Please try again later."
317
 
318
  @property
319
  def _identifying_params(self) -> Mapping[str, Any]:
 
323
  def _llm_type(self) -> str:
324
  return "litserve_llm"
325
 
326
+ # Initialize model with fallback
327
+ try:
328
+ model = LitServeLLM(endpoint_url="https://8001-01k2h9d9mervcmgfn66ybkpwvq.cloudspaces.litng.ai/predict")
329
+ except Exception as e:
330
+ print(f"Model initialization failed: {e}")
331
+ model = None
332
 
333
  prompt = PromptTemplate(
334
  input_variables=["context", "question"],
335
  template="""
336
  You are a smart assistant. Based on the provided context, answer the question in 1–2 lines only.
337
  If the context has more details, summarize it concisely.
338
+
339
  Context:
340
  {context}
341
+
342
  Question: {question}
343
+
344
  Answer:
345
  """
346
  )
347
 
348
+ # Initialize chain with error handling
349
+ try:
350
+ if model:
351
+ llm_chain = LLMChain(llm=model, prompt=prompt)
352
+ else:
353
+ llm_chain = None
354
+ print("LLM chain not initialized due to model failure")
355
+ except Exception as e:
356
+ print(f"LLM chain initialization failed: {e}")
357
+ llm_chain = None
358
 
359
  # ------------------ RAG Pipeline ------------------
360
  def get_rag_response(question):
361
+ """Get the complete RAG response with error handling"""
362
+ try:
363
+ # Get context
364
+ retrieved_context = get_retrieved_context(question)
365
+
366
+ # If LLM chain is not available, return fallback response
367
+ if not llm_chain:
368
+ fallback_response = "I'm currently experiencing technical difficulties. Please try again later or contact support."
369
+ return fallback_response, retrieved_context
370
+
371
+ # Get response from LLM
372
+ result = llm_chain.invoke({
373
+ "context": retrieved_context,
374
+ "question": question
375
+ })
376
+
377
+ full_response = result["text"].strip()
378
+
379
+ if "Answer:" in full_response:
380
+ full_response = full_response.split("Answer:", 1)[-1].strip()
381
+
382
+ return full_response, retrieved_context
383
+
384
+ except Exception as e:
385
+ error_msg = f"Error generating response: {str(e)}"
386
+ print(f"RAG pipeline error: {error_msg}")
387
+ return f"I apologize, but I encountered an error while processing your request. Please try again. Error: {str(e)}", "Error retrieving context"
388
 
389
  def rag_pipeline_stream(question):
390
  """Streaming version of RAG pipeline"""
391
+ try:
392
+ full_response, _ = get_rag_response(question)
393
+
394
+ # Stream word by word
395
+ words = full_response.split()
396
+ current_text = ""
397
+ for word in words:
398
+ current_text += word + " "
399
+ yield current_text
400
+ time.sleep(0.05) # Adjust speed as needed
401
+
402
+ except Exception as e:
403
+ error_msg = f"Error in streaming: {str(e)}"
404
+ print(f"Streaming error: {error_msg}")
405
+ yield "I apologize, but I encountered an error while generating the response."
406
 
407
  # ------------------ Gradio UI ------------------
408
+ with gr.Blocks(theme=gr.themes.Soft(), title="Maintenance AI Assistant") as demo:
409
  gr.Markdown("""
410
  # 🛠 Maintenance AI Assistant
411
  *Your intelligent companion for maintenance queries and troubleshooting*
 
413
 
414
  usage_counter = gr.State(value=0)
415
  session_start = gr.State(value=datetime.now().isoformat())
416
+ current_response = gr.State(value="")
417
 
418
  with gr.Row():
419
  with gr.Column(scale=1):
 
449
 
450
  def track_usage(question, count, session_start, feedback_value=None):
451
  """Track usage and get response"""
452
+ if not question.strip():
453
+ return "Please enter a question.", count, session_start, ""
 
 
 
 
 
 
 
 
 
 
 
454
 
455
+ count += 1
456
+
457
+ try:
458
+ with mlflow.start_run(run_name=f"User-Interaction-{count}", nested=True) if mlflow_tracking_uri else dummy_context():
459
+ if mlflow_tracking_uri:
460
+ mlflow.log_param("question", question)
461
+ mlflow.log_param("session_start", session_start)
462
+ mlflow.log_param("user_feedback", feedback_value or "No feedback")
463
+
464
+ if feedback_value:
465
+ mlflow.log_metric("helpful_responses", 1 if feedback_value == "Helpful" else 0)
466
+
467
+ mlflow.log_metric("total_queries", count)
468
+
469
+ # Get response and context
470
+ response, context = get_rag_response(question)
471
+
472
+ if mlflow_tracking_uri:
473
+ mlflow.log_metric("response_length", len(response))
474
+ mlflow.log_metric("response_tokens", len(response.split()))
475
+
476
+ return response, count, session_start, response
477
+
478
+ except Exception as e:
479
+ print(f"Tracking error: {e}")
480
+ error_msg = f"System error: {str(e)}"
481
+ return error_msg, count, session_start, error_msg
482
 
483
  def evaluate_response(question, response):
484
  """Evaluate the response and return metrics"""
485
+ if not question or not response or "error" in response.lower() or "apologize" in response.lower():
486
+ return gr.update(value={"info": "Evaluation skipped due to error response"}, visible=True)
487
 
488
  try:
489
  context = get_retrieved_context(question)
490
  metrics = evaluator.evaluate_all(question, response, context)
491
 
492
+ # Log metrics to MLflow if available
493
+ if mlflow_tracking_uri and mlflow.active_run():
494
+ for metric_name, metric_value in metrics.items():
495
+ if isinstance(metric_value, (int, float)):
496
+ mlflow.log_metric(metric_name, metric_value)
497
 
498
  return gr.update(value=metrics, visible=True)
499
  except Exception as e:
500
  print(f"Evaluation error: {e}")
501
+ return gr.update(value={"error": f"Evaluation failed: {str(e)}"}, visible=True)
502
 
503
  def clear_chat():
504
  """Clear the chat interface"""
505
  return "", "", gr.update(visible=False)
506
 
507
+ # Dummy context manager for when MLflow is not available
508
+ class dummy_context:
509
+ def __enter__(self):
510
+ return self
511
+ def __exit__(self, *args):
512
+ pass
513
+
514
  # Main interaction flow
515
  ask_button.click(
516
  fn=lambda: ("", gr.update(visible=False)), # Clear previous metrics
 
539
  )
540
 
541
  # Feedback handling
542
+ def handle_feedback(feedback_val):
543
+ try:
544
+ if mlflow_tracking_uri and mlflow.active_run():
545
+ mlflow.log_metric("user_feedback_score", 1 if feedback_val == "Helpful" else 0)
546
+ except:
547
+ pass
548
+ return
549
+
550
  feedback.change(
551
+ fn=handle_feedback,
552
  inputs=[feedback],
553
  outputs=[]
554
  )
 
557
  demo.launch(
558
  server_name="0.0.0.0",
559
  server_port=7860,
560
+ share=False, # Disable sharing to avoid the warning
561
  show_error=True
562
  )