Gül Sena Altıntaş commited on
Commit
6c9db61
·
1 Parent(s): 6383574

Updated app for summary markdown tables

Browse files
Files changed (1) hide show
  1. app.py +508 -259
app.py CHANGED
@@ -1,32 +1,35 @@
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import pandas as pd
3
  import plotly.express as px
4
  import plotly.graph_objects as go
5
- from collections import Counter
6
- import torch
7
- from transformers import AutoTokenizer, AutoModelForCausalLM
8
- import re
9
- import logging
10
- from typing import List, Dict, Any
11
- import gc
12
- import os
13
  import psutil
 
 
14
 
15
 
16
  def get_memory_usage():
17
  """Return (gpu_mem_used_MB, gpu_mem_total_MB, ram_used_MB, ram_total_MB)"""
18
  # System RAM
19
  vm = psutil.virtual_memory()
20
- ram_used_mb = vm.used / (1024 ** 2)
21
- ram_total_mb = vm.total / (1024 ** 2)
22
 
23
  # GPU memory
24
  if torch.cuda.is_available():
25
  gpu_idx = torch.cuda.current_device()
26
  torch.cuda.synchronize()
27
- gpu_mem_alloc = torch.cuda.memory_allocated(gpu_idx) / (1024 ** 2)
28
- gpu_mem_reserved = torch.cuda.memory_reserved(gpu_idx) / (1024 ** 2)
29
- gpu_mem_total = torch.cuda.get_device_properties(gpu_idx).total_memory / (1024 ** 2)
 
 
30
  gpu_mem_used = max(gpu_mem_alloc, gpu_mem_reserved) # safe estimate
31
  else:
32
  gpu_mem_used = 0
@@ -41,77 +44,85 @@ logger = logging.getLogger(__name__)
41
 
42
  # Model configurations - maps display names to HF model paths
43
  PREDEFINED_MODELS = [
44
- "meta-llama/Llama-3.2-1B",
45
- "google/gemma-2-2b",
46
- "Qwen/Qwen3-0.6B",
47
- "Qwen/Qwen2.5-0.5B",
48
- "Qwen/Qwen2.5-1.5B",
49
- "bigscience/bloom-560m",
50
- "CohereForAI/aya-expanse-8b",
51
- "common-pile/comma-v0.1-2t",
52
- "google/byt5-small",
53
- "gsaltintas/supertoken_models-llama_gpt2",
54
- "gsaltintas/supertoken_models-llama_google-gemma-2-2b"
55
  ]
56
  # Global cache for loaded models
57
  model_cache = {}
58
 
 
59
  def parse_dataset(text):
60
  """Parse the input dataset text into structured questions"""
61
  if not text.strip():
62
  return [], "Please enter your dataset"
63
-
64
- lines = text.strip().split('\n')
65
  if len(lines) < 2:
66
  return [], "Dataset must have at least a header and one question"
67
-
68
  # Skip header and detect delimiter
69
  first_data_line = lines[1] if len(lines) > 1 else lines[0]
70
- delimiter = '\t' if '\t' in first_data_line else ','
71
-
72
  questions = []
73
  errors = []
74
-
75
  for i, line in enumerate(lines[1:], 2): # Start from line 2 (after header)
76
  line = line.strip()
77
  if not line:
78
  continue
79
-
80
  parts = [part.strip().strip('"') for part in line.split(delimiter)]
81
-
82
  if len(parts) < 5:
83
  errors.append(f"Line {i}: Not enough columns (need 5, got {len(parts)})")
84
  continue
85
-
86
  question = {
87
- 'question': parts[0],
88
- 'correct_answer': parts[1],
89
- 'choices': [parts[2], parts[3], parts[4]]
90
  }
91
-
92
  # Ensure correct answer is in choices
93
- if question['correct_answer'] not in question['choices']:
94
- question['choices'].append(question['correct_answer'])
95
-
96
  questions.append(question)
97
-
98
- error_msg = '\n'.join(errors) if errors else ""
99
  return questions, error_msg
100
 
 
101
  def setup_tokenizer(model_path):
102
  tokenizer_name = model_path
103
  if "supertoken" in model_path:
104
- from huggingface_hub import list_repo_files, hf_hub_download
105
  import json
 
 
 
106
  files = list_repo_files(model_path)
107
  if "tokenizer_config.json" in files:
108
- tokenizer_path = hf_hub_download(repo_id=model_path, filename="tokenizer_config.json")
 
 
109
  with open(tokenizer_path) as f:
110
  tok_config = json.load(f)["data"]["tokenizer"]
111
  if tok_config["name"] == "huggingface":
112
  tokenizer_name = tok_config["path"]
113
  # todo: tiktoken
114
- tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True, legacy=True)
 
 
115
  return tokenizer
116
 
117
 
@@ -124,90 +135,96 @@ def load_model_and_tokenizer(model_path, progress_callback=None):
124
  logger.info(f"Current GPU memory: {gpu_used:.1f}/{gpu_total:.1f} MB")
125
  logger.info(f"Current RAM: {ram_used:.1f}/{ram_total:.1f} MB")
126
 
127
- use_cache = not (
128
- (gpu_total > 0 and gpu_used / gpu_total > 0.8) or
129
- (ram_used / ram_total > 0.8)
130
- ) or model_path in model_cache
 
 
 
131
  if not use_cache:
132
  logger.warning("High memory usage detected — disabling model cache.")
133
 
134
-
135
  if use_cache and model_path in model_cache:
136
  logger.info(f"Using cached model: {model_path}")
137
  if progress_callback:
138
  progress_callback(1.0, f"✅ Using cached model: {model_path}")
139
  return model_cache[model_path]
140
-
141
  try:
142
  if progress_callback:
143
  progress_callback(0.1, f"🔄 Starting to load model: {model_path}")
144
-
145
-
146
  # Check if CUDA is available
147
  device = "cuda" if torch.cuda.is_available() else "cpu"
148
  logger.info(f"Loading model: {model_path} using device: {device}")
149
-
150
  if progress_callback:
151
  progress_callback(0.2, f"📥 Loading tokenizer for {model_path}...")
152
-
153
  # Load tokenizer
154
  tokenizer = setup_tokenizer(model_path)
155
-
156
  # Add pad token if missing
157
  if tokenizer.pad_token is None:
158
  tokenizer.pad_token = tokenizer.eos_token
159
-
160
  if progress_callback:
161
- progress_callback(0.5, f"🧠 Loading model weights for {model_path}... (this may take a while)")
162
-
 
 
 
163
  logger.info(os.getcwd())
164
  # Load model with appropriate settings
165
  model = AutoModelForCausalLM.from_pretrained(
166
  model_path,
167
  torch_dtype=torch.float16 if device == "cuda" else torch.float32,
168
- device_map="auto" if device== "cuda" else None,
169
  trust_remote_code=True,
170
- low_cpu_mem_usage=True
171
  )
172
-
173
- model_info = {
174
- 'tokenizer': tokenizer,
175
- 'model': model,
176
- 'device': device
177
- }
178
-
179
  if use_cache:
180
  model_cache[model_path] = model_info
181
-
182
  if progress_callback:
183
  progress_callback(1.0, f"✅ Successfully loaded model: {model_path}")
184
-
185
  return model_info
186
-
187
  except Exception as e:
188
  import code
 
189
  error_msg = f"❌ Error loading model {model_path}: {str(e)}"
190
  logger.error(error_msg)
191
  # code.interact(local=dict(globals(), **locals()))
192
  if progress_callback:
193
  progress_callback(0.0, error_msg)
194
  return None
195
-
 
196
  def calculate_choice_likelihood(model, tokenizer, question, choice):
197
  """Calculate the log-likelihood of the choice given the question prompt"""
198
  try:
199
  prompt = f"Question: {question}\nAnswer: "
200
- prompt=question
201
  full_text = f"{prompt} {choice}"
202
 
203
  # Tokenize full input (prompt + answer)
204
- input_ids = tokenizer.encode(full_text, return_tensors="pt", add_special_tokens=False).to(model.device)
205
- prompt_ids = tokenizer.encode(prompt, return_tensors="pt", add_special_tokens=False).to(model.device)
 
 
 
 
206
 
207
  if input_ids.size(1) <= prompt_ids.size(1):
208
  logger.warning("Answer tokens are empty after tokenization.")
209
  return float("-inf")
210
-
211
  with torch.no_grad():
212
  outputs = model(input_ids)
213
  logits = outputs.logits
@@ -215,7 +232,9 @@ def calculate_choice_likelihood(model, tokenizer, question, choice):
215
  # Get logits for the answer tokens only
216
  answer_len = input_ids.size(1) - prompt_ids.size(1)
217
  target_ids = input_ids[:, -answer_len:]
218
- logits = logits[:, prompt_ids.size(1)-1:-1, :] # shifted for next-token prediction
 
 
219
 
220
  log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
221
  token_log_probs = log_probs.gather(2, target_ids.unsqueeze(-1)).squeeze(-1)
@@ -228,76 +247,95 @@ def calculate_choice_likelihood(model, tokenizer, question, choice):
228
  return float("-inf")
229
 
230
 
231
-
232
  def evaluate_model_on_questions(model_path, questions, progress_callback=None):
233
  """Evaluate a single model on all questions using likelihood-based scoring"""
234
-
235
- model_info = load_model_and_tokenizer(model_path, progress_callback=progress_callback)
236
-
 
 
237
  if model_info is None:
238
- return [{'error': f'Failed to load model {model_path}'}] * len(questions)
239
-
240
  results = []
241
- model = model_info['model']
242
- tokenizer = model_info['tokenizer']
243
-
244
  for i, question in enumerate(questions):
245
  try:
246
  # Calculate likelihood for each choice
247
  choice_likelihoods = {}
248
  choice_probs = {}
249
-
250
- for choice in question['choices']:
251
- likelihood = calculate_choice_likelihood(model, tokenizer, question['question'], choice)
 
 
252
  choice_likelihoods[choice] = likelihood
253
-
254
  # Convert log probabilities to probabilities for confidence scoring
255
  max_log_prob = max(choice_likelihoods.values())
256
- choice_probs = {choice: torch.exp(torch.tensor(log_prob - max_log_prob)).item()
257
- for choice, log_prob in choice_likelihoods.items()}
258
-
 
 
259
  # Normalize probabilities
260
  total_prob = sum(choice_probs.values())
261
  if total_prob > 0:
262
- choice_probs = {choice: prob / total_prob for choice, prob in choice_probs.items()}
263
-
 
 
264
  # Select the choice with highest likelihood
265
- predicted_choice = max(choice_likelihoods.keys(), key=lambda x: choice_likelihoods[x])
266
- is_correct = predicted_choice == question['correct_answer']
267
-
 
 
268
  # Confidence is the probability of the selected choice
269
  confidence = choice_probs.get(predicted_choice, 0.0)
270
-
271
- results.append({
272
- 'question_idx': i,
273
- 'predicted': predicted_choice,
274
- 'correct': is_correct,
275
- 'confidence': confidence,
276
- 'choice_likelihoods': choice_likelihoods,
277
- 'choice_probabilities': choice_probs,
278
- 'raw_response': f"Likelihoods: {choice_likelihoods}"
279
- })
280
-
 
 
281
  if progress_callback:
282
  # Use remaining 80% for evaluation progress
283
  evaluation_progress = 0.2 + (i + 1) / len(questions) * 0.8
284
- progress_callback(evaluation_progress, f"🔍 Evaluating {model_path}: {i+1}/{len(questions)} questions (likelihood-based)")
285
-
 
 
 
286
  except Exception as e:
287
  logger.error(f"Error evaluating question {i} with {model_path}: {str(e)}")
288
- results.append({
289
- 'question_idx': i,
290
- 'predicted': question['choices'][0] if question['choices'] else '',
291
- 'correct': False,
292
- 'confidence': 0.0,
293
- 'choice_likelihoods': {},
294
- 'choice_probabilities': {},
295
- 'raw_response': f"Error: {str(e)}"
296
- })
297
-
 
 
298
  return results
299
 
300
- def run_evaluation(dataset_text, selected_predefined, custom_models_text="", progress=gr.Progress()):
 
 
 
301
  """Main evaluation function"""
302
  if not dataset_text.strip():
303
  return (
@@ -305,159 +343,194 @@ def run_evaluation(dataset_text, selected_predefined, custom_models_text="", pro
305
  "<p>No data provided</p>",
306
  None,
307
  None,
308
- gr.update(visible=True)
 
 
309
  )
310
-
311
  # Parse custom models
312
  custom_models = []
313
  if custom_models_text is None:
314
  custom_models_text = ""
315
  if custom_models_text.strip():
316
- custom_models = [model.strip() for model in custom_models_text.strip().split('\n') if model.strip()]
317
-
 
 
 
 
318
  # Combine selected models
319
  all_models = []
320
-
321
  # Add predefined models
322
  all_models.extend(selected_predefined)
323
  all_models.extend(custom_models)
324
-
325
  if not all_models:
326
  return (
327
  "Please select at least one model or add custom models",
328
  "<p>No models selected</p>",
329
  None,
330
  None,
331
- gr.update(visible=False)
 
 
332
  )
333
-
334
  # Parse dataset
335
  questions, parse_error = parse_dataset(dataset_text)
336
-
337
  if parse_error:
338
  return (
339
  f"Dataset parsing error:\n{parse_error}",
340
  "<p>Failed to parse dataset</p>",
341
  None,
342
  None,
343
- gr.update(visible=True)
 
 
344
  )
345
-
346
  if not questions:
347
  return (
348
  "No valid questions found in dataset",
349
  "<p>No questions to evaluate</p>",
350
  None,
351
  None,
352
- gr.update(visible=True)
 
 
353
  )
354
-
355
  # Run evaluation
356
  progress(0, "Starting evaluation...")
357
  results = {}
358
  total_steps = len(all_models) * len(questions)
359
  current_step = 0
360
-
361
  summary_md = create_summary_markdown({})
362
  for model_path in all_models:
363
- display_name = model_path.split('/')[-1] if '/' in model_path else model_path
364
  try:
 
365
  def model_progress(p, msg):
366
  nonlocal current_step
367
  current_step = int(p * len(questions))
368
  overall_progress = current_step / total_steps
369
  progress(overall_progress, msg)
370
-
371
- model_results = evaluate_model_on_questions(model_path, questions, model_progress)
 
 
372
  results[display_name] = model_results
373
-
374
  except Exception as e:
375
  logger.error(f"Failed to evaluate {display_name}: {str(e)}")
376
- results[display_name] = [{'error': str(e)}] * len(questions)
377
-
378
  # Clean up GPU memory
379
  if torch.cuda.is_available():
380
  torch.cuda.empty_cache()
381
  gc.collect()
382
-
383
  # Generate outputs
384
  summary_stats = generate_summary_stats(questions, results)
385
  summary_md = create_summary_markdown(summary_stats)
386
  detailed_html = create_detailed_results_html(questions, results)
387
  accuracy_chart = create_accuracy_chart(summary_stats)
388
  confidence_chart = create_confidence_chart(results)
389
-
 
 
 
 
 
 
390
  return (
391
  summary_md,
392
  detailed_html,
393
  accuracy_chart,
394
  confidence_chart,
395
- gr.update(visible=True)
 
 
396
  )
397
 
 
398
  def generate_summary_stats(questions, results):
399
  """Generate summary statistics for all models"""
400
  summary = {}
401
-
402
  for model, model_results in results.items():
403
- if not model_results or 'error' in model_results[0]:
404
  summary[model] = {
405
- 'accuracy': 0.0,
406
- 'correct': 0,
407
- 'total': len(questions),
408
- 'avg_confidence': 0.0,
409
- 'error': model_results[0].get('error', 'Unknown error') if model_results else 'No results'
 
 
410
  }
411
  continue
412
-
413
- correct_count = sum(1 for r in model_results if r.get('correct', False))
414
  total_count = len(model_results)
415
  accuracy = correct_count / total_count if total_count > 0 else 0
416
-
417
  # Calculate average confidence
418
- avg_confidence = sum(r.get('confidence', 0) for r in model_results) / total_count if total_count > 0 else 0
419
-
 
 
 
 
420
  summary[model] = {
421
- 'accuracy': accuracy,
422
- 'correct': correct_count,
423
- 'total': total_count,
424
- 'avg_confidence': avg_confidence
425
  }
426
-
427
  return summary
428
 
 
429
  def create_summary_markdown(summary_stats):
430
  """Create markdown summary of results"""
431
  if not summary_stats:
432
  return "No results available"
433
-
434
  # Sort by accuracy
435
- sorted_models = sorted(summary_stats.items(), key=lambda x: x[1]['accuracy'], reverse=True)
436
-
 
 
437
  lines = ["## 🏆 Model Performance Summary\n"]
438
-
439
  for i, (model, stats) in enumerate(sorted_models):
440
- if 'error' in stats:
441
  lines.append(f"❌ **{model}**: Error - {stats['error']}")
442
  continue
443
-
444
- accuracy_pct = stats['accuracy'] * 100
445
- medal = "🥇" if i == 0 else "🥈" if i == 1 else "🥉" if i == 2 else f"{i+1}."
446
-
447
  lines.append(
448
  f"{medal} **{model}**: {accuracy_pct:.1f}% "
449
  f"({stats['correct']}/{stats['total']} correct, "
450
  f"avg confidence: {stats['avg_confidence']:.2f})"
451
  )
452
-
453
  return "\n".join(lines)
454
 
 
455
  def create_detailed_results_html(questions, results):
456
  """Create detailed HTML results for each question"""
457
  if not questions or not results:
458
  return "<p>No detailed results available</p>"
459
-
460
- html_parts = ["""
 
461
  <style>
462
  .question-card {
463
  background: white;
@@ -544,128 +617,277 @@ def create_detailed_results_html(questions, results):
544
  font-family: monospace;
545
  }
546
  </style>
547
- """]
548
-
 
549
  for q_idx, question in enumerate(questions):
550
  html_parts.append(f"""
551
  <div class="question-card">
552
  <div class="question-header">
553
  <span class="question-number">Q{q_idx + 1}</span>
554
  </div>
555
- <div class="question-text">{question['question']}</div>
556
  <div class="choices">
557
  <strong>Choices:</strong><br>
558
- {' | '.join(f'{chr(65+i)}) {choice}' for i, choice in enumerate(question['choices']))}
559
  </div>
560
  <div class="correct-answer">
561
- <strong>✓ Correct Answer:</strong> {question['correct_answer']}
562
  </div>
563
  <div class="model-results">
564
  """)
565
-
566
  # Add results for each model
567
  for model, model_results in results.items():
568
  if q_idx < len(model_results):
569
  result = model_results[q_idx]
570
-
571
- if 'error' in result:
572
  html_parts.append(f"""
573
  <div class="model-result result-error">
574
  <div>⚠️ {model}</div>
575
  <div style="font-size: 12px; margin-top: 4px;">
576
  Error occurred
577
  </div>
578
- <div class="raw-response">{result.get('raw_response', 'Unknown error')}</div>
579
  </div>
580
  """)
581
  else:
582
- result_class = 'result-correct' if result.get('correct', False) else 'result-incorrect'
583
- icon = '✅' if result.get('correct', False) else '❌'
584
-
 
 
 
 
585
  html_parts.append(f"""
586
  <div class="model-result {result_class}">
587
  <div>{icon} {model}</div>
588
  <div style="font-size: 12px; margin-top: 4px;">
589
- "{result.get('predicted', 'No prediction')}"
590
  </div>
591
- <div class="raw-response">Raw: "{result.get('raw_response', '')}"</div>
592
  </div>
593
  """)
594
-
595
  html_parts.append("""
596
  </div>
597
  </div>
598
  """)
599
-
600
  return "".join(html_parts)
601
 
 
602
  def create_accuracy_chart(summary_stats):
603
  """Create accuracy comparison chart"""
604
  if not summary_stats:
605
  return None
606
-
607
  models = []
608
  accuracies = []
609
-
610
  for model, stats in summary_stats.items():
611
- if 'error' not in stats:
612
  models.append(model)
613
- accuracies.append(stats['accuracy'] * 100)
614
-
615
  if not models:
616
  return None
617
-
618
- fig = go.Figure(data=[
619
- go.Bar(
620
- x=models,
621
- y=accuracies,
622
- marker_color='lightblue',
623
- text=[f'{acc:.1f}%' for acc in accuracies],
624
- textposition='auto',
625
- )
626
- ])
627
-
 
 
628
  fig.update_layout(
629
  title="Model Accuracy Comparison",
630
  xaxis_title="Models",
631
  yaxis_title="Accuracy (%)",
632
  template="plotly_white",
633
- showlegend=False
634
  )
635
-
636
  return fig
637
 
 
638
  def create_confidence_chart(results):
639
  """Create confidence distribution chart"""
640
  if not results:
641
  return None
642
-
643
  data = []
644
  for model, model_results in results.items():
645
  for result in model_results:
646
- if 'error' not in result and 'confidence' in result:
647
- data.append({
648
- 'Model': model,
649
- 'Confidence': result['confidence'],
650
- 'Correct': 'Correct' if result.get('correct', False) else 'Incorrect'
651
- })
652
-
 
 
 
 
653
  if not data:
654
  return None
655
-
656
  df = pd.DataFrame(data)
657
-
658
  fig = px.box(
659
  df,
660
- x='Model',
661
- y='Confidence',
662
- color='Correct',
663
  title="Confidence Distribution by Model and Correctness",
664
- template="plotly_white"
665
  )
666
-
667
  return fig
668
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
669
  # Sample datasets for quick testing
670
  SAMPLE_DATASETS = {
671
  "Custom (enter below)": "",
@@ -678,18 +900,16 @@ What is 2+2?,4,3,2,5
678
  What is 5*3?,15,12,16,18
679
  What is 10-7?,3,7,4,2
680
  What is 8/2?,4,3,2,5""",
681
-
682
  "World Capitals": """Question,Correct Answer,Choice1,Choice2,Choice3
683
  What is the capital of France?,Paris,London,Berlin,Rome
684
  What is the capital of Japan?,Tokyo,Seoul,Beijing,Bangkok
685
  What is the capital of Brazil?,Brasília,Rio de Janeiro,São Paulo,Salvador
686
  What is the capital of Australia?,Canberra,Sydney,Melbourne,Perth""",
687
-
688
  "Science Quiz": """Question,Correct Answer,Choice1,Choice2,Choice3
689
  What is the chemical symbol for gold?,Au,Ag,Ca,K
690
  Which planet is closest to the Sun?,Mercury,Venus,Earth,Mars
691
  What is the speed of light?,299792458 m/s,300000000 m/s,2992458 m/s,299000000 m/s
692
- What gas do plants absorb from the atmosphere?,Carbon dioxide,Oxygen,Nitrogen,Hydrogen"""
693
  }
694
 
695
  # Custom CSS
@@ -704,7 +924,9 @@ css = """
704
  """
705
 
706
  # Create Gradio interface
707
- with gr.Blocks(title="🤖 Model Performance Comparison", theme=gr.themes.Soft(), css=css) as demo:
 
 
708
  gr.Markdown("""
709
  # 🤖 Model Performance Comparison Tool
710
 
@@ -718,7 +940,7 @@ with gr.Blocks(title="🤖 Model Performance Comparison", theme=gr.themes.Soft()
718
  - Detailed question-by-question results
719
  - Performance charts and statistics
720
  """)
721
-
722
  with gr.Row():
723
  with gr.Column(scale=2):
724
  # Sample dataset selector
@@ -726,9 +948,9 @@ with gr.Blocks(title="🤖 Model Performance Comparison", theme=gr.themes.Soft()
726
  choices=list(SAMPLE_DATASETS.keys()),
727
  value="Custom (enter below)",
728
  label="Choose sample dataset or enter your own",
729
- interactive=True
730
  )
731
-
732
  # Dataset input
733
  dataset_input = gr.Textbox(
734
  label="Dataset (CSV/TSV format)",
@@ -739,16 +961,16 @@ Question,Correct Answer,Choice1,Choice2,Choice3
739
  What is 2+2?,4,3,2,5
740
  What is the capital of France?,Paris,London,Berlin,Paris""",
741
  lines=8,
742
- max_lines=15
743
  )
744
-
745
  gr.Markdown("""
746
  **Format Requirements**:
747
  - First line: header (will be ignored), leave empty if no header
748
  - Each data line: Question, Correct Answer, Choice1, Choice2, Choice3
749
  - Use commas or tabs as separators
750
  """)
751
-
752
  with gr.Column(scale=1):
753
  # Model selection
754
  with gr.Tabs():
@@ -757,9 +979,9 @@ What is the capital of France?,Paris,London,Berlin,Paris""",
757
  choices=PREDEFINED_MODELS,
758
  value=[PREDEFINED_MODELS[0]],
759
  label="Select from popular models",
760
- interactive=True
761
  )
762
-
763
  with gr.TabItem("➕ Custom Models"):
764
  custom_models_input = gr.Textbox(
765
  label="Custom HuggingFace Model Paths",
@@ -770,7 +992,7 @@ bigscience/bloom-560m""",
770
  lines=5,
771
  info="Add any HuggingFace model path. One model per line.",
772
  )
773
-
774
  gr.Markdown("""
775
  **Examples of valid model paths**:
776
  - `microsoft/DialoGPT-medium`
@@ -778,57 +1000,84 @@ bigscience/bloom-560m""",
778
  - `facebook/opt-350m`
779
  - Your own fine-tuned models!
780
  """)
781
-
782
  # Evaluate button
783
- evaluate_btn = gr.Button(
784
- "⚡ Run Evaluation",
785
- variant="primary",
786
- scale=1
787
- )
788
-
789
  gr.Markdown("""
790
  **⚠️ Note**:
791
  - Larger models require more GPU memory, currently we only run on CPU
792
  - First run will download models (may take time)
793
  - Models are cached for subsequent runs
794
  """)
795
-
796
  # Results section
797
  with gr.Column(visible=True) as results_section:
798
  gr.Markdown("## 📊 Results")
799
-
800
  summary_output = gr.Markdown(
801
- value="Results will appear here...",
802
- label="Performance Summary"
803
  )
804
-
805
  with gr.Row():
806
  accuracy_plot = gr.Plot(label="Accuracy Comparison")
807
  confidence_plot = gr.Plot(label="Confidence Analysis")
808
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
809
  detailed_results = gr.HTML(
810
  value="<p>Detailed results will appear here...</p>",
811
- label="Detailed Question-by-Question Results"
812
  )
813
-
814
  # Event handlers
815
  def update_dataset_from_sample(sample_name):
816
  if sample_name in SAMPLE_DATASETS:
817
  return gr.update(value=SAMPLE_DATASETS[sample_name])
818
  return gr.update()
819
-
820
  sample_selector.change(
821
- fn=update_dataset_from_sample,
822
- inputs=sample_selector,
823
- outputs=dataset_input
824
  )
825
-
826
  evaluate_btn.click(
827
  fn=run_evaluation,
828
  inputs=[dataset_input, predefined_selector, custom_models_input],
829
- outputs=[summary_output, detailed_results, accuracy_plot, confidence_plot, results_section]
 
 
 
 
 
 
 
 
830
  )
831
-
832
  gr.Markdown("""
833
  ---
834
  ### About Model Evaluation
@@ -852,4 +1101,4 @@ bigscience/bloom-560m""",
852
  """)
853
 
854
  if __name__ == "__main__":
855
- demo.launch()
 
1
+ import gc
2
+ import logging
3
+ import os
4
+ import re
5
+ from collections import Counter
6
+ from typing import Any, Dict, List
7
+
8
  import gradio as gr
9
  import pandas as pd
10
  import plotly.express as px
11
  import plotly.graph_objects as go
 
 
 
 
 
 
 
 
12
  import psutil
13
+ import torch
14
+ from transformers import AutoModelForCausalLM, AutoTokenizer
15
 
16
 
17
  def get_memory_usage():
18
  """Return (gpu_mem_used_MB, gpu_mem_total_MB, ram_used_MB, ram_total_MB)"""
19
  # System RAM
20
  vm = psutil.virtual_memory()
21
+ ram_used_mb = vm.used / (1024**2)
22
+ ram_total_mb = vm.total / (1024**2)
23
 
24
  # GPU memory
25
  if torch.cuda.is_available():
26
  gpu_idx = torch.cuda.current_device()
27
  torch.cuda.synchronize()
28
+ gpu_mem_alloc = torch.cuda.memory_allocated(gpu_idx) / (1024**2)
29
+ gpu_mem_reserved = torch.cuda.memory_reserved(gpu_idx) / (1024**2)
30
+ gpu_mem_total = torch.cuda.get_device_properties(gpu_idx).total_memory / (
31
+ 1024**2
32
+ )
33
  gpu_mem_used = max(gpu_mem_alloc, gpu_mem_reserved) # safe estimate
34
  else:
35
  gpu_mem_used = 0
 
44
 
45
  # Model configurations - maps display names to HF model paths
46
  PREDEFINED_MODELS = [
47
+ "meta-llama/Llama-3.2-1B",
48
+ "google/gemma-2-2b",
49
+ "Qwen/Qwen3-0.6B",
50
+ "Qwen/Qwen2.5-0.5B",
51
+ "Qwen/Qwen2.5-1.5B",
52
+ "bigscience/bloom-560m",
53
+ "CohereForAI/aya-expanse-8b",
54
+ "common-pile/comma-v0.1-2t",
55
+ "google/byt5-small",
56
+ "gsaltintas/supertoken_models-llama_gpt2",
57
+ "gsaltintas/supertoken_models-llama_google-gemma-2-2b",
58
  ]
59
  # Global cache for loaded models
60
  model_cache = {}
61
 
62
+
63
  def parse_dataset(text):
64
  """Parse the input dataset text into structured questions"""
65
  if not text.strip():
66
  return [], "Please enter your dataset"
67
+
68
+ lines = text.strip().split("\n")
69
  if len(lines) < 2:
70
  return [], "Dataset must have at least a header and one question"
71
+
72
  # Skip header and detect delimiter
73
  first_data_line = lines[1] if len(lines) > 1 else lines[0]
74
+ delimiter = "\t" if "\t" in first_data_line else ","
75
+
76
  questions = []
77
  errors = []
78
+
79
  for i, line in enumerate(lines[1:], 2): # Start from line 2 (after header)
80
  line = line.strip()
81
  if not line:
82
  continue
83
+
84
  parts = [part.strip().strip('"') for part in line.split(delimiter)]
85
+
86
  if len(parts) < 5:
87
  errors.append(f"Line {i}: Not enough columns (need 5, got {len(parts)})")
88
  continue
89
+
90
  question = {
91
+ "question": parts[0],
92
+ "correct_answer": parts[1],
93
+ "choices": [parts[2], parts[3], parts[4]],
94
  }
95
+
96
  # Ensure correct answer is in choices
97
+ if question["correct_answer"] not in question["choices"]:
98
+ question["choices"].append(question["correct_answer"])
99
+
100
  questions.append(question)
101
+
102
+ error_msg = "\n".join(errors) if errors else ""
103
  return questions, error_msg
104
 
105
+
106
  def setup_tokenizer(model_path):
107
  tokenizer_name = model_path
108
  if "supertoken" in model_path:
 
109
  import json
110
+
111
+ from huggingface_hub import hf_hub_download, list_repo_files
112
+
113
  files = list_repo_files(model_path)
114
  if "tokenizer_config.json" in files:
115
+ tokenizer_path = hf_hub_download(
116
+ repo_id=model_path, filename="tokenizer_config.json"
117
+ )
118
  with open(tokenizer_path) as f:
119
  tok_config = json.load(f)["data"]["tokenizer"]
120
  if tok_config["name"] == "huggingface":
121
  tokenizer_name = tok_config["path"]
122
  # todo: tiktoken
123
+ tokenizer = AutoTokenizer.from_pretrained(
124
+ tokenizer_name, trust_remote_code=True, legacy=True
125
+ )
126
  return tokenizer
127
 
128
 
 
135
  logger.info(f"Current GPU memory: {gpu_used:.1f}/{gpu_total:.1f} MB")
136
  logger.info(f"Current RAM: {ram_used:.1f}/{ram_total:.1f} MB")
137
 
138
+ use_cache = (
139
+ not (
140
+ (gpu_total > 0 and gpu_used / gpu_total > 0.8)
141
+ or (ram_used / ram_total > 0.8)
142
+ )
143
+ or model_path in model_cache
144
+ )
145
  if not use_cache:
146
  logger.warning("High memory usage detected — disabling model cache.")
147
 
 
148
  if use_cache and model_path in model_cache:
149
  logger.info(f"Using cached model: {model_path}")
150
  if progress_callback:
151
  progress_callback(1.0, f"✅ Using cached model: {model_path}")
152
  return model_cache[model_path]
153
+
154
  try:
155
  if progress_callback:
156
  progress_callback(0.1, f"🔄 Starting to load model: {model_path}")
157
+
 
158
  # Check if CUDA is available
159
  device = "cuda" if torch.cuda.is_available() else "cpu"
160
  logger.info(f"Loading model: {model_path} using device: {device}")
161
+
162
  if progress_callback:
163
  progress_callback(0.2, f"📥 Loading tokenizer for {model_path}...")
164
+
165
  # Load tokenizer
166
  tokenizer = setup_tokenizer(model_path)
167
+
168
  # Add pad token if missing
169
  if tokenizer.pad_token is None:
170
  tokenizer.pad_token = tokenizer.eos_token
171
+
172
  if progress_callback:
173
+ progress_callback(
174
+ 0.5,
175
+ f"🧠 Loading model weights for {model_path}... (this may take a while)",
176
+ )
177
+
178
  logger.info(os.getcwd())
179
  # Load model with appropriate settings
180
  model = AutoModelForCausalLM.from_pretrained(
181
  model_path,
182
  torch_dtype=torch.float16 if device == "cuda" else torch.float32,
183
+ device_map="auto" if device == "cuda" else None,
184
  trust_remote_code=True,
185
+ low_cpu_mem_usage=True,
186
  )
187
+
188
+ model_info = {"tokenizer": tokenizer, "model": model, "device": device}
189
+
 
 
 
 
190
  if use_cache:
191
  model_cache[model_path] = model_info
192
+
193
  if progress_callback:
194
  progress_callback(1.0, f"✅ Successfully loaded model: {model_path}")
195
+
196
  return model_info
197
+
198
  except Exception as e:
199
  import code
200
+
201
  error_msg = f"❌ Error loading model {model_path}: {str(e)}"
202
  logger.error(error_msg)
203
  # code.interact(local=dict(globals(), **locals()))
204
  if progress_callback:
205
  progress_callback(0.0, error_msg)
206
  return None
207
+
208
+
209
  def calculate_choice_likelihood(model, tokenizer, question, choice):
210
  """Calculate the log-likelihood of the choice given the question prompt"""
211
  try:
212
  prompt = f"Question: {question}\nAnswer: "
213
+ prompt = question
214
  full_text = f"{prompt} {choice}"
215
 
216
  # Tokenize full input (prompt + answer)
217
+ input_ids = tokenizer.encode(
218
+ full_text, return_tensors="pt", add_special_tokens=False
219
+ ).to(model.device)
220
+ prompt_ids = tokenizer.encode(
221
+ prompt, return_tensors="pt", add_special_tokens=False
222
+ ).to(model.device)
223
 
224
  if input_ids.size(1) <= prompt_ids.size(1):
225
  logger.warning("Answer tokens are empty after tokenization.")
226
  return float("-inf")
227
+
228
  with torch.no_grad():
229
  outputs = model(input_ids)
230
  logits = outputs.logits
 
232
  # Get logits for the answer tokens only
233
  answer_len = input_ids.size(1) - prompt_ids.size(1)
234
  target_ids = input_ids[:, -answer_len:]
235
+ logits = logits[
236
+ :, prompt_ids.size(1) - 1 : -1, :
237
+ ] # shifted for next-token prediction
238
 
239
  log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
240
  token_log_probs = log_probs.gather(2, target_ids.unsqueeze(-1)).squeeze(-1)
 
247
  return float("-inf")
248
 
249
 
 
250
  def evaluate_model_on_questions(model_path, questions, progress_callback=None):
251
  """Evaluate a single model on all questions using likelihood-based scoring"""
252
+
253
+ model_info = load_model_and_tokenizer(
254
+ model_path, progress_callback=progress_callback
255
+ )
256
+
257
  if model_info is None:
258
+ return [{"error": f"Failed to load model {model_path}"}] * len(questions)
259
+
260
  results = []
261
+ model = model_info["model"]
262
+ tokenizer = model_info["tokenizer"]
263
+
264
  for i, question in enumerate(questions):
265
  try:
266
  # Calculate likelihood for each choice
267
  choice_likelihoods = {}
268
  choice_probs = {}
269
+
270
+ for choice in question["choices"]:
271
+ likelihood = calculate_choice_likelihood(
272
+ model, tokenizer, question["question"], choice
273
+ )
274
  choice_likelihoods[choice] = likelihood
275
+
276
  # Convert log probabilities to probabilities for confidence scoring
277
  max_log_prob = max(choice_likelihoods.values())
278
+ choice_probs = {
279
+ choice: torch.exp(torch.tensor(log_prob - max_log_prob)).item()
280
+ for choice, log_prob in choice_likelihoods.items()
281
+ }
282
+
283
  # Normalize probabilities
284
  total_prob = sum(choice_probs.values())
285
  if total_prob > 0:
286
+ choice_probs = {
287
+ choice: prob / total_prob for choice, prob in choice_probs.items()
288
+ }
289
+
290
  # Select the choice with highest likelihood
291
+ predicted_choice = max(
292
+ choice_likelihoods.keys(), key=lambda x: choice_likelihoods[x]
293
+ )
294
+ is_correct = predicted_choice == question["correct_answer"]
295
+
296
  # Confidence is the probability of the selected choice
297
  confidence = choice_probs.get(predicted_choice, 0.0)
298
+
299
+ results.append(
300
+ {
301
+ "question_idx": i,
302
+ "predicted": predicted_choice,
303
+ "correct": is_correct,
304
+ "confidence": confidence,
305
+ "choice_likelihoods": choice_likelihoods,
306
+ "choice_probabilities": choice_probs,
307
+ "raw_response": f"Likelihoods: {choice_likelihoods}",
308
+ }
309
+ )
310
+
311
  if progress_callback:
312
  # Use remaining 80% for evaluation progress
313
  evaluation_progress = 0.2 + (i + 1) / len(questions) * 0.8
314
+ progress_callback(
315
+ evaluation_progress,
316
+ f"🔍 Evaluating {model_path}: {i + 1}/{len(questions)} questions (likelihood-based)",
317
+ )
318
+
319
  except Exception as e:
320
  logger.error(f"Error evaluating question {i} with {model_path}: {str(e)}")
321
+ results.append(
322
+ {
323
+ "question_idx": i,
324
+ "predicted": question["choices"][0] if question["choices"] else "",
325
+ "correct": False,
326
+ "confidence": 0.0,
327
+ "choice_likelihoods": {},
328
+ "choice_probabilities": {},
329
+ "raw_response": f"Error: {str(e)}",
330
+ }
331
+ )
332
+
333
  return results
334
 
335
+
336
+ def run_evaluation(
337
+ dataset_text, selected_predefined, custom_models_text="", progress=gr.Progress()
338
+ ):
339
  """Main evaluation function"""
340
  if not dataset_text.strip():
341
  return (
 
343
  "<p>No data provided</p>",
344
  None,
345
  None,
346
+ gr.update(visible=True),
347
+ "", # markdown_summary
348
+ "", # csv_summary
349
  )
350
+
351
  # Parse custom models
352
  custom_models = []
353
  if custom_models_text is None:
354
  custom_models_text = ""
355
  if custom_models_text.strip():
356
+ custom_models = [
357
+ model.strip()
358
+ for model in custom_models_text.strip().split("\n")
359
+ if model.strip()
360
+ ]
361
+
362
  # Combine selected models
363
  all_models = []
364
+
365
  # Add predefined models
366
  all_models.extend(selected_predefined)
367
  all_models.extend(custom_models)
368
+
369
  if not all_models:
370
  return (
371
  "Please select at least one model or add custom models",
372
  "<p>No models selected</p>",
373
  None,
374
  None,
375
+ gr.update(visible=False),
376
+ "",
377
+ "",
378
  )
379
+
380
  # Parse dataset
381
  questions, parse_error = parse_dataset(dataset_text)
382
+
383
  if parse_error:
384
  return (
385
  f"Dataset parsing error:\n{parse_error}",
386
  "<p>Failed to parse dataset</p>",
387
  None,
388
  None,
389
+ gr.update(visible=True),
390
+ "",
391
+ "",
392
  )
393
+
394
  if not questions:
395
  return (
396
  "No valid questions found in dataset",
397
  "<p>No questions to evaluate</p>",
398
  None,
399
  None,
400
+ gr.update(visible=True),
401
+ "",
402
+ "",
403
  )
404
+
405
  # Run evaluation
406
  progress(0, "Starting evaluation...")
407
  results = {}
408
  total_steps = len(all_models) * len(questions)
409
  current_step = 0
410
+
411
  summary_md = create_summary_markdown({})
412
  for model_path in all_models:
413
+ display_name = model_path.split("/")[-1] if "/" in model_path else model_path
414
  try:
415
+
416
  def model_progress(p, msg):
417
  nonlocal current_step
418
  current_step = int(p * len(questions))
419
  overall_progress = current_step / total_steps
420
  progress(overall_progress, msg)
421
+
422
+ model_results = evaluate_model_on_questions(
423
+ model_path, questions, model_progress
424
+ )
425
  results[display_name] = model_results
426
+
427
  except Exception as e:
428
  logger.error(f"Failed to evaluate {display_name}: {str(e)}")
429
+ results[display_name] = [{"error": str(e)}] * len(questions)
430
+
431
  # Clean up GPU memory
432
  if torch.cuda.is_available():
433
  torch.cuda.empty_cache()
434
  gc.collect()
435
+
436
  # Generate outputs
437
  summary_stats = generate_summary_stats(questions, results)
438
  summary_md = create_summary_markdown(summary_stats)
439
  detailed_html = create_detailed_results_html(questions, results)
440
  accuracy_chart = create_accuracy_chart(summary_stats)
441
  confidence_chart = create_confidence_chart(results)
442
+
443
+ # Generate compact summaries
444
+ markdown_summary = generate_compact_summary_markdown(
445
+ questions, results, summary_stats
446
+ )
447
+ csv_summary = generate_csv_summary(questions, results, summary_stats)
448
+
449
  return (
450
  summary_md,
451
  detailed_html,
452
  accuracy_chart,
453
  confidence_chart,
454
+ gr.update(visible=True),
455
+ markdown_summary,
456
+ csv_summary,
457
  )
458
 
459
+
460
  def generate_summary_stats(questions, results):
461
  """Generate summary statistics for all models"""
462
  summary = {}
463
+
464
  for model, model_results in results.items():
465
+ if not model_results or "error" in model_results[0]:
466
  summary[model] = {
467
+ "accuracy": 0.0,
468
+ "correct": 0,
469
+ "total": len(questions),
470
+ "avg_confidence": 0.0,
471
+ "error": model_results[0].get("error", "Unknown error")
472
+ if model_results
473
+ else "No results",
474
  }
475
  continue
476
+
477
+ correct_count = sum(1 for r in model_results if r.get("correct", False))
478
  total_count = len(model_results)
479
  accuracy = correct_count / total_count if total_count > 0 else 0
480
+
481
  # Calculate average confidence
482
+ avg_confidence = (
483
+ sum(r.get("confidence", 0) for r in model_results) / total_count
484
+ if total_count > 0
485
+ else 0
486
+ )
487
+
488
  summary[model] = {
489
+ "accuracy": accuracy,
490
+ "correct": correct_count,
491
+ "total": total_count,
492
+ "avg_confidence": avg_confidence,
493
  }
494
+
495
  return summary
496
 
497
+
498
  def create_summary_markdown(summary_stats):
499
  """Create markdown summary of results"""
500
  if not summary_stats:
501
  return "No results available"
502
+
503
  # Sort by accuracy
504
+ sorted_models = sorted(
505
+ summary_stats.items(), key=lambda x: x[1]["accuracy"], reverse=True
506
+ )
507
+
508
  lines = ["## 🏆 Model Performance Summary\n"]
509
+
510
  for i, (model, stats) in enumerate(sorted_models):
511
+ if "error" in stats:
512
  lines.append(f"❌ **{model}**: Error - {stats['error']}")
513
  continue
514
+
515
+ accuracy_pct = stats["accuracy"] * 100
516
+ medal = "🥇" if i == 0 else "🥈" if i == 1 else "🥉" if i == 2 else f"{i + 1}."
517
+
518
  lines.append(
519
  f"{medal} **{model}**: {accuracy_pct:.1f}% "
520
  f"({stats['correct']}/{stats['total']} correct, "
521
  f"avg confidence: {stats['avg_confidence']:.2f})"
522
  )
523
+
524
  return "\n".join(lines)
525
 
526
+
527
  def create_detailed_results_html(questions, results):
528
  """Create detailed HTML results for each question"""
529
  if not questions or not results:
530
  return "<p>No detailed results available</p>"
531
+
532
+ html_parts = [
533
+ """
534
  <style>
535
  .question-card {
536
  background: white;
 
617
  font-family: monospace;
618
  }
619
  </style>
620
+ """
621
+ ]
622
+
623
  for q_idx, question in enumerate(questions):
624
  html_parts.append(f"""
625
  <div class="question-card">
626
  <div class="question-header">
627
  <span class="question-number">Q{q_idx + 1}</span>
628
  </div>
629
+ <div class="question-text">{question["question"]}</div>
630
  <div class="choices">
631
  <strong>Choices:</strong><br>
632
+ {" | ".join(f"{chr(65 + i)}) {choice}" for i, choice in enumerate(question["choices"]))}
633
  </div>
634
  <div class="correct-answer">
635
+ <strong>✓ Correct Answer:</strong> {question["correct_answer"]}
636
  </div>
637
  <div class="model-results">
638
  """)
639
+
640
  # Add results for each model
641
  for model, model_results in results.items():
642
  if q_idx < len(model_results):
643
  result = model_results[q_idx]
644
+
645
+ if "error" in result:
646
  html_parts.append(f"""
647
  <div class="model-result result-error">
648
  <div>⚠️ {model}</div>
649
  <div style="font-size: 12px; margin-top: 4px;">
650
  Error occurred
651
  </div>
652
+ <div class="raw-response">{result.get("raw_response", "Unknown error")}</div>
653
  </div>
654
  """)
655
  else:
656
+ result_class = (
657
+ "result-correct"
658
+ if result.get("correct", False)
659
+ else "result-incorrect"
660
+ )
661
+ icon = "✅" if result.get("correct", False) else "❌"
662
+
663
  html_parts.append(f"""
664
  <div class="model-result {result_class}">
665
  <div>{icon} {model}</div>
666
  <div style="font-size: 12px; margin-top: 4px;">
667
+ "{result.get("predicted", "No prediction")}"
668
  </div>
669
+ <div class="raw-response">Raw: "{result.get("raw_response", "")}"</div>
670
  </div>
671
  """)
672
+
673
  html_parts.append("""
674
  </div>
675
  </div>
676
  """)
677
+
678
  return "".join(html_parts)
679
 
680
+
681
  def create_accuracy_chart(summary_stats):
682
  """Create accuracy comparison chart"""
683
  if not summary_stats:
684
  return None
685
+
686
  models = []
687
  accuracies = []
688
+
689
  for model, stats in summary_stats.items():
690
+ if "error" not in stats:
691
  models.append(model)
692
+ accuracies.append(stats["accuracy"] * 100)
693
+
694
  if not models:
695
  return None
696
+
697
+ fig = go.Figure(
698
+ data=[
699
+ go.Bar(
700
+ x=models,
701
+ y=accuracies,
702
+ marker_color="lightblue",
703
+ text=[f"{acc:.1f}%" for acc in accuracies],
704
+ textposition="auto",
705
+ )
706
+ ]
707
+ )
708
+
709
  fig.update_layout(
710
  title="Model Accuracy Comparison",
711
  xaxis_title="Models",
712
  yaxis_title="Accuracy (%)",
713
  template="plotly_white",
714
+ showlegend=False,
715
  )
716
+
717
  return fig
718
 
719
+
720
  def create_confidence_chart(results):
721
  """Create confidence distribution chart"""
722
  if not results:
723
  return None
724
+
725
  data = []
726
  for model, model_results in results.items():
727
  for result in model_results:
728
+ if "error" not in result and "confidence" in result:
729
+ data.append(
730
+ {
731
+ "Model": model,
732
+ "Confidence": result["confidence"],
733
+ "Correct": "Correct"
734
+ if result.get("correct", False)
735
+ else "Incorrect",
736
+ }
737
+ )
738
+
739
  if not data:
740
  return None
741
+
742
  df = pd.DataFrame(data)
743
+
744
  fig = px.box(
745
  df,
746
+ x="Model",
747
+ y="Confidence",
748
+ color="Correct",
749
  title="Confidence Distribution by Model and Correctness",
750
+ template="plotly_white",
751
  )
752
+
753
  return fig
754
 
755
+
756
+ def generate_compact_summary_markdown(questions, results, summary_stats):
757
+ """Generate a compact markdown summary table for copy-pasting"""
758
+ logger.info("compaaact summary")
759
+ if not summary_stats or not questions or not results:
760
+ return "No data available for summary"
761
+
762
+ lines = ["# Model Performance Summary\n"]
763
+
764
+ # Accuracy Summary Table
765
+ lines.append("## 📊 Accuracy Summary\n")
766
+ lines.append("| Rank | Model | Accuracy | Correct | Total | Avg Confidence |")
767
+ lines.append("|------|-------|----------|---------|-------|----------------|")
768
+
769
+ # Sort by accuracy
770
+ sorted_models = sorted(
771
+ summary_stats.items(), key=lambda x: x[1].get("accuracy", 0), reverse=True
772
+ )
773
+
774
+ for i, (model, stats) in enumerate(sorted_models):
775
+ if "error" in stats:
776
+ lines.append(f"| {i + 1} | {model} | ERROR | - | - | - |")
777
+ else:
778
+ accuracy_pct = stats["accuracy"] * 100
779
+ lines.append(
780
+ f"| {i + 1} | {model} | {accuracy_pct:.1f}% | {stats['correct']} | {stats['total']} | {stats['avg_confidence']:.3f} |"
781
+ )
782
+
783
+ lines.append("\n")
784
+
785
+ # Detailed Results Table
786
+ lines.append("## 📋 Detailed Question Results\n")
787
+
788
+ # Get all model names for header
789
+ model_names = list(results.keys())
790
+ header = "| Q# | Question | Correct Answer |" + "".join(
791
+ [f" {model} |" for model in model_names]
792
+ )
793
+ separator = "|" + "|".join(
794
+ ["-" * (len(col.strip()) + 2) for col in header.split("|")[1:]]
795
+ )
796
+
797
+ lines.append(header)
798
+ lines.append(separator)
799
+
800
+ for q_idx, question in enumerate(questions):
801
+ # Truncate long questions for table readability
802
+ question_text = question["question"]
803
+ if len(question_text) > 50:
804
+ question_text = question_text[:47] + "..."
805
+
806
+ row = f"| {q_idx + 1} | {question_text} | {question['correct_answer']} |"
807
+
808
+ for model in model_names:
809
+ if q_idx < len(results[model]) and "error" not in results[model][q_idx]:
810
+ result = results[model][q_idx]
811
+ predicted = result.get("predicted", "N/A")
812
+ is_correct = result.get("correct", False)
813
+ confidence = result.get("confidence", 0)
814
+
815
+ # Add emoji for visual feedback
816
+ status_emoji = "✅" if is_correct else "❌"
817
+ row += f" {status_emoji} {predicted} ({confidence:.2f}) |"
818
+ else:
819
+ row += " ⚠️ ERROR |"
820
+
821
+ lines.append(row)
822
+
823
+ lines.append("\n")
824
+
825
+ # Legend
826
+ lines.append("### Legend")
827
+ lines.append("- ✅ = Correct answer")
828
+ lines.append("- ❌ = Incorrect answer")
829
+ lines.append("- ⚠️ = Error occurred")
830
+ lines.append("- Numbers in parentheses = Confidence score")
831
+ logger.info("\n".join(lines))
832
+ return "\n".join(lines)
833
+
834
+
835
+ def generate_csv_summary(questions, results, summary_stats):
836
+ """Generate CSV format summary"""
837
+ # TODO: add CSV file download if necessary
838
+ if not summary_stats or not questions or not results:
839
+ return "No data available"
840
+
841
+ lines = []
842
+
843
+ # Accuracy summary header
844
+ lines.append("# ACCURACY SUMMARY")
845
+ lines.append("Rank,Model,Accuracy_Percent,Correct,Total,Avg_Confidence")
846
+
847
+ sorted_models = sorted(
848
+ summary_stats.items(), key=lambda x: x[1].get("accuracy", 0), reverse=True
849
+ )
850
+ for i, (model, stats) in enumerate(sorted_models):
851
+ if "error" in stats:
852
+ lines.append(f"{i + 1},{model},ERROR,-,-,-")
853
+ else:
854
+ accuracy_pct = stats["accuracy"] * 100
855
+ lines.append(
856
+ f"{i + 1},{model},{accuracy_pct:.1f},{stats['correct']},{stats['total']},{stats['avg_confidence']:.3f}"
857
+ )
858
+
859
+ lines.append("")
860
+ lines.append("# DETAILED RESULTS")
861
+
862
+ # Header for detailed results
863
+ model_names = list(results.keys())
864
+ header = "Question_ID,Question,Correct_Answer," + ",".join(
865
+ [
866
+ f"{model}_Predicted,{model}_Correct,{model}_Confidence"
867
+ for model in model_names
868
+ ]
869
+ )
870
+ lines.append(header)
871
+
872
+ # Detailed results
873
+ for q_idx, question in enumerate(questions):
874
+ row = f'{q_idx + 1},"{question["question"]}",{question["correct_answer"]}'
875
+
876
+ for model in model_names:
877
+ if q_idx < len(results[model]) and "error" not in results[model][q_idx]:
878
+ result = results[model][q_idx]
879
+ predicted = result.get("predicted", "N/A")
880
+ is_correct = str(result.get("correct", False))
881
+ confidence = result.get("confidence", 0)
882
+ row += f",{predicted},{is_correct},{confidence:.3f}"
883
+ else:
884
+ row += ",ERROR,FALSE,0"
885
+
886
+ lines.append(row)
887
+
888
+ return "\n".join(lines)
889
+
890
+
891
  # Sample datasets for quick testing
892
  SAMPLE_DATASETS = {
893
  "Custom (enter below)": "",
 
900
  What is 5*3?,15,12,16,18
901
  What is 10-7?,3,7,4,2
902
  What is 8/2?,4,3,2,5""",
 
903
  "World Capitals": """Question,Correct Answer,Choice1,Choice2,Choice3
904
  What is the capital of France?,Paris,London,Berlin,Rome
905
  What is the capital of Japan?,Tokyo,Seoul,Beijing,Bangkok
906
  What is the capital of Brazil?,Brasília,Rio de Janeiro,São Paulo,Salvador
907
  What is the capital of Australia?,Canberra,Sydney,Melbourne,Perth""",
 
908
  "Science Quiz": """Question,Correct Answer,Choice1,Choice2,Choice3
909
  What is the chemical symbol for gold?,Au,Ag,Ca,K
910
  Which planet is closest to the Sun?,Mercury,Venus,Earth,Mars
911
  What is the speed of light?,299792458 m/s,300000000 m/s,2992458 m/s,299000000 m/s
912
+ What gas do plants absorb from the atmosphere?,Carbon dioxide,Oxygen,Nitrogen,Hydrogen""",
913
  }
914
 
915
  # Custom CSS
 
924
  """
925
 
926
  # Create Gradio interface
927
+ with gr.Blocks(
928
+ title="🤖 Model Performance Comparison", theme=gr.themes.Soft(), css=css
929
+ ) as demo:
930
  gr.Markdown("""
931
  # 🤖 Model Performance Comparison Tool
932
 
 
940
  - Detailed question-by-question results
941
  - Performance charts and statistics
942
  """)
943
+
944
  with gr.Row():
945
  with gr.Column(scale=2):
946
  # Sample dataset selector
 
948
  choices=list(SAMPLE_DATASETS.keys()),
949
  value="Custom (enter below)",
950
  label="Choose sample dataset or enter your own",
951
+ interactive=True,
952
  )
953
+
954
  # Dataset input
955
  dataset_input = gr.Textbox(
956
  label="Dataset (CSV/TSV format)",
 
961
  What is 2+2?,4,3,2,5
962
  What is the capital of France?,Paris,London,Berlin,Paris""",
963
  lines=8,
964
+ max_lines=15,
965
  )
966
+
967
  gr.Markdown("""
968
  **Format Requirements**:
969
  - First line: header (will be ignored), leave empty if no header
970
  - Each data line: Question, Correct Answer, Choice1, Choice2, Choice3
971
  - Use commas or tabs as separators
972
  """)
973
+
974
  with gr.Column(scale=1):
975
  # Model selection
976
  with gr.Tabs():
 
979
  choices=PREDEFINED_MODELS,
980
  value=[PREDEFINED_MODELS[0]],
981
  label="Select from popular models",
982
+ interactive=True,
983
  )
984
+
985
  with gr.TabItem("➕ Custom Models"):
986
  custom_models_input = gr.Textbox(
987
  label="Custom HuggingFace Model Paths",
 
992
  lines=5,
993
  info="Add any HuggingFace model path. One model per line.",
994
  )
995
+
996
  gr.Markdown("""
997
  **Examples of valid model paths**:
998
  - `microsoft/DialoGPT-medium`
 
1000
  - `facebook/opt-350m`
1001
  - Your own fine-tuned models!
1002
  """)
1003
+
1004
  # Evaluate button
1005
+ evaluate_btn = gr.Button("⚡ Run Evaluation", variant="primary", scale=1)
1006
+
 
 
 
 
1007
  gr.Markdown("""
1008
  **⚠️ Note**:
1009
  - Larger models require more GPU memory, currently we only run on CPU
1010
  - First run will download models (may take time)
1011
  - Models are cached for subsequent runs
1012
  """)
1013
+
1014
  # Results section
1015
  with gr.Column(visible=True) as results_section:
1016
  gr.Markdown("## 📊 Results")
1017
+
1018
  summary_output = gr.Markdown(
1019
+ value="Results will appear here...", label="Performance Summary"
 
1020
  )
1021
+
1022
  with gr.Row():
1023
  accuracy_plot = gr.Plot(label="Accuracy Comparison")
1024
  confidence_plot = gr.Plot(label="Confidence Analysis")
1025
+
1026
+ # NEW: Export Section
1027
+ gr.Markdown("## 📥 Export Results")
1028
+
1029
+ with gr.Row():
1030
+ with gr.Column():
1031
+ gr.Markdown("### 📋 Markdown Table Format")
1032
+ markdown_summary_output = gr.Textbox(
1033
+ label="Markdown Summary (Copy & Paste Ready)",
1034
+ lines=15,
1035
+ max_lines=25,
1036
+ show_copy_button=True,
1037
+ interactive=False,
1038
+ value="",
1039
+ )
1040
+
1041
+ with gr.Column():
1042
+ gr.Markdown("### 📊 CSV Format")
1043
+ csv_summary_output = gr.Textbox(
1044
+ label="CSV Summary (Copy & Paste Ready)",
1045
+ lines=15,
1046
+ max_lines=25,
1047
+ show_copy_button=True,
1048
+ interactive=False,
1049
+ value="",
1050
+ )
1051
+
1052
  detailed_results = gr.HTML(
1053
  value="<p>Detailed results will appear here...</p>",
1054
+ label="Detailed Question-by-Question Results",
1055
  )
1056
+
1057
  # Event handlers
1058
  def update_dataset_from_sample(sample_name):
1059
  if sample_name in SAMPLE_DATASETS:
1060
  return gr.update(value=SAMPLE_DATASETS[sample_name])
1061
  return gr.update()
1062
+
1063
  sample_selector.change(
1064
+ fn=update_dataset_from_sample, inputs=sample_selector, outputs=dataset_input
 
 
1065
  )
1066
+
1067
  evaluate_btn.click(
1068
  fn=run_evaluation,
1069
  inputs=[dataset_input, predefined_selector, custom_models_input],
1070
+ outputs=[
1071
+ summary_output,
1072
+ detailed_results,
1073
+ accuracy_plot,
1074
+ confidence_plot,
1075
+ results_section,
1076
+ markdown_summary_output,
1077
+ csv_summary_output,
1078
+ ],
1079
  )
1080
+
1081
  gr.Markdown("""
1082
  ---
1083
  ### About Model Evaluation
 
1101
  """)
1102
 
1103
  if __name__ == "__main__":
1104
+ demo.launch()