CoderBak commited on
Commit
6d0cdbd
·
verified ·
1 Parent(s): fde17c4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +413 -365
app.py CHANGED
@@ -11,6 +11,8 @@ import sqlite3
11
  import math
12
  import time
13
  from huggingface_hub import hf_hub_download
 
 
14
 
15
  # 翻译表
16
  SUBJECT_TRANS = {
@@ -30,8 +32,8 @@ MODEL_TRANS = {
30
  "still-3-1.5b-preview": "STILL-3-1.5B-Preview",
31
  "deepseek-r1-distill-qwen-32b": "DeepSeek-R1-Distill-Qwen-32B",
32
  "light-r1-7b-ds": "Light-R1-7B-DS",
33
- #"openmath-nemotron-32b": "OpenMath-Nemotron-32B",
34
- #"qwen3-235b-a22b": "Qwen3-235B-A22B",
35
  "skywork-or1-32b-preview": "Skywork-OR1-32B-Preview",
36
  "deepscaler-1.5b-preview": "DeepScaler-1.5B-Preview",
37
  "deepseek-r1-distill-qwen-7b": "DeepSeek-R1-Distill-Qwen-7B",
@@ -44,6 +46,11 @@ MODEL_TRANS = {
44
  "skywork-or1-math-7b": "Skywork-OR1-Math-7B",
45
  "skywork-or1-7b-preview": "Skywork-OR1-7B-Preview",
46
  "qwen3-30b-a3b": "Qwen3-30B-A3B",
 
 
 
 
 
47
  # 添加更多模型映射
48
  }
49
 
@@ -72,7 +79,6 @@ class ModelDatabase:
72
  self.conn.execute("PRAGMA temp_store = MEMORY") # 临时表存储在内存中
73
  self.conn.execute("PRAGMA mmap_size = 8589934592") # 尝试使用8GB内存映射
74
  self.conn.row_factory = sqlite3.Row
75
- print("Database connection established with optimized parameters")
76
 
77
  # 创建索引以加速查询
78
  self._ensure_indices()
@@ -96,7 +102,7 @@ class ModelDatabase:
96
  cursor.execute("CREATE INDEX IF NOT EXISTS idx_problems_unique_id ON problems(unique_id)")
97
  cursor.execute("ANALYZE") # 分析表以优化查询计划
98
  except Exception as e:
99
- print(f"Error creating indices: {e}")
100
 
101
  def get_available_models(self):
102
  """Get list of all available models"""
@@ -110,8 +116,7 @@ class ModelDatabase:
110
  models = [row['model_name'] for row in cursor.fetchall()]
111
  self._models_cache = models # 存储到实例缓存
112
  return models
113
- except sqlite3.OperationalError as e:
114
- print(f"Error in get_available_models: {e}")
115
  return []
116
 
117
  def get_available_datasets(self):
@@ -126,8 +131,7 @@ class ModelDatabase:
126
  datasets = [row['dataset'].upper() for row in cursor.fetchall()]
127
  self._datasets_cache = datasets # 存储到实例缓存
128
  return datasets
129
- except sqlite3.OperationalError as e:
130
- print(f"Error in get_available_datasets: {e}")
131
  return DATASETS
132
 
133
  def get_model_statistics(self, model_name, dataset):
@@ -173,8 +177,7 @@ class ModelDatabase:
173
 
174
  self._cache[cache_key] = stats_data
175
  return stats_data
176
- except sqlite3.OperationalError as e:
177
- print(f"Database error in get_model_statistics: {e}")
178
  return [["Database Error", "No data available"]]
179
 
180
  def get_all_model_accuracies(self, dataset):
@@ -194,8 +197,7 @@ class ModelDatabase:
194
  results = [(row['model_name'], row['accuracy']) for row in cursor.fetchall()]
195
  self._cache[cache_key] = results
196
  return results
197
- except sqlite3.OperationalError as e:
198
- print(f"Error in get_all_model_accuracies: {e}")
199
  return []
200
 
201
  def get_problems_by_model_dataset(self, model_name, dataset):
@@ -222,8 +224,7 @@ class ModelDatabase:
222
  sorted_results = sorted(results, key=lambda x: int(re.search(r'\d+', x[0]).group(0)) if re.search(r'\d+', x[0]) else 0)
223
  self._cache[cache_key] = sorted_results
224
  return sorted_results
225
- except sqlite3.OperationalError as e:
226
- print(f"Database error in get_problems_by_model_dataset: {e}")
227
  return []
228
 
229
  def get_problem_data(self, model_name, dataset, problem_id):
@@ -248,8 +249,7 @@ class ModelDatabase:
248
  # 转为字典存储,避免SQLite连接依赖
249
  self._problem_cache[problem_cache_key] = dict(problem)
250
  problem = self._problem_cache[problem_cache_key]
251
- except Exception as e:
252
- print(f"Error fetching problem data: {e}")
253
  return None, None
254
 
255
  if not problem:
@@ -279,8 +279,7 @@ class ModelDatabase:
279
  responses = [dict(r) for r in responses]
280
  self._response_cache[resp_cache_key] = responses
281
  return problem, responses
282
- except Exception as e:
283
- print(f"Error fetching responses: {e}")
284
  return problem, None
285
  else:
286
  # 获��所有模型对此问题的响应
@@ -305,8 +304,7 @@ class ModelDatabase:
305
  responses = [dict(r) for r in responses]
306
  self._response_cache[resp_cache_key] = responses
307
  return problem, responses
308
- except Exception as e:
309
- print(f"Error fetching all responses: {e}")
310
  return problem, None
311
 
312
  def get_model_responses(self, selected_models, dataset, problem_id):
@@ -341,36 +339,46 @@ class ModelDatabase:
341
  """清除指定部分或全部缓存"""
342
  if section == 'main' or section is None:
343
  self._cache = {}
344
- print("Cleared main cache")
345
  if section == 'problem' or section is None:
346
  self._problem_cache = {}
347
- print("Cleared problem cache")
348
  if section == 'response' or section is None:
349
  self._response_cache = {}
350
- print("Cleared response cache")
351
  if section == 'models' or section is None:
352
  if hasattr(self, '_models_cache'):
353
  self._models_cache = None
354
  if hasattr(self, '_datasets_cache'):
355
  self._datasets_cache = None
356
- print("Cleared metadata cache")
357
 
358
  def close(self):
359
  """关闭数据库连接并释放资源"""
360
  if hasattr(self, 'conn') and self.conn:
361
  try:
362
  self.conn.close()
363
- print("Database connection closed")
364
- except Exception as e:
365
- print(f"Error closing database: {e}")
366
 
367
  # 清理所有缓存
368
  self.clear_cache()
369
 
370
  def format_latex(text):
371
  if text is None: return ""
372
- text = text.replace('\\', '\\\\') # Escape backslashes for JS/HTML
 
 
373
  text = text.replace('\n', '<br>')
 
 
 
 
 
 
 
 
 
 
 
 
 
374
  return text
375
 
376
  def get_gradient_color(accuracy, color_map='RdYlGn'):
@@ -378,7 +386,7 @@ def get_gradient_color(accuracy, color_map='RdYlGn'):
378
  return "#505050" # Default for missing or invalid accuracy
379
  try:
380
  # 使用更深的颜色映射
381
- cmap = plt.cm.get_cmap(color_map)
382
  rgba = cmap(float(accuracy))
383
 
384
  # 确保颜色足够深以与白色文本形成对比
@@ -391,8 +399,7 @@ def get_gradient_color(accuracy, color_map='RdYlGn'):
391
  # 转回十六进制
392
  hex_color = mpl.colors.rgb2hex((r, g, b, a))
393
  return hex_color
394
- except Exception as e:
395
- print(f"Error getting gradient color: {e}")
396
  return "#505050"
397
 
398
  def get_contrasting_text_color(bg_color):
@@ -425,135 +432,172 @@ def get_contrasting_text_color(bg_color):
425
  # 其他颜色根据亮度决定
426
  return "#000" if yiq > 160 else "#fff"
427
 
428
- def format_sample(sample, show_correctness=True):
 
429
  if sample is None: return ""
430
  sample_dict = dict(sample) if hasattr(sample, 'keys') else sample if isinstance(sample, dict) else {}
431
- if not sample_dict: return "<div>No sample data</div>"
432
 
433
- formatted_response = format_latex(sample_dict.get('response', ''))
434
  extracted = sample_dict.get('extracted', '')
435
- formatted_extracted = format_latex(extracted) if extracted else ""
436
  correctness = sample_dict.get('correctness', 0)
437
  correctness_label = "✓ Correct" if correctness else "✗ Incorrect"
438
  correctness_color = "var(--color-green)" if correctness else "var(--color-red)"
439
- html = f"<div style='font-size: 0.85em; padding: 10px; border-radius: 8px;' class='dark-mode-compatible dark-mode-bg-secondary'>"
440
 
441
- # 将correctness和extracted放在同一行
 
 
 
 
 
 
 
442
  if show_correctness:
443
- html += f"<div style='display: flex; align-items: center; margin-bottom: 5px;'>"
 
444
  html += f"<span style='color: {correctness_color}; font-weight: bold; margin-right: 10px;'>{correctness_label}</span>"
445
- if formatted_extracted:
446
- html += f"<span style='background-color: rgba(0,0,0,0.05); padding: 2px 5px; border-radius: 3px;'><b>Extracted:</b> {formatted_extracted}</span>"
 
 
 
 
 
 
 
 
 
 
 
447
  html += f"</div>"
448
 
449
- html += f"<div style='white-space: pre-wrap;' class='math-content'>{formatted_response}</div>"
450
  html += "</div>"
451
  return html
452
 
453
- def create_comparison_chart(df, sort_by='avg_accuracy', ascending=False):
454
- if df.empty: return None, "No data available for chart"
455
- if sort_by not in df.columns and not df.index.name == sort_by:
456
- # Try to find a valid sort column if avg_accuracy is not present (e.g. single model comparison)
457
- potential_sort_cols = [col for col in df.columns if 'accuracy' in col]
458
- if potential_sort_cols:
459
- sort_by = potential_sort_cols[0] # Default to first accuracy column
460
- else: # Fallback to index if no accuracy column
461
- sort_by = df.index.name if df.index.name else 'unique_id'
462
- if sort_by not in df.index.name and sort_by not in df.columns : # df.index.name might be None
463
- return None, f"Sort column '{sort_by}' not found in DataFrame."
464
 
465
- try:
466
- if sort_by in df.columns:
467
- df_sorted = df.sort_values(by=sort_by, ascending=ascending)
468
- elif sort_by == df.index.name:
469
- df_sorted = df.sort_index(ascending=ascending) # Sorting by index
470
- else:
471
- df_sorted = df # No sort if column not found and not index
472
- except KeyError:
473
- return None, f"Sort column '{sort_by}' not found."
474
-
475
- df_sorted['problem_id_display'] = df_sorted.index.to_series().apply(lambda x: re.search(r'\d+', str(x)).group(0) if re.search(r'\d+', str(x)) else str(x))
476
- accuracy_cols_to_plot = [col for col in df_sorted.columns if col.endswith('_accuracy')]
477
- if not accuracy_cols_to_plot:
478
- return None, "No accuracy columns to plot."
479
-
480
- fig, ax = plt.subplots(figsize=(10, max(6, len(df_sorted) * 0.25)))
481
- cmap = plt.cm.get_cmap('RdYlGn')
482
- num_models = len(accuracy_cols_to_plot)
483
- bar_height = 0.8 / num_models if num_models > 0 else 0.8
484
- y_pos_base = np.arange(len(df_sorted))
485
-
486
- for i, col_name in enumerate(accuracy_cols_to_plot):
487
- model_label = col_name.replace('_accuracy', '')
488
- accuracies = df_sorted[col_name].fillna(0) # Fill NaN for plotting
489
- # Offset y_pos for grouped bars
490
- y_pos = y_pos_base - (bar_height * num_models / 2) + (i * bar_height) + (bar_height / 2)
491
-
492
- bars = ax.barh(y_pos, accuracies, height=bar_height, label=model_label, alpha=0.8)
493
- for bar_idx, bar_val in enumerate(accuracies):
494
- bar.set_color(cmap(bar_val)) # Color individual bars
495
- if bar_val > 0:
496
- ax.text(max(0.01, bar_val + 0.01), y_pos[bar_idx], f'{bar_val:.0%}', va='center', ha='left', fontsize=8)
497
 
498
- ax.set_yticks(y_pos_base)
499
- ax.set_yticklabels(df_sorted['problem_id_display'])
500
- ax.set_xlim(0, 1.1)
501
- ax.set_xlabel('Accuracy')
502
- ax.set_ylabel('Problem ID')
503
- ax.legend(loc='lower right', fontsize='small')
504
- ax.grid(True, axis='x', linestyle='--', alpha=0.6)
505
- plt.tight_layout()
506
- chart_path = "comparison_chart.png"
507
- plt.savefig(chart_path, dpi=120, bbox_inches='tight')
508
- plt.close(fig)
509
- return chart_path, df_sorted.index.tolist()
510
 
511
- def create_problem_grid_html(problems, mode='default'):
512
- """Create HTML for problem grid buttons. The JS function will be defined globally."""
513
- if not problems:
514
- return "<div>No problems found for this model/dataset. Please select a model and dataset.</div>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
515
 
516
- html_buttons = ""
 
 
 
 
 
 
 
 
 
 
 
 
517
  try:
518
- sorted_problems = sorted(
519
- [(str(p[0]), float(p[1]) if p[1] is not None else 0.0, p[2]) for p in problems],
520
- key=lambda x: int(re.search(r'\d+', x[0]).group(0)) if re.search(r'\d+', x[0]) else 0
521
- )
522
  except Exception as e:
523
- print(f"Error sorting problems: {e}. Problems: {problems[:5]}")
524
- return f"<div>Error displaying problems. Check logs. {e}</div>"
525
 
526
- for pid, accuracy, _ in sorted_problems:
527
- match = re.search(r'\d+', pid)
528
- num_display = match.group(0) if match else pid
529
- acc_pct = int(accuracy * 100)
 
 
 
 
 
530
 
531
- # 获取背景颜色
532
- bg_color = get_gradient_color(accuracy)
533
- # 统一使用白色文本,添加!important确保不被覆盖
534
- text_color = "#ffffff"
535
-
536
- html_buttons += f"""
537
- <div
538
- data-problem-id=\"{pid}\"
539
- class=\"problem-btn\"
540
- title=\"ID: {pid} - Acc: {acc_pct}%\"
541
- style='background-color: {bg_color}; color: {text_color} !important;
542
- border-radius: 4px; padding: 5px; text-align: center; font-size: 0.7em;
543
- min-height: 36px; user-select: none; width: 100%;
544
- display: flex; flex-direction: column; justify-content: center;
545
- overflow: hidden; text-overflow: ellipsis; white-space: nowrap;'>
546
- <div style="font-weight: bold; color: {text_color} !important;">{num_display}</div>
547
- <div style="color: {text_color} !important;">{acc_pct}%</div>
548
- </div>
549
- """
550
 
551
- # 添加自定义样式强制文本颜色为白色
552
- custom_style = "<style>.problem-btn, .problem-btn div { color: white !important; }</style>"
553
- # 根据模式设置每行显示的列数
554
- grid_cols = 20 if mode == 'comparison' else 10
555
- grid_html = f"{custom_style}<div style='display: grid; grid-template-columns: repeat({grid_cols}, 1fr); gap: 4px;'>{html_buttons}</div>"
556
- return grid_html
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
557
 
558
  def handle_problem_select(problem_id_from_js, current_model_state, current_dataset_state, mode='default'):
559
  global db
@@ -562,9 +606,6 @@ def handle_problem_select(problem_id_from_js, current_model_state, current_datas
562
  dataset_name = current_dataset_state.value if hasattr(current_dataset_state, 'value') else current_dataset_state
563
  problem_id = problem_id_from_js.value if hasattr(problem_id_from_js, 'value') else problem_id_from_js
564
 
565
- print(f"**************** PYTHON HANDLER CALLED ***************")
566
- print(f"[Backend] handle_problem_select: problem_id='{problem_id}', length={len(str(problem_id)) if problem_id else 0}")
567
-
568
  # 处理纯数字输入,构建完整unique_id
569
  if problem_id and problem_id.isdigit():
570
  # 构建格式:OlymMATH-HARD-0-EN 或类似格式
@@ -574,26 +615,9 @@ def handle_problem_select(problem_id_from_js, current_model_state, current_datas
574
  language, difficulty = parts
575
  # 构建完整ID
576
  problem_id = f"OlymMATH-{difficulty}-{problem_id}-{language}"
577
- print(f"[Backend] Constructed full problem_id from number: '{problem_id}'")
578
-
579
- print(f"[Backend] Raw problem_id received (exact characters):")
580
- if problem_id:
581
- print(f"'{problem_id}'")
582
- # Print character by character for debugging
583
- print("Character by character:")
584
- for i, c in enumerate(str(problem_id)):
585
- print(f" Position {i}: '{c}' (ord={ord(c)})")
586
-
587
- print(f"[Backend] model='{model_name}', dataset='{dataset_name}'")
588
-
589
- # Debugging - dump more information about the objects
590
- print(f"[Debug] problem_id_from_js type: {type(problem_id_from_js)}")
591
- print(f"[Debug] current_model_state type: {type(current_model_state)}")
592
- print(f"[Debug] current_dataset_state type: {type(current_dataset_state)}")
593
 
594
  if not problem_id or not dataset_name:
595
  error_message = f"Missing data: problem_id='{problem_id}', dataset='{dataset_name}'"
596
- print(f"[Backend] {error_message}")
597
  return "Please fill in all the fields.", "No answer available.", "", gr.State([])
598
 
599
  # For comparison mode, we might not have a model selected yet
@@ -604,24 +628,32 @@ def handle_problem_select(problem_id_from_js, current_model_state, current_datas
604
 
605
  if not problem_data:
606
  error_message = f"Problem data not found: problem_id='{problem_id}', dataset='{dataset_name}'"
607
- print(f"[Backend] {error_message}")
608
  return f"Problem not found: {problem_id}. Please check the ID and try again.", "No answer available.", "", gr.State([])
609
 
610
  problem_dict = dict(problem_data)
611
- problem_content = f"{format_latex(problem_dict.get('problem', ''))}"
612
- answer_content = f"{format_latex(problem_dict.get('answer', ''))}"
 
 
 
 
 
 
 
 
 
 
 
613
 
614
  # For comparison without model, we don't have samples to display
615
  return problem_content, answer_content, "", gr.State([])
616
  except Exception as e:
617
  error_message = f"Database error: {str(e)}"
618
- print(f"[Backend] {error_message}")
619
  return f"Database error occurred. Please try again.", "No answer available.", "", gr.State([])
620
 
621
  # The regular flow for model-specific data
622
  if not model_name:
623
  error_message = f"Missing data: model='{model_name}'"
624
- print(f"[Backend] {error_message}")
625
  return "Please fill in all the fields.", "No answer available.", "", gr.State([])
626
 
627
  # The problem_id from JS should be the full unique_id. No reconstruction needed normally.
@@ -630,19 +662,29 @@ def handle_problem_select(problem_id_from_js, current_model_state, current_datas
630
 
631
  if not problem_data:
632
  error_message = f"Problem data not found: problem_id='{problem_id}', model='{model_name}', dataset='{dataset_name}'"
633
- print(f"[Backend] {error_message}")
634
  return f"Problem not found: {problem_id}. Please check the ID and try again.", "No answer available.", "", gr.State([])
635
  except Exception as e:
636
  error_message = f"Database error: {str(e)}"
637
- print(f"[Backend] {error_message}")
638
  return f"Database error occurred. Please try again.", "No answer available.", "", gr.State([])
639
 
640
  problem_dict = dict(problem_data)
641
  problem_display_num = re.search(r'\d+', problem_id).group(0) if re.search(r'\d+', problem_id) else problem_id
642
 
643
- problem_content = f"{format_latex(problem_dict.get('problem', ''))}"
644
- answer_content = f"{format_latex(problem_dict.get('answer', ''))}"
 
 
 
 
 
 
 
 
 
 
 
645
 
 
646
  if not responses_data:
647
  samples_grid_html = "<div>No samples available for this problem.</div>"
648
  # 返回空的样本数据状态
@@ -720,8 +762,8 @@ def handle_problem_select(problem_id_from_js, current_model_state, current_datas
720
 
721
  samples_grid_html += '</div>'
722
 
723
- # 第三行和第四行 - 对于默认模式(非比较模式)
724
- if mode != 'comparison' and actual_display_count > 2*samples_per_row:
725
  # 第三行
726
  row_samples = displayed_samples[2*samples_per_row:3*samples_per_row]
727
  if row_samples:
@@ -787,106 +829,58 @@ def handle_problem_select(problem_id_from_js, current_model_state, current_datas
787
  </div>
788
  """
789
 
790
- print(f"[Backend] Successfully prepared display for problem {problem_id}")
791
  # 获取第一个样本作为初始样本
792
  if samples_data:
793
  # 这样样本会在选择问题后立即显示
794
- first_sample = format_sample(samples_data[0])
795
  return problem_content, answer_content, final_html, gr.State(samples_data)
796
  else:
797
  return problem_content, answer_content, final_html, gr.State([])
798
 
799
- def handle_sample_select(sample_number, samples_data):
800
- print(f"[Backend] handle_sample_select: sample_number='{sample_number}', samples_data_type={type(samples_data)}")
801
-
802
- # 确保从Gradio State对象中提取实际值
803
- if hasattr(samples_data, 'value'):
804
- samples_list = samples_data.value
805
- print(f"[Backend] Extracted samples_data from State: length={len(samples_list) if isinstance(samples_list, list) else 'not_list'}")
806
- else:
807
- samples_list = samples_data
808
-
809
- # 确保样本编号是整数
810
- try:
811
- sample_idx = int(sample_number)
812
- except ValueError:
813
- return "<div style='color: red; padding: 10px; border: 1px solid red; border-radius: 5px;'>Error: Sample number must be an integer.</div>"
814
-
815
- # 确保样本数据存在且为非空列表
816
- if not samples_list or not isinstance(samples_list, list) or len(samples_list) == 0:
817
- return "<div>No sample data available. Please select a problem first.</div>"
818
-
819
- # 检查索引是否在有效范围内,如果不在范围内,显示错误消息
820
- if sample_idx < 0:
821
- return f"<div style='color: red; padding: 10px; border: 1px solid red; border-radius: 5px;'>Error: Sample number {sample_idx} is out of range. Valid range is 0 to {len(samples_list) - 1}.</div>"
822
-
823
- if sample_idx >= len(samples_list):
824
- return f"<div style='color: red; padding: 10px; border: 1px solid red; border-radius: 5px;'>Error: Sample number {sample_idx} is out of range. Valid range is 0 to {len(samples_list) - 1}.</div>"
825
-
826
- # 获取所选样本的数据
827
- try:
828
- sample = samples_list[sample_idx]
829
- formatted_sample = format_sample(sample)
830
- return formatted_sample
831
- except Exception as e:
832
- print(f"[Backend] Error formatting sample: {e}")
833
- return f"<div style='color: red; padding: 10px; border: 1px solid red; border-radius: 5px;'>Error displaying sample {sample_idx}: {str(e)}</div>"
834
 
835
- def handle_first_sample(samples_data):
836
- """处理并显示第一个样本(索引0)"""
837
- # 确保从Gradio State对象中提取实际值
838
- if hasattr(samples_data, 'value'):
839
- samples_list = samples_data.value
840
- else:
841
- samples_list = samples_data
842
-
843
- # 检查样本数据是否存在
844
- if not samples_list or not isinstance(samples_list, list) or len(samples_list) == 0:
845
- return "<div>No sample data available. Please select the problem and dataset first.</div>"
846
-
847
- # 直接获取第一个样本,避免错误处理逻辑
848
  try:
849
- sample = samples_list[0]
850
- formatted_sample = format_sample(sample)
851
- return formatted_sample
 
852
  except Exception as e:
853
- print(f"[Backend] Error formatting first sample: {e}")
854
- return f"<div style='color: red; padding: 10px; border: 1px solid red; border-radius: 5px;'>Error displaying first sample: {str(e)}</div>"
855
 
856
- def handle_comparison_problem_update(problem_id, dataset_state):
857
- """处理比较页面的问题更新,仅更新问题和答案内容,不需要模型"""
858
- global db
859
- # 确保从Gradio State对象中提取实际值
860
- dataset_name = dataset_state.value if hasattr(dataset_state, 'value') else dataset_state
861
- problem_id_value = problem_id.value if hasattr(problem_id, 'value') else problem_id
862
-
863
- if not problem_id_value or not dataset_name:
864
- return "Please select a dataset and enter a problem ID.", "No answer available."
865
 
866
- # 处理纯数字输入,构建完整unique_id
867
- if problem_id_value and problem_id_value.isdigit():
868
- # 构建格式:OlymMATH-HARD-0-EN 或类似格式
869
- parts = dataset_name.split('-')
870
- if len(parts) == 2: # 确保格式正确 (例如 "EN-HARD")
871
- language, difficulty = parts
872
- # 构建完整ID
873
- problem_id_value = f"OlymMATH-{difficulty}-{problem_id_value}-{language}"
 
 
 
 
 
 
 
 
 
 
 
874
 
875
- try:
876
- # 只获取问题数据,不获取特定模型的响应
877
- problem_data, _ = db.get_problem_data(None, dataset_name, problem_id_value)
878
-
879
- if not problem_data:
880
- return f"Problem not found: {problem_id_value}. Please check the ID and try again.", "No answer available."
881
-
882
- problem_dict = dict(problem_data)
883
- problem_content = f"{format_latex(problem_dict.get('problem', ''))}"
884
- answer_content = f"{format_latex(problem_dict.get('answer', ''))}"
885
-
886
- return problem_content, answer_content
887
- except Exception as e:
888
- print(f"[Backend] Error in handle_comparison_problem_update: {e}")
889
- return f"Error: {str(e)}", "No answer available."
890
 
891
  def create_ui(db_path):
892
  global db
@@ -896,7 +890,7 @@ def create_ui(db_path):
896
  if not AVAILABLE_DATASETS:
897
  AVAILABLE_DATASETS = ["EN-HARD", "EN-EASY", "ZH-HARD", "ZH-EASY"] # Fallback
898
 
899
- # Modified CSS
900
  custom_css = """
901
  .padding.svelte-phx28p { padding: unset !important; }
902
  body, .gradio-container { font-family: sans-serif; font-size: 0.95em; line-height: 1.6; }
@@ -904,6 +898,7 @@ def create_ui(db_path):
904
  .sample-btn:hover { transform: translateY(-1px); box-shadow: 0 2px 5px rgba(0,0,0,0.1); }
905
  .problem-grid-container { overflow-y: auto; }
906
  .math-content { overflow-x: auto; padding: 5px; }
 
907
  h1, h2, h3, h4, h5 { margin-top: 0.8em; margin-bottom: 0.4em; color: var(--color-text); }
908
  .gradio-tabs > div[role='tablist'] button { font-size: 0.9em; padding: 8px 12px; }
909
  .gr-dropdown select { font-size: 0.9em; }
@@ -929,49 +924,50 @@ def create_ui(db_path):
929
  //border-radius: 8px;
930
  //margin-top: 10px;
931
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
932
  """
933
 
934
  with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue=gr.themes.colors.blue, secondary_hue=gr.themes.colors.sky)) as demo:
935
- # Define global JavaScript functions here, including MathJax and problem selection
936
- gr.HTML("""
937
- <script>
938
- // Function to load MathJax
939
- function loadMathJax() {
940
- if (window.MathJax) return;
941
- window.MathJax = {
942
- tex: {
943
- inlineMath: [['$', '$'], ['\\(', '\\)']],
944
- displayMath: [['$$', '$$'], ['\\[', '\\]']],
945
- processEscapes: true, tags: 'ams'
946
- },
947
- svg: { fontCache: 'global' },
948
- startup: {
949
- ready: () => {
950
- window.MathJax.startup.defaultReady();
951
- window.typesetMath = (elements) => {
952
- if (window.MathJax && window.MathJax.startup && window.MathJax.startup.document && window.MathJax.startup.document.source && window.MathJax.startup.document.source.typeset) {
953
- window.MathJax.startup.document.source.typeset(elements);
954
- }
955
- };
956
- }
957
- }
958
- };
959
- const script = document.createElement('script');
960
- script.src = 'https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js';
961
- script.async = true; script.id = 'MathJax-script';
962
- document.head.appendChild(script);
963
- }
964
- if (document.readyState === 'loading') {
965
- document.addEventListener('DOMContentLoaded', loadMathJax);
966
- } else {
967
- loadMathJax();
968
- }
969
- </script>
970
- """)
971
-
972
- # Hidden Textbox to communicate Problem ID from JS to Python
973
- # IMPORTANT: elem_id must match what JavaScript's document.getElementById uses.
974
-
975
 
976
  current_dataset_state = gr.State(value=AVAILABLE_DATASETS[0] if AVAILABLE_DATASETS else "")
977
  current_model_state = gr.State(value=None)
@@ -1036,9 +1032,25 @@ def create_ui(db_path):
1036
  with gr.Column(scale=3, min_width=400):
1037
  with gr.Tabs():
1038
  with gr.TabItem("Problem Statement"):
1039
- problem_markdown_output = gr.Markdown("Please fill in all the fields.")
 
 
 
 
 
 
 
 
1040
  with gr.TabItem("Reference Answer"):
1041
- answer_markdown_output = gr.Markdown("No answer available.")
 
 
 
 
 
 
 
 
1042
 
1043
  # 样本网格
1044
  samples_grid_output = gr.HTML("")
@@ -1056,11 +1068,23 @@ def create_ui(db_path):
1056
  every=0.5
1057
  )
1058
 
1059
- # 样本内容显示区域
1060
- sample_content_output = gr.HTML(
1061
  value="<div>Select a problem first to view samples.</div>",
1062
- elem_classes="math-content dark-mode-bg-secondary",
1063
- elem_id="sample-content-area"
 
 
 
 
 
 
 
 
 
 
 
 
1064
  )
1065
 
1066
  with gr.TabItem("Model Comparison"):
@@ -1088,9 +1112,25 @@ def create_ui(db_path):
1088
  with gr.Column(scale=1):
1089
  with gr.Tabs():
1090
  with gr.TabItem("Problem Statement"):
1091
- comp_problem_markdown_output = gr.Markdown("Please select models and problem.")
 
 
 
 
 
 
 
 
1092
  with gr.TabItem("Reference Answer"):
1093
- comp_answer_markdown_output = gr.Markdown("No answer available.")
 
 
 
 
 
 
 
 
1094
 
1095
  # 左右两部分模型比较
1096
  with gr.Row(variant='compact'):
@@ -1122,11 +1162,23 @@ def create_ui(db_path):
1122
  every=0.5
1123
  )
1124
 
1125
- # 样本内容显示区域
1126
- comp_sample_content_output_left = gr.HTML(
1127
  value="<div>Select a problem first to view samples.</div>",
1128
- elem_classes="math-content dark-mode-bg-secondary",
1129
- elem_id="comp-sample-content-area-left"
 
 
 
 
 
 
 
 
 
 
 
 
1130
  )
1131
 
1132
  # 右侧模型
@@ -1157,11 +1209,23 @@ def create_ui(db_path):
1157
  every=0.5
1158
  )
1159
 
1160
- # 样本内容显示区域
1161
- comp_sample_content_output_right = gr.HTML(
1162
  value="<div>Select a problem first to view samples.</div>",
1163
- elem_classes="math-content dark-mode-bg-secondary",
1164
- elem_id="comp-sample-content-area-right"
 
 
 
 
 
 
 
 
 
 
 
 
1165
  )
1166
 
1167
  # --- Event Handlers ---
@@ -1201,7 +1265,6 @@ def create_ui(db_path):
1201
  gr.Dropdown(choices=comp_model_choices if comp_model_choices else [], value=None)
1202
 
1203
  def update_problem_grid_and_stats(selected_model_formatted, selected_dataset, mode='default'):
1204
- print(f"[Backend] update_problem_grid_and_stats: model_formatted='{selected_model_formatted}', dataset='{selected_dataset}'")
1205
  if not selected_model_formatted or not selected_dataset:
1206
  # Return empty/default values for all outputs, including the state
1207
  return gr.DataFrame(value=[]), gr.HTML("<div>Please select a model and dataset first.</div>"), None
@@ -1219,7 +1282,6 @@ def create_ui(db_path):
1219
  problem_list = db.get_problems_by_model_dataset(model_name, selected_dataset)
1220
  grid_html = create_problem_grid_html(problem_list, mode=mode)
1221
 
1222
- print(f"[Backend] update_problem_grid_and_stats: New model_name for state: {model_name}")
1223
  # Correctly return the actual value for the current_model_state output
1224
  return gr.DataFrame(value=stats_data), gr.HTML(value=grid_html), model_name
1225
 
@@ -1238,9 +1300,9 @@ def create_ui(db_path):
1238
  inputs=[],
1239
  outputs=[sample_number_input]
1240
  ).then(
1241
- lambda: ("Please fill in all the fields.", "No answer available.", "", gr.State([]), "<div>Select a problem first to view samples.</div>"),
1242
  inputs=[],
1243
- outputs=[problem_markdown_output, answer_markdown_output, samples_grid_output, current_samples_data_state, sample_content_output]
1244
  )
1245
 
1246
  # Initial population of model dropdowns based on default dataset
@@ -1253,9 +1315,9 @@ def create_ui(db_path):
1253
  inputs=[current_dataset_state],
1254
  outputs=[model_stats_df, problem_grid_html_output, current_dataset_state]
1255
  ).then(
1256
- lambda: ("Please fill in all the fields.", "No answer available.", "", gr.State([]), "<div>Select a problem first to view samples.</div>"),
1257
  inputs=[],
1258
- outputs=[problem_markdown_output, answer_markdown_output, samples_grid_output, current_samples_data_state, sample_content_output]
1259
  ).then(
1260
  # 重置Sample Number为0
1261
  fn=lambda: "0",
@@ -1319,12 +1381,12 @@ def create_ui(db_path):
1319
  problem_content, answer_content, samples_grid_html, new_samples_data = handle_problem_select_comparison(current_problem_id, new_model_state, current_dataset)
1320
 
1321
  # 获取第一个样本的内容
1322
- first_sample_content = handle_first_sample(new_samples_data)
1323
 
1324
- return grid_html, new_model_state, problem_content, answer_content, samples_grid_html, new_samples_data, first_sample_content
1325
  else:
1326
  # 没有问题ID,只返回更新的模型状态
1327
- return grid_html, new_model_state, "Please enter a problem ID.", "No answer available.", "", gr.State([]), "<div>Select a problem first to view samples.</div>"
1328
 
1329
  # 修改model_dropdown的处理函数,以重新查询当前问题响应 - 比较页面右侧
1330
  def update_model_and_requery_problem_right(model_dropdown_value, current_dataset, current_problem_id):
@@ -1337,18 +1399,18 @@ def create_ui(db_path):
1337
  _, _, samples_grid_html, new_samples_data = handle_problem_select_comparison(current_problem_id, new_model_state, current_dataset)
1338
 
1339
  # 获取第一个样本的内容
1340
- first_sample_content = handle_first_sample(new_samples_data)
1341
 
1342
- return grid_html, new_model_state, samples_grid_html, new_samples_data, first_sample_content
1343
  else:
1344
  # 没有问题ID,只返回更新的模型状态
1345
- return grid_html, new_model_state, "", gr.State([]), "<div>Select a problem first to view samples.</div>"
1346
 
1347
  # 左侧模型选择事件
1348
  comp_model_dropdown_left.change(
1349
  fn=update_model_and_requery_problem_left,
1350
  inputs=[comp_model_dropdown_left, comp_dataset_state, comp_problem_state_input],
1351
- outputs=[comp_problem_grid_html_output_left, comp_model_state_left, comp_problem_markdown_output, comp_answer_markdown_output, comp_samples_grid_output_left, comp_samples_data_state_left, comp_sample_content_output_left]
1352
  ).then(
1353
  # 重置Sample Number为0
1354
  fn=lambda: "0",
@@ -1360,7 +1422,7 @@ def create_ui(db_path):
1360
  comp_model_dropdown_right.change(
1361
  fn=update_model_and_requery_problem_right,
1362
  inputs=[comp_model_dropdown_right, comp_dataset_state, comp_problem_state_input],
1363
- outputs=[comp_problem_grid_html_output_right, comp_model_state_right, comp_samples_grid_output_right, comp_samples_data_state_right, comp_sample_content_output_right]
1364
  ).then(
1365
  # 重置Sample Number为0
1366
  fn=lambda: "0",
@@ -1368,7 +1430,21 @@ def create_ui(db_path):
1368
  outputs=[comp_sample_number_input_right]
1369
  )
1370
 
1371
- # 问题选择事件 - 左侧模型
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1372
  comp_problem_state_input.change(
1373
  fn=handle_problem_select_comparison,
1374
  inputs=[comp_problem_state_input, comp_model_state_left, comp_dataset_state],
@@ -1381,7 +1457,7 @@ def create_ui(db_path):
1381
  ).then(
1382
  fn=handle_first_sample,
1383
  inputs=[comp_samples_data_state_left],
1384
- outputs=[comp_sample_content_output_left]
1385
  )
1386
 
1387
  # 问题选择事件 - 右侧模型
@@ -1397,25 +1473,10 @@ def create_ui(db_path):
1397
  ).then(
1398
  fn=handle_first_sample,
1399
  inputs=[comp_samples_data_state_right],
1400
- outputs=[comp_sample_content_output_right]
1401
- )
1402
-
1403
- # 左侧样本选择
1404
- comp_sample_number_input_left.change(
1405
- fn=handle_sample_select,
1406
- inputs=[comp_sample_number_input_left, comp_samples_data_state_left],
1407
- outputs=[comp_sample_content_output_left]
1408
- )
1409
-
1410
- # 右侧样本选择
1411
- comp_sample_number_input_right.change(
1412
- fn=handle_sample_select,
1413
- inputs=[comp_sample_number_input_right, comp_samples_data_state_right],
1414
- outputs=[comp_sample_content_output_right]
1415
  )
1416
 
1417
  # This is the crucial link: problem_state_input is changed by user, triggers this Python callback.
1418
- print("Setting up problem_state_input change handler...")
1419
  problem_state_input.change(
1420
  fn=handle_problem_select,
1421
  inputs=[problem_state_input, current_model_state, current_dataset_state],
@@ -1428,7 +1489,7 @@ def create_ui(db_path):
1428
  ).then(
1429
  fn=handle_first_sample,
1430
  inputs=[current_samples_data_state],
1431
- outputs=[sample_content_output]
1432
  )
1433
 
1434
  # Also listen for direct input event which may be more reliable than change
@@ -1444,20 +1505,20 @@ def create_ui(db_path):
1444
  ).then(
1445
  fn=handle_first_sample,
1446
  inputs=[current_samples_data_state],
1447
- outputs=[sample_content_output]
1448
  )
1449
 
1450
  # 添加样本编号的事件处理
1451
  sample_number_input.change(
1452
  fn=handle_sample_select,
1453
  inputs=[sample_number_input, current_samples_data_state],
1454
- outputs=[sample_content_output]
1455
  )
1456
 
1457
  sample_number_input.input(
1458
  fn=handle_sample_select,
1459
  inputs=[sample_number_input, current_samples_data_state],
1460
- outputs=[sample_content_output]
1461
  )
1462
 
1463
  # 修改model_dropdown.change处理函数,以重新查询当前问题响应
@@ -1470,17 +1531,17 @@ def create_ui(db_path):
1470
  problem_content, answer_content, samples_grid_html, new_samples_data = handle_problem_select(current_problem_id, new_model_state, current_dataset)
1471
 
1472
  # 获取第一个样本的内容
1473
- first_sample_content = handle_first_sample(new_samples_data)
1474
 
1475
- return stats_df, grid_html, new_model_state, problem_content, answer_content, samples_grid_html, new_samples_data, first_sample_content
1476
  else:
1477
  # 没有问题ID,只返回更新的模型状态
1478
- return stats_df, grid_html, new_model_state, "Please fill in all the fields.", "No answer available.", "", gr.State([]), "<div>Select a problem first to view samples.</div>"
1479
 
1480
  model_dropdown.change(
1481
  fn=update_model_and_requery_problem,
1482
  inputs=[model_dropdown, current_dataset_state, problem_state_input],
1483
- outputs=[model_stats_df, problem_grid_html_output, current_model_state, problem_markdown_output, answer_markdown_output, samples_grid_output, current_samples_data_state, sample_content_output]
1484
  ).then(
1485
  # 重置Sample Number为0
1486
  fn=lambda: "0",
@@ -1501,20 +1562,17 @@ def monitor_memory_usage():
1501
 
1502
  # 如果内存使用超过12GB (激进设置),清理缓存
1503
  if memory_usage_mb > 12000: # 12GB
1504
- print(f"Memory usage high ({memory_usage_mb:.1f} MB), clearing caches...")
1505
  if db:
1506
  db.clear_cache('response') # 优先清理响应缓存
1507
  gc.collect()
1508
  # 如果内存使用超过14GB,更激进地清理
1509
  if memory_usage_mb > 14000: # 14GB
1510
- print(f"Memory usage critical ({memory_usage_mb:.1f} MB), clearing all caches...")
1511
  if db:
1512
  db.clear_cache() # 清理所有缓存
1513
  gc.collect()
1514
 
1515
  return f"Memory: {memory_usage_mb:.1f} MB"
1516
  except Exception as e:
1517
- print(f"Error monitoring memory: {e}")
1518
  return "Memory monitor error"
1519
 
1520
  # 修改主函数以使用优化策略
@@ -1523,7 +1581,6 @@ if __name__ == "__main__":
1523
 
1524
  # 检查数据库文件是否存在,如果不存在则从 Hugging Face 下载
1525
  if not os.path.exists(DB_PATH):
1526
- print(f"Database file not found at {DB_PATH}. Attempting to download from Hugging Face...")
1527
  try:
1528
  # 从环境变量获取 HF_TOKEN
1529
  hf_token = os.environ.get("HF_TOKEN")
@@ -1537,9 +1594,7 @@ if __name__ == "__main__":
1537
  repo_type="dataset",
1538
  token=hf_token
1539
  )
1540
- print(f"Successfully downloaded database file to {DB_PATH}")
1541
  except Exception as e:
1542
- print(f"Error downloading database: {str(e)}")
1543
  # 创建一个显示错误信息的简单 Gradio 应用
1544
  with gr.Blocks() as error_demo:
1545
  gr.Markdown(f"# Error: Database Download Failed\n{str(e)}\nPlease ensure HF_TOKEN is set correctly and try again.")
@@ -1547,19 +1602,13 @@ if __name__ == "__main__":
1547
  exit(1)
1548
 
1549
  if os.path.exists(DB_PATH):
1550
- # 报告数据库大小
1551
- db_size = os.path.getsize(DB_PATH)
1552
- print(f"Database size: {db_size / (1024*1024*1024):.2f} GB")
1553
-
1554
  # 创建UI并启动
1555
- print("Initializing database connection...")
1556
  db = ModelDatabase(DB_PATH)
1557
 
1558
  # 添加清理函数
1559
  def cleanup():
1560
  global db
1561
  if db:
1562
- print("Shutting down, cleaning up resources...")
1563
  db.close()
1564
 
1565
  # 注册清理函数
@@ -1576,8 +1625,7 @@ if __name__ == "__main__":
1576
  inbrowser=False
1577
  )
1578
  else:
1579
- print(f"Database file not found at {DB_PATH}. Please ensure it exists.")
1580
- # Optionally, create a dummy DB or a message App
1581
  with gr.Blocks() as error_demo:
1582
  gr.Markdown(f"# Error: Database Not Found\nCould not find `{DB_PATH}`. Please ensure the database file is correctly placed and accessible.")
1583
  error_demo.launch(server_name="0.0.0.0")
 
11
  import math
12
  import time
13
  from huggingface_hub import hf_hub_download
14
+ import psutil
15
+ import gc
16
 
17
  # 翻译表
18
  SUBJECT_TRANS = {
 
32
  "still-3-1.5b-preview": "STILL-3-1.5B-Preview",
33
  "deepseek-r1-distill-qwen-32b": "DeepSeek-R1-Distill-Qwen-32B",
34
  "light-r1-7b-ds": "Light-R1-7B-DS",
35
+ "openmath-nemotron-32b": "OpenMath-Nemotron-32B",
36
+ "qwen3-235b-a22b": "Qwen3-235B-A22B",
37
  "skywork-or1-32b-preview": "Skywork-OR1-32B-Preview",
38
  "deepscaler-1.5b-preview": "DeepScaler-1.5B-Preview",
39
  "deepseek-r1-distill-qwen-7b": "DeepSeek-R1-Distill-Qwen-7B",
 
46
  "skywork-or1-math-7b": "Skywork-OR1-Math-7B",
47
  "skywork-or1-7b-preview": "Skywork-OR1-7B-Preview",
48
  "qwen3-30b-a3b": "Qwen3-30B-A3B",
49
+ "deepseek-r1": "DeepSeek-R1",
50
+ "glm-z1-air": "GLM-Z1-Air",
51
+ "gemini-2.5-pro-exp-03-25": "Gemini 2.5 Pro Exp 0325",
52
+ "o3-mini-high": "OpenAI o3-mini (high)",
53
+ "qwen3-0.6b": "Qwen3-0.6B"
54
  # 添加更多模型映射
55
  }
56
 
 
79
  self.conn.execute("PRAGMA temp_store = MEMORY") # 临时表存储在内存中
80
  self.conn.execute("PRAGMA mmap_size = 8589934592") # 尝试使用8GB内存映射
81
  self.conn.row_factory = sqlite3.Row
 
82
 
83
  # 创建索引以加速查询
84
  self._ensure_indices()
 
102
  cursor.execute("CREATE INDEX IF NOT EXISTS idx_problems_unique_id ON problems(unique_id)")
103
  cursor.execute("ANALYZE") # 分析表以优化查询计划
104
  except Exception as e:
105
+ pass
106
 
107
  def get_available_models(self):
108
  """Get list of all available models"""
 
116
  models = [row['model_name'] for row in cursor.fetchall()]
117
  self._models_cache = models # 存储到实例缓存
118
  return models
119
+ except sqlite3.OperationalError:
 
120
  return []
121
 
122
  def get_available_datasets(self):
 
131
  datasets = [row['dataset'].upper() for row in cursor.fetchall()]
132
  self._datasets_cache = datasets # 存储到实例缓存
133
  return datasets
134
+ except sqlite3.OperationalError:
 
135
  return DATASETS
136
 
137
  def get_model_statistics(self, model_name, dataset):
 
177
 
178
  self._cache[cache_key] = stats_data
179
  return stats_data
180
+ except sqlite3.OperationalError:
 
181
  return [["Database Error", "No data available"]]
182
 
183
  def get_all_model_accuracies(self, dataset):
 
197
  results = [(row['model_name'], row['accuracy']) for row in cursor.fetchall()]
198
  self._cache[cache_key] = results
199
  return results
200
+ except sqlite3.OperationalError:
 
201
  return []
202
 
203
  def get_problems_by_model_dataset(self, model_name, dataset):
 
224
  sorted_results = sorted(results, key=lambda x: int(re.search(r'\d+', x[0]).group(0)) if re.search(r'\d+', x[0]) else 0)
225
  self._cache[cache_key] = sorted_results
226
  return sorted_results
227
+ except sqlite3.OperationalError:
 
228
  return []
229
 
230
  def get_problem_data(self, model_name, dataset, problem_id):
 
249
  # 转为字典存储,避免SQLite连接依赖
250
  self._problem_cache[problem_cache_key] = dict(problem)
251
  problem = self._problem_cache[problem_cache_key]
252
+ except Exception:
 
253
  return None, None
254
 
255
  if not problem:
 
279
  responses = [dict(r) for r in responses]
280
  self._response_cache[resp_cache_key] = responses
281
  return problem, responses
282
+ except Exception:
 
283
  return problem, None
284
  else:
285
  # 获��所有模型对此问题的响应
 
304
  responses = [dict(r) for r in responses]
305
  self._response_cache[resp_cache_key] = responses
306
  return problem, responses
307
+ except Exception:
 
308
  return problem, None
309
 
310
  def get_model_responses(self, selected_models, dataset, problem_id):
 
339
  """清除指定部分或全部缓存"""
340
  if section == 'main' or section is None:
341
  self._cache = {}
 
342
  if section == 'problem' or section is None:
343
  self._problem_cache = {}
 
344
  if section == 'response' or section is None:
345
  self._response_cache = {}
 
346
  if section == 'models' or section is None:
347
  if hasattr(self, '_models_cache'):
348
  self._models_cache = None
349
  if hasattr(self, '_datasets_cache'):
350
  self._datasets_cache = None
 
351
 
352
  def close(self):
353
  """关闭数据库连接并释放资源"""
354
  if hasattr(self, 'conn') and self.conn:
355
  try:
356
  self.conn.close()
357
+ except Exception:
358
+ pass
 
359
 
360
  # 清理所有缓存
361
  self.clear_cache()
362
 
363
  def format_latex(text):
364
  if text is None: return ""
365
+ # Process the text for proper LaTeX rendering with KaTeX
366
+ # KaTeX requires LaTeX backslashes to be preserved
367
+ # Only replace newlines with HTML breaks
368
  text = text.replace('\n', '<br>')
369
+ # Wrap in a span that KaTeX can detect and render
370
+ return f'<span class="math-inline">{text}</span>'
371
+
372
+ def format_markdown_with_math(text):
373
+ if text is None: return ""
374
+
375
+ # Don't add HTML tags or do special processing for LaTeX - let Gradio handle it
376
+ # Just clean up basic issues that might affect rendering
377
+
378
+ # Convert newlines for markdown
379
+ text = text.replace('\r\n', '\n').replace('\r', '\n')
380
+
381
+ # Return the cleaned text for Gradio's markdown component to render
382
  return text
383
 
384
  def get_gradient_color(accuracy, color_map='RdYlGn'):
 
386
  return "#505050" # Default for missing or invalid accuracy
387
  try:
388
  # 使用更深的颜色映射
389
+ cmap = plt.colormaps.get_cmap(color_map)
390
  rgba = cmap(float(accuracy))
391
 
392
  # 确保颜色足够深以与白色文本形成对比
 
399
  # 转回十六进制
400
  hex_color = mpl.colors.rgb2hex((r, g, b, a))
401
  return hex_color
402
+ except Exception:
 
403
  return "#505050"
404
 
405
  def get_contrasting_text_color(bg_color):
 
432
  # 其他颜色根据亮度决定
433
  return "#000" if yiq > 160 else "#fff"
434
 
435
+ def format_sample_metadata(sample, show_correctness=True):
436
+ """生成样本元数据的HTML格式显示"""
437
  if sample is None: return ""
438
  sample_dict = dict(sample) if hasattr(sample, 'keys') else sample if isinstance(sample, dict) else {}
439
+ if not sample_dict: return "No sample data"
440
 
441
+ # 提取所需信息
442
  extracted = sample_dict.get('extracted', '')
 
443
  correctness = sample_dict.get('correctness', 0)
444
  correctness_label = "✓ Correct" if correctness else "✗ Incorrect"
445
  correctness_color = "var(--color-green)" if correctness else "var(--color-red)"
 
446
 
447
+ # 获取token信息
448
+ output_tokens = sample_dict.get('output_tokens', None)
449
+ reasoning_tokens = sample_dict.get('reasoning_tokens', None)
450
+
451
+ # 创建元数据HTML
452
+ html = f"<div style='font-size: 0.85em; padding: 10px; border-radius: 8px; margin-bottom: 5px;' class='dark-mode-compatible dark-mode-bg-secondary'>"
453
+
454
+ # 创建信息行
455
  if show_correctness:
456
+ html += f"<div style='display: flex; flex-wrap: wrap; align-items: center; margin-bottom: 5px;'>"
457
+ # 正确性指示器
458
  html += f"<span style='color: {correctness_color}; font-weight: bold; margin-right: 10px;'>{correctness_label}</span>"
459
+
460
+ # 提取的答案
461
+ if extracted:
462
+ html += f"<span style='background-color: rgba(0,0,0,0.05); padding: 2px 5px; border-radius: 3px; margin-right: 10px;'><b>Extracted:</b> ${extracted}$</span>"
463
+
464
+ # 输出token数
465
+ if output_tokens is not None:
466
+ html += f"<span style='background-color: rgba(0,0,0,0.05); padding: 2px 5px; border-radius: 3px; margin-right: 10px;'><b>Output Tokens:</b> {output_tokens}</span>"
467
+
468
+ # 推理token数 - 仅在可用时
469
+ if reasoning_tokens is not None:
470
+ html += f"<span style='background-color: rgba(0,0,0,0.05); padding: 2px 5px; border-radius: 3px;'><b>Reasoning Tokens:</b> {reasoning_tokens}</span>"
471
+
472
  html += f"</div>"
473
 
 
474
  html += "</div>"
475
  return html
476
 
477
+ def format_sample_response(sample):
478
+ """生成样本响应的Markdown格式显示"""
479
+ if sample is None: return ""
480
+ sample_dict = dict(sample) if hasattr(sample, 'keys') else sample if isinstance(sample, dict) else {}
481
+ if not sample_dict: return "No sample data"
 
 
 
 
 
 
482
 
483
+ # 获取响应内容
484
+ response = sample_dict.get('response', '')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
485
 
486
+ # 转义特殊标签以防止被解析为HTML
487
+ # 替换<think>标签
488
+ response = response.replace("<think>", "&lt;think&gt;")
489
+ response = response.replace("</think>", "&lt;/think&gt;")
490
+
491
+ # 替换其他可能的特殊标签
492
+ response = response.replace("<reasoning>", "&lt;reasoning&gt;")
493
+ response = response.replace("</reasoning>", "&lt;/reasoning&gt;")
494
+ response = response.replace("<answer>", "&lt;answer&gt;")
495
+ response = response.replace("</answer>", "&lt;/answer&gt;")
496
+
497
+ return response
498
 
499
+ def handle_sample_select(sample_number, samples_data):
500
+ # 确保从Gradio State对象中提取实际值
501
+ if hasattr(samples_data, 'value'):
502
+ samples_list = samples_data.value
503
+ else:
504
+ samples_list = samples_data
505
+
506
+ # 确保样本编号是整数
507
+ try:
508
+ sample_idx = int(sample_number)
509
+ except ValueError:
510
+ return "Error: Sample number must be an integer.", ""
511
+
512
+ # 确保样本数据存在且为非空列表
513
+ if not samples_list or not isinstance(samples_list, list) or len(samples_list) == 0:
514
+ return "No sample data available. Please select a problem first.", ""
515
+
516
+ # 检查索引是否在有效范围内,如果不在范围内,显示错误消息
517
+ if sample_idx < 0:
518
+ err_msg = f"**Error:** Sample number {sample_idx} is out of range. Valid range is 0 to {len(samples_list) - 1}."
519
+ return err_msg, ""
520
+
521
+ if sample_idx >= len(samples_list):
522
+ err_msg = f"**Error:** Sample number {sample_idx} is out of range. Valid range is 0 to {len(samples_list) - 1}."
523
+ return err_msg, ""
524
+
525
+ # 获取所选样本的数据
526
+ try:
527
+ sample = samples_list[sample_idx]
528
+ formatted_metadata = format_sample_metadata(sample)
529
+ formatted_response = format_sample_response(sample)
530
+ return formatted_metadata, formatted_response
531
+ except Exception as e:
532
+ err_msg = f"**Error displaying sample {sample_idx}:** {str(e)}"
533
+ return err_msg, ""
534
 
535
+ def handle_first_sample(samples_data):
536
+ """处理并显示第一个样本(索引0)"""
537
+ # 确保从Gradio State对象中提取实际值
538
+ if hasattr(samples_data, 'value'):
539
+ samples_list = samples_data.value
540
+ else:
541
+ samples_list = samples_data
542
+
543
+ # 检查样本数据是否存在
544
+ if not samples_list or not isinstance(samples_list, list) or len(samples_list) == 0:
545
+ return "No sample data available. Please select the problem and dataset first.", ""
546
+
547
+ # 直接获取第一个样本,避免错误处理逻辑
548
  try:
549
+ sample = samples_list[0]
550
+ formatted_metadata = format_sample_metadata(sample)
551
+ formatted_response = format_sample_response(sample)
552
+ return formatted_metadata, formatted_response
553
  except Exception as e:
554
+ err_msg = f"**Error displaying first sample:** {str(e)}"
555
+ return err_msg, ""
556
 
557
+ def handle_comparison_problem_update(problem_id, dataset_state):
558
+ """处理比较页面的问题更新,仅更新问题和答案内容,不需要模型"""
559
+ global db
560
+ # 确保从Gradio State对象中提取实际值
561
+ dataset_name = dataset_state.value if hasattr(dataset_state, 'value') else dataset_state
562
+ problem_id_value = problem_id.value if hasattr(problem_id, 'value') else problem_id
563
+
564
+ if not problem_id_value or not dataset_name:
565
+ return "Please select a dataset and enter a problem ID.", "No answer available."
566
 
567
+ # 处理纯数字输入,构建完整unique_id
568
+ if problem_id_value and problem_id_value.isdigit():
569
+ # 构建格式:OlymMATH-HARD-0-EN 或类似格��
570
+ parts = dataset_name.split('-')
571
+ if len(parts) == 2: # 确保格式正确 (例如 "EN-HARD")
572
+ language, difficulty = parts
573
+ # 构建完整ID
574
+ problem_id_value = f"OlymMATH-{difficulty}-{problem_id_value}-{language}"
 
 
 
 
 
 
 
 
 
 
 
575
 
576
+ try:
577
+ # 只获取问题数据,不获取特定模型的响应
578
+ problem_data, _ = db.get_problem_data(None, dataset_name, problem_id_value)
579
+
580
+ if not problem_data:
581
+ return f"Problem not found: {problem_id_value}. Please check the ID and try again.", "No answer available."
582
+
583
+ problem_dict = dict(problem_data)
584
+ # Use format_markdown_with_math for proper rendering
585
+ problem_content = format_markdown_with_math(problem_dict.get('problem', ''))
586
+
587
+ # 将答案中的双美元符号替换为单美元符号
588
+ answer_text = problem_dict.get('answer', '')
589
+ # 先将$$...$$替换为单个$...$,使用re.DOTALL处理多行
590
+ answer_text = re.sub(r'\$\$(.*?)\$\$', r'$\1$', answer_text, flags=re.DOTALL)
591
+
592
+ # 检查答案是否已经包含美元符号,如果没有则添加
593
+ if '$' not in answer_text and answer_text.strip():
594
+ answer_text = f"${answer_text}$"
595
+
596
+ answer_content = format_markdown_with_math(answer_text)
597
+
598
+ return problem_content, answer_content
599
+ except Exception as e:
600
+ return f"Error: {str(e)}", "No answer available."
601
 
602
  def handle_problem_select(problem_id_from_js, current_model_state, current_dataset_state, mode='default'):
603
  global db
 
606
  dataset_name = current_dataset_state.value if hasattr(current_dataset_state, 'value') else current_dataset_state
607
  problem_id = problem_id_from_js.value if hasattr(problem_id_from_js, 'value') else problem_id_from_js
608
 
 
 
 
609
  # 处理纯数字输入,构建完整unique_id
610
  if problem_id and problem_id.isdigit():
611
  # 构建格式:OlymMATH-HARD-0-EN 或类似格式
 
615
  language, difficulty = parts
616
  # 构建完整ID
617
  problem_id = f"OlymMATH-{difficulty}-{problem_id}-{language}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
618
 
619
  if not problem_id or not dataset_name:
620
  error_message = f"Missing data: problem_id='{problem_id}', dataset='{dataset_name}'"
 
621
  return "Please fill in all the fields.", "No answer available.", "", gr.State([])
622
 
623
  # For comparison mode, we might not have a model selected yet
 
628
 
629
  if not problem_data:
630
  error_message = f"Problem data not found: problem_id='{problem_id}', dataset='{dataset_name}'"
 
631
  return f"Problem not found: {problem_id}. Please check the ID and try again.", "No answer available.", "", gr.State([])
632
 
633
  problem_dict = dict(problem_data)
634
+ # Process problem and answer text for Markdown rendering
635
+ problem_content = format_markdown_with_math(problem_dict.get('problem', ''))
636
+
637
+ # 将答案中的双美元符号替换为单美元符号
638
+ answer_text = problem_dict.get('answer', '')
639
+ # 先将$$...$$替换为单个$...$,使用re.DOTALL处理多行
640
+ answer_text = re.sub(r'\$\$(.*?)\$\$', r'$\1$', answer_text, flags=re.DOTALL)
641
+
642
+ # 检查答案是否已经包含美元符号,如果没有则添加
643
+ if '$' not in answer_text and answer_text.strip():
644
+ answer_text = f"${answer_text}$"
645
+
646
+ answer_content = format_markdown_with_math(answer_text)
647
 
648
  # For comparison without model, we don't have samples to display
649
  return problem_content, answer_content, "", gr.State([])
650
  except Exception as e:
651
  error_message = f"Database error: {str(e)}"
 
652
  return f"Database error occurred. Please try again.", "No answer available.", "", gr.State([])
653
 
654
  # The regular flow for model-specific data
655
  if not model_name:
656
  error_message = f"Missing data: model='{model_name}'"
 
657
  return "Please fill in all the fields.", "No answer available.", "", gr.State([])
658
 
659
  # The problem_id from JS should be the full unique_id. No reconstruction needed normally.
 
662
 
663
  if not problem_data:
664
  error_message = f"Problem data not found: problem_id='{problem_id}', model='{model_name}', dataset='{dataset_name}'"
 
665
  return f"Problem not found: {problem_id}. Please check the ID and try again.", "No answer available.", "", gr.State([])
666
  except Exception as e:
667
  error_message = f"Database error: {str(e)}"
 
668
  return f"Database error occurred. Please try again.", "No answer available.", "", gr.State([])
669
 
670
  problem_dict = dict(problem_data)
671
  problem_display_num = re.search(r'\d+', problem_id).group(0) if re.search(r'\d+', problem_id) else problem_id
672
 
673
+ # Process problem and answer text for Markdown rendering
674
+ problem_content = format_markdown_with_math(problem_dict.get('problem', ''))
675
+
676
+ # 将答案中的双美元符号替换为单美元符号
677
+ answer_text = problem_dict.get('answer', '')
678
+ # 先将$$...$$替换为单个$...$,使用re.DOTALL处理多行
679
+ answer_text = re.sub(r'\$\$(.*?)\$\$', r'$\1$', answer_text, flags=re.DOTALL)
680
+
681
+ # 检查答案是否已经包含美元符号,如果没有则添加
682
+ if '$' not in answer_text and answer_text.strip():
683
+ answer_text = f"${answer_text}$"
684
+
685
+ answer_content = format_markdown_with_math(answer_text)
686
 
687
+ # Rest of the function remains the same
688
  if not responses_data:
689
  samples_grid_html = "<div>No samples available for this problem.</div>"
690
  # 返回空的样本数据状态
 
762
 
763
  samples_grid_html += '</div>'
764
 
765
+ # 第三行和第四行 - 允许所有模式显示完整的64个样本
766
+ if actual_display_count > 2*samples_per_row:
767
  # 第三行
768
  row_samples = displayed_samples[2*samples_per_row:3*samples_per_row]
769
  if row_samples:
 
829
  </div>
830
  """
831
 
 
832
  # 获取第一个样本作为初始样本
833
  if samples_data:
834
  # 这样样本会在选择问题后立即显示
 
835
  return problem_content, answer_content, final_html, gr.State(samples_data)
836
  else:
837
  return problem_content, answer_content, final_html, gr.State([])
838
 
839
+ def create_problem_grid_html(problems, mode='default'):
840
+ """Create HTML for problem grid buttons. The JS function will be defined globally."""
841
+ if not problems:
842
+ return "<div>No problems found for this model/dataset. Please select a model and dataset.</div>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
843
 
844
+ html_buttons = ""
 
 
 
 
 
 
 
 
 
 
 
 
845
  try:
846
+ sorted_problems = sorted(
847
+ [(str(p[0]), float(p[1]) if p[1] is not None else 0.0, p[2]) for p in problems],
848
+ key=lambda x: int(re.search(r'\d+', x[0]).group(0)) if re.search(r'\d+', x[0]) else 0
849
+ )
850
  except Exception as e:
851
+ return f"<div>Error displaying problems. Check logs. {e}</div>"
 
852
 
853
+ for pid, accuracy, _ in sorted_problems:
854
+ match = re.search(r'\d+', pid)
855
+ num_display = match.group(0) if match else pid
856
+ acc_pct = int(accuracy * 100)
 
 
 
 
 
857
 
858
+ # 获取背景颜色
859
+ bg_color = get_gradient_color(accuracy)
860
+ # 统一使用白色文本,添加!important确保不被覆盖
861
+ text_color = "#ffffff"
862
+
863
+ html_buttons += f"""
864
+ <div
865
+ data-problem-id=\"{pid}\"
866
+ class=\"problem-btn\"
867
+ title=\"ID: {pid} - Acc: {acc_pct}%\"
868
+ style='background-color: {bg_color}; color: {text_color} !important;
869
+ border-radius: 4px; padding: 5px; text-align: center; font-size: 0.7em;
870
+ min-height: 36px; user-select: none; width: 100%;
871
+ display: flex; flex-direction: column; justify-content: center;
872
+ overflow: hidden; text-overflow: ellipsis; white-space: nowrap;'>
873
+ <div style="font-weight: bold; color: {text_color} !important;">{num_display}</div>
874
+ <div style="color: {text_color} !important;">{acc_pct}%</div>
875
+ </div>
876
+ """
877
 
878
+ # 添加自定义样式强制文本颜色为白色
879
+ custom_style = "<style>.problem-btn, .problem-btn div { color: white !important; }</style>"
880
+ # 根据模式设置每行显示的列数
881
+ grid_cols = 20 if mode == 'comparison' else 10
882
+ grid_html = f"{custom_style}<div style='display: grid; grid-template-columns: repeat({grid_cols}, 1fr); gap: 4px;'>{html_buttons}</div>"
883
+ return grid_html
 
 
 
 
 
 
 
 
 
884
 
885
  def create_ui(db_path):
886
  global db
 
890
  if not AVAILABLE_DATASETS:
891
  AVAILABLE_DATASETS = ["EN-HARD", "EN-EASY", "ZH-HARD", "ZH-EASY"] # Fallback
892
 
893
+ # Add MathJax support to the CSS
894
  custom_css = """
895
  .padding.svelte-phx28p { padding: unset !important; }
896
  body, .gradio-container { font-family: sans-serif; font-size: 0.95em; line-height: 1.6; }
 
898
  .sample-btn:hover { transform: translateY(-1px); box-shadow: 0 2px 5px rgba(0,0,0,0.1); }
899
  .problem-grid-container { overflow-y: auto; }
900
  .math-content { overflow-x: auto; padding: 5px; }
901
+ .sample-response { overflow-y: clip !important; max-height: none !important; height: auto !important; }
902
  h1, h2, h3, h4, h5 { margin-top: 0.8em; margin-bottom: 0.4em; color: var(--color-text); }
903
  .gradio-tabs > div[role='tablist'] button { font-size: 0.9em; padding: 8px 12px; }
904
  .gr-dropdown select { font-size: 0.9em; }
 
924
  //border-radius: 8px;
925
  //margin-top: 10px;
926
  }
927
+
928
+ /* MathJax Styles for Gradio's Built-in LaTeX */
929
+ .math-inline, .math-display {
930
+ font-size: 110%;
931
+ }
932
+ .math-container p {
933
+ margin: 0.5em 0;
934
+ }
935
+
936
+ /* Markdown content styles */
937
+ .gr-markdown strong {
938
+ font-weight: bold;
939
+ }
940
+ .gr-markdown em {
941
+ font-style: italic;
942
+ }
943
+ .gr-markdown ul, .gr-markdown ol {
944
+ padding-left: 2em;
945
+ margin: 0.5em 0;
946
+ }
947
+ .gr-markdown blockquote {
948
+ border-left: 3px solid #ccc;
949
+ margin: 0.5em 0;
950
+ padding-left: 1em;
951
+ color: #666;
952
+ }
953
+ .gr-markdown pre, .gr-markdown code {
954
+ background-color: rgba(0,0,0,0.05);
955
+ padding: 2px 4px;
956
+ border-radius: 3px;
957
+ font-family: monospace;
958
+ }
959
+ .gr-markdown table {
960
+ border-collapse: collapse;
961
+ margin: 0.5em 0;
962
+ }
963
+ .gr-markdown th, .gr-markdown td {
964
+ border: 1px solid #ddd;
965
+ padding: 4px 8px;
966
+ }
967
  """
968
 
969
  with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue=gr.themes.colors.blue, secondary_hue=gr.themes.colors.sky)) as demo:
970
+ # Remove KaTeX loading script since we're using Gradio's native Markdown with LaTeX
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
971
 
972
  current_dataset_state = gr.State(value=AVAILABLE_DATASETS[0] if AVAILABLE_DATASETS else "")
973
  current_model_state = gr.State(value=None)
 
1032
  with gr.Column(scale=3, min_width=400):
1033
  with gr.Tabs():
1034
  with gr.TabItem("Problem Statement"):
1035
+ problem_markdown_output = gr.Markdown(
1036
+ "Please fill in all the fields.",
1037
+ latex_delimiters=[
1038
+ {"left": "$", "right": "$", "display": False},
1039
+ {"left": "$$", "right": "$$", "display": True},
1040
+ {"left": "\\(", "right": "\\)", "display": False},
1041
+ {"left": "\\[", "right": "\\]", "display": True}
1042
+ ]
1043
+ )
1044
  with gr.TabItem("Reference Answer"):
1045
+ answer_markdown_output = gr.Markdown(
1046
+ "No answer available.",
1047
+ latex_delimiters=[
1048
+ {"left": "$", "right": "$", "display": False},
1049
+ {"left": "$$", "right": "$$", "display": True},
1050
+ {"left": "\\(", "right": "\\)", "display": False},
1051
+ {"left": "\\[", "right": "\\]", "display": True}
1052
+ ]
1053
+ )
1054
 
1055
  # 样本网格
1056
  samples_grid_output = gr.HTML("")
 
1068
  every=0.5
1069
  )
1070
 
1071
+ # 样本内容显示区域 - 使用HTML和Markdown组件分别显示元数据和响应内容
1072
+ sample_metadata_output = gr.HTML(
1073
  value="<div>Select a problem first to view samples.</div>",
1074
+ elem_classes="sample-metadata dark-mode-bg-secondary",
1075
+ elem_id="sample-metadata-area"
1076
+ )
1077
+
1078
+ sample_response_output = gr.Markdown(
1079
+ value="Select a problem first to view samples.",
1080
+ elem_classes="sample-response dark-mode-bg-secondary",
1081
+ elem_id="sample-response-area",
1082
+ latex_delimiters=[
1083
+ {"left": "$", "right": "$", "display": False},
1084
+ {"left": "$$", "right": "$$", "display": True},
1085
+ {"left": "\\(", "right": "\\)", "display": False},
1086
+ {"left": "\\[", "right": "\\]", "display": True}
1087
+ ]
1088
  )
1089
 
1090
  with gr.TabItem("Model Comparison"):
 
1112
  with gr.Column(scale=1):
1113
  with gr.Tabs():
1114
  with gr.TabItem("Problem Statement"):
1115
+ comp_problem_markdown_output = gr.Markdown(
1116
+ "Please select models and problem.",
1117
+ latex_delimiters=[
1118
+ {"left": "$", "right": "$", "display": False},
1119
+ {"left": "$$", "right": "$$", "display": True},
1120
+ {"left": "\\(", "right": "\\)", "display": False},
1121
+ {"left": "\\[", "right": "\\]", "display": True}
1122
+ ]
1123
+ )
1124
  with gr.TabItem("Reference Answer"):
1125
+ comp_answer_markdown_output = gr.Markdown(
1126
+ "No answer available.",
1127
+ latex_delimiters=[
1128
+ {"left": "$", "right": "$", "display": False},
1129
+ {"left": "$$", "right": "$$", "display": True},
1130
+ {"left": "\\(", "right": "\\)", "display": False},
1131
+ {"left": "\\[", "right": "\\]", "display": True}
1132
+ ]
1133
+ )
1134
 
1135
  # 左右两部分模型比较
1136
  with gr.Row(variant='compact'):
 
1162
  every=0.5
1163
  )
1164
 
1165
+ # 样本内容显示区域 - 使用HTML和Markdown组件分别显示元数据和响应内容
1166
+ comp_sample_metadata_output_left = gr.HTML(
1167
  value="<div>Select a problem first to view samples.</div>",
1168
+ elem_classes="sample-metadata dark-mode-bg-secondary",
1169
+ elem_id="comp-sample-metadata-area-left"
1170
+ )
1171
+
1172
+ comp_sample_response_output_left = gr.Markdown(
1173
+ value="Select a problem first to view samples.",
1174
+ elem_classes="sample-response dark-mode-bg-secondary",
1175
+ elem_id="comp-sample-response-area-left",
1176
+ latex_delimiters=[
1177
+ {"left": "$", "right": "$", "display": False},
1178
+ {"left": "$$", "right": "$$", "display": True},
1179
+ {"left": "\\(", "right": "\\)", "display": False},
1180
+ {"left": "\\[", "right": "\\]", "display": True}
1181
+ ]
1182
  )
1183
 
1184
  # 右侧模型
 
1209
  every=0.5
1210
  )
1211
 
1212
+ # 样本内容显示区域 - 使用HTML和Markdown组件分别显示元数据和响应内容
1213
+ comp_sample_metadata_output_right = gr.HTML(
1214
  value="<div>Select a problem first to view samples.</div>",
1215
+ elem_classes="sample-metadata dark-mode-bg-secondary",
1216
+ elem_id="comp-sample-metadata-area-right"
1217
+ )
1218
+
1219
+ comp_sample_response_output_right = gr.Markdown(
1220
+ value="Select a problem first to view samples.",
1221
+ elem_classes="sample-response dark-mode-bg-secondary",
1222
+ elem_id="comp-sample-response-area-right",
1223
+ latex_delimiters=[
1224
+ {"left": "$", "right": "$", "display": False},
1225
+ {"left": "$$", "right": "$$", "display": True},
1226
+ {"left": "\\(", "right": "\\)", "display": False},
1227
+ {"left": "\\[", "right": "\\]", "display": True}
1228
+ ]
1229
  )
1230
 
1231
  # --- Event Handlers ---
 
1265
  gr.Dropdown(choices=comp_model_choices if comp_model_choices else [], value=None)
1266
 
1267
  def update_problem_grid_and_stats(selected_model_formatted, selected_dataset, mode='default'):
 
1268
  if not selected_model_formatted or not selected_dataset:
1269
  # Return empty/default values for all outputs, including the state
1270
  return gr.DataFrame(value=[]), gr.HTML("<div>Please select a model and dataset first.</div>"), None
 
1282
  problem_list = db.get_problems_by_model_dataset(model_name, selected_dataset)
1283
  grid_html = create_problem_grid_html(problem_list, mode=mode)
1284
 
 
1285
  # Correctly return the actual value for the current_model_state output
1286
  return gr.DataFrame(value=stats_data), gr.HTML(value=grid_html), model_name
1287
 
 
1300
  inputs=[],
1301
  outputs=[sample_number_input]
1302
  ).then(
1303
+ lambda: ("Please fill in all the fields.", "No answer available.", "", gr.State([]), "<div>Select a problem first to view samples.</div>", ""),
1304
  inputs=[],
1305
+ outputs=[problem_markdown_output, answer_markdown_output, samples_grid_output, current_samples_data_state, sample_metadata_output, sample_response_output]
1306
  )
1307
 
1308
  # Initial population of model dropdowns based on default dataset
 
1315
  inputs=[current_dataset_state],
1316
  outputs=[model_stats_df, problem_grid_html_output, current_dataset_state]
1317
  ).then(
1318
+ lambda: ("Please fill in all the fields.", "No answer available.", "", gr.State([]), "<div>Select a problem first to view samples.</div>", ""),
1319
  inputs=[],
1320
+ outputs=[problem_markdown_output, answer_markdown_output, samples_grid_output, current_samples_data_state, sample_metadata_output, sample_response_output]
1321
  ).then(
1322
  # 重置Sample Number为0
1323
  fn=lambda: "0",
 
1381
  problem_content, answer_content, samples_grid_html, new_samples_data = handle_problem_select_comparison(current_problem_id, new_model_state, current_dataset)
1382
 
1383
  # 获取第一个样本的内容
1384
+ first_metadata, first_response = handle_first_sample(new_samples_data)
1385
 
1386
+ return grid_html, new_model_state, problem_content, answer_content, samples_grid_html, new_samples_data, first_metadata, first_response
1387
  else:
1388
  # 没有问题ID,只返回更新的模型状态
1389
+ return grid_html, new_model_state, "Please enter a problem ID.", "No answer available.", "", gr.State([]), "<div>Select a problem first to view samples.</div>", ""
1390
 
1391
  # 修改model_dropdown的处理函数,以重新查询当前问题响应 - 比较页面右侧
1392
  def update_model_and_requery_problem_right(model_dropdown_value, current_dataset, current_problem_id):
 
1399
  _, _, samples_grid_html, new_samples_data = handle_problem_select_comparison(current_problem_id, new_model_state, current_dataset)
1400
 
1401
  # 获取第一个样本的内容
1402
+ first_metadata, first_response = handle_first_sample(new_samples_data)
1403
 
1404
+ return grid_html, new_model_state, samples_grid_html, new_samples_data, first_metadata, first_response
1405
  else:
1406
  # 没有问题ID,只返回更新的模型状态
1407
+ return grid_html, new_model_state, "", gr.State([]), "<div>Select a problem first to view samples.</div>", ""
1408
 
1409
  # 左侧模型选择事件
1410
  comp_model_dropdown_left.change(
1411
  fn=update_model_and_requery_problem_left,
1412
  inputs=[comp_model_dropdown_left, comp_dataset_state, comp_problem_state_input],
1413
+ outputs=[comp_problem_grid_html_output_left, comp_model_state_left, comp_problem_markdown_output, comp_answer_markdown_output, comp_samples_grid_output_left, comp_samples_data_state_left, comp_sample_metadata_output_left, comp_sample_response_output_left]
1414
  ).then(
1415
  # 重置Sample Number为0
1416
  fn=lambda: "0",
 
1422
  comp_model_dropdown_right.change(
1423
  fn=update_model_and_requery_problem_right,
1424
  inputs=[comp_model_dropdown_right, comp_dataset_state, comp_problem_state_input],
1425
+ outputs=[comp_problem_grid_html_output_right, comp_model_state_right, comp_samples_grid_output_right, comp_samples_data_state_right, comp_sample_metadata_output_right, comp_sample_response_output_right]
1426
  ).then(
1427
  # 重置Sample Number为0
1428
  fn=lambda: "0",
 
1430
  outputs=[comp_sample_number_input_right]
1431
  )
1432
 
1433
+ # 左侧样本选择
1434
+ comp_sample_number_input_left.change(
1435
+ fn=handle_sample_select,
1436
+ inputs=[comp_sample_number_input_left, comp_samples_data_state_left],
1437
+ outputs=[comp_sample_metadata_output_left, comp_sample_response_output_left]
1438
+ )
1439
+
1440
+ # 右侧样本选择
1441
+ comp_sample_number_input_right.change(
1442
+ fn=handle_sample_select,
1443
+ inputs=[comp_sample_number_input_right, comp_samples_data_state_right],
1444
+ outputs=[comp_sample_metadata_output_right, comp_sample_response_output_right]
1445
+ )
1446
+
1447
+ # 为比较页面问题选择事件添加处理
1448
  comp_problem_state_input.change(
1449
  fn=handle_problem_select_comparison,
1450
  inputs=[comp_problem_state_input, comp_model_state_left, comp_dataset_state],
 
1457
  ).then(
1458
  fn=handle_first_sample,
1459
  inputs=[comp_samples_data_state_left],
1460
+ outputs=[comp_sample_metadata_output_left, comp_sample_response_output_left]
1461
  )
1462
 
1463
  # 问题选择事件 - 右侧模型
 
1473
  ).then(
1474
  fn=handle_first_sample,
1475
  inputs=[comp_samples_data_state_right],
1476
+ outputs=[comp_sample_metadata_output_right, comp_sample_response_output_right]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1477
  )
1478
 
1479
  # This is the crucial link: problem_state_input is changed by user, triggers this Python callback.
 
1480
  problem_state_input.change(
1481
  fn=handle_problem_select,
1482
  inputs=[problem_state_input, current_model_state, current_dataset_state],
 
1489
  ).then(
1490
  fn=handle_first_sample,
1491
  inputs=[current_samples_data_state],
1492
+ outputs=[sample_metadata_output, sample_response_output]
1493
  )
1494
 
1495
  # Also listen for direct input event which may be more reliable than change
 
1505
  ).then(
1506
  fn=handle_first_sample,
1507
  inputs=[current_samples_data_state],
1508
+ outputs=[sample_metadata_output, sample_response_output]
1509
  )
1510
 
1511
  # 添加样本编号的事件处理
1512
  sample_number_input.change(
1513
  fn=handle_sample_select,
1514
  inputs=[sample_number_input, current_samples_data_state],
1515
+ outputs=[sample_metadata_output, sample_response_output]
1516
  )
1517
 
1518
  sample_number_input.input(
1519
  fn=handle_sample_select,
1520
  inputs=[sample_number_input, current_samples_data_state],
1521
+ outputs=[sample_metadata_output, sample_response_output]
1522
  )
1523
 
1524
  # 修改model_dropdown.change处理函数,以重新查询当前问题响应
 
1531
  problem_content, answer_content, samples_grid_html, new_samples_data = handle_problem_select(current_problem_id, new_model_state, current_dataset)
1532
 
1533
  # 获取第一个样本的内容
1534
+ first_metadata, first_response = handle_first_sample(new_samples_data)
1535
 
1536
+ return stats_df, grid_html, new_model_state, problem_content, answer_content, samples_grid_html, new_samples_data, first_metadata, first_response
1537
  else:
1538
  # 没有问题ID,只返回更新的模型状态
1539
+ return stats_df, grid_html, new_model_state, "Please fill in all the fields.", "No answer available.", "", gr.State([]), "<div>Select a problem first to view samples.</div>", ""
1540
 
1541
  model_dropdown.change(
1542
  fn=update_model_and_requery_problem,
1543
  inputs=[model_dropdown, current_dataset_state, problem_state_input],
1544
+ outputs=[model_stats_df, problem_grid_html_output, current_model_state, problem_markdown_output, answer_markdown_output, samples_grid_output, current_samples_data_state, sample_metadata_output, sample_response_output]
1545
  ).then(
1546
  # 重置Sample Number为0
1547
  fn=lambda: "0",
 
1562
 
1563
  # 如果内存使用超过12GB (激进设置),清理缓存
1564
  if memory_usage_mb > 12000: # 12GB
 
1565
  if db:
1566
  db.clear_cache('response') # 优先清理响应缓存
1567
  gc.collect()
1568
  # 如果内存使用超过14GB,更激进地清理
1569
  if memory_usage_mb > 14000: # 14GB
 
1570
  if db:
1571
  db.clear_cache() # 清理所有缓存
1572
  gc.collect()
1573
 
1574
  return f"Memory: {memory_usage_mb:.1f} MB"
1575
  except Exception as e:
 
1576
  return "Memory monitor error"
1577
 
1578
  # 修改主函数以使用优化策略
 
1581
 
1582
  # 检查数据库文件是否存在,如果不存在则从 Hugging Face 下载
1583
  if not os.path.exists(DB_PATH):
 
1584
  try:
1585
  # 从环境变量获取 HF_TOKEN
1586
  hf_token = os.environ.get("HF_TOKEN")
 
1594
  repo_type="dataset",
1595
  token=hf_token
1596
  )
 
1597
  except Exception as e:
 
1598
  # 创建一个显示错误信息的简单 Gradio 应用
1599
  with gr.Blocks() as error_demo:
1600
  gr.Markdown(f"# Error: Database Download Failed\n{str(e)}\nPlease ensure HF_TOKEN is set correctly and try again.")
 
1602
  exit(1)
1603
 
1604
  if os.path.exists(DB_PATH):
 
 
 
 
1605
  # 创建UI并启动
 
1606
  db = ModelDatabase(DB_PATH)
1607
 
1608
  # 添加清理函数
1609
  def cleanup():
1610
  global db
1611
  if db:
 
1612
  db.close()
1613
 
1614
  # 注册清理函数
 
1625
  inbrowser=False
1626
  )
1627
  else:
1628
+ # 创建一个显示错误信息的简单 Gradio 应用
 
1629
  with gr.Blocks() as error_demo:
1630
  gr.Markdown(f"# Error: Database Not Found\nCould not find `{DB_PATH}`. Please ensure the database file is correctly placed and accessible.")
1631
  error_demo.launch(server_name="0.0.0.0")