akera commited on
Commit
2644208
·
verified ·
1 Parent(s): 4a955b1

Update src/plotting.py

Browse files
Files changed (1) hide show
  1. src/plotting.py +276 -338
src/plotting.py CHANGED
@@ -17,6 +17,7 @@ from config import (
17
  MODEL_CATEGORIES,
18
  CHART_CONFIG,
19
  STATISTICAL_CONFIG,
 
20
  )
21
 
22
  # Scientific plotting style
@@ -34,58 +35,50 @@ def create_scientific_leaderboard_plot(
34
  df: pd.DataFrame, track: str, metric: str = "quality", top_n: int = 15
35
  ) -> go.Figure:
36
  """Create scientific leaderboard plot with confidence intervals."""
37
-
38
  if df.empty:
39
  fig = go.Figure()
40
  fig.add_annotation(
41
  text="No models available for this track",
42
- xref="paper",
43
- yref="paper",
44
- x=0.5,
45
- y=0.5,
46
- showarrow=False,
47
- font=dict(size=16),
48
  )
49
  fig.update_layout(title=f"No Data Available - {track.title()} Track")
50
  return fig
51
-
52
  # Get top N models for this track
53
  metric_col = f"{track}_{metric}"
54
  ci_lower_col = f"{track}_ci_lower"
55
  ci_upper_col = f"{track}_ci_upper"
56
-
57
  if metric_col not in df.columns:
58
  fig = go.Figure()
59
  fig.add_annotation(
60
  text=f"Metric {metric} not available for {track} track",
61
- xref="paper",
62
- yref="paper",
63
- x=0.5,
64
- y=0.5,
65
- showarrow=False,
66
  )
67
  return fig
68
-
69
  # Filter and sort
70
  valid_models = df[(df[metric_col] > 0)].head(top_n)
71
-
72
  if valid_models.empty:
73
  fig = go.Figure()
74
  fig.add_annotation(text="No valid models found", x=0.5, y=0.5, showarrow=False)
75
  return fig
76
-
77
  # Create color mapping by category
78
  category_colors = {}
79
  for i, category in enumerate(MODEL_CATEGORIES.keys()):
80
  category_colors[category] = MODEL_CATEGORIES[category]["color"]
81
-
82
- colors = [
83
- category_colors.get(cat, "#808080") for cat in valid_models["model_category"]
84
- ]
85
-
86
  # Main bar plot
87
  fig = go.Figure()
88
-
89
  # Add bars with error bars if confidence intervals available
90
  if ci_lower_col in valid_models.columns and ci_upper_col in valid_models.columns:
91
  error_y = dict(
@@ -98,34 +91,30 @@ def create_scientific_leaderboard_plot(
98
  )
99
  else:
100
  error_y = None
101
-
102
- fig.add_trace(
103
- go.Bar(
104
- y=valid_models["model_name"],
105
- x=valid_models[metric_col],
106
- orientation="h",
107
- marker=dict(color=colors, line=dict(color="black", width=0.5)),
108
- error_x=error_y,
109
- text=[f"{score:.3f}" for score in valid_models[metric_col]],
110
- textposition="auto",
111
- hovertemplate=(
112
- "<b>%{y}</b><br>"
113
- + f"{metric.title()}: %{{x:.4f}}<br>"
114
- + "Category: %{customdata[0]}<br>"
115
- + "Author: %{customdata[1]}<br>"
116
- + "Samples: %{customdata[2]}<br>"
117
- + "<extra></extra>"
118
- ),
119
- customdata=list(
120
- zip(
121
- valid_models["model_category"],
122
- valid_models["author"],
123
- valid_models.get(f"{track}_samples", [0] * len(valid_models)),
124
- )
125
- ),
126
- )
127
- )
128
-
129
  # Customize layout
130
  track_info = EVALUATION_TRACKS[track]
131
  fig.update_layout(
@@ -138,24 +127,21 @@ def create_scientific_leaderboard_plot(
138
  paper_bgcolor="white",
139
  font=dict(size=12),
140
  )
141
-
142
  # Reverse y-axis to show best model at top
143
  fig.update_yaxes(autorange="reversed")
144
-
145
  # Add category legend
146
  for category, info in MODEL_CATEGORIES.items():
147
  if category in valid_models["model_category"].values:
148
- fig.add_trace(
149
- go.Scatter(
150
- x=[None],
151
- y=[None],
152
- mode="markers",
153
- marker=dict(size=10, color=info["color"]),
154
- name=info["name"],
155
- showlegend=True,
156
- )
157
- )
158
-
159
  return fig
160
 
161
 
@@ -163,63 +149,57 @@ def create_language_pair_heatmap_scientific(
163
  model_results: Dict, track: str, metric: str = "quality_score"
164
  ) -> go.Figure:
165
  """Create research-grade language pair heatmap with proper axes."""
166
-
167
  if not model_results or "tracks" not in model_results:
168
  fig = go.Figure()
169
- fig.add_annotation(
170
- text="No model results available", x=0.5, y=0.5, showarrow=False
171
- )
172
  return fig
173
-
174
  track_data = model_results["tracks"].get(track, {})
175
  if track_data.get("error") or "pair_metrics" not in track_data:
176
  fig = go.Figure()
177
- fig.add_annotation(
178
- text=f"No data available for {track} track", x=0.5, y=0.5, showarrow=False
179
- )
180
  return fig
181
-
182
  pair_metrics = track_data["pair_metrics"]
183
  track_languages = EVALUATION_TRACKS[track]["languages"]
184
-
185
  # Create matrix for heatmap
186
  n_langs = len(track_languages)
187
  matrix = np.full((n_langs, n_langs), np.nan)
188
-
189
  for i, src_lang in enumerate(track_languages):
190
  for j, tgt_lang in enumerate(track_languages):
191
  if src_lang != tgt_lang:
192
  pair_key = f"{src_lang}_to_{tgt_lang}"
193
  if pair_key in pair_metrics and metric in pair_metrics[pair_key]:
194
  matrix[i, j] = pair_metrics[pair_key][metric]["mean"]
195
-
196
  # Create language labels
197
  lang_labels = [LANGUAGE_NAMES.get(lang, lang.upper()) for lang in track_languages]
198
-
199
  # Create heatmap
200
- fig = go.Figure(
201
- data=go.Heatmap(
202
- z=matrix,
203
- x=lang_labels,
204
- y=lang_labels,
205
- colorscale="Viridis",
206
- showscale=True,
207
- colorbar=dict(
208
- title=f"{metric.replace('_', ' ').title()}",
209
- titleside="right",
210
- len=0.8,
211
- ),
212
- hovertemplate=(
213
- "Source: %{y}<br>"
214
- + "Target: %{x}<br>"
215
- + f"{metric.replace('_', ' ').title()}: %{{z:.3f}}<br>"
216
- + "<extra></extra>"
217
- ),
218
- zmin=0,
219
- zmax=1 if metric == "quality_score" else None,
220
- )
221
- )
222
-
223
  # Customize layout
224
  track_info = EVALUATION_TRACKS[track]
225
  fig.update_layout(
@@ -232,93 +212,87 @@ def create_language_pair_heatmap_scientific(
232
  xaxis=dict(side="bottom"),
233
  yaxis=dict(autorange="reversed"), # Source languages from top to bottom
234
  )
235
-
236
  return fig
237
 
238
 
239
  def create_statistical_comparison_plot(df: pd.DataFrame, track: str) -> go.Figure:
240
  """Create statistical comparison plot showing confidence intervals."""
241
-
242
  if df.empty:
243
  fig = go.Figure()
244
  fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
245
  return fig
246
-
247
  metric_col = f"{track}_quality"
248
  ci_lower_col = f"{track}_ci_lower"
249
  ci_upper_col = f"{track}_ci_upper"
250
-
251
  # Filter to models with data for this track
252
  valid_models = df[
253
- (df[metric_col] > 0) & (df[ci_lower_col].notna()) & (df[ci_upper_col].notna())
 
 
254
  ].head(10)
255
-
256
  if valid_models.empty:
257
  fig = go.Figure()
258
- fig.add_annotation(
259
- text="No models with confidence intervals", x=0.5, y=0.5, showarrow=False
260
- )
261
  return fig
262
-
263
  fig = go.Figure()
264
-
265
  # Add confidence intervals as error bars
266
  for i, (_, model) in enumerate(valid_models.iterrows()):
267
  category = model["model_category"]
268
  color = MODEL_CATEGORIES.get(category, {}).get("color", "#808080")
269
-
270
  # Main point
271
- fig.add_trace(
272
- go.Scatter(
273
- x=[model[metric_col]],
274
- y=[i],
275
- mode="markers",
276
- marker=dict(
277
- size=12,
278
- color=color,
279
- line=dict(color="black", width=1),
280
- ),
281
- name=model["model_name"],
282
- showlegend=False,
283
- hovertemplate=(
284
- f"<b>{model['model_name']}</b><br>"
285
- + f"Quality: {model[metric_col]:.4f}<br>"
286
- + f"95% CI: [{model[ci_lower_col]:.4f}, {model[ci_upper_col]:.4f}]<br>"
287
- + f"Category: {category}<br>"
288
- + "<extra></extra>"
289
- ),
290
- )
291
- )
292
-
293
  # Confidence interval line
294
- fig.add_trace(
295
- go.Scatter(
296
- x=[model[ci_lower_col], model[ci_upper_col]],
297
- y=[i, i],
298
- mode="lines",
299
- line=dict(color=color, width=3),
300
- showlegend=False,
301
- hoverinfo="skip",
302
- )
303
- )
304
-
305
  # CI endpoints
306
- fig.add_trace(
307
- go.Scatter(
308
- x=[model[ci_lower_col], model[ci_upper_col]],
309
- y=[i, i],
310
- mode="markers",
311
- marker=dict(
312
- symbol="line-ns",
313
- size=10,
314
- color=color,
315
- line=dict(width=2),
316
- ),
317
- showlegend=False,
318
- hoverinfo="skip",
319
- )
320
- )
321
-
322
  # Customize layout
323
  track_info = EVALUATION_TRACKS[track]
324
  fig.update_layout(
@@ -336,56 +310,52 @@ def create_statistical_comparison_plot(df: pd.DataFrame, track: str) -> go.Figur
336
  plot_bgcolor="white",
337
  paper_bgcolor="white",
338
  )
339
-
340
  return fig
341
 
342
 
343
  def create_category_comparison_plot(df: pd.DataFrame, track: str) -> go.Figure:
344
  """Create category-wise comparison plot."""
345
-
346
  if df.empty:
347
  fig = go.Figure()
348
  fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
349
  return fig
350
-
351
  metric_col = f"{track}_quality"
352
  adequate_col = f"{track}_adequate"
353
-
354
  # Filter to adequate models
355
  valid_models = df[df[adequate_col] & (df[metric_col] > 0)]
356
-
357
  if valid_models.empty:
358
  fig = go.Figure()
359
- fig.add_annotation(
360
- text="No adequate models found", x=0.5, y=0.5, showarrow=False
361
- )
362
  return fig
363
-
364
  fig = go.Figure()
365
-
366
  # Create box plot for each category
367
  for category, info in MODEL_CATEGORIES.items():
368
  category_models = valid_models[valid_models["model_category"] == category]
369
-
370
  if len(category_models) > 0:
371
- fig.add_trace(
372
- go.Box(
373
- y=category_models[metric_col],
374
- name=info["name"],
375
- marker_color=info["color"],
376
- boxpoints="all", # Show all points
377
- jitter=0.3,
378
- pointpos=-1.8,
379
- hovertemplate=(
380
- f"<b>{info['name']}</b><br>"
381
- + "Quality: %{y:.4f}<br>"
382
- + "Model: %{customdata}<br>"
383
- + "<extra></extra>"
384
- ),
385
- customdata=category_models["model_name"],
386
- )
387
- )
388
-
389
  # Customize layout
390
  track_info = EVALUATION_TRACKS[track]
391
  fig.update_layout(
@@ -397,202 +367,183 @@ def create_category_comparison_plot(df: pd.DataFrame, track: str) -> go.Figure:
397
  plot_bgcolor="white",
398
  paper_bgcolor="white",
399
  )
400
-
401
  return fig
402
 
403
 
404
  def create_adequacy_analysis_plot(df: pd.DataFrame) -> go.Figure:
405
  """Create analysis plot for statistical adequacy across tracks."""
406
-
407
  if df.empty:
408
  fig = go.Figure()
409
  fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
410
  return fig
411
-
412
  fig = make_subplots(
413
- rows=2,
414
- cols=2,
415
  subplot_titles=(
416
  "Sample Sizes by Track",
417
- "Statistical Adequacy Distribution",
418
  "Scientific Adequacy Scores",
419
- "Model Categories Distribution",
420
  ),
421
  specs=[
422
  [{"type": "bar"}, {"type": "pie"}],
423
- [{"type": "histogram"}, {"type": "bar"}],
424
- ],
425
  )
426
-
427
  # Sample sizes by track
428
  track_names = []
429
  sample_counts = []
430
-
431
  for track in EVALUATION_TRACKS.keys():
432
  samples_col = f"{track}_samples"
433
  if samples_col in df.columns:
434
  total_samples = df[df[samples_col] > 0][samples_col].sum()
435
  track_names.append(track.replace("_", " ").title())
436
  sample_counts.append(total_samples)
437
-
438
  if track_names:
439
  fig.add_trace(
440
- go.Bar(x=track_names, y=sample_counts, name="Samples"), row=1, col=1
 
441
  )
442
-
443
  # Statistical adequacy distribution
444
  adequacy_bins = pd.cut(
445
- df["scientific_adequacy_score"],
446
  bins=[0, 0.3, 0.6, 0.8, 1.0],
447
- labels=["Poor", "Fair", "Good", "Excellent"],
448
  )
449
  adequacy_counts = adequacy_bins.value_counts()
450
-
451
  if not adequacy_counts.empty:
452
  fig.add_trace(
453
  go.Pie(
454
  labels=adequacy_counts.index,
455
  values=adequacy_counts.values,
456
- name="Adequacy",
457
  ),
458
- row=1,
459
- col=2,
460
  )
461
-
462
  # Scientific adequacy scores histogram
463
  fig.add_trace(
464
  go.Histogram(
465
- x=df["scientific_adequacy_score"], nbinsx=20, name="Adequacy Scores"
 
 
466
  ),
467
- row=2,
468
- col=1,
469
  )
470
-
471
  # Model categories distribution
472
  category_counts = df["model_category"].value_counts()
473
- category_colors = [
474
- MODEL_CATEGORIES.get(cat, {}).get("color", "#808080")
475
- for cat in category_counts.index
476
- ]
477
-
478
  fig.add_trace(
479
  go.Bar(
480
  x=category_counts.index,
481
  y=category_counts.values,
482
  marker_color=category_colors,
483
- name="Categories",
484
  ),
485
- row=2,
486
- col=2,
487
  )
488
-
489
  fig.update_layout(
490
- title="📊 Scientific Evaluation Analysis", height=800, showlegend=False
 
 
491
  )
492
-
493
  return fig
494
 
495
 
496
  def create_cross_track_analysis_plot(df: pd.DataFrame) -> go.Figure:
497
  """Create cross-track performance correlation analysis."""
498
-
499
  if df.empty:
500
  fig = go.Figure()
501
  fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
502
  return fig
503
-
504
  # Get models with data in multiple tracks
505
  quality_cols = [f"{track}_quality" for track in EVALUATION_TRACKS.keys()]
506
  available_cols = [col for col in quality_cols if col in df.columns]
507
-
508
  if len(available_cols) < 2:
509
  fig = go.Figure()
510
- fig.add_annotation(
511
- text="Need at least 2 tracks for comparison", x=0.5, y=0.5, showarrow=False
512
- )
513
  return fig
514
-
515
  # Filter to models with data in multiple tracks
516
  multi_track_models = df.copy()
517
  for col in available_cols:
518
  multi_track_models = multi_track_models[multi_track_models[col] > 0]
519
-
520
  if len(multi_track_models) < 3:
521
  fig = go.Figure()
522
- fig.add_annotation(
523
- text="Insufficient models for cross-track analysis",
524
- x=0.5,
525
- y=0.5,
526
- showarrow=False,
527
- )
528
  return fig
529
-
530
  # Create scatter plot matrix
531
- track_pairs = [
532
- (available_cols[i], available_cols[j])
533
- for i in range(len(available_cols))
534
- for j in range(i + 1, len(available_cols))
535
- ]
536
-
537
  if not track_pairs:
538
  fig = go.Figure()
539
- fig.add_annotation(
540
- text="No track pairs available", x=0.5, y=0.5, showarrow=False
541
- )
542
  return fig
543
-
544
  # Use first pair for demonstration
545
  x_col, y_col = track_pairs[0]
546
  x_track = x_col.replace("_quality", "").replace("_", " ").title()
547
  y_track = y_col.replace("_quality", "").replace("_", " ").title()
548
-
549
  fig = go.Figure()
550
-
551
  # Color by category
552
  for category, info in MODEL_CATEGORIES.items():
553
- category_models = multi_track_models[
554
- multi_track_models["model_category"] == category
555
- ]
556
-
557
  if len(category_models) > 0:
558
- fig.add_trace(
559
- go.Scatter(
560
- x=category_models[x_col],
561
- y=category_models[y_col],
562
- mode="markers",
563
- marker=dict(
564
- size=10,
565
- color=info["color"],
566
- line=dict(color="black", width=1),
567
- ),
568
- name=info["name"],
569
- text=category_models["model_name"],
570
- hovertemplate=(
571
- "<b>%{text}</b><br>"
572
- + f"{x_track}: %{{x:.4f}}<br>"
573
- + f"{y_track}: %{{y:.4f}}<br>"
574
- + f"Category: {info['name']}<br>"
575
- + "<extra></extra>"
576
- ),
577
- )
578
- )
579
-
580
  # Add diagonal line for reference
581
  min_val = min(multi_track_models[x_col].min(), multi_track_models[y_col].min())
582
  max_val = max(multi_track_models[x_col].max(), multi_track_models[y_col].max())
583
-
584
- fig.add_trace(
585
- go.Scatter(
586
- x=[min_val, max_val],
587
- y=[min_val, max_val],
588
- mode="lines",
589
- line=dict(dash="dash", color="gray", width=2),
590
- name="Perfect Correlation",
591
- showlegend=False,
592
- hoverinfo="skip",
593
- )
594
- )
595
-
596
  fig.update_layout(
597
  title=f"🔄 Cross-Track Performance: {x_track} vs {y_track}",
598
  xaxis_title=f"{x_track} Quality Score",
@@ -602,82 +553,71 @@ def create_cross_track_analysis_plot(df: pd.DataFrame) -> go.Figure:
602
  plot_bgcolor="white",
603
  paper_bgcolor="white",
604
  )
605
-
606
  return fig
607
 
608
 
609
- def create_scientific_model_detail_plot(
610
- model_results: Dict, model_name: str, track: str
611
- ) -> go.Figure:
612
  """Create detailed scientific analysis for a specific model."""
613
-
614
  if not model_results or "tracks" not in model_results:
615
  fig = go.Figure()
616
- fig.add_annotation(
617
- text="No model results available", x=0.5, y=0.5, showarrow=False
618
- )
619
  return fig
620
-
621
  track_data = model_results["tracks"].get(track, {})
622
  if track_data.get("error") or "pair_metrics" not in track_data:
623
  fig = go.Figure()
624
- fig.add_annotation(
625
- text=f"No data for {track} track", x=0.5, y=0.5, showarrow=False
626
- )
627
  return fig
628
-
629
  pair_metrics = track_data["pair_metrics"]
630
  track_languages = EVALUATION_TRACKS[track]["languages"]
631
-
632
  # Extract data for plotting
633
  pairs = []
634
  quality_means = []
635
  quality_cis = []
636
  bleu_means = []
637
  sample_counts = []
638
-
639
  for src in track_languages:
640
  for tgt in track_languages:
641
  if src == tgt:
642
  continue
643
-
644
  pair_key = f"{src}_to_{tgt}"
645
  if pair_key in pair_metrics:
646
  metrics = pair_metrics[pair_key]
647
-
648
  if "quality_score" in metrics and "sample_count" in metrics:
649
  pair_label = f"{LANGUAGE_NAMES.get(src, src)} → {LANGUAGE_NAMES.get(tgt, tgt)}"
650
  pairs.append(pair_label)
651
-
652
  quality_stats = metrics["quality_score"]
653
  quality_means.append(quality_stats["mean"])
654
- quality_cis.append(
655
- [quality_stats["ci_lower"], quality_stats["ci_upper"]]
656
- )
657
-
658
  bleu_stats = metrics.get("bleu", {"mean": 0})
659
  bleu_means.append(bleu_stats["mean"])
660
-
661
  sample_counts.append(metrics["sample_count"])
662
-
663
  if not pairs:
664
  fig = go.Figure()
665
- fig.add_annotation(
666
- text="No language pair data available", x=0.5, y=0.5, showarrow=False
667
- )
668
  return fig
669
-
670
  # Create subplots
671
  fig = make_subplots(
672
- rows=2,
673
- cols=1,
674
  subplot_titles=(
675
  "Quality Scores by Language Pair (with 95% CI)",
676
- "BLEU Scores by Language Pair",
677
  ),
678
  vertical_spacing=0.15,
679
  )
680
-
681
  # Quality scores with confidence intervals
682
  error_y = dict(
683
  type="data",
@@ -687,7 +627,7 @@ def create_scientific_model_detail_plot(
687
  thickness=2,
688
  width=4,
689
  )
690
-
691
  fig.add_trace(
692
  go.Bar(
693
  x=pairs,
@@ -698,17 +638,16 @@ def create_scientific_model_detail_plot(
698
  text=[f"{score:.3f}" for score in quality_means],
699
  textposition="outside",
700
  hovertemplate=(
701
- "<b>%{x}</b><br>"
702
- + "Quality: %{y:.4f}<br>"
703
- + "Samples: %{customdata}<br>"
704
- + "<extra></extra>"
705
  ),
706
  customdata=sample_counts,
707
  ),
708
- row=1,
709
- col=1,
710
  )
711
-
712
  # BLEU scores
713
  fig.add_trace(
714
  go.Bar(
@@ -719,10 +658,9 @@ def create_scientific_model_detail_plot(
719
  text=[f"{score:.1f}" for score in bleu_means],
720
  textposition="outside",
721
  ),
722
- row=2,
723
- col=1,
724
  )
725
-
726
  # Customize layout
727
  track_info = EVALUATION_TRACKS[track]
728
  fig.update_layout(
@@ -731,9 +669,9 @@ def create_scientific_model_detail_plot(
731
  showlegend=False,
732
  margin=dict(l=50, r=50, t=100, b=150),
733
  )
734
-
735
  # Rotate x-axis labels
736
  fig.update_xaxes(tickangle=45, row=1, col=1)
737
  fig.update_xaxes(tickangle=45, row=2, col=1)
738
-
739
- return fig
 
17
  MODEL_CATEGORIES,
18
  CHART_CONFIG,
19
  STATISTICAL_CONFIG,
20
+ SAMPLE_SIZE_RECOMMENDATIONS,
21
  )
22
 
23
  # Scientific plotting style
 
35
  df: pd.DataFrame, track: str, metric: str = "quality", top_n: int = 15
36
  ) -> go.Figure:
37
  """Create scientific leaderboard plot with confidence intervals."""
38
+
39
  if df.empty:
40
  fig = go.Figure()
41
  fig.add_annotation(
42
  text="No models available for this track",
43
+ xref="paper", yref="paper",
44
+ x=0.5, y=0.5, showarrow=False,
45
+ font=dict(size=16)
 
 
 
46
  )
47
  fig.update_layout(title=f"No Data Available - {track.title()} Track")
48
  return fig
49
+
50
  # Get top N models for this track
51
  metric_col = f"{track}_{metric}"
52
  ci_lower_col = f"{track}_ci_lower"
53
  ci_upper_col = f"{track}_ci_upper"
54
+
55
  if metric_col not in df.columns:
56
  fig = go.Figure()
57
  fig.add_annotation(
58
  text=f"Metric {metric} not available for {track} track",
59
+ xref="paper", yref="paper",
60
+ x=0.5, y=0.5, showarrow=False,
 
 
 
61
  )
62
  return fig
63
+
64
  # Filter and sort
65
  valid_models = df[(df[metric_col] > 0)].head(top_n)
66
+
67
  if valid_models.empty:
68
  fig = go.Figure()
69
  fig.add_annotation(text="No valid models found", x=0.5, y=0.5, showarrow=False)
70
  return fig
71
+
72
  # Create color mapping by category
73
  category_colors = {}
74
  for i, category in enumerate(MODEL_CATEGORIES.keys()):
75
  category_colors[category] = MODEL_CATEGORIES[category]["color"]
76
+
77
+ colors = [category_colors.get(cat, "#808080") for cat in valid_models["model_category"]]
78
+
 
 
79
  # Main bar plot
80
  fig = go.Figure()
81
+
82
  # Add bars with error bars if confidence intervals available
83
  if ci_lower_col in valid_models.columns and ci_upper_col in valid_models.columns:
84
  error_y = dict(
 
91
  )
92
  else:
93
  error_y = None
94
+
95
+ fig.add_trace(go.Bar(
96
+ y=valid_models["model_name"],
97
+ x=valid_models[metric_col],
98
+ orientation="h",
99
+ marker=dict(color=colors, line=dict(color="black", width=0.5)),
100
+ error_x=error_y,
101
+ text=[f"{score:.3f}" for score in valid_models[metric_col]],
102
+ textposition="auto",
103
+ hovertemplate=(
104
+ "<b>%{y}</b><br>" +
105
+ f"{metric.title()}: %{{x:.4f}}<br>" +
106
+ "Category: %{customdata[0]}<br>" +
107
+ "Author: %{customdata[1]}<br>" +
108
+ "Samples: %{customdata[2]}<br>" +
109
+ "<extra></extra>"
110
+ ),
111
+ customdata=list(zip(
112
+ valid_models["model_category"],
113
+ valid_models["author"],
114
+ valid_models.get(f"{track}_samples", [0] * len(valid_models))
115
+ )),
116
+ ))
117
+
 
 
 
 
118
  # Customize layout
119
  track_info = EVALUATION_TRACKS[track]
120
  fig.update_layout(
 
127
  paper_bgcolor="white",
128
  font=dict(size=12),
129
  )
130
+
131
  # Reverse y-axis to show best model at top
132
  fig.update_yaxes(autorange="reversed")
133
+
134
  # Add category legend
135
  for category, info in MODEL_CATEGORIES.items():
136
  if category in valid_models["model_category"].values:
137
+ fig.add_trace(go.Scatter(
138
+ x=[None], y=[None],
139
+ mode="markers",
140
+ marker=dict(size=10, color=info["color"]),
141
+ name=info["name"],
142
+ showlegend=True,
143
+ ))
144
+
 
 
 
145
  return fig
146
 
147
 
 
149
  model_results: Dict, track: str, metric: str = "quality_score"
150
  ) -> go.Figure:
151
  """Create research-grade language pair heatmap with proper axes."""
152
+
153
  if not model_results or "tracks" not in model_results:
154
  fig = go.Figure()
155
+ fig.add_annotation(text="No model results available", x=0.5, y=0.5, showarrow=False)
 
 
156
  return fig
157
+
158
  track_data = model_results["tracks"].get(track, {})
159
  if track_data.get("error") or "pair_metrics" not in track_data:
160
  fig = go.Figure()
161
+ fig.add_annotation(text=f"No data available for {track} track", x=0.5, y=0.5, showarrow=False)
 
 
162
  return fig
163
+
164
  pair_metrics = track_data["pair_metrics"]
165
  track_languages = EVALUATION_TRACKS[track]["languages"]
166
+
167
  # Create matrix for heatmap
168
  n_langs = len(track_languages)
169
  matrix = np.full((n_langs, n_langs), np.nan)
170
+
171
  for i, src_lang in enumerate(track_languages):
172
  for j, tgt_lang in enumerate(track_languages):
173
  if src_lang != tgt_lang:
174
  pair_key = f"{src_lang}_to_{tgt_lang}"
175
  if pair_key in pair_metrics and metric in pair_metrics[pair_key]:
176
  matrix[i, j] = pair_metrics[pair_key][metric]["mean"]
177
+
178
  # Create language labels
179
  lang_labels = [LANGUAGE_NAMES.get(lang, lang.upper()) for lang in track_languages]
180
+
181
  # Create heatmap
182
+ fig = go.Figure(data=go.Heatmap(
183
+ z=matrix,
184
+ x=lang_labels,
185
+ y=lang_labels,
186
+ colorscale="Viridis",
187
+ showscale=True,
188
+ colorbar=dict(
189
+ title=f"{metric.replace('_', ' ').title()}",
190
+ titleside="right",
191
+ len=0.8,
192
+ ),
193
+ hovertemplate=(
194
+ "Source: %{y}<br>" +
195
+ "Target: %{x}<br>" +
196
+ f"{metric.replace('_', ' ').title()}: %{{z:.3f}}<br>" +
197
+ "<extra></extra>"
198
+ ),
199
+ zmin=0,
200
+ zmax=1 if metric == "quality_score" else None,
201
+ ))
202
+
 
 
203
  # Customize layout
204
  track_info = EVALUATION_TRACKS[track]
205
  fig.update_layout(
 
212
  xaxis=dict(side="bottom"),
213
  yaxis=dict(autorange="reversed"), # Source languages from top to bottom
214
  )
215
+
216
  return fig
217
 
218
 
219
  def create_statistical_comparison_plot(df: pd.DataFrame, track: str) -> go.Figure:
220
  """Create statistical comparison plot showing confidence intervals."""
221
+
222
  if df.empty:
223
  fig = go.Figure()
224
  fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
225
  return fig
226
+
227
  metric_col = f"{track}_quality"
228
  ci_lower_col = f"{track}_ci_lower"
229
  ci_upper_col = f"{track}_ci_upper"
230
+
231
  # Filter to models with data for this track
232
  valid_models = df[
233
+ (df[metric_col] > 0) &
234
+ (df[ci_lower_col].notna()) &
235
+ (df[ci_upper_col].notna())
236
  ].head(10)
237
+
238
  if valid_models.empty:
239
  fig = go.Figure()
240
+ fig.add_annotation(text="No models with confidence intervals", x=0.5, y=0.5, showarrow=False)
 
 
241
  return fig
242
+
243
  fig = go.Figure()
244
+
245
  # Add confidence intervals as error bars
246
  for i, (_, model) in enumerate(valid_models.iterrows()):
247
  category = model["model_category"]
248
  color = MODEL_CATEGORIES.get(category, {}).get("color", "#808080")
249
+
250
  # Main point
251
+ fig.add_trace(go.Scatter(
252
+ x=[model[metric_col]],
253
+ y=[i],
254
+ mode="markers",
255
+ marker=dict(
256
+ size=12,
257
+ color=color,
258
+ line=dict(color="black", width=1),
259
+ ),
260
+ name=model["model_name"],
261
+ showlegend=False,
262
+ hovertemplate=(
263
+ f"<b>{model['model_name']}</b><br>" +
264
+ f"Quality: {model[metric_col]:.4f}<br>" +
265
+ f"95% CI: [{model[ci_lower_col]:.4f}, {model[ci_upper_col]:.4f}]<br>" +
266
+ f"Category: {category}<br>" +
267
+ "<extra></extra>"
268
+ ),
269
+ ))
270
+
 
 
271
  # Confidence interval line
272
+ fig.add_trace(go.Scatter(
273
+ x=[model[ci_lower_col], model[ci_upper_col]],
274
+ y=[i, i],
275
+ mode="lines",
276
+ line=dict(color=color, width=3),
277
+ showlegend=False,
278
+ hoverinfo="skip",
279
+ ))
280
+
 
 
281
  # CI endpoints
282
+ fig.add_trace(go.Scatter(
283
+ x=[model[ci_lower_col], model[ci_upper_col]],
284
+ y=[i, i],
285
+ mode="markers",
286
+ marker=dict(
287
+ symbol="line-ns",
288
+ size=10,
289
+ color=color,
290
+ line=dict(width=2),
291
+ ),
292
+ showlegend=False,
293
+ hoverinfo="skip",
294
+ ))
295
+
 
 
296
  # Customize layout
297
  track_info = EVALUATION_TRACKS[track]
298
  fig.update_layout(
 
310
  plot_bgcolor="white",
311
  paper_bgcolor="white",
312
  )
313
+
314
  return fig
315
 
316
 
317
  def create_category_comparison_plot(df: pd.DataFrame, track: str) -> go.Figure:
318
  """Create category-wise comparison plot."""
319
+
320
  if df.empty:
321
  fig = go.Figure()
322
  fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
323
  return fig
324
+
325
  metric_col = f"{track}_quality"
326
  adequate_col = f"{track}_adequate"
327
+
328
  # Filter to adequate models
329
  valid_models = df[df[adequate_col] & (df[metric_col] > 0)]
330
+
331
  if valid_models.empty:
332
  fig = go.Figure()
333
+ fig.add_annotation(text="No adequate models found", x=0.5, y=0.5, showarrow=False)
 
 
334
  return fig
335
+
336
  fig = go.Figure()
337
+
338
  # Create box plot for each category
339
  for category, info in MODEL_CATEGORIES.items():
340
  category_models = valid_models[valid_models["model_category"] == category]
341
+
342
  if len(category_models) > 0:
343
+ fig.add_trace(go.Box(
344
+ y=category_models[metric_col],
345
+ name=info["name"],
346
+ marker_color=info["color"],
347
+ boxpoints="all", # Show all points
348
+ jitter=0.3,
349
+ pointpos=-1.8,
350
+ hovertemplate=(
351
+ f"<b>{info['name']}</b><br>" +
352
+ "Quality: %{y:.4f}<br>" +
353
+ "Model: %{customdata}<br>" +
354
+ "<extra></extra>"
355
+ ),
356
+ customdata=category_models["model_name"],
357
+ ))
358
+
 
 
359
  # Customize layout
360
  track_info = EVALUATION_TRACKS[track]
361
  fig.update_layout(
 
367
  plot_bgcolor="white",
368
  paper_bgcolor="white",
369
  )
370
+
371
  return fig
372
 
373
 
374
  def create_adequacy_analysis_plot(df: pd.DataFrame) -> go.Figure:
375
  """Create analysis plot for statistical adequacy across tracks."""
376
+
377
  if df.empty:
378
  fig = go.Figure()
379
  fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
380
  return fig
381
+
382
  fig = make_subplots(
383
+ rows=2, cols=2,
 
384
  subplot_titles=(
385
  "Sample Sizes by Track",
386
+ "Statistical Adequacy Distribution",
387
  "Scientific Adequacy Scores",
388
+ "Model Categories Distribution"
389
  ),
390
  specs=[
391
  [{"type": "bar"}, {"type": "pie"}],
392
+ [{"type": "histogram"}, {"type": "bar"}]
393
+ ]
394
  )
395
+
396
  # Sample sizes by track
397
  track_names = []
398
  sample_counts = []
399
+
400
  for track in EVALUATION_TRACKS.keys():
401
  samples_col = f"{track}_samples"
402
  if samples_col in df.columns:
403
  total_samples = df[df[samples_col] > 0][samples_col].sum()
404
  track_names.append(track.replace("_", " ").title())
405
  sample_counts.append(total_samples)
406
+
407
  if track_names:
408
  fig.add_trace(
409
+ go.Bar(x=track_names, y=sample_counts, name="Samples"),
410
+ row=1, col=1
411
  )
412
+
413
  # Statistical adequacy distribution
414
  adequacy_bins = pd.cut(
415
+ df["scientific_adequacy_score"],
416
  bins=[0, 0.3, 0.6, 0.8, 1.0],
417
+ labels=["Poor", "Fair", "Good", "Excellent"]
418
  )
419
  adequacy_counts = adequacy_bins.value_counts()
420
+
421
  if not adequacy_counts.empty:
422
  fig.add_trace(
423
  go.Pie(
424
  labels=adequacy_counts.index,
425
  values=adequacy_counts.values,
426
+ name="Adequacy"
427
  ),
428
+ row=1, col=2
 
429
  )
430
+
431
  # Scientific adequacy scores histogram
432
  fig.add_trace(
433
  go.Histogram(
434
+ x=df["scientific_adequacy_score"],
435
+ nbinsx=20,
436
+ name="Adequacy Scores"
437
  ),
438
+ row=2, col=1
 
439
  )
440
+
441
  # Model categories distribution
442
  category_counts = df["model_category"].value_counts()
443
+ category_colors = [MODEL_CATEGORIES.get(cat, {}).get("color", "#808080") for cat in category_counts.index]
444
+
 
 
 
445
  fig.add_trace(
446
  go.Bar(
447
  x=category_counts.index,
448
  y=category_counts.values,
449
  marker_color=category_colors,
450
+ name="Categories"
451
  ),
452
+ row=2, col=2
 
453
  )
454
+
455
  fig.update_layout(
456
+ title="📊 Scientific Evaluation Analysis",
457
+ height=800,
458
+ showlegend=False
459
  )
460
+
461
  return fig
462
 
463
 
464
  def create_cross_track_analysis_plot(df: pd.DataFrame) -> go.Figure:
465
  """Create cross-track performance correlation analysis."""
466
+
467
  if df.empty:
468
  fig = go.Figure()
469
  fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
470
  return fig
471
+
472
  # Get models with data in multiple tracks
473
  quality_cols = [f"{track}_quality" for track in EVALUATION_TRACKS.keys()]
474
  available_cols = [col for col in quality_cols if col in df.columns]
475
+
476
  if len(available_cols) < 2:
477
  fig = go.Figure()
478
+ fig.add_annotation(text="Need at least 2 tracks for comparison", x=0.5, y=0.5, showarrow=False)
 
 
479
  return fig
480
+
481
  # Filter to models with data in multiple tracks
482
  multi_track_models = df.copy()
483
  for col in available_cols:
484
  multi_track_models = multi_track_models[multi_track_models[col] > 0]
485
+
486
  if len(multi_track_models) < 3:
487
  fig = go.Figure()
488
+ fig.add_annotation(text="Insufficient models for cross-track analysis", x=0.5, y=0.5, showarrow=False)
 
 
 
 
 
489
  return fig
490
+
491
  # Create scatter plot matrix
492
+ track_pairs = [(available_cols[i], available_cols[j])
493
+ for i in range(len(available_cols))
494
+ for j in range(i+1, len(available_cols))]
495
+
 
 
496
  if not track_pairs:
497
  fig = go.Figure()
498
+ fig.add_annotation(text="No track pairs available", x=0.5, y=0.5, showarrow=False)
 
 
499
  return fig
500
+
501
  # Use first pair for demonstration
502
  x_col, y_col = track_pairs[0]
503
  x_track = x_col.replace("_quality", "").replace("_", " ").title()
504
  y_track = y_col.replace("_quality", "").replace("_", " ").title()
505
+
506
  fig = go.Figure()
507
+
508
  # Color by category
509
  for category, info in MODEL_CATEGORIES.items():
510
+ category_models = multi_track_models[multi_track_models["model_category"] == category]
511
+
 
 
512
  if len(category_models) > 0:
513
+ fig.add_trace(go.Scatter(
514
+ x=category_models[x_col],
515
+ y=category_models[y_col],
516
+ mode="markers",
517
+ marker=dict(
518
+ size=10,
519
+ color=info["color"],
520
+ line=dict(color="black", width=1),
521
+ ),
522
+ name=info["name"],
523
+ text=category_models["model_name"],
524
+ hovertemplate=(
525
+ "<b>%{text}</b><br>" +
526
+ f"{x_track}: %{{x:.4f}}<br>" +
527
+ f"{y_track}: %{{y:.4f}}<br>" +
528
+ f"Category: {info['name']}<br>" +
529
+ "<extra></extra>"
530
+ ),
531
+ ))
532
+
 
 
533
  # Add diagonal line for reference
534
  min_val = min(multi_track_models[x_col].min(), multi_track_models[y_col].min())
535
  max_val = max(multi_track_models[x_col].max(), multi_track_models[y_col].max())
536
+
537
+ fig.add_trace(go.Scatter(
538
+ x=[min_val, max_val],
539
+ y=[min_val, max_val],
540
+ mode="lines",
541
+ line=dict(dash="dash", color="gray", width=2),
542
+ name="Perfect Correlation",
543
+ showlegend=False,
544
+ hoverinfo="skip",
545
+ ))
546
+
 
 
547
  fig.update_layout(
548
  title=f"🔄 Cross-Track Performance: {x_track} vs {y_track}",
549
  xaxis_title=f"{x_track} Quality Score",
 
553
  plot_bgcolor="white",
554
  paper_bgcolor="white",
555
  )
556
+
557
  return fig
558
 
559
 
560
+ def create_scientific_model_detail_plot(model_results: Dict, model_name: str, track: str) -> go.Figure:
 
 
561
  """Create detailed scientific analysis for a specific model."""
562
+
563
  if not model_results or "tracks" not in model_results:
564
  fig = go.Figure()
565
+ fig.add_annotation(text="No model results available", x=0.5, y=0.5, showarrow=False)
 
 
566
  return fig
567
+
568
  track_data = model_results["tracks"].get(track, {})
569
  if track_data.get("error") or "pair_metrics" not in track_data:
570
  fig = go.Figure()
571
+ fig.add_annotation(text=f"No data for {track} track", x=0.5, y=0.5, showarrow=False)
 
 
572
  return fig
573
+
574
  pair_metrics = track_data["pair_metrics"]
575
  track_languages = EVALUATION_TRACKS[track]["languages"]
576
+
577
  # Extract data for plotting
578
  pairs = []
579
  quality_means = []
580
  quality_cis = []
581
  bleu_means = []
582
  sample_counts = []
583
+
584
  for src in track_languages:
585
  for tgt in track_languages:
586
  if src == tgt:
587
  continue
588
+
589
  pair_key = f"{src}_to_{tgt}"
590
  if pair_key in pair_metrics:
591
  metrics = pair_metrics[pair_key]
592
+
593
  if "quality_score" in metrics and "sample_count" in metrics:
594
  pair_label = f"{LANGUAGE_NAMES.get(src, src)} → {LANGUAGE_NAMES.get(tgt, tgt)}"
595
  pairs.append(pair_label)
596
+
597
  quality_stats = metrics["quality_score"]
598
  quality_means.append(quality_stats["mean"])
599
+ quality_cis.append([quality_stats["ci_lower"], quality_stats["ci_upper"]])
600
+
 
 
601
  bleu_stats = metrics.get("bleu", {"mean": 0})
602
  bleu_means.append(bleu_stats["mean"])
603
+
604
  sample_counts.append(metrics["sample_count"])
605
+
606
  if not pairs:
607
  fig = go.Figure()
608
+ fig.add_annotation(text="No language pair data available", x=0.5, y=0.5, showarrow=False)
 
 
609
  return fig
610
+
611
  # Create subplots
612
  fig = make_subplots(
613
+ rows=2, cols=1,
 
614
  subplot_titles=(
615
  "Quality Scores by Language Pair (with 95% CI)",
616
+ "BLEU Scores by Language Pair"
617
  ),
618
  vertical_spacing=0.15,
619
  )
620
+
621
  # Quality scores with confidence intervals
622
  error_y = dict(
623
  type="data",
 
627
  thickness=2,
628
  width=4,
629
  )
630
+
631
  fig.add_trace(
632
  go.Bar(
633
  x=pairs,
 
638
  text=[f"{score:.3f}" for score in quality_means],
639
  textposition="outside",
640
  hovertemplate=(
641
+ "<b>%{x}</b><br>" +
642
+ "Quality: %{y:.4f}<br>" +
643
+ "Samples: %{customdata}<br>" +
644
+ "<extra></extra>"
645
  ),
646
  customdata=sample_counts,
647
  ),
648
+ row=1, col=1
 
649
  )
650
+
651
  # BLEU scores
652
  fig.add_trace(
653
  go.Bar(
 
658
  text=[f"{score:.1f}" for score in bleu_means],
659
  textposition="outside",
660
  ),
661
+ row=2, col=1
 
662
  )
663
+
664
  # Customize layout
665
  track_info = EVALUATION_TRACKS[track]
666
  fig.update_layout(
 
669
  showlegend=False,
670
  margin=dict(l=50, r=50, t=100, b=150),
671
  )
672
+
673
  # Rotate x-axis labels
674
  fig.update_xaxes(tickangle=45, row=1, col=1)
675
  fig.update_xaxes(tickangle=45, row=2, col=1)
676
+
677
+ return fig