akera commited on
Commit
75faa66
·
verified ·
1 Parent(s): 83243ea

Update src/plotting.py

Browse files
Files changed (1) hide show
  1. src/plotting.py +126 -370
src/plotting.py CHANGED
@@ -1,7 +1,4 @@
1
  # src/plotting.py
2
- import json
3
- import matplotlib.pyplot as plt
4
- import matplotlib.gridspec as gridspec
5
  import plotly.graph_objects as go
6
  import plotly.express as px
7
  from plotly.subplots import make_subplots
@@ -18,25 +15,13 @@ from config import (
18
  EVALUATION_TRACKS,
19
  MODEL_CATEGORIES,
20
  CHART_CONFIG,
21
- STATISTICAL_CONFIG,
22
- SAMPLE_SIZE_RECOMMENDATIONS,
23
  )
24
 
25
- # Scientific plotting style
26
- plt.style.use("default")
27
- plt.rcParams["figure.facecolor"] = "white"
28
- plt.rcParams["axes.facecolor"] = "white"
29
- plt.rcParams["font.size"] = 10
30
- plt.rcParams["axes.labelsize"] = 12
31
- plt.rcParams["axes.titlesize"] = 14
32
- plt.rcParams["xtick.labelsize"] = 10
33
- plt.rcParams["ytick.labelsize"] = 10
34
-
35
 
36
- def create_scientific_leaderboard_plot(
37
  df: pd.DataFrame, track: str, metric: str = "quality", top_n: int = 15
38
  ) -> go.Figure:
39
- """Create scientific leaderboard plot with confidence intervals."""
40
 
41
  if df.empty:
42
  fig = go.Figure()
@@ -46,7 +31,11 @@ def create_scientific_leaderboard_plot(
46
  x=0.5, y=0.5, showarrow=False,
47
  font=dict(size=16)
48
  )
49
- fig.update_layout(title=f"No Data Available - {track.title()} Track")
 
 
 
 
50
  return fig
51
 
52
  # Get top N models for this track
@@ -72,18 +61,15 @@ def create_scientific_leaderboard_plot(
72
  return fig
73
 
74
  # Create color mapping by category
75
- category_colors = {}
76
- for i, category in enumerate(MODEL_CATEGORIES.keys()):
77
- category_colors[category] = MODEL_CATEGORIES[category]["color"]
78
-
79
- colors = [category_colors.get(cat, "#808080") for cat in valid_models["model_category"]]
80
 
81
  # Main bar plot
82
  fig = go.Figure()
83
 
84
  # Add bars with error bars if confidence intervals available
 
85
  if ci_lower_col in valid_models.columns and ci_upper_col in valid_models.columns:
86
- error_y = dict(
87
  type="data",
88
  array=valid_models[ci_upper_col] - valid_models[metric_col],
89
  arrayminus=valid_models[metric_col] - valid_models[ci_lower_col],
@@ -91,15 +77,13 @@ def create_scientific_leaderboard_plot(
91
  thickness=2,
92
  width=4,
93
  )
94
- else:
95
- error_y = None
96
 
97
  fig.add_trace(go.Bar(
98
  y=valid_models["model_name"],
99
  x=valid_models[metric_col],
100
  orientation="h",
101
  marker=dict(color=colors, line=dict(color="black", width=0.5)),
102
- error_x=error_y,
103
  text=[f"{score:.3f}" for score in valid_models[metric_col]],
104
  textposition="auto",
105
  hovertemplate=(
@@ -125,32 +109,21 @@ def create_scientific_leaderboard_plot(
125
  yaxis_title="Models",
126
  height=max(400, len(valid_models) * 35 + 100),
127
  margin=dict(l=20, r=20, t=60, b=20),
128
- plot_bgcolor="white",
129
- paper_bgcolor="white",
130
  font=dict(size=12),
131
  )
132
 
133
  # Reverse y-axis to show best model at top
134
  fig.update_yaxes(autorange="reversed")
135
 
136
- # Add category legend
137
- for category, info in MODEL_CATEGORIES.items():
138
- if category in valid_models["model_category"].values:
139
- fig.add_trace(go.Scatter(
140
- x=[None], y=[None],
141
- mode="markers",
142
- marker=dict(size=10, color=info["color"]),
143
- name=info["name"],
144
- showlegend=True,
145
- ))
146
-
147
  return fig
148
 
149
 
150
- def create_language_pair_heatmap_scientific(
151
  model_results: Dict, track: str, metric: str = "quality_score"
152
  ) -> go.Figure:
153
- """Create research-grade language pair heatmap with proper axes."""
154
 
155
  if not model_results or "tracks" not in model_results:
156
  fig = go.Figure()
@@ -212,14 +185,16 @@ def create_language_pair_heatmap_scientific(
212
  width=700,
213
  font=dict(size=12),
214
  xaxis=dict(side="bottom"),
215
- yaxis=dict(autorange="reversed"), # Source languages from top to bottom
 
 
216
  )
217
 
218
  return fig
219
 
220
 
221
- def create_statistical_comparison_plot(df: pd.DataFrame, track: str) -> go.Figure:
222
- """Create statistical comparison plot showing confidence intervals."""
223
 
224
  if df.empty:
225
  fig = go.Figure()
@@ -279,26 +254,11 @@ def create_statistical_comparison_plot(df: pd.DataFrame, track: str) -> go.Figur
279
  showlegend=False,
280
  hoverinfo="skip",
281
  ))
282
-
283
- # CI endpoints
284
- fig.add_trace(go.Scatter(
285
- x=[model[ci_lower_col], model[ci_upper_col]],
286
- y=[i, i],
287
- mode="markers",
288
- marker=dict(
289
- symbol="line-ns",
290
- size=10,
291
- color=color,
292
- line=dict(width=2),
293
- ),
294
- showlegend=False,
295
- hoverinfo="skip",
296
- ))
297
 
298
  # Customize layout
299
  track_info = EVALUATION_TRACKS[track]
300
  fig.update_layout(
301
- title=f"📊 {track_info['name']} - Statistical Comparison",
302
  xaxis_title="Quality Score",
303
  yaxis_title="Models",
304
  height=max(400, len(valid_models) * 40 + 100),
@@ -309,371 +269,167 @@ def create_statistical_comparison_plot(df: pd.DataFrame, track: str) -> go.Figur
309
  autorange="reversed",
310
  ),
311
  showlegend=False,
312
- plot_bgcolor="white",
313
- paper_bgcolor="white",
314
  )
315
 
316
  return fig
317
 
318
 
319
- def create_category_comparison_plot(df: pd.DataFrame, track: str) -> go.Figure:
320
- """Create category-wise comparison plot."""
321
 
322
- if df.empty:
323
  fig = go.Figure()
324
- fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
325
- return fig
326
-
327
- metric_col = f"{track}_quality"
328
- adequate_col = f"{track}_adequate"
329
-
330
- # Filter to adequate models
331
- valid_models = df[df[adequate_col] & (df[metric_col] > 0)]
332
-
333
- if valid_models.empty:
334
- fig = go.Figure()
335
- fig.add_annotation(text="No adequate models found", x=0.5, y=0.5, showarrow=False)
336
  return fig
337
 
338
- fig = go.Figure()
339
-
340
- # Create box plot for each category
341
- for category, info in MODEL_CATEGORIES.items():
342
- category_models = valid_models[valid_models["model_category"] == category]
343
-
344
- if len(category_models) > 0:
345
- fig.add_trace(go.Box(
346
- y=category_models[metric_col],
347
- name=info["name"],
348
- marker_color=info["color"],
349
- boxpoints="all", # Show all points
350
- jitter=0.3,
351
- pointpos=-1.8,
352
- hovertemplate=(
353
- f"<b>{info['name']}</b><br>" +
354
- "Quality: %{y:.4f}<br>" +
355
- "Model: %{customdata}<br>" +
356
- "<extra></extra>"
357
- ),
358
- customdata=category_models["model_name"],
359
- ))
360
-
361
- # Customize layout
362
- track_info = EVALUATION_TRACKS[track]
363
- fig.update_layout(
364
- title=f"📈 {track_info['name']} - Performance by Category",
365
- xaxis_title="Model Category",
366
- yaxis_title="Quality Score",
367
- height=500,
368
- showlegend=False,
369
- plot_bgcolor="white",
370
- paper_bgcolor="white",
371
- )
372
-
373
- return fig
374
-
375
-
376
- def create_adequacy_analysis_plot(df: pd.DataFrame) -> go.Figure:
377
- """Create analysis plot for statistical adequacy across tracks."""
378
 
379
- if df.empty:
380
  fig = go.Figure()
381
- fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
 
 
 
382
  return fig
383
 
 
384
  fig = make_subplots(
385
- rows=2, cols=2,
386
- subplot_titles=(
387
- "Sample Sizes by Track",
388
- "Statistical Adequacy Distribution",
389
- "Scientific Adequacy Scores",
390
- "Model Categories Distribution"
391
- ),
392
- specs=[
393
- [{"type": "bar"}, {"type": "pie"}],
394
- [{"type": "histogram"}, {"type": "bar"}]
395
- ]
396
  )
397
 
398
- # Sample sizes by track
399
- track_names = []
400
- sample_counts = []
401
-
402
- for track in EVALUATION_TRACKS.keys():
403
- samples_col = f"{track}_samples"
404
- if samples_col in df.columns:
405
- total_samples = df[df[samples_col] > 0][samples_col].sum()
406
- track_names.append(track.replace("_", " ").title())
407
- sample_counts.append(total_samples)
408
-
409
- if track_names:
410
  fig.add_trace(
411
- go.Bar(x=track_names, y=sample_counts, name="Samples"),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
412
  row=1, col=1
413
  )
414
-
415
- # Statistical adequacy distribution
416
- adequacy_bins = pd.cut(
417
- df["scientific_adequacy_score"],
418
- bins=[0, 0.3, 0.6, 0.8, 1.0],
419
- labels=["Poor", "Fair", "Good", "Excellent"]
420
- )
421
- adequacy_counts = adequacy_bins.value_counts()
422
-
423
- if not adequacy_counts.empty:
424
  fig.add_trace(
425
- go.Pie(
426
- labels=adequacy_counts.index,
427
- values=adequacy_counts.values,
428
- name="Adequacy"
 
 
 
 
 
 
 
 
 
 
 
429
  ),
430
- row=1, col=2
431
  )
432
 
433
- # Scientific adequacy scores histogram
434
- fig.add_trace(
435
- go.Histogram(
436
- x=df["scientific_adequacy_score"],
437
- nbinsx=20,
438
- name="Adequacy Scores"
439
- ),
440
- row=2, col=1
441
- )
442
-
443
- # Model categories distribution
444
- category_counts = df["model_category"].value_counts()
445
- category_colors = [MODEL_CATEGORIES.get(cat, {}).get("color", "#808080") for cat in category_counts.index]
446
-
447
- fig.add_trace(
448
- go.Bar(
449
- x=category_counts.index,
450
- y=category_counts.values,
451
- marker_color=category_colors,
452
- name="Categories"
453
- ),
454
- row=2, col=2
455
- )
456
-
457
  fig.update_layout(
458
- title="📊 Scientific Evaluation Analysis",
459
  height=800,
460
- showlegend=False
 
 
 
 
 
 
 
 
 
461
  )
462
 
 
 
 
 
 
463
  return fig
464
 
465
 
466
- def create_cross_track_analysis_plot(df: pd.DataFrame) -> go.Figure:
467
- """Create cross-track performance correlation analysis."""
468
 
469
  if df.empty:
470
  fig = go.Figure()
471
  fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
472
  return fig
473
 
474
- # Get models with data in multiple tracks
475
- quality_cols = [f"{track}_quality" for track in EVALUATION_TRACKS.keys()]
476
- available_cols = [col for col in quality_cols if col in df.columns]
477
-
478
- if len(available_cols) < 2:
479
- fig = go.Figure()
480
- fig.add_annotation(text="Need at least 2 tracks for comparison", x=0.5, y=0.5, showarrow=False)
481
- return fig
482
-
483
- # Filter to models with data in multiple tracks
484
- multi_track_models = df.copy()
485
- for col in available_cols:
486
- multi_track_models = multi_track_models[multi_track_models[col] > 0]
487
-
488
- if len(multi_track_models) < 3:
489
- fig = go.Figure()
490
- fig.add_annotation(text="Insufficient models for cross-track analysis", x=0.5, y=0.5, showarrow=False)
491
- return fig
492
 
493
- # Create scatter plot matrix
494
- track_pairs = [(available_cols[i], available_cols[j])
495
- for i in range(len(available_cols))
496
- for j in range(i+1, len(available_cols))]
497
 
498
- if not track_pairs:
499
  fig = go.Figure()
500
- fig.add_annotation(text="No track pairs available", x=0.5, y=0.5, showarrow=False)
501
  return fig
502
 
503
- # Use first pair for demonstration
504
- x_col, y_col = track_pairs[0]
505
- x_track = x_col.replace("_quality", "").replace("_", " ").title()
506
- y_track = y_col.replace("_quality", "").replace("_", " ").title()
507
-
508
  fig = go.Figure()
509
 
510
- # Color by category
511
  for category, info in MODEL_CATEGORIES.items():
512
- category_models = multi_track_models[multi_track_models["model_category"] == category]
513
 
514
  if len(category_models) > 0:
515
- fig.add_trace(go.Scatter(
516
- x=category_models[x_col],
517
- y=category_models[y_col],
518
- mode="markers",
519
- marker=dict(
520
- size=10,
521
- color=info["color"],
522
- line=dict(color="black", width=1),
523
- ),
524
  name=info["name"],
525
- text=category_models["model_name"],
 
 
 
526
  hovertemplate=(
527
- "<b>%{text}</b><br>" +
528
- f"{x_track}: %{{x:.4f}}<br>" +
529
- f"{y_track}: %{{y:.4f}}<br>" +
530
- f"Category: {info['name']}<br>" +
531
  "<extra></extra>"
532
  ),
 
533
  ))
534
 
535
- # Add diagonal line for reference
536
- min_val = min(multi_track_models[x_col].min(), multi_track_models[y_col].min())
537
- max_val = max(multi_track_models[x_col].max(), multi_track_models[y_col].max())
538
-
539
- fig.add_trace(go.Scatter(
540
- x=[min_val, max_val],
541
- y=[min_val, max_val],
542
- mode="lines",
543
- line=dict(dash="dash", color="gray", width=2),
544
- name="Perfect Correlation",
545
- showlegend=False,
546
- hoverinfo="skip",
547
- ))
548
-
549
- fig.update_layout(
550
- title=f"🔄 Cross-Track Performance: {x_track} vs {y_track}",
551
- xaxis_title=f"{x_track} Quality Score",
552
- yaxis_title=f"{y_track} Quality Score",
553
- height=600,
554
- width=600,
555
- plot_bgcolor="white",
556
- paper_bgcolor="white",
557
- )
558
-
559
- return fig
560
-
561
-
562
- def create_scientific_model_detail_plot(model_results: Dict, model_name: str, track: str) -> go.Figure:
563
- """Create detailed scientific analysis for a specific model."""
564
-
565
- if not model_results or "tracks" not in model_results:
566
- fig = go.Figure()
567
- fig.add_annotation(text="No model results available", x=0.5, y=0.5, showarrow=False)
568
- return fig
569
-
570
- track_data = model_results["tracks"].get(track, {})
571
- if track_data.get("error") or "pair_metrics" not in track_data:
572
- fig = go.Figure()
573
- fig.add_annotation(text=f"No data for {track} track", x=0.5, y=0.5, showarrow=False)
574
- return fig
575
-
576
- pair_metrics = track_data["pair_metrics"]
577
- track_languages = EVALUATION_TRACKS[track]["languages"]
578
-
579
- # Extract data for plotting
580
- pairs = []
581
- quality_means = []
582
- quality_cis = []
583
- bleu_means = []
584
- sample_counts = []
585
-
586
- for src in track_languages:
587
- for tgt in track_languages:
588
- if src == tgt:
589
- continue
590
-
591
- pair_key = f"{src}_to_{tgt}"
592
- if pair_key in pair_metrics:
593
- metrics = pair_metrics[pair_key]
594
-
595
- if "quality_score" in metrics and "sample_count" in metrics:
596
- pair_label = f"{LANGUAGE_NAMES.get(src, src)} → {LANGUAGE_NAMES.get(tgt, tgt)}"
597
- pairs.append(pair_label)
598
-
599
- quality_stats = metrics["quality_score"]
600
- quality_means.append(quality_stats["mean"])
601
- quality_cis.append([quality_stats["ci_lower"], quality_stats["ci_upper"]])
602
-
603
- bleu_stats = metrics.get("bleu", {"mean": 0})
604
- bleu_means.append(bleu_stats["mean"])
605
-
606
- sample_counts.append(metrics["sample_count"])
607
-
608
- if not pairs:
609
- fig = go.Figure()
610
- fig.add_annotation(text="No language pair data available", x=0.5, y=0.5, showarrow=False)
611
- return fig
612
-
613
- # Create subplots
614
- fig = make_subplots(
615
- rows=2, cols=1,
616
- subplot_titles=(
617
- "Quality Scores by Language Pair (with 95% CI)",
618
- "BLEU Scores by Language Pair"
619
- ),
620
- vertical_spacing=0.15,
621
- )
622
-
623
- # Quality scores with confidence intervals
624
- error_y = dict(
625
- type="data",
626
- array=[ci[1] - mean for ci, mean in zip(quality_cis, quality_means)],
627
- arrayminus=[mean - ci[0] for ci, mean in zip(quality_cis, quality_means)],
628
- visible=True,
629
- thickness=2,
630
- width=4,
631
- )
632
-
633
- fig.add_trace(
634
- go.Bar(
635
- x=pairs,
636
- y=quality_means,
637
- error_y=error_y,
638
- name="Quality Score",
639
- marker_color="steelblue",
640
- text=[f"{score:.3f}" for score in quality_means],
641
- textposition="outside",
642
- hovertemplate=(
643
- "<b>%{x}</b><br>" +
644
- "Quality: %{y:.4f}<br>" +
645
- "Samples: %{customdata}<br>" +
646
- "<extra></extra>"
647
- ),
648
- customdata=sample_counts,
649
- ),
650
- row=1, col=1
651
- )
652
-
653
- # BLEU scores
654
- fig.add_trace(
655
- go.Bar(
656
- x=pairs,
657
- y=bleu_means,
658
- name="BLEU Score",
659
- marker_color="coral",
660
- text=[f"{score:.1f}" for score in bleu_means],
661
- textposition="outside",
662
- ),
663
- row=2, col=1
664
- )
665
-
666
  # Customize layout
667
  track_info = EVALUATION_TRACKS[track]
668
  fig.update_layout(
669
- title=f"🔬 Detailed Analysis: {model_name} - {track_info['name']}",
670
- height=900,
 
 
671
  showlegend=False,
672
- margin=dict(l=50, r=50, t=100, b=150),
 
673
  )
674
 
675
- # Rotate x-axis labels
676
- fig.update_xaxes(tickangle=45, row=1, col=1)
677
- fig.update_xaxes(tickangle=45, row=2, col=1)
678
-
679
  return fig
 
1
  # src/plotting.py
 
 
 
2
  import plotly.graph_objects as go
3
  import plotly.express as px
4
  from plotly.subplots import make_subplots
 
15
  EVALUATION_TRACKS,
16
  MODEL_CATEGORIES,
17
  CHART_CONFIG,
 
 
18
  )
19
 
 
 
 
 
 
 
 
 
 
 
20
 
21
+ def create_leaderboard_plot(
22
  df: pd.DataFrame, track: str, metric: str = "quality", top_n: int = 15
23
  ) -> go.Figure:
24
+ """Create leaderboard plot with confidence intervals."""
25
 
26
  if df.empty:
27
  fig = go.Figure()
 
31
  x=0.5, y=0.5, showarrow=False,
32
  font=dict(size=16)
33
  )
34
+ fig.update_layout(
35
+ title=f"No Data Available - {track.title()} Track",
36
+ paper_bgcolor="rgba(0,0,0,0)",
37
+ plot_bgcolor="rgba(0,0,0,0)"
38
+ )
39
  return fig
40
 
41
  # Get top N models for this track
 
61
  return fig
62
 
63
  # Create color mapping by category
64
+ colors = [MODEL_CATEGORIES.get(cat, {}).get("color", "#808080") for cat in valid_models["model_category"]]
 
 
 
 
65
 
66
  # Main bar plot
67
  fig = go.Figure()
68
 
69
  # Add bars with error bars if confidence intervals available
70
+ error_x = None
71
  if ci_lower_col in valid_models.columns and ci_upper_col in valid_models.columns:
72
+ error_x = dict(
73
  type="data",
74
  array=valid_models[ci_upper_col] - valid_models[metric_col],
75
  arrayminus=valid_models[metric_col] - valid_models[ci_lower_col],
 
77
  thickness=2,
78
  width=4,
79
  )
 
 
80
 
81
  fig.add_trace(go.Bar(
82
  y=valid_models["model_name"],
83
  x=valid_models[metric_col],
84
  orientation="h",
85
  marker=dict(color=colors, line=dict(color="black", width=0.5)),
86
+ error_x=error_x,
87
  text=[f"{score:.3f}" for score in valid_models[metric_col]],
88
  textposition="auto",
89
  hovertemplate=(
 
109
  yaxis_title="Models",
110
  height=max(400, len(valid_models) * 35 + 100),
111
  margin=dict(l=20, r=20, t=60, b=20),
112
+ paper_bgcolor="rgba(0,0,0,0)",
113
+ plot_bgcolor="rgba(0,0,0,0)",
114
  font=dict(size=12),
115
  )
116
 
117
  # Reverse y-axis to show best model at top
118
  fig.update_yaxes(autorange="reversed")
119
 
 
 
 
 
 
 
 
 
 
 
 
120
  return fig
121
 
122
 
123
+ def create_language_pair_heatmap(
124
  model_results: Dict, track: str, metric: str = "quality_score"
125
  ) -> go.Figure:
126
+ """Create language pair heatmap for a model."""
127
 
128
  if not model_results or "tracks" not in model_results:
129
  fig = go.Figure()
 
185
  width=700,
186
  font=dict(size=12),
187
  xaxis=dict(side="bottom"),
188
+ yaxis=dict(autorange="reversed"),
189
+ paper_bgcolor="rgba(0,0,0,0)",
190
+ plot_bgcolor="rgba(0,0,0,0)",
191
  )
192
 
193
  return fig
194
 
195
 
196
+ def create_performance_comparison_plot(df: pd.DataFrame, track: str) -> go.Figure:
197
+ """Create performance comparison plot showing confidence intervals."""
198
 
199
  if df.empty:
200
  fig = go.Figure()
 
254
  showlegend=False,
255
  hoverinfo="skip",
256
  ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
 
258
  # Customize layout
259
  track_info = EVALUATION_TRACKS[track]
260
  fig.update_layout(
261
+ title=f"📊 {track_info['name']} - Performance Comparison",
262
  xaxis_title="Quality Score",
263
  yaxis_title="Models",
264
  height=max(400, len(valid_models) * 40 + 100),
 
269
  autorange="reversed",
270
  ),
271
  showlegend=False,
272
+ paper_bgcolor="rgba(0,0,0,0)",
273
+ plot_bgcolor="rgba(0,0,0,0)",
274
  )
275
 
276
  return fig
277
 
278
 
279
+ def create_language_pair_comparison_plot(pairs_df: pd.DataFrame, track: str) -> go.Figure:
280
+ """Create language pair comparison plot showing all models across all pairs."""
281
 
282
+ if pairs_df.empty:
283
  fig = go.Figure()
284
+ fig.add_annotation(
285
+ text="No language pair data available",
286
+ x=0.5, y=0.5, showarrow=False
287
+ )
 
 
 
 
 
 
 
 
288
  return fig
289
 
290
+ # Get unique language pairs and models
291
+ language_pairs = sorted(pairs_df['Language Pair'].unique())
292
+ models = sorted(pairs_df['Model'].unique())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
 
294
+ if len(language_pairs) == 0 or len(models) == 0:
295
  fig = go.Figure()
296
+ fig.add_annotation(
297
+ text="Insufficient data for comparison",
298
+ x=0.5, y=0.5, showarrow=False
299
+ )
300
  return fig
301
 
302
+ # Create subplot for each metric
303
  fig = make_subplots(
304
+ rows=2, cols=1,
305
+ subplot_titles=('Quality Score by Language Pair', 'BLEU Score by Language Pair'),
306
+ vertical_spacing=0.1,
307
+ shared_xaxes=True
 
 
 
 
 
 
 
308
  )
309
 
310
+ # Quality Score comparison
311
+ for model in models:
312
+ model_data = pairs_df[pairs_df['Model'] == model]
313
+ category = model_data['Category'].iloc[0] if not model_data.empty else 'community'
314
+ color = MODEL_CATEGORIES.get(category, {}).get('color', '#808080')
315
+
 
 
 
 
 
 
316
  fig.add_trace(
317
+ go.Bar(
318
+ name=model,
319
+ x=model_data['Language Pair'],
320
+ y=model_data['Quality Score'],
321
+ marker_color=color,
322
+ opacity=0.8,
323
+ legendgroup=model,
324
+ showlegend=True,
325
+ hovertemplate=(
326
+ f"<b>{model}</b><br>" +
327
+ "Language Pair: %{x}<br>" +
328
+ "Quality Score: %{y:.4f}<br>" +
329
+ f"Category: {category}<br>" +
330
+ "<extra></extra>"
331
+ )
332
+ ),
333
  row=1, col=1
334
  )
335
+
336
+ # BLEU Score comparison
 
 
 
 
 
 
 
 
337
  fig.add_trace(
338
+ go.Bar(
339
+ name=model,
340
+ x=model_data['Language Pair'],
341
+ y=model_data['BLEU'],
342
+ marker_color=color,
343
+ opacity=0.8,
344
+ legendgroup=model,
345
+ showlegend=False,
346
+ hovertemplate=(
347
+ f"<b>{model}</b><br>" +
348
+ "Language Pair: %{x}<br>" +
349
+ "BLEU: %{y:.2f}<br>" +
350
+ f"Category: {category}<br>" +
351
+ "<extra></extra>"
352
+ )
353
  ),
354
+ row=2, col=1
355
  )
356
 
357
+ # Update layout
358
+ track_info = EVALUATION_TRACKS[track]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
359
  fig.update_layout(
360
+ title=f"📊 {track_info['name']} - Language Pair Performance Comparison",
361
  height=800,
362
+ barmode='group',
363
+ paper_bgcolor="rgba(0,0,0,0)",
364
+ plot_bgcolor="rgba(0,0,0,0)",
365
+ legend=dict(
366
+ orientation="h",
367
+ yanchor="bottom",
368
+ y=1.02,
369
+ xanchor="right",
370
+ x=1
371
+ )
372
  )
373
 
374
+ # Rotate x-axis labels for better readability
375
+ fig.update_xaxes(tickangle=45, row=2, col=1)
376
+ fig.update_yaxes(title_text="Quality Score", row=1, col=1)
377
+ fig.update_yaxes(title_text="BLEU Score", row=2, col=1)
378
+
379
  return fig
380
 
381
 
382
+ def create_category_comparison_plot(df: pd.DataFrame, track: str) -> go.Figure:
383
+ """Create category-wise comparison plot."""
384
 
385
  if df.empty:
386
  fig = go.Figure()
387
  fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
388
  return fig
389
 
390
+ metric_col = f"{track}_quality"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
391
 
392
+ # Filter to models with data
393
+ valid_models = df[df[metric_col] > 0]
 
 
394
 
395
+ if valid_models.empty:
396
  fig = go.Figure()
397
+ fig.add_annotation(text="No valid models found", x=0.5, y=0.5, showarrow=False)
398
  return fig
399
 
 
 
 
 
 
400
  fig = go.Figure()
401
 
402
+ # Create box plot for each category
403
  for category, info in MODEL_CATEGORIES.items():
404
+ category_models = valid_models[valid_models["model_category"] == category]
405
 
406
  if len(category_models) > 0:
407
+ fig.add_trace(go.Box(
408
+ y=category_models[metric_col],
 
 
 
 
 
 
 
409
  name=info["name"],
410
+ marker_color=info["color"],
411
+ boxpoints="all", # Show all points
412
+ jitter=0.3,
413
+ pointpos=-1.8,
414
  hovertemplate=(
415
+ f"<b>{info['name']}</b><br>" +
416
+ "Quality: %{y:.4f}<br>" +
417
+ "Model: %{customdata}<br>" +
 
418
  "<extra></extra>"
419
  ),
420
+ customdata=category_models["model_name"],
421
  ))
422
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
423
  # Customize layout
424
  track_info = EVALUATION_TRACKS[track]
425
  fig.update_layout(
426
+ title=f"📈 {track_info['name']} - Performance by Category",
427
+ xaxis_title="Model Category",
428
+ yaxis_title="Quality Score",
429
+ height=500,
430
  showlegend=False,
431
+ paper_bgcolor="rgba(0,0,0,0)",
432
+ plot_bgcolor="rgba(0,0,0,0)",
433
  )
434
 
 
 
 
 
435
  return fig