Spaces:
Running
Running
Update src/plotting.py
Browse files- src/plotting.py +276 -338
src/plotting.py
CHANGED
@@ -17,6 +17,7 @@ from config import (
|
|
17 |
MODEL_CATEGORIES,
|
18 |
CHART_CONFIG,
|
19 |
STATISTICAL_CONFIG,
|
|
|
20 |
)
|
21 |
|
22 |
# Scientific plotting style
|
@@ -34,58 +35,50 @@ def create_scientific_leaderboard_plot(
|
|
34 |
df: pd.DataFrame, track: str, metric: str = "quality", top_n: int = 15
|
35 |
) -> go.Figure:
|
36 |
"""Create scientific leaderboard plot with confidence intervals."""
|
37 |
-
|
38 |
if df.empty:
|
39 |
fig = go.Figure()
|
40 |
fig.add_annotation(
|
41 |
text="No models available for this track",
|
42 |
-
xref="paper",
|
43 |
-
|
44 |
-
|
45 |
-
y=0.5,
|
46 |
-
showarrow=False,
|
47 |
-
font=dict(size=16),
|
48 |
)
|
49 |
fig.update_layout(title=f"No Data Available - {track.title()} Track")
|
50 |
return fig
|
51 |
-
|
52 |
# Get top N models for this track
|
53 |
metric_col = f"{track}_{metric}"
|
54 |
ci_lower_col = f"{track}_ci_lower"
|
55 |
ci_upper_col = f"{track}_ci_upper"
|
56 |
-
|
57 |
if metric_col not in df.columns:
|
58 |
fig = go.Figure()
|
59 |
fig.add_annotation(
|
60 |
text=f"Metric {metric} not available for {track} track",
|
61 |
-
xref="paper",
|
62 |
-
|
63 |
-
x=0.5,
|
64 |
-
y=0.5,
|
65 |
-
showarrow=False,
|
66 |
)
|
67 |
return fig
|
68 |
-
|
69 |
# Filter and sort
|
70 |
valid_models = df[(df[metric_col] > 0)].head(top_n)
|
71 |
-
|
72 |
if valid_models.empty:
|
73 |
fig = go.Figure()
|
74 |
fig.add_annotation(text="No valid models found", x=0.5, y=0.5, showarrow=False)
|
75 |
return fig
|
76 |
-
|
77 |
# Create color mapping by category
|
78 |
category_colors = {}
|
79 |
for i, category in enumerate(MODEL_CATEGORIES.keys()):
|
80 |
category_colors[category] = MODEL_CATEGORIES[category]["color"]
|
81 |
-
|
82 |
-
colors = [
|
83 |
-
|
84 |
-
]
|
85 |
-
|
86 |
# Main bar plot
|
87 |
fig = go.Figure()
|
88 |
-
|
89 |
# Add bars with error bars if confidence intervals available
|
90 |
if ci_lower_col in valid_models.columns and ci_upper_col in valid_models.columns:
|
91 |
error_y = dict(
|
@@ -98,34 +91,30 @@ def create_scientific_leaderboard_plot(
|
|
98 |
)
|
99 |
else:
|
100 |
error_y = None
|
101 |
-
|
102 |
-
fig.add_trace(
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
),
|
126 |
-
)
|
127 |
-
)
|
128 |
-
|
129 |
# Customize layout
|
130 |
track_info = EVALUATION_TRACKS[track]
|
131 |
fig.update_layout(
|
@@ -138,24 +127,21 @@ def create_scientific_leaderboard_plot(
|
|
138 |
paper_bgcolor="white",
|
139 |
font=dict(size=12),
|
140 |
)
|
141 |
-
|
142 |
# Reverse y-axis to show best model at top
|
143 |
fig.update_yaxes(autorange="reversed")
|
144 |
-
|
145 |
# Add category legend
|
146 |
for category, info in MODEL_CATEGORIES.items():
|
147 |
if category in valid_models["model_category"].values:
|
148 |
-
fig.add_trace(
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
)
|
157 |
-
)
|
158 |
-
|
159 |
return fig
|
160 |
|
161 |
|
@@ -163,63 +149,57 @@ def create_language_pair_heatmap_scientific(
|
|
163 |
model_results: Dict, track: str, metric: str = "quality_score"
|
164 |
) -> go.Figure:
|
165 |
"""Create research-grade language pair heatmap with proper axes."""
|
166 |
-
|
167 |
if not model_results or "tracks" not in model_results:
|
168 |
fig = go.Figure()
|
169 |
-
fig.add_annotation(
|
170 |
-
text="No model results available", x=0.5, y=0.5, showarrow=False
|
171 |
-
)
|
172 |
return fig
|
173 |
-
|
174 |
track_data = model_results["tracks"].get(track, {})
|
175 |
if track_data.get("error") or "pair_metrics" not in track_data:
|
176 |
fig = go.Figure()
|
177 |
-
fig.add_annotation(
|
178 |
-
text=f"No data available for {track} track", x=0.5, y=0.5, showarrow=False
|
179 |
-
)
|
180 |
return fig
|
181 |
-
|
182 |
pair_metrics = track_data["pair_metrics"]
|
183 |
track_languages = EVALUATION_TRACKS[track]["languages"]
|
184 |
-
|
185 |
# Create matrix for heatmap
|
186 |
n_langs = len(track_languages)
|
187 |
matrix = np.full((n_langs, n_langs), np.nan)
|
188 |
-
|
189 |
for i, src_lang in enumerate(track_languages):
|
190 |
for j, tgt_lang in enumerate(track_languages):
|
191 |
if src_lang != tgt_lang:
|
192 |
pair_key = f"{src_lang}_to_{tgt_lang}"
|
193 |
if pair_key in pair_metrics and metric in pair_metrics[pair_key]:
|
194 |
matrix[i, j] = pair_metrics[pair_key][metric]["mean"]
|
195 |
-
|
196 |
# Create language labels
|
197 |
lang_labels = [LANGUAGE_NAMES.get(lang, lang.upper()) for lang in track_languages]
|
198 |
-
|
199 |
# Create heatmap
|
200 |
-
fig = go.Figure(
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
)
|
222 |
-
|
223 |
# Customize layout
|
224 |
track_info = EVALUATION_TRACKS[track]
|
225 |
fig.update_layout(
|
@@ -232,93 +212,87 @@ def create_language_pair_heatmap_scientific(
|
|
232 |
xaxis=dict(side="bottom"),
|
233 |
yaxis=dict(autorange="reversed"), # Source languages from top to bottom
|
234 |
)
|
235 |
-
|
236 |
return fig
|
237 |
|
238 |
|
239 |
def create_statistical_comparison_plot(df: pd.DataFrame, track: str) -> go.Figure:
|
240 |
"""Create statistical comparison plot showing confidence intervals."""
|
241 |
-
|
242 |
if df.empty:
|
243 |
fig = go.Figure()
|
244 |
fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
|
245 |
return fig
|
246 |
-
|
247 |
metric_col = f"{track}_quality"
|
248 |
ci_lower_col = f"{track}_ci_lower"
|
249 |
ci_upper_col = f"{track}_ci_upper"
|
250 |
-
|
251 |
# Filter to models with data for this track
|
252 |
valid_models = df[
|
253 |
-
(df[metric_col] > 0) &
|
|
|
|
|
254 |
].head(10)
|
255 |
-
|
256 |
if valid_models.empty:
|
257 |
fig = go.Figure()
|
258 |
-
fig.add_annotation(
|
259 |
-
text="No models with confidence intervals", x=0.5, y=0.5, showarrow=False
|
260 |
-
)
|
261 |
return fig
|
262 |
-
|
263 |
fig = go.Figure()
|
264 |
-
|
265 |
# Add confidence intervals as error bars
|
266 |
for i, (_, model) in enumerate(valid_models.iterrows()):
|
267 |
category = model["model_category"]
|
268 |
color = MODEL_CATEGORIES.get(category, {}).get("color", "#808080")
|
269 |
-
|
270 |
# Main point
|
271 |
-
fig.add_trace(
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
)
|
292 |
-
|
293 |
# Confidence interval line
|
294 |
-
fig.add_trace(
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
)
|
304 |
-
|
305 |
# CI endpoints
|
306 |
-
fig.add_trace(
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
)
|
321 |
-
|
322 |
# Customize layout
|
323 |
track_info = EVALUATION_TRACKS[track]
|
324 |
fig.update_layout(
|
@@ -336,56 +310,52 @@ def create_statistical_comparison_plot(df: pd.DataFrame, track: str) -> go.Figur
|
|
336 |
plot_bgcolor="white",
|
337 |
paper_bgcolor="white",
|
338 |
)
|
339 |
-
|
340 |
return fig
|
341 |
|
342 |
|
343 |
def create_category_comparison_plot(df: pd.DataFrame, track: str) -> go.Figure:
|
344 |
"""Create category-wise comparison plot."""
|
345 |
-
|
346 |
if df.empty:
|
347 |
fig = go.Figure()
|
348 |
fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
|
349 |
return fig
|
350 |
-
|
351 |
metric_col = f"{track}_quality"
|
352 |
adequate_col = f"{track}_adequate"
|
353 |
-
|
354 |
# Filter to adequate models
|
355 |
valid_models = df[df[adequate_col] & (df[metric_col] > 0)]
|
356 |
-
|
357 |
if valid_models.empty:
|
358 |
fig = go.Figure()
|
359 |
-
fig.add_annotation(
|
360 |
-
text="No adequate models found", x=0.5, y=0.5, showarrow=False
|
361 |
-
)
|
362 |
return fig
|
363 |
-
|
364 |
fig = go.Figure()
|
365 |
-
|
366 |
# Create box plot for each category
|
367 |
for category, info in MODEL_CATEGORIES.items():
|
368 |
category_models = valid_models[valid_models["model_category"] == category]
|
369 |
-
|
370 |
if len(category_models) > 0:
|
371 |
-
fig.add_trace(
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
)
|
388 |
-
|
389 |
# Customize layout
|
390 |
track_info = EVALUATION_TRACKS[track]
|
391 |
fig.update_layout(
|
@@ -397,202 +367,183 @@ def create_category_comparison_plot(df: pd.DataFrame, track: str) -> go.Figure:
|
|
397 |
plot_bgcolor="white",
|
398 |
paper_bgcolor="white",
|
399 |
)
|
400 |
-
|
401 |
return fig
|
402 |
|
403 |
|
404 |
def create_adequacy_analysis_plot(df: pd.DataFrame) -> go.Figure:
|
405 |
"""Create analysis plot for statistical adequacy across tracks."""
|
406 |
-
|
407 |
if df.empty:
|
408 |
fig = go.Figure()
|
409 |
fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
|
410 |
return fig
|
411 |
-
|
412 |
fig = make_subplots(
|
413 |
-
rows=2,
|
414 |
-
cols=2,
|
415 |
subplot_titles=(
|
416 |
"Sample Sizes by Track",
|
417 |
-
"Statistical Adequacy Distribution",
|
418 |
"Scientific Adequacy Scores",
|
419 |
-
"Model Categories Distribution"
|
420 |
),
|
421 |
specs=[
|
422 |
[{"type": "bar"}, {"type": "pie"}],
|
423 |
-
[{"type": "histogram"}, {"type": "bar"}]
|
424 |
-
]
|
425 |
)
|
426 |
-
|
427 |
# Sample sizes by track
|
428 |
track_names = []
|
429 |
sample_counts = []
|
430 |
-
|
431 |
for track in EVALUATION_TRACKS.keys():
|
432 |
samples_col = f"{track}_samples"
|
433 |
if samples_col in df.columns:
|
434 |
total_samples = df[df[samples_col] > 0][samples_col].sum()
|
435 |
track_names.append(track.replace("_", " ").title())
|
436 |
sample_counts.append(total_samples)
|
437 |
-
|
438 |
if track_names:
|
439 |
fig.add_trace(
|
440 |
-
go.Bar(x=track_names, y=sample_counts, name="Samples"),
|
|
|
441 |
)
|
442 |
-
|
443 |
# Statistical adequacy distribution
|
444 |
adequacy_bins = pd.cut(
|
445 |
-
df["scientific_adequacy_score"],
|
446 |
bins=[0, 0.3, 0.6, 0.8, 1.0],
|
447 |
-
labels=["Poor", "Fair", "Good", "Excellent"]
|
448 |
)
|
449 |
adequacy_counts = adequacy_bins.value_counts()
|
450 |
-
|
451 |
if not adequacy_counts.empty:
|
452 |
fig.add_trace(
|
453 |
go.Pie(
|
454 |
labels=adequacy_counts.index,
|
455 |
values=adequacy_counts.values,
|
456 |
-
name="Adequacy"
|
457 |
),
|
458 |
-
row=1,
|
459 |
-
col=2,
|
460 |
)
|
461 |
-
|
462 |
# Scientific adequacy scores histogram
|
463 |
fig.add_trace(
|
464 |
go.Histogram(
|
465 |
-
x=df["scientific_adequacy_score"],
|
|
|
|
|
466 |
),
|
467 |
-
row=2,
|
468 |
-
col=1,
|
469 |
)
|
470 |
-
|
471 |
# Model categories distribution
|
472 |
category_counts = df["model_category"].value_counts()
|
473 |
-
category_colors = [
|
474 |
-
|
475 |
-
for cat in category_counts.index
|
476 |
-
]
|
477 |
-
|
478 |
fig.add_trace(
|
479 |
go.Bar(
|
480 |
x=category_counts.index,
|
481 |
y=category_counts.values,
|
482 |
marker_color=category_colors,
|
483 |
-
name="Categories"
|
484 |
),
|
485 |
-
row=2,
|
486 |
-
col=2,
|
487 |
)
|
488 |
-
|
489 |
fig.update_layout(
|
490 |
-
title="📊 Scientific Evaluation Analysis",
|
|
|
|
|
491 |
)
|
492 |
-
|
493 |
return fig
|
494 |
|
495 |
|
496 |
def create_cross_track_analysis_plot(df: pd.DataFrame) -> go.Figure:
|
497 |
"""Create cross-track performance correlation analysis."""
|
498 |
-
|
499 |
if df.empty:
|
500 |
fig = go.Figure()
|
501 |
fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
|
502 |
return fig
|
503 |
-
|
504 |
# Get models with data in multiple tracks
|
505 |
quality_cols = [f"{track}_quality" for track in EVALUATION_TRACKS.keys()]
|
506 |
available_cols = [col for col in quality_cols if col in df.columns]
|
507 |
-
|
508 |
if len(available_cols) < 2:
|
509 |
fig = go.Figure()
|
510 |
-
fig.add_annotation(
|
511 |
-
text="Need at least 2 tracks for comparison", x=0.5, y=0.5, showarrow=False
|
512 |
-
)
|
513 |
return fig
|
514 |
-
|
515 |
# Filter to models with data in multiple tracks
|
516 |
multi_track_models = df.copy()
|
517 |
for col in available_cols:
|
518 |
multi_track_models = multi_track_models[multi_track_models[col] > 0]
|
519 |
-
|
520 |
if len(multi_track_models) < 3:
|
521 |
fig = go.Figure()
|
522 |
-
fig.add_annotation(
|
523 |
-
text="Insufficient models for cross-track analysis",
|
524 |
-
x=0.5,
|
525 |
-
y=0.5,
|
526 |
-
showarrow=False,
|
527 |
-
)
|
528 |
return fig
|
529 |
-
|
530 |
# Create scatter plot matrix
|
531 |
-
track_pairs = [
|
532 |
-
|
533 |
-
|
534 |
-
|
535 |
-
]
|
536 |
-
|
537 |
if not track_pairs:
|
538 |
fig = go.Figure()
|
539 |
-
fig.add_annotation(
|
540 |
-
text="No track pairs available", x=0.5, y=0.5, showarrow=False
|
541 |
-
)
|
542 |
return fig
|
543 |
-
|
544 |
# Use first pair for demonstration
|
545 |
x_col, y_col = track_pairs[0]
|
546 |
x_track = x_col.replace("_quality", "").replace("_", " ").title()
|
547 |
y_track = y_col.replace("_quality", "").replace("_", " ").title()
|
548 |
-
|
549 |
fig = go.Figure()
|
550 |
-
|
551 |
# Color by category
|
552 |
for category, info in MODEL_CATEGORIES.items():
|
553 |
-
category_models = multi_track_models[
|
554 |
-
|
555 |
-
]
|
556 |
-
|
557 |
if len(category_models) > 0:
|
558 |
-
fig.add_trace(
|
559 |
-
|
560 |
-
|
561 |
-
|
562 |
-
|
563 |
-
|
564 |
-
|
565 |
-
|
566 |
-
|
567 |
-
|
568 |
-
|
569 |
-
|
570 |
-
|
571 |
-
|
572 |
-
|
573 |
-
|
574 |
-
|
575 |
-
|
576 |
-
|
577 |
-
|
578 |
-
)
|
579 |
-
|
580 |
# Add diagonal line for reference
|
581 |
min_val = min(multi_track_models[x_col].min(), multi_track_models[y_col].min())
|
582 |
max_val = max(multi_track_models[x_col].max(), multi_track_models[y_col].max())
|
583 |
-
|
584 |
-
fig.add_trace(
|
585 |
-
|
586 |
-
|
587 |
-
|
588 |
-
|
589 |
-
|
590 |
-
|
591 |
-
|
592 |
-
|
593 |
-
|
594 |
-
)
|
595 |
-
|
596 |
fig.update_layout(
|
597 |
title=f"🔄 Cross-Track Performance: {x_track} vs {y_track}",
|
598 |
xaxis_title=f"{x_track} Quality Score",
|
@@ -602,82 +553,71 @@ def create_cross_track_analysis_plot(df: pd.DataFrame) -> go.Figure:
|
|
602 |
plot_bgcolor="white",
|
603 |
paper_bgcolor="white",
|
604 |
)
|
605 |
-
|
606 |
return fig
|
607 |
|
608 |
|
609 |
-
def create_scientific_model_detail_plot(
|
610 |
-
model_results: Dict, model_name: str, track: str
|
611 |
-
) -> go.Figure:
|
612 |
"""Create detailed scientific analysis for a specific model."""
|
613 |
-
|
614 |
if not model_results or "tracks" not in model_results:
|
615 |
fig = go.Figure()
|
616 |
-
fig.add_annotation(
|
617 |
-
text="No model results available", x=0.5, y=0.5, showarrow=False
|
618 |
-
)
|
619 |
return fig
|
620 |
-
|
621 |
track_data = model_results["tracks"].get(track, {})
|
622 |
if track_data.get("error") or "pair_metrics" not in track_data:
|
623 |
fig = go.Figure()
|
624 |
-
fig.add_annotation(
|
625 |
-
text=f"No data for {track} track", x=0.5, y=0.5, showarrow=False
|
626 |
-
)
|
627 |
return fig
|
628 |
-
|
629 |
pair_metrics = track_data["pair_metrics"]
|
630 |
track_languages = EVALUATION_TRACKS[track]["languages"]
|
631 |
-
|
632 |
# Extract data for plotting
|
633 |
pairs = []
|
634 |
quality_means = []
|
635 |
quality_cis = []
|
636 |
bleu_means = []
|
637 |
sample_counts = []
|
638 |
-
|
639 |
for src in track_languages:
|
640 |
for tgt in track_languages:
|
641 |
if src == tgt:
|
642 |
continue
|
643 |
-
|
644 |
pair_key = f"{src}_to_{tgt}"
|
645 |
if pair_key in pair_metrics:
|
646 |
metrics = pair_metrics[pair_key]
|
647 |
-
|
648 |
if "quality_score" in metrics and "sample_count" in metrics:
|
649 |
pair_label = f"{LANGUAGE_NAMES.get(src, src)} → {LANGUAGE_NAMES.get(tgt, tgt)}"
|
650 |
pairs.append(pair_label)
|
651 |
-
|
652 |
quality_stats = metrics["quality_score"]
|
653 |
quality_means.append(quality_stats["mean"])
|
654 |
-
quality_cis.append(
|
655 |
-
|
656 |
-
)
|
657 |
-
|
658 |
bleu_stats = metrics.get("bleu", {"mean": 0})
|
659 |
bleu_means.append(bleu_stats["mean"])
|
660 |
-
|
661 |
sample_counts.append(metrics["sample_count"])
|
662 |
-
|
663 |
if not pairs:
|
664 |
fig = go.Figure()
|
665 |
-
fig.add_annotation(
|
666 |
-
text="No language pair data available", x=0.5, y=0.5, showarrow=False
|
667 |
-
)
|
668 |
return fig
|
669 |
-
|
670 |
# Create subplots
|
671 |
fig = make_subplots(
|
672 |
-
rows=2,
|
673 |
-
cols=1,
|
674 |
subplot_titles=(
|
675 |
"Quality Scores by Language Pair (with 95% CI)",
|
676 |
-
"BLEU Scores by Language Pair"
|
677 |
),
|
678 |
vertical_spacing=0.15,
|
679 |
)
|
680 |
-
|
681 |
# Quality scores with confidence intervals
|
682 |
error_y = dict(
|
683 |
type="data",
|
@@ -687,7 +627,7 @@ def create_scientific_model_detail_plot(
|
|
687 |
thickness=2,
|
688 |
width=4,
|
689 |
)
|
690 |
-
|
691 |
fig.add_trace(
|
692 |
go.Bar(
|
693 |
x=pairs,
|
@@ -698,17 +638,16 @@ def create_scientific_model_detail_plot(
|
|
698 |
text=[f"{score:.3f}" for score in quality_means],
|
699 |
textposition="outside",
|
700 |
hovertemplate=(
|
701 |
-
"<b>%{x}</b><br>"
|
702 |
-
|
703 |
-
|
704 |
-
|
705 |
),
|
706 |
customdata=sample_counts,
|
707 |
),
|
708 |
-
row=1,
|
709 |
-
col=1,
|
710 |
)
|
711 |
-
|
712 |
# BLEU scores
|
713 |
fig.add_trace(
|
714 |
go.Bar(
|
@@ -719,10 +658,9 @@ def create_scientific_model_detail_plot(
|
|
719 |
text=[f"{score:.1f}" for score in bleu_means],
|
720 |
textposition="outside",
|
721 |
),
|
722 |
-
row=2,
|
723 |
-
col=1,
|
724 |
)
|
725 |
-
|
726 |
# Customize layout
|
727 |
track_info = EVALUATION_TRACKS[track]
|
728 |
fig.update_layout(
|
@@ -731,9 +669,9 @@ def create_scientific_model_detail_plot(
|
|
731 |
showlegend=False,
|
732 |
margin=dict(l=50, r=50, t=100, b=150),
|
733 |
)
|
734 |
-
|
735 |
# Rotate x-axis labels
|
736 |
fig.update_xaxes(tickangle=45, row=1, col=1)
|
737 |
fig.update_xaxes(tickangle=45, row=2, col=1)
|
738 |
-
|
739 |
-
return fig
|
|
|
17 |
MODEL_CATEGORIES,
|
18 |
CHART_CONFIG,
|
19 |
STATISTICAL_CONFIG,
|
20 |
+
SAMPLE_SIZE_RECOMMENDATIONS,
|
21 |
)
|
22 |
|
23 |
# Scientific plotting style
|
|
|
35 |
df: pd.DataFrame, track: str, metric: str = "quality", top_n: int = 15
|
36 |
) -> go.Figure:
|
37 |
"""Create scientific leaderboard plot with confidence intervals."""
|
38 |
+
|
39 |
if df.empty:
|
40 |
fig = go.Figure()
|
41 |
fig.add_annotation(
|
42 |
text="No models available for this track",
|
43 |
+
xref="paper", yref="paper",
|
44 |
+
x=0.5, y=0.5, showarrow=False,
|
45 |
+
font=dict(size=16)
|
|
|
|
|
|
|
46 |
)
|
47 |
fig.update_layout(title=f"No Data Available - {track.title()} Track")
|
48 |
return fig
|
49 |
+
|
50 |
# Get top N models for this track
|
51 |
metric_col = f"{track}_{metric}"
|
52 |
ci_lower_col = f"{track}_ci_lower"
|
53 |
ci_upper_col = f"{track}_ci_upper"
|
54 |
+
|
55 |
if metric_col not in df.columns:
|
56 |
fig = go.Figure()
|
57 |
fig.add_annotation(
|
58 |
text=f"Metric {metric} not available for {track} track",
|
59 |
+
xref="paper", yref="paper",
|
60 |
+
x=0.5, y=0.5, showarrow=False,
|
|
|
|
|
|
|
61 |
)
|
62 |
return fig
|
63 |
+
|
64 |
# Filter and sort
|
65 |
valid_models = df[(df[metric_col] > 0)].head(top_n)
|
66 |
+
|
67 |
if valid_models.empty:
|
68 |
fig = go.Figure()
|
69 |
fig.add_annotation(text="No valid models found", x=0.5, y=0.5, showarrow=False)
|
70 |
return fig
|
71 |
+
|
72 |
# Create color mapping by category
|
73 |
category_colors = {}
|
74 |
for i, category in enumerate(MODEL_CATEGORIES.keys()):
|
75 |
category_colors[category] = MODEL_CATEGORIES[category]["color"]
|
76 |
+
|
77 |
+
colors = [category_colors.get(cat, "#808080") for cat in valid_models["model_category"]]
|
78 |
+
|
|
|
|
|
79 |
# Main bar plot
|
80 |
fig = go.Figure()
|
81 |
+
|
82 |
# Add bars with error bars if confidence intervals available
|
83 |
if ci_lower_col in valid_models.columns and ci_upper_col in valid_models.columns:
|
84 |
error_y = dict(
|
|
|
91 |
)
|
92 |
else:
|
93 |
error_y = None
|
94 |
+
|
95 |
+
fig.add_trace(go.Bar(
|
96 |
+
y=valid_models["model_name"],
|
97 |
+
x=valid_models[metric_col],
|
98 |
+
orientation="h",
|
99 |
+
marker=dict(color=colors, line=dict(color="black", width=0.5)),
|
100 |
+
error_x=error_y,
|
101 |
+
text=[f"{score:.3f}" for score in valid_models[metric_col]],
|
102 |
+
textposition="auto",
|
103 |
+
hovertemplate=(
|
104 |
+
"<b>%{y}</b><br>" +
|
105 |
+
f"{metric.title()}: %{{x:.4f}}<br>" +
|
106 |
+
"Category: %{customdata[0]}<br>" +
|
107 |
+
"Author: %{customdata[1]}<br>" +
|
108 |
+
"Samples: %{customdata[2]}<br>" +
|
109 |
+
"<extra></extra>"
|
110 |
+
),
|
111 |
+
customdata=list(zip(
|
112 |
+
valid_models["model_category"],
|
113 |
+
valid_models["author"],
|
114 |
+
valid_models.get(f"{track}_samples", [0] * len(valid_models))
|
115 |
+
)),
|
116 |
+
))
|
117 |
+
|
|
|
|
|
|
|
|
|
118 |
# Customize layout
|
119 |
track_info = EVALUATION_TRACKS[track]
|
120 |
fig.update_layout(
|
|
|
127 |
paper_bgcolor="white",
|
128 |
font=dict(size=12),
|
129 |
)
|
130 |
+
|
131 |
# Reverse y-axis to show best model at top
|
132 |
fig.update_yaxes(autorange="reversed")
|
133 |
+
|
134 |
# Add category legend
|
135 |
for category, info in MODEL_CATEGORIES.items():
|
136 |
if category in valid_models["model_category"].values:
|
137 |
+
fig.add_trace(go.Scatter(
|
138 |
+
x=[None], y=[None],
|
139 |
+
mode="markers",
|
140 |
+
marker=dict(size=10, color=info["color"]),
|
141 |
+
name=info["name"],
|
142 |
+
showlegend=True,
|
143 |
+
))
|
144 |
+
|
|
|
|
|
|
|
145 |
return fig
|
146 |
|
147 |
|
|
|
149 |
model_results: Dict, track: str, metric: str = "quality_score"
|
150 |
) -> go.Figure:
|
151 |
"""Create research-grade language pair heatmap with proper axes."""
|
152 |
+
|
153 |
if not model_results or "tracks" not in model_results:
|
154 |
fig = go.Figure()
|
155 |
+
fig.add_annotation(text="No model results available", x=0.5, y=0.5, showarrow=False)
|
|
|
|
|
156 |
return fig
|
157 |
+
|
158 |
track_data = model_results["tracks"].get(track, {})
|
159 |
if track_data.get("error") or "pair_metrics" not in track_data:
|
160 |
fig = go.Figure()
|
161 |
+
fig.add_annotation(text=f"No data available for {track} track", x=0.5, y=0.5, showarrow=False)
|
|
|
|
|
162 |
return fig
|
163 |
+
|
164 |
pair_metrics = track_data["pair_metrics"]
|
165 |
track_languages = EVALUATION_TRACKS[track]["languages"]
|
166 |
+
|
167 |
# Create matrix for heatmap
|
168 |
n_langs = len(track_languages)
|
169 |
matrix = np.full((n_langs, n_langs), np.nan)
|
170 |
+
|
171 |
for i, src_lang in enumerate(track_languages):
|
172 |
for j, tgt_lang in enumerate(track_languages):
|
173 |
if src_lang != tgt_lang:
|
174 |
pair_key = f"{src_lang}_to_{tgt_lang}"
|
175 |
if pair_key in pair_metrics and metric in pair_metrics[pair_key]:
|
176 |
matrix[i, j] = pair_metrics[pair_key][metric]["mean"]
|
177 |
+
|
178 |
# Create language labels
|
179 |
lang_labels = [LANGUAGE_NAMES.get(lang, lang.upper()) for lang in track_languages]
|
180 |
+
|
181 |
# Create heatmap
|
182 |
+
fig = go.Figure(data=go.Heatmap(
|
183 |
+
z=matrix,
|
184 |
+
x=lang_labels,
|
185 |
+
y=lang_labels,
|
186 |
+
colorscale="Viridis",
|
187 |
+
showscale=True,
|
188 |
+
colorbar=dict(
|
189 |
+
title=f"{metric.replace('_', ' ').title()}",
|
190 |
+
titleside="right",
|
191 |
+
len=0.8,
|
192 |
+
),
|
193 |
+
hovertemplate=(
|
194 |
+
"Source: %{y}<br>" +
|
195 |
+
"Target: %{x}<br>" +
|
196 |
+
f"{metric.replace('_', ' ').title()}: %{{z:.3f}}<br>" +
|
197 |
+
"<extra></extra>"
|
198 |
+
),
|
199 |
+
zmin=0,
|
200 |
+
zmax=1 if metric == "quality_score" else None,
|
201 |
+
))
|
202 |
+
|
|
|
|
|
203 |
# Customize layout
|
204 |
track_info = EVALUATION_TRACKS[track]
|
205 |
fig.update_layout(
|
|
|
212 |
xaxis=dict(side="bottom"),
|
213 |
yaxis=dict(autorange="reversed"), # Source languages from top to bottom
|
214 |
)
|
215 |
+
|
216 |
return fig
|
217 |
|
218 |
|
219 |
def create_statistical_comparison_plot(df: pd.DataFrame, track: str) -> go.Figure:
|
220 |
"""Create statistical comparison plot showing confidence intervals."""
|
221 |
+
|
222 |
if df.empty:
|
223 |
fig = go.Figure()
|
224 |
fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
|
225 |
return fig
|
226 |
+
|
227 |
metric_col = f"{track}_quality"
|
228 |
ci_lower_col = f"{track}_ci_lower"
|
229 |
ci_upper_col = f"{track}_ci_upper"
|
230 |
+
|
231 |
# Filter to models with data for this track
|
232 |
valid_models = df[
|
233 |
+
(df[metric_col] > 0) &
|
234 |
+
(df[ci_lower_col].notna()) &
|
235 |
+
(df[ci_upper_col].notna())
|
236 |
].head(10)
|
237 |
+
|
238 |
if valid_models.empty:
|
239 |
fig = go.Figure()
|
240 |
+
fig.add_annotation(text="No models with confidence intervals", x=0.5, y=0.5, showarrow=False)
|
|
|
|
|
241 |
return fig
|
242 |
+
|
243 |
fig = go.Figure()
|
244 |
+
|
245 |
# Add confidence intervals as error bars
|
246 |
for i, (_, model) in enumerate(valid_models.iterrows()):
|
247 |
category = model["model_category"]
|
248 |
color = MODEL_CATEGORIES.get(category, {}).get("color", "#808080")
|
249 |
+
|
250 |
# Main point
|
251 |
+
fig.add_trace(go.Scatter(
|
252 |
+
x=[model[metric_col]],
|
253 |
+
y=[i],
|
254 |
+
mode="markers",
|
255 |
+
marker=dict(
|
256 |
+
size=12,
|
257 |
+
color=color,
|
258 |
+
line=dict(color="black", width=1),
|
259 |
+
),
|
260 |
+
name=model["model_name"],
|
261 |
+
showlegend=False,
|
262 |
+
hovertemplate=(
|
263 |
+
f"<b>{model['model_name']}</b><br>" +
|
264 |
+
f"Quality: {model[metric_col]:.4f}<br>" +
|
265 |
+
f"95% CI: [{model[ci_lower_col]:.4f}, {model[ci_upper_col]:.4f}]<br>" +
|
266 |
+
f"Category: {category}<br>" +
|
267 |
+
"<extra></extra>"
|
268 |
+
),
|
269 |
+
))
|
270 |
+
|
|
|
|
|
271 |
# Confidence interval line
|
272 |
+
fig.add_trace(go.Scatter(
|
273 |
+
x=[model[ci_lower_col], model[ci_upper_col]],
|
274 |
+
y=[i, i],
|
275 |
+
mode="lines",
|
276 |
+
line=dict(color=color, width=3),
|
277 |
+
showlegend=False,
|
278 |
+
hoverinfo="skip",
|
279 |
+
))
|
280 |
+
|
|
|
|
|
281 |
# CI endpoints
|
282 |
+
fig.add_trace(go.Scatter(
|
283 |
+
x=[model[ci_lower_col], model[ci_upper_col]],
|
284 |
+
y=[i, i],
|
285 |
+
mode="markers",
|
286 |
+
marker=dict(
|
287 |
+
symbol="line-ns",
|
288 |
+
size=10,
|
289 |
+
color=color,
|
290 |
+
line=dict(width=2),
|
291 |
+
),
|
292 |
+
showlegend=False,
|
293 |
+
hoverinfo="skip",
|
294 |
+
))
|
295 |
+
|
|
|
|
|
296 |
# Customize layout
|
297 |
track_info = EVALUATION_TRACKS[track]
|
298 |
fig.update_layout(
|
|
|
310 |
plot_bgcolor="white",
|
311 |
paper_bgcolor="white",
|
312 |
)
|
313 |
+
|
314 |
return fig
|
315 |
|
316 |
|
317 |
def create_category_comparison_plot(df: pd.DataFrame, track: str) -> go.Figure:
|
318 |
"""Create category-wise comparison plot."""
|
319 |
+
|
320 |
if df.empty:
|
321 |
fig = go.Figure()
|
322 |
fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
|
323 |
return fig
|
324 |
+
|
325 |
metric_col = f"{track}_quality"
|
326 |
adequate_col = f"{track}_adequate"
|
327 |
+
|
328 |
# Filter to adequate models
|
329 |
valid_models = df[df[adequate_col] & (df[metric_col] > 0)]
|
330 |
+
|
331 |
if valid_models.empty:
|
332 |
fig = go.Figure()
|
333 |
+
fig.add_annotation(text="No adequate models found", x=0.5, y=0.5, showarrow=False)
|
|
|
|
|
334 |
return fig
|
335 |
+
|
336 |
fig = go.Figure()
|
337 |
+
|
338 |
# Create box plot for each category
|
339 |
for category, info in MODEL_CATEGORIES.items():
|
340 |
category_models = valid_models[valid_models["model_category"] == category]
|
341 |
+
|
342 |
if len(category_models) > 0:
|
343 |
+
fig.add_trace(go.Box(
|
344 |
+
y=category_models[metric_col],
|
345 |
+
name=info["name"],
|
346 |
+
marker_color=info["color"],
|
347 |
+
boxpoints="all", # Show all points
|
348 |
+
jitter=0.3,
|
349 |
+
pointpos=-1.8,
|
350 |
+
hovertemplate=(
|
351 |
+
f"<b>{info['name']}</b><br>" +
|
352 |
+
"Quality: %{y:.4f}<br>" +
|
353 |
+
"Model: %{customdata}<br>" +
|
354 |
+
"<extra></extra>"
|
355 |
+
),
|
356 |
+
customdata=category_models["model_name"],
|
357 |
+
))
|
358 |
+
|
|
|
|
|
359 |
# Customize layout
|
360 |
track_info = EVALUATION_TRACKS[track]
|
361 |
fig.update_layout(
|
|
|
367 |
plot_bgcolor="white",
|
368 |
paper_bgcolor="white",
|
369 |
)
|
370 |
+
|
371 |
return fig
|
372 |
|
373 |
|
374 |
def create_adequacy_analysis_plot(df: pd.DataFrame) -> go.Figure:
|
375 |
"""Create analysis plot for statistical adequacy across tracks."""
|
376 |
+
|
377 |
if df.empty:
|
378 |
fig = go.Figure()
|
379 |
fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
|
380 |
return fig
|
381 |
+
|
382 |
fig = make_subplots(
|
383 |
+
rows=2, cols=2,
|
|
|
384 |
subplot_titles=(
|
385 |
"Sample Sizes by Track",
|
386 |
+
"Statistical Adequacy Distribution",
|
387 |
"Scientific Adequacy Scores",
|
388 |
+
"Model Categories Distribution"
|
389 |
),
|
390 |
specs=[
|
391 |
[{"type": "bar"}, {"type": "pie"}],
|
392 |
+
[{"type": "histogram"}, {"type": "bar"}]
|
393 |
+
]
|
394 |
)
|
395 |
+
|
396 |
# Sample sizes by track
|
397 |
track_names = []
|
398 |
sample_counts = []
|
399 |
+
|
400 |
for track in EVALUATION_TRACKS.keys():
|
401 |
samples_col = f"{track}_samples"
|
402 |
if samples_col in df.columns:
|
403 |
total_samples = df[df[samples_col] > 0][samples_col].sum()
|
404 |
track_names.append(track.replace("_", " ").title())
|
405 |
sample_counts.append(total_samples)
|
406 |
+
|
407 |
if track_names:
|
408 |
fig.add_trace(
|
409 |
+
go.Bar(x=track_names, y=sample_counts, name="Samples"),
|
410 |
+
row=1, col=1
|
411 |
)
|
412 |
+
|
413 |
# Statistical adequacy distribution
|
414 |
adequacy_bins = pd.cut(
|
415 |
+
df["scientific_adequacy_score"],
|
416 |
bins=[0, 0.3, 0.6, 0.8, 1.0],
|
417 |
+
labels=["Poor", "Fair", "Good", "Excellent"]
|
418 |
)
|
419 |
adequacy_counts = adequacy_bins.value_counts()
|
420 |
+
|
421 |
if not adequacy_counts.empty:
|
422 |
fig.add_trace(
|
423 |
go.Pie(
|
424 |
labels=adequacy_counts.index,
|
425 |
values=adequacy_counts.values,
|
426 |
+
name="Adequacy"
|
427 |
),
|
428 |
+
row=1, col=2
|
|
|
429 |
)
|
430 |
+
|
431 |
# Scientific adequacy scores histogram
|
432 |
fig.add_trace(
|
433 |
go.Histogram(
|
434 |
+
x=df["scientific_adequacy_score"],
|
435 |
+
nbinsx=20,
|
436 |
+
name="Adequacy Scores"
|
437 |
),
|
438 |
+
row=2, col=1
|
|
|
439 |
)
|
440 |
+
|
441 |
# Model categories distribution
|
442 |
category_counts = df["model_category"].value_counts()
|
443 |
+
category_colors = [MODEL_CATEGORIES.get(cat, {}).get("color", "#808080") for cat in category_counts.index]
|
444 |
+
|
|
|
|
|
|
|
445 |
fig.add_trace(
|
446 |
go.Bar(
|
447 |
x=category_counts.index,
|
448 |
y=category_counts.values,
|
449 |
marker_color=category_colors,
|
450 |
+
name="Categories"
|
451 |
),
|
452 |
+
row=2, col=2
|
|
|
453 |
)
|
454 |
+
|
455 |
fig.update_layout(
|
456 |
+
title="📊 Scientific Evaluation Analysis",
|
457 |
+
height=800,
|
458 |
+
showlegend=False
|
459 |
)
|
460 |
+
|
461 |
return fig
|
462 |
|
463 |
|
464 |
def create_cross_track_analysis_plot(df: pd.DataFrame) -> go.Figure:
|
465 |
"""Create cross-track performance correlation analysis."""
|
466 |
+
|
467 |
if df.empty:
|
468 |
fig = go.Figure()
|
469 |
fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
|
470 |
return fig
|
471 |
+
|
472 |
# Get models with data in multiple tracks
|
473 |
quality_cols = [f"{track}_quality" for track in EVALUATION_TRACKS.keys()]
|
474 |
available_cols = [col for col in quality_cols if col in df.columns]
|
475 |
+
|
476 |
if len(available_cols) < 2:
|
477 |
fig = go.Figure()
|
478 |
+
fig.add_annotation(text="Need at least 2 tracks for comparison", x=0.5, y=0.5, showarrow=False)
|
|
|
|
|
479 |
return fig
|
480 |
+
|
481 |
# Filter to models with data in multiple tracks
|
482 |
multi_track_models = df.copy()
|
483 |
for col in available_cols:
|
484 |
multi_track_models = multi_track_models[multi_track_models[col] > 0]
|
485 |
+
|
486 |
if len(multi_track_models) < 3:
|
487 |
fig = go.Figure()
|
488 |
+
fig.add_annotation(text="Insufficient models for cross-track analysis", x=0.5, y=0.5, showarrow=False)
|
|
|
|
|
|
|
|
|
|
|
489 |
return fig
|
490 |
+
|
491 |
# Create scatter plot matrix
|
492 |
+
track_pairs = [(available_cols[i], available_cols[j])
|
493 |
+
for i in range(len(available_cols))
|
494 |
+
for j in range(i+1, len(available_cols))]
|
495 |
+
|
|
|
|
|
496 |
if not track_pairs:
|
497 |
fig = go.Figure()
|
498 |
+
fig.add_annotation(text="No track pairs available", x=0.5, y=0.5, showarrow=False)
|
|
|
|
|
499 |
return fig
|
500 |
+
|
501 |
# Use first pair for demonstration
|
502 |
x_col, y_col = track_pairs[0]
|
503 |
x_track = x_col.replace("_quality", "").replace("_", " ").title()
|
504 |
y_track = y_col.replace("_quality", "").replace("_", " ").title()
|
505 |
+
|
506 |
fig = go.Figure()
|
507 |
+
|
508 |
# Color by category
|
509 |
for category, info in MODEL_CATEGORIES.items():
|
510 |
+
category_models = multi_track_models[multi_track_models["model_category"] == category]
|
511 |
+
|
|
|
|
|
512 |
if len(category_models) > 0:
|
513 |
+
fig.add_trace(go.Scatter(
|
514 |
+
x=category_models[x_col],
|
515 |
+
y=category_models[y_col],
|
516 |
+
mode="markers",
|
517 |
+
marker=dict(
|
518 |
+
size=10,
|
519 |
+
color=info["color"],
|
520 |
+
line=dict(color="black", width=1),
|
521 |
+
),
|
522 |
+
name=info["name"],
|
523 |
+
text=category_models["model_name"],
|
524 |
+
hovertemplate=(
|
525 |
+
"<b>%{text}</b><br>" +
|
526 |
+
f"{x_track}: %{{x:.4f}}<br>" +
|
527 |
+
f"{y_track}: %{{y:.4f}}<br>" +
|
528 |
+
f"Category: {info['name']}<br>" +
|
529 |
+
"<extra></extra>"
|
530 |
+
),
|
531 |
+
))
|
532 |
+
|
|
|
|
|
533 |
# Add diagonal line for reference
|
534 |
min_val = min(multi_track_models[x_col].min(), multi_track_models[y_col].min())
|
535 |
max_val = max(multi_track_models[x_col].max(), multi_track_models[y_col].max())
|
536 |
+
|
537 |
+
fig.add_trace(go.Scatter(
|
538 |
+
x=[min_val, max_val],
|
539 |
+
y=[min_val, max_val],
|
540 |
+
mode="lines",
|
541 |
+
line=dict(dash="dash", color="gray", width=2),
|
542 |
+
name="Perfect Correlation",
|
543 |
+
showlegend=False,
|
544 |
+
hoverinfo="skip",
|
545 |
+
))
|
546 |
+
|
|
|
|
|
547 |
fig.update_layout(
|
548 |
title=f"🔄 Cross-Track Performance: {x_track} vs {y_track}",
|
549 |
xaxis_title=f"{x_track} Quality Score",
|
|
|
553 |
plot_bgcolor="white",
|
554 |
paper_bgcolor="white",
|
555 |
)
|
556 |
+
|
557 |
return fig
|
558 |
|
559 |
|
560 |
+
def create_scientific_model_detail_plot(model_results: Dict, model_name: str, track: str) -> go.Figure:
|
|
|
|
|
561 |
"""Create detailed scientific analysis for a specific model."""
|
562 |
+
|
563 |
if not model_results or "tracks" not in model_results:
|
564 |
fig = go.Figure()
|
565 |
+
fig.add_annotation(text="No model results available", x=0.5, y=0.5, showarrow=False)
|
|
|
|
|
566 |
return fig
|
567 |
+
|
568 |
track_data = model_results["tracks"].get(track, {})
|
569 |
if track_data.get("error") or "pair_metrics" not in track_data:
|
570 |
fig = go.Figure()
|
571 |
+
fig.add_annotation(text=f"No data for {track} track", x=0.5, y=0.5, showarrow=False)
|
|
|
|
|
572 |
return fig
|
573 |
+
|
574 |
pair_metrics = track_data["pair_metrics"]
|
575 |
track_languages = EVALUATION_TRACKS[track]["languages"]
|
576 |
+
|
577 |
# Extract data for plotting
|
578 |
pairs = []
|
579 |
quality_means = []
|
580 |
quality_cis = []
|
581 |
bleu_means = []
|
582 |
sample_counts = []
|
583 |
+
|
584 |
for src in track_languages:
|
585 |
for tgt in track_languages:
|
586 |
if src == tgt:
|
587 |
continue
|
588 |
+
|
589 |
pair_key = f"{src}_to_{tgt}"
|
590 |
if pair_key in pair_metrics:
|
591 |
metrics = pair_metrics[pair_key]
|
592 |
+
|
593 |
if "quality_score" in metrics and "sample_count" in metrics:
|
594 |
pair_label = f"{LANGUAGE_NAMES.get(src, src)} → {LANGUAGE_NAMES.get(tgt, tgt)}"
|
595 |
pairs.append(pair_label)
|
596 |
+
|
597 |
quality_stats = metrics["quality_score"]
|
598 |
quality_means.append(quality_stats["mean"])
|
599 |
+
quality_cis.append([quality_stats["ci_lower"], quality_stats["ci_upper"]])
|
600 |
+
|
|
|
|
|
601 |
bleu_stats = metrics.get("bleu", {"mean": 0})
|
602 |
bleu_means.append(bleu_stats["mean"])
|
603 |
+
|
604 |
sample_counts.append(metrics["sample_count"])
|
605 |
+
|
606 |
if not pairs:
|
607 |
fig = go.Figure()
|
608 |
+
fig.add_annotation(text="No language pair data available", x=0.5, y=0.5, showarrow=False)
|
|
|
|
|
609 |
return fig
|
610 |
+
|
611 |
# Create subplots
|
612 |
fig = make_subplots(
|
613 |
+
rows=2, cols=1,
|
|
|
614 |
subplot_titles=(
|
615 |
"Quality Scores by Language Pair (with 95% CI)",
|
616 |
+
"BLEU Scores by Language Pair"
|
617 |
),
|
618 |
vertical_spacing=0.15,
|
619 |
)
|
620 |
+
|
621 |
# Quality scores with confidence intervals
|
622 |
error_y = dict(
|
623 |
type="data",
|
|
|
627 |
thickness=2,
|
628 |
width=4,
|
629 |
)
|
630 |
+
|
631 |
fig.add_trace(
|
632 |
go.Bar(
|
633 |
x=pairs,
|
|
|
638 |
text=[f"{score:.3f}" for score in quality_means],
|
639 |
textposition="outside",
|
640 |
hovertemplate=(
|
641 |
+
"<b>%{x}</b><br>" +
|
642 |
+
"Quality: %{y:.4f}<br>" +
|
643 |
+
"Samples: %{customdata}<br>" +
|
644 |
+
"<extra></extra>"
|
645 |
),
|
646 |
customdata=sample_counts,
|
647 |
),
|
648 |
+
row=1, col=1
|
|
|
649 |
)
|
650 |
+
|
651 |
# BLEU scores
|
652 |
fig.add_trace(
|
653 |
go.Bar(
|
|
|
658 |
text=[f"{score:.1f}" for score in bleu_means],
|
659 |
textposition="outside",
|
660 |
),
|
661 |
+
row=2, col=1
|
|
|
662 |
)
|
663 |
+
|
664 |
# Customize layout
|
665 |
track_info = EVALUATION_TRACKS[track]
|
666 |
fig.update_layout(
|
|
|
669 |
showlegend=False,
|
670 |
margin=dict(l=50, r=50, t=100, b=150),
|
671 |
)
|
672 |
+
|
673 |
# Rotate x-axis labels
|
674 |
fig.update_xaxes(tickangle=45, row=1, col=1)
|
675 |
fig.update_xaxes(tickangle=45, row=2, col=1)
|
676 |
+
|
677 |
+
return fig
|