Spaces:
Sleeping
Sleeping
Update src/plotting.py
Browse files- src/plotting.py +632 -489
src/plotting.py
CHANGED
@@ -1,8 +1,6 @@
|
|
1 |
# src/plotting.py
|
2 |
import matplotlib.pyplot as plt
|
3 |
import matplotlib.gridspec as gridspec
|
4 |
-
import matplotlib.colors as mcolors
|
5 |
-
from colorsys import rgb_to_hls, hls_to_rgb
|
6 |
import plotly.graph_objects as go
|
7 |
import plotly.express as px
|
8 |
from plotly.subplots import make_subplots
|
@@ -10,587 +8,732 @@ import pandas as pd
|
|
10 |
import numpy as np
|
11 |
from collections import defaultdict
|
12 |
from typing import Dict, List, Optional, Union
|
13 |
-
from config import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
-
|
16 |
-
plt.
|
17 |
-
plt.rcParams[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
-
def create_leaderboard_ranking_plot(df: pd.DataFrame, metric: str = 'quality_score', top_n: int = 15) -> go.Figure:
|
20 |
-
"""Create interactive leaderboard ranking plot using Plotly."""
|
21 |
-
|
22 |
if df.empty:
|
23 |
fig = go.Figure()
|
24 |
fig.add_annotation(
|
25 |
-
text="No
|
26 |
-
xref="paper",
|
27 |
-
|
28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
)
|
30 |
-
fig.update_layout(title="No Data Available")
|
31 |
return fig
|
32 |
-
|
33 |
-
#
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
go.Bar(
|
39 |
-
y=
|
40 |
-
x=
|
41 |
-
orientation=
|
42 |
-
marker=dict(
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
colorbar=dict(title=metric.replace('_', ' ').title())
|
47 |
-
),
|
48 |
-
text=[f"{score:.3f}" for score in top_models[metric]],
|
49 |
-
textposition='auto',
|
50 |
hovertemplate=(
|
51 |
-
"<b>%{y}</b><br>"
|
52 |
-
f"{metric.
|
53 |
-
"
|
54 |
-
"
|
55 |
-
"<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
),
|
57 |
-
customdata=list(zip(top_models['author'], top_models['coverage_rate']))
|
58 |
)
|
59 |
-
|
60 |
-
|
|
|
|
|
61 |
fig.update_layout(
|
62 |
-
title=f"🏆
|
63 |
-
xaxis_title=f"{metric.
|
64 |
yaxis_title="Models",
|
65 |
-
height=max(400, len(
|
66 |
margin=dict(l=20, r=20, t=60, b=20),
|
67 |
-
plot_bgcolor=
|
68 |
-
paper_bgcolor=
|
|
|
69 |
)
|
70 |
-
|
71 |
# Reverse y-axis to show best model at top
|
72 |
fig.update_yaxes(autorange="reversed")
|
73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
return fig
|
75 |
|
76 |
-
|
77 |
-
|
78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
if df.empty:
|
80 |
fig = go.Figure()
|
81 |
fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
|
82 |
-
fig.update_layout(title="No Data Available")
|
83 |
return fig
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
|
|
|
|
|
|
92 |
fig = go.Figure()
|
93 |
-
fig.add_annotation(
|
94 |
-
|
|
|
95 |
return fig
|
96 |
-
|
97 |
-
# Metrics to include in radar chart
|
98 |
-
metrics = ['quality_score', 'bleu', 'chrf', 'rouge1', 'rougeL']
|
99 |
-
metric_labels = ['Quality Score', 'BLEU (/100)', 'ChrF', 'ROUGE-1', 'ROUGE-L']
|
100 |
-
|
101 |
fig = go.Figure()
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
),
|
135 |
-
showlegend=
|
136 |
-
|
137 |
-
|
138 |
)
|
139 |
-
|
140 |
return fig
|
141 |
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
|
|
146 |
fig = go.Figure()
|
147 |
-
fig.add_annotation(text="No
|
148 |
-
fig.update_layout(title="No Language Pair Data Available")
|
149 |
return fig
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
if pair_key in pair_metrics and metric in pair_metrics[pair_key]:
|
162 |
-
matrix[i, j] = pair_metrics[pair_key][metric]
|
163 |
-
else:
|
164 |
-
matrix[i, j] = np.nan
|
165 |
-
else:
|
166 |
-
matrix[i, j] = np.nan
|
167 |
-
|
168 |
-
# Create language labels
|
169 |
-
lang_labels = [LANGUAGE_NAMES.get(lang, lang) for lang in languages]
|
170 |
-
|
171 |
-
fig = go.Figure(data=go.Heatmap(
|
172 |
-
z=matrix,
|
173 |
-
x=lang_labels,
|
174 |
-
y=lang_labels,
|
175 |
-
colorscale='Viridis',
|
176 |
-
showscale=True,
|
177 |
-
colorbar=dict(title=metric.replace('_', ' ').title()),
|
178 |
-
hovertemplate=(
|
179 |
-
"Source: %{y}<br>" +
|
180 |
-
"Target: %{x}<br>" +
|
181 |
-
f"{metric.replace('_', ' ').title()}: %{{z:.3f}}<br>" +
|
182 |
-
"<extra></extra>"
|
183 |
)
|
184 |
-
|
185 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
186 |
fig.update_layout(
|
187 |
-
title=f"
|
188 |
-
xaxis_title="
|
189 |
-
yaxis_title="
|
190 |
-
height=
|
191 |
-
|
|
|
|
|
192 |
)
|
193 |
-
|
194 |
return fig
|
195 |
|
196 |
-
|
197 |
-
|
198 |
-
|
|
|
199 |
if df.empty:
|
200 |
fig = go.Figure()
|
201 |
fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
|
202 |
-
fig.update_layout(title="No Data Available")
|
203 |
return fig
|
204 |
-
|
205 |
fig = make_subplots(
|
206 |
-
rows=2,
|
|
|
207 |
subplot_titles=(
|
208 |
-
"
|
209 |
-
"
|
210 |
-
"
|
211 |
-
"
|
212 |
),
|
213 |
-
specs=[
|
214 |
-
|
|
|
|
|
215 |
)
|
216 |
-
|
217 |
-
#
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
226 |
)
|
227 |
-
|
228 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
229 |
fig.add_trace(
|
230 |
-
go.
|
231 |
-
x=df[
|
232 |
-
y=df['quality_score'],
|
233 |
-
mode='markers',
|
234 |
-
text=df['model_name'],
|
235 |
-
name="Quality vs Coverage"
|
236 |
),
|
237 |
-
row=
|
|
|
238 |
)
|
239 |
-
|
240 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
241 |
fig.add_trace(
|
242 |
-
go.
|
243 |
-
x=
|
244 |
-
y=
|
245 |
-
|
246 |
-
|
247 |
-
name="Quality vs Samples"
|
248 |
),
|
249 |
-
row=2,
|
|
|
250 |
)
|
251 |
-
|
252 |
-
# Google comparable coverage
|
253 |
-
google_coverage = df['google_pairs_covered'].value_counts().sort_index()
|
254 |
-
fig.add_trace(
|
255 |
-
go.Bar(x=google_coverage.index, y=google_coverage.values, name="Google Coverage"),
|
256 |
-
row=2, col=2
|
257 |
-
)
|
258 |
-
|
259 |
fig.update_layout(
|
260 |
-
title="
|
261 |
-
height=800,
|
262 |
-
showlegend=False
|
263 |
)
|
264 |
-
|
265 |
return fig
|
266 |
|
267 |
-
|
268 |
-
|
269 |
-
|
|
|
270 |
if df.empty:
|
271 |
fig = go.Figure()
|
272 |
fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
|
273 |
-
fig.update_layout(title="No Data Available")
|
274 |
return fig
|
275 |
-
|
276 |
-
# Convert submission_date to datetime
|
277 |
-
df_copy = df.copy()
|
278 |
-
df_copy['submission_date'] = pd.to_datetime(df_copy['submission_date'])
|
279 |
-
df_copy = df_copy.sort_values('submission_date')
|
280 |
-
|
281 |
-
fig = go.Figure()
|
282 |
-
|
283 |
-
# Add scatter plot for each submission
|
284 |
-
fig.add_trace(go.Scatter(
|
285 |
-
x=df_copy['submission_date'],
|
286 |
-
y=df_copy['quality_score'],
|
287 |
-
mode='markers+lines',
|
288 |
-
marker=dict(
|
289 |
-
size=10,
|
290 |
-
color=df_copy['quality_score'],
|
291 |
-
colorscale='Viridis',
|
292 |
-
showscale=True,
|
293 |
-
colorbar=dict(title="Quality Score")
|
294 |
-
),
|
295 |
-
text=df_copy['model_name'],
|
296 |
-
hovertemplate=(
|
297 |
-
"<b>%{text}</b><br>" +
|
298 |
-
"Date: %{x}<br>" +
|
299 |
-
"Quality Score: %{y:.4f}<br>" +
|
300 |
-
"<extra></extra>"
|
301 |
-
),
|
302 |
-
name="Models"
|
303 |
-
))
|
304 |
-
|
305 |
-
# Add trend line
|
306 |
-
if len(df_copy) > 1:
|
307 |
-
z = np.polyfit(range(len(df_copy)), df_copy['quality_score'], 1)
|
308 |
-
trend_line = np.poly1d(z)(range(len(df_copy)))
|
309 |
-
|
310 |
-
fig.add_trace(go.Scatter(
|
311 |
-
x=df_copy['submission_date'],
|
312 |
-
y=trend_line,
|
313 |
-
mode='lines',
|
314 |
-
line=dict(dash='dash', color='red'),
|
315 |
-
name="Trend",
|
316 |
-
hoverinfo='skip'
|
317 |
-
))
|
318 |
-
|
319 |
-
fig.update_layout(
|
320 |
-
title="📅 Model Performance Timeline",
|
321 |
-
xaxis_title="Submission Date",
|
322 |
-
yaxis_title="Quality Score",
|
323 |
-
height=500
|
324 |
-
)
|
325 |
-
|
326 |
-
return fig
|
327 |
|
328 |
-
|
329 |
-
""
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
335 |
fig = go.Figure()
|
336 |
fig.add_annotation(
|
337 |
-
text="No
|
338 |
-
x=0.5, y=0.5, showarrow=False
|
339 |
)
|
340 |
-
fig.update_layout(title="No Google Comparable Models")
|
341 |
return fig
|
342 |
-
|
|
|
|
|
|
|
|
|
|
|
343 |
fig = go.Figure()
|
344 |
-
|
345 |
-
#
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
369 |
fig.update_layout(
|
370 |
-
title="
|
371 |
-
xaxis_title="
|
372 |
-
yaxis_title="Quality Score",
|
373 |
-
height=
|
|
|
|
|
|
|
374 |
)
|
375 |
-
|
376 |
return fig
|
377 |
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
382 |
fig = go.Figure()
|
383 |
-
fig.add_annotation(
|
384 |
-
|
|
|
385 |
return fig
|
386 |
-
|
387 |
-
pair_metrics =
|
388 |
-
|
389 |
-
|
|
|
390 |
pairs = []
|
391 |
-
|
392 |
-
|
|
|
393 |
sample_counts = []
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
409 |
if not pairs:
|
410 |
fig = go.Figure()
|
411 |
-
fig.add_annotation(
|
412 |
-
|
|
|
413 |
return fig
|
414 |
-
|
415 |
-
# Create
|
416 |
fig = make_subplots(
|
417 |
-
rows=2,
|
|
|
418 |
subplot_titles=(
|
419 |
-
|
420 |
-
|
421 |
),
|
422 |
vertical_spacing=0.15,
|
423 |
-
row_heights=[0.45, 0.45]
|
424 |
)
|
425 |
-
|
426 |
-
#
|
427 |
-
|
428 |
-
|
429 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
430 |
fig.add_trace(
|
431 |
go.Bar(
|
432 |
x=pairs,
|
433 |
-
y=
|
434 |
-
|
435 |
-
name="
|
436 |
-
|
437 |
-
|
438 |
-
|
439 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
440 |
),
|
441 |
-
row=1,
|
|
|
442 |
)
|
443 |
-
|
444 |
-
#
|
445 |
fig.add_trace(
|
446 |
go.Bar(
|
447 |
x=pairs,
|
448 |
-
y=
|
449 |
-
|
450 |
-
|
451 |
-
text=[f"{score:.
|
452 |
-
textposition=
|
453 |
-
textfont=dict(size=10),
|
454 |
-
showlegend=False
|
455 |
),
|
456 |
-
row=2,
|
|
|
457 |
)
|
458 |
-
|
459 |
-
#
|
|
|
460 |
fig.update_layout(
|
|
|
461 |
height=900,
|
462 |
-
|
463 |
-
|
464 |
-
x=0.5,
|
465 |
-
xanchor='center'
|
466 |
-
),
|
467 |
-
showlegend=True,
|
468 |
-
margin=dict(l=50, r=50, t=100, b=150)
|
469 |
-
)
|
470 |
-
|
471 |
-
# Update x-axes to rotate labels properly
|
472 |
-
fig.update_xaxes(
|
473 |
-
tickangle=45,
|
474 |
-
tickfont=dict(size=10),
|
475 |
-
row=1, col=1
|
476 |
)
|
477 |
-
fig.update_xaxes(
|
478 |
-
tickangle=45,
|
479 |
-
tickfont=dict(size=10),
|
480 |
-
row=2, col=1
|
481 |
-
)
|
482 |
-
|
483 |
-
# Update y-axes
|
484 |
-
fig.update_yaxes(title_text="BLEU Score", row=1, col=1)
|
485 |
-
fig.update_yaxes(title_text="Quality Score", row=2, col=1)
|
486 |
-
|
487 |
-
# Add legend manually for Google vs UG40 only
|
488 |
-
fig.add_trace(
|
489 |
-
go.Scatter(
|
490 |
-
x=[None], y=[None],
|
491 |
-
mode='markers',
|
492 |
-
marker=dict(size=15, color='#1f77b4', symbol='square'),
|
493 |
-
name="Google Comparable",
|
494 |
-
showlegend=True
|
495 |
-
)
|
496 |
-
)
|
497 |
-
|
498 |
-
fig.add_trace(
|
499 |
-
go.Scatter(
|
500 |
-
x=[None], y=[None],
|
501 |
-
mode='markers',
|
502 |
-
marker=dict(size=15, color='#ff7f0e', symbol='square'),
|
503 |
-
name="UG40 Only",
|
504 |
-
showlegend=True
|
505 |
-
)
|
506 |
-
)
|
507 |
-
|
508 |
-
return fig
|
509 |
|
510 |
-
|
511 |
-
|
512 |
-
|
513 |
-
|
514 |
-
|
515 |
-
subplot_titles=(
|
516 |
-
"Sample Distribution",
|
517 |
-
"Primary Metrics",
|
518 |
-
"Error Analysis",
|
519 |
-
"Coverage Summary"
|
520 |
-
),
|
521 |
-
specs=[[{"type": "pie"}, {"type": "bar"}],
|
522 |
-
[{"type": "bar"}, {"type": "bar"}]]
|
523 |
-
)
|
524 |
-
|
525 |
-
# Sample distribution (pie chart)
|
526 |
-
coverage = validation_info.get('coverage', 0.8)
|
527 |
-
fig.add_trace(
|
528 |
-
go.Pie(
|
529 |
-
labels=["Evaluated", "Missing"],
|
530 |
-
values=[coverage * 100, (1 - coverage) * 100],
|
531 |
-
name="Samples"
|
532 |
-
),
|
533 |
-
row=1, col=1
|
534 |
-
)
|
535 |
-
|
536 |
-
# Primary metrics
|
537 |
-
if 'summary' in evaluation_results:
|
538 |
-
metrics_data = evaluation_results['summary']['primary_metrics']
|
539 |
-
metric_names = list(metrics_data.keys())
|
540 |
-
metric_values = list(metrics_data.values())
|
541 |
-
|
542 |
-
fig.add_trace(
|
543 |
-
go.Bar(
|
544 |
-
x=metric_names,
|
545 |
-
y=metric_values,
|
546 |
-
name="Metrics",
|
547 |
-
text=[f"{val:.3f}" for val in metric_values],
|
548 |
-
textposition='auto'
|
549 |
-
),
|
550 |
-
row=1, col=2
|
551 |
-
)
|
552 |
-
|
553 |
-
# Error analysis (CER, WER)
|
554 |
-
if 'averages' in evaluation_results:
|
555 |
-
error_metrics = ['cer', 'wer']
|
556 |
-
error_values = [evaluation_results['averages'].get(m, 0) for m in error_metrics]
|
557 |
-
|
558 |
-
fig.add_trace(
|
559 |
-
go.Bar(
|
560 |
-
x=error_metrics,
|
561 |
-
y=error_values,
|
562 |
-
name="Errors",
|
563 |
-
text=[f"{val:.3f}" for val in error_values],
|
564 |
-
textposition='auto'
|
565 |
-
),
|
566 |
-
row=2, col=1
|
567 |
-
)
|
568 |
-
|
569 |
-
# Coverage summary
|
570 |
-
if 'summary' in evaluation_results:
|
571 |
-
summary = evaluation_results['summary']
|
572 |
-
coverage_labels = ["Total Samples", "Lang Pairs", "Google Pairs"]
|
573 |
-
coverage_values = [
|
574 |
-
summary.get('total_samples', 0),
|
575 |
-
summary.get('language_pairs_covered', 0),
|
576 |
-
summary.get('google_comparable_pairs', 0)
|
577 |
-
]
|
578 |
-
|
579 |
-
fig.add_trace(
|
580 |
-
go.Bar(
|
581 |
-
x=coverage_labels,
|
582 |
-
y=coverage_values,
|
583 |
-
name="Coverage",
|
584 |
-
text=[f"{val}" for val in coverage_values],
|
585 |
-
textposition='auto'
|
586 |
-
),
|
587 |
-
row=2, col=2
|
588 |
-
)
|
589 |
-
|
590 |
-
fig.update_layout(
|
591 |
-
title="📋 Submission Summary",
|
592 |
-
height=700,
|
593 |
-
showlegend=False
|
594 |
-
)
|
595 |
-
|
596 |
-
return fig
|
|
|
1 |
# src/plotting.py
|
2 |
import matplotlib.pyplot as plt
|
3 |
import matplotlib.gridspec as gridspec
|
|
|
|
|
4 |
import plotly.graph_objects as go
|
5 |
import plotly.express as px
|
6 |
from plotly.subplots import make_subplots
|
|
|
8 |
import numpy as np
|
9 |
from collections import defaultdict
|
10 |
from typing import Dict, List, Optional, Union
|
11 |
+
from config import (
|
12 |
+
LANGUAGE_NAMES,
|
13 |
+
ALL_UG40_LANGUAGES,
|
14 |
+
GOOGLE_SUPPORTED_LANGUAGES,
|
15 |
+
METRICS_CONFIG,
|
16 |
+
EVALUATION_TRACKS,
|
17 |
+
MODEL_CATEGORIES,
|
18 |
+
CHART_CONFIG,
|
19 |
+
STATISTICAL_CONFIG,
|
20 |
+
)
|
21 |
|
22 |
+
# Scientific plotting style
|
23 |
+
plt.style.use("default")
|
24 |
+
plt.rcParams["figure.facecolor"] = "white"
|
25 |
+
plt.rcParams["axes.facecolor"] = "white"
|
26 |
+
plt.rcParams["font.size"] = 10
|
27 |
+
plt.rcParams["axes.labelsize"] = 12
|
28 |
+
plt.rcParams["axes.titlesize"] = 14
|
29 |
+
plt.rcParams["xtick.labelsize"] = 10
|
30 |
+
plt.rcParams["ytick.labelsize"] = 10
|
31 |
+
|
32 |
+
|
33 |
+
def create_scientific_leaderboard_plot(
|
34 |
+
df: pd.DataFrame, track: str, metric: str = "quality", top_n: int = 15
|
35 |
+
) -> go.Figure:
|
36 |
+
"""Create scientific leaderboard plot with confidence intervals."""
|
37 |
|
|
|
|
|
|
|
38 |
if df.empty:
|
39 |
fig = go.Figure()
|
40 |
fig.add_annotation(
|
41 |
+
text="No models available for this track",
|
42 |
+
xref="paper",
|
43 |
+
yref="paper",
|
44 |
+
x=0.5,
|
45 |
+
y=0.5,
|
46 |
+
showarrow=False,
|
47 |
+
font=dict(size=16),
|
48 |
+
)
|
49 |
+
fig.update_layout(title=f"No Data Available - {track.title()} Track")
|
50 |
+
return fig
|
51 |
+
|
52 |
+
# Get top N models for this track
|
53 |
+
metric_col = f"{track}_{metric}"
|
54 |
+
ci_lower_col = f"{track}_ci_lower"
|
55 |
+
ci_upper_col = f"{track}_ci_upper"
|
56 |
+
|
57 |
+
if metric_col not in df.columns:
|
58 |
+
fig = go.Figure()
|
59 |
+
fig.add_annotation(
|
60 |
+
text=f"Metric {metric} not available for {track} track",
|
61 |
+
xref="paper",
|
62 |
+
yref="paper",
|
63 |
+
x=0.5,
|
64 |
+
y=0.5,
|
65 |
+
showarrow=False,
|
66 |
)
|
|
|
67 |
return fig
|
68 |
+
|
69 |
+
# Filter and sort
|
70 |
+
valid_models = df[(df[metric_col] > 0)].head(top_n)
|
71 |
+
|
72 |
+
if valid_models.empty:
|
73 |
+
fig = go.Figure()
|
74 |
+
fig.add_annotation(text="No valid models found", x=0.5, y=0.5, showarrow=False)
|
75 |
+
return fig
|
76 |
+
|
77 |
+
# Create color mapping by category
|
78 |
+
category_colors = {}
|
79 |
+
for i, category in enumerate(MODEL_CATEGORIES.keys()):
|
80 |
+
category_colors[category] = MODEL_CATEGORIES[category]["color"]
|
81 |
+
|
82 |
+
colors = [
|
83 |
+
category_colors.get(cat, "#808080") for cat in valid_models["model_category"]
|
84 |
+
]
|
85 |
+
|
86 |
+
# Main bar plot
|
87 |
+
fig = go.Figure()
|
88 |
+
|
89 |
+
# Add bars with error bars if confidence intervals available
|
90 |
+
if ci_lower_col in valid_models.columns and ci_upper_col in valid_models.columns:
|
91 |
+
error_y = dict(
|
92 |
+
type="data",
|
93 |
+
array=valid_models[ci_upper_col] - valid_models[metric_col],
|
94 |
+
arrayminus=valid_models[metric_col] - valid_models[ci_lower_col],
|
95 |
+
visible=True,
|
96 |
+
thickness=2,
|
97 |
+
width=4,
|
98 |
+
)
|
99 |
+
else:
|
100 |
+
error_y = None
|
101 |
+
|
102 |
+
fig.add_trace(
|
103 |
go.Bar(
|
104 |
+
y=valid_models["model_name"],
|
105 |
+
x=valid_models[metric_col],
|
106 |
+
orientation="h",
|
107 |
+
marker=dict(color=colors, line=dict(color="black", width=0.5)),
|
108 |
+
error_x=error_y,
|
109 |
+
text=[f"{score:.3f}" for score in valid_models[metric_col]],
|
110 |
+
textposition="auto",
|
|
|
|
|
|
|
|
|
111 |
hovertemplate=(
|
112 |
+
"<b>%{y}</b><br>"
|
113 |
+
+ f"{metric.title()}: %{{x:.4f}}<br>"
|
114 |
+
+ "Category: %{customdata[0]}<br>"
|
115 |
+
+ "Author: %{customdata[1]}<br>"
|
116 |
+
+ "Samples: %{customdata[2]}<br>"
|
117 |
+
+ "<extra></extra>"
|
118 |
+
),
|
119 |
+
customdata=list(
|
120 |
+
zip(
|
121 |
+
valid_models["model_category"],
|
122 |
+
valid_models["author"],
|
123 |
+
valid_models.get(f"{track}_samples", [0] * len(valid_models)),
|
124 |
+
)
|
125 |
),
|
|
|
126 |
)
|
127 |
+
)
|
128 |
+
|
129 |
+
# Customize layout
|
130 |
+
track_info = EVALUATION_TRACKS[track]
|
131 |
fig.update_layout(
|
132 |
+
title=f"🏆 {track_info['name']} - {metric.title()} Score",
|
133 |
+
xaxis_title=f"{metric.title()} Score (with 95% CI)",
|
134 |
yaxis_title="Models",
|
135 |
+
height=max(400, len(valid_models) * 35 + 100),
|
136 |
margin=dict(l=20, r=20, t=60, b=20),
|
137 |
+
plot_bgcolor="white",
|
138 |
+
paper_bgcolor="white",
|
139 |
+
font=dict(size=12),
|
140 |
)
|
141 |
+
|
142 |
# Reverse y-axis to show best model at top
|
143 |
fig.update_yaxes(autorange="reversed")
|
144 |
+
|
145 |
+
# Add category legend
|
146 |
+
for category, info in MODEL_CATEGORIES.items():
|
147 |
+
if category in valid_models["model_category"].values:
|
148 |
+
fig.add_trace(
|
149 |
+
go.Scatter(
|
150 |
+
x=[None],
|
151 |
+
y=[None],
|
152 |
+
mode="markers",
|
153 |
+
marker=dict(size=10, color=info["color"]),
|
154 |
+
name=info["name"],
|
155 |
+
showlegend=True,
|
156 |
+
)
|
157 |
+
)
|
158 |
+
|
159 |
return fig
|
160 |
|
161 |
+
|
162 |
+
def create_language_pair_heatmap_scientific(
|
163 |
+
model_results: Dict, track: str, metric: str = "quality_score"
|
164 |
+
) -> go.Figure:
|
165 |
+
"""Create research-grade language pair heatmap with proper axes."""
|
166 |
+
|
167 |
+
if not model_results or "tracks" not in model_results:
|
168 |
+
fig = go.Figure()
|
169 |
+
fig.add_annotation(
|
170 |
+
text="No model results available", x=0.5, y=0.5, showarrow=False
|
171 |
+
)
|
172 |
+
return fig
|
173 |
+
|
174 |
+
track_data = model_results["tracks"].get(track, {})
|
175 |
+
if track_data.get("error") or "pair_metrics" not in track_data:
|
176 |
+
fig = go.Figure()
|
177 |
+
fig.add_annotation(
|
178 |
+
text=f"No data available for {track} track", x=0.5, y=0.5, showarrow=False
|
179 |
+
)
|
180 |
+
return fig
|
181 |
+
|
182 |
+
pair_metrics = track_data["pair_metrics"]
|
183 |
+
track_languages = EVALUATION_TRACKS[track]["languages"]
|
184 |
+
|
185 |
+
# Create matrix for heatmap
|
186 |
+
n_langs = len(track_languages)
|
187 |
+
matrix = np.full((n_langs, n_langs), np.nan)
|
188 |
+
|
189 |
+
for i, src_lang in enumerate(track_languages):
|
190 |
+
for j, tgt_lang in enumerate(track_languages):
|
191 |
+
if src_lang != tgt_lang:
|
192 |
+
pair_key = f"{src_lang}_to_{tgt_lang}"
|
193 |
+
if pair_key in pair_metrics and metric in pair_metrics[pair_key]:
|
194 |
+
matrix[i, j] = pair_metrics[pair_key][metric]["mean"]
|
195 |
+
|
196 |
+
# Create language labels
|
197 |
+
lang_labels = [LANGUAGE_NAMES.get(lang, lang.upper()) for lang in track_languages]
|
198 |
+
|
199 |
+
# Create heatmap
|
200 |
+
fig = go.Figure(
|
201 |
+
data=go.Heatmap(
|
202 |
+
z=matrix,
|
203 |
+
x=lang_labels,
|
204 |
+
y=lang_labels,
|
205 |
+
colorscale="Viridis",
|
206 |
+
showscale=True,
|
207 |
+
colorbar=dict(
|
208 |
+
title=f"{metric.replace('_', ' ').title()}",
|
209 |
+
titleside="right",
|
210 |
+
len=0.8,
|
211 |
+
),
|
212 |
+
hovertemplate=(
|
213 |
+
"Source: %{y}<br>"
|
214 |
+
+ "Target: %{x}<br>"
|
215 |
+
+ f"{metric.replace('_', ' ').title()}: %{{z:.3f}}<br>"
|
216 |
+
+ "<extra></extra>"
|
217 |
+
),
|
218 |
+
zmin=0,
|
219 |
+
zmax=1 if metric == "quality_score" else None,
|
220 |
+
)
|
221 |
+
)
|
222 |
+
|
223 |
+
# Customize layout
|
224 |
+
track_info = EVALUATION_TRACKS[track]
|
225 |
+
fig.update_layout(
|
226 |
+
title=f"🗺️ {track_info['name']} - {metric.replace('_', ' ').title()} by Language Pair",
|
227 |
+
xaxis_title="Target Language",
|
228 |
+
yaxis_title="Source Language",
|
229 |
+
height=600,
|
230 |
+
width=700,
|
231 |
+
font=dict(size=12),
|
232 |
+
xaxis=dict(side="bottom"),
|
233 |
+
yaxis=dict(autorange="reversed"), # Source languages from top to bottom
|
234 |
+
)
|
235 |
+
|
236 |
+
return fig
|
237 |
+
|
238 |
+
|
239 |
+
def create_statistical_comparison_plot(df: pd.DataFrame, track: str) -> go.Figure:
|
240 |
+
"""Create statistical comparison plot showing confidence intervals."""
|
241 |
+
|
242 |
if df.empty:
|
243 |
fig = go.Figure()
|
244 |
fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
|
|
|
245 |
return fig
|
246 |
+
|
247 |
+
metric_col = f"{track}_quality"
|
248 |
+
ci_lower_col = f"{track}_ci_lower"
|
249 |
+
ci_upper_col = f"{track}_ci_upper"
|
250 |
+
|
251 |
+
# Filter to models with data for this track
|
252 |
+
valid_models = df[
|
253 |
+
(df[metric_col] > 0) & (df[ci_lower_col].notna()) & (df[ci_upper_col].notna())
|
254 |
+
].head(10)
|
255 |
+
|
256 |
+
if valid_models.empty:
|
257 |
fig = go.Figure()
|
258 |
+
fig.add_annotation(
|
259 |
+
text="No models with confidence intervals", x=0.5, y=0.5, showarrow=False
|
260 |
+
)
|
261 |
return fig
|
262 |
+
|
|
|
|
|
|
|
|
|
263 |
fig = go.Figure()
|
264 |
+
|
265 |
+
# Add confidence intervals as error bars
|
266 |
+
for i, (_, model) in enumerate(valid_models.iterrows()):
|
267 |
+
category = model["model_category"]
|
268 |
+
color = MODEL_CATEGORIES.get(category, {}).get("color", "#808080")
|
269 |
+
|
270 |
+
# Main point
|
271 |
+
fig.add_trace(
|
272 |
+
go.Scatter(
|
273 |
+
x=[model[metric_col]],
|
274 |
+
y=[i],
|
275 |
+
mode="markers",
|
276 |
+
marker=dict(
|
277 |
+
size=12,
|
278 |
+
color=color,
|
279 |
+
line=dict(color="black", width=1),
|
280 |
+
),
|
281 |
+
name=model["model_name"],
|
282 |
+
showlegend=False,
|
283 |
+
hovertemplate=(
|
284 |
+
f"<b>{model['model_name']}</b><br>"
|
285 |
+
+ f"Quality: {model[metric_col]:.4f}<br>"
|
286 |
+
+ f"95% CI: [{model[ci_lower_col]:.4f}, {model[ci_upper_col]:.4f}]<br>"
|
287 |
+
+ f"Category: {category}<br>"
|
288 |
+
+ "<extra></extra>"
|
289 |
+
),
|
290 |
+
)
|
291 |
+
)
|
292 |
+
|
293 |
+
# Confidence interval line
|
294 |
+
fig.add_trace(
|
295 |
+
go.Scatter(
|
296 |
+
x=[model[ci_lower_col], model[ci_upper_col]],
|
297 |
+
y=[i, i],
|
298 |
+
mode="lines",
|
299 |
+
line=dict(color=color, width=3),
|
300 |
+
showlegend=False,
|
301 |
+
hoverinfo="skip",
|
302 |
)
|
303 |
+
)
|
304 |
+
|
305 |
+
# CI endpoints
|
306 |
+
fig.add_trace(
|
307 |
+
go.Scatter(
|
308 |
+
x=[model[ci_lower_col], model[ci_upper_col]],
|
309 |
+
y=[i, i],
|
310 |
+
mode="markers",
|
311 |
+
marker=dict(
|
312 |
+
symbol="line-ns",
|
313 |
+
size=10,
|
314 |
+
color=color,
|
315 |
+
line=dict(width=2),
|
316 |
+
),
|
317 |
+
showlegend=False,
|
318 |
+
hoverinfo="skip",
|
319 |
+
)
|
320 |
+
)
|
321 |
+
|
322 |
+
# Customize layout
|
323 |
+
track_info = EVALUATION_TRACKS[track]
|
324 |
+
fig.update_layout(
|
325 |
+
title=f"📊 {track_info['name']} - Statistical Comparison",
|
326 |
+
xaxis_title="Quality Score",
|
327 |
+
yaxis_title="Models",
|
328 |
+
height=max(400, len(valid_models) * 40 + 100),
|
329 |
+
yaxis=dict(
|
330 |
+
tickmode="array",
|
331 |
+
tickvals=list(range(len(valid_models))),
|
332 |
+
ticktext=valid_models["model_name"].tolist(),
|
333 |
+
autorange="reversed",
|
334 |
),
|
335 |
+
showlegend=False,
|
336 |
+
plot_bgcolor="white",
|
337 |
+
paper_bgcolor="white",
|
338 |
)
|
339 |
+
|
340 |
return fig
|
341 |
|
342 |
+
|
343 |
+
def create_category_comparison_plot(df: pd.DataFrame, track: str) -> go.Figure:
|
344 |
+
"""Create category-wise comparison plot."""
|
345 |
+
|
346 |
+
if df.empty:
|
347 |
fig = go.Figure()
|
348 |
+
fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
|
|
|
349 |
return fig
|
350 |
+
|
351 |
+
metric_col = f"{track}_quality"
|
352 |
+
adequate_col = f"{track}_adequate"
|
353 |
+
|
354 |
+
# Filter to adequate models
|
355 |
+
valid_models = df[df[adequate_col] & (df[metric_col] > 0)]
|
356 |
+
|
357 |
+
if valid_models.empty:
|
358 |
+
fig = go.Figure()
|
359 |
+
fig.add_annotation(
|
360 |
+
text="No adequate models found", x=0.5, y=0.5, showarrow=False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
361 |
)
|
362 |
+
return fig
|
363 |
+
|
364 |
+
fig = go.Figure()
|
365 |
+
|
366 |
+
# Create box plot for each category
|
367 |
+
for category, info in MODEL_CATEGORIES.items():
|
368 |
+
category_models = valid_models[valid_models["model_category"] == category]
|
369 |
+
|
370 |
+
if len(category_models) > 0:
|
371 |
+
fig.add_trace(
|
372 |
+
go.Box(
|
373 |
+
y=category_models[metric_col],
|
374 |
+
name=info["name"],
|
375 |
+
marker_color=info["color"],
|
376 |
+
boxpoints="all", # Show all points
|
377 |
+
jitter=0.3,
|
378 |
+
pointpos=-1.8,
|
379 |
+
hovertemplate=(
|
380 |
+
f"<b>{info['name']}</b><br>"
|
381 |
+
+ "Quality: %{y:.4f}<br>"
|
382 |
+
+ "Model: %{customdata}<br>"
|
383 |
+
+ "<extra></extra>"
|
384 |
+
),
|
385 |
+
customdata=category_models["model_name"],
|
386 |
+
)
|
387 |
+
)
|
388 |
+
|
389 |
+
# Customize layout
|
390 |
+
track_info = EVALUATION_TRACKS[track]
|
391 |
fig.update_layout(
|
392 |
+
title=f"📈 {track_info['name']} - Performance by Category",
|
393 |
+
xaxis_title="Model Category",
|
394 |
+
yaxis_title="Quality Score",
|
395 |
+
height=500,
|
396 |
+
showlegend=False,
|
397 |
+
plot_bgcolor="white",
|
398 |
+
paper_bgcolor="white",
|
399 |
)
|
400 |
+
|
401 |
return fig
|
402 |
|
403 |
+
|
404 |
+
def create_adequacy_analysis_plot(df: pd.DataFrame) -> go.Figure:
|
405 |
+
"""Create analysis plot for statistical adequacy across tracks."""
|
406 |
+
|
407 |
if df.empty:
|
408 |
fig = go.Figure()
|
409 |
fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
|
|
|
410 |
return fig
|
411 |
+
|
412 |
fig = make_subplots(
|
413 |
+
rows=2,
|
414 |
+
cols=2,
|
415 |
subplot_titles=(
|
416 |
+
"Sample Sizes by Track",
|
417 |
+
"Statistical Adequacy Distribution",
|
418 |
+
"Scientific Adequacy Scores",
|
419 |
+
"Model Categories Distribution",
|
420 |
),
|
421 |
+
specs=[
|
422 |
+
[{"type": "bar"}, {"type": "pie"}],
|
423 |
+
[{"type": "histogram"}, {"type": "bar"}],
|
424 |
+
],
|
425 |
)
|
426 |
+
|
427 |
+
# Sample sizes by track
|
428 |
+
track_names = []
|
429 |
+
sample_counts = []
|
430 |
+
|
431 |
+
for track in EVALUATION_TRACKS.keys():
|
432 |
+
samples_col = f"{track}_samples"
|
433 |
+
if samples_col in df.columns:
|
434 |
+
total_samples = df[df[samples_col] > 0][samples_col].sum()
|
435 |
+
track_names.append(track.replace("_", " ").title())
|
436 |
+
sample_counts.append(total_samples)
|
437 |
+
|
438 |
+
if track_names:
|
439 |
+
fig.add_trace(
|
440 |
+
go.Bar(x=track_names, y=sample_counts, name="Samples"), row=1, col=1
|
441 |
+
)
|
442 |
+
|
443 |
+
# Statistical adequacy distribution
|
444 |
+
adequacy_bins = pd.cut(
|
445 |
+
df["scientific_adequacy_score"],
|
446 |
+
bins=[0, 0.3, 0.6, 0.8, 1.0],
|
447 |
+
labels=["Poor", "Fair", "Good", "Excellent"],
|
448 |
)
|
449 |
+
adequacy_counts = adequacy_bins.value_counts()
|
450 |
+
|
451 |
+
if not adequacy_counts.empty:
|
452 |
+
fig.add_trace(
|
453 |
+
go.Pie(
|
454 |
+
labels=adequacy_counts.index,
|
455 |
+
values=adequacy_counts.values,
|
456 |
+
name="Adequacy",
|
457 |
+
),
|
458 |
+
row=1,
|
459 |
+
col=2,
|
460 |
+
)
|
461 |
+
|
462 |
+
# Scientific adequacy scores histogram
|
463 |
fig.add_trace(
|
464 |
+
go.Histogram(
|
465 |
+
x=df["scientific_adequacy_score"], nbinsx=20, name="Adequacy Scores"
|
|
|
|
|
|
|
|
|
466 |
),
|
467 |
+
row=2,
|
468 |
+
col=1,
|
469 |
)
|
470 |
+
|
471 |
+
# Model categories distribution
|
472 |
+
category_counts = df["model_category"].value_counts()
|
473 |
+
category_colors = [
|
474 |
+
MODEL_CATEGORIES.get(cat, {}).get("color", "#808080")
|
475 |
+
for cat in category_counts.index
|
476 |
+
]
|
477 |
+
|
478 |
fig.add_trace(
|
479 |
+
go.Bar(
|
480 |
+
x=category_counts.index,
|
481 |
+
y=category_counts.values,
|
482 |
+
marker_color=category_colors,
|
483 |
+
name="Categories",
|
|
|
484 |
),
|
485 |
+
row=2,
|
486 |
+
col=2,
|
487 |
)
|
488 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
489 |
fig.update_layout(
|
490 |
+
title="📊 Scientific Evaluation Analysis", height=800, showlegend=False
|
|
|
|
|
491 |
)
|
492 |
+
|
493 |
return fig
|
494 |
|
495 |
+
|
496 |
+
def create_cross_track_analysis_plot(df: pd.DataFrame) -> go.Figure:
|
497 |
+
"""Create cross-track performance correlation analysis."""
|
498 |
+
|
499 |
if df.empty:
|
500 |
fig = go.Figure()
|
501 |
fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
|
|
|
502 |
return fig
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
503 |
|
504 |
+
# Get models with data in multiple tracks
|
505 |
+
quality_cols = [f"{track}_quality" for track in EVALUATION_TRACKS.keys()]
|
506 |
+
available_cols = [col for col in quality_cols if col in df.columns]
|
507 |
+
|
508 |
+
if len(available_cols) < 2:
|
509 |
+
fig = go.Figure()
|
510 |
+
fig.add_annotation(
|
511 |
+
text="Need at least 2 tracks for comparison", x=0.5, y=0.5, showarrow=False
|
512 |
+
)
|
513 |
+
return fig
|
514 |
+
|
515 |
+
# Filter to models with data in multiple tracks
|
516 |
+
multi_track_models = df.copy()
|
517 |
+
for col in available_cols:
|
518 |
+
multi_track_models = multi_track_models[multi_track_models[col] > 0]
|
519 |
+
|
520 |
+
if len(multi_track_models) < 3:
|
521 |
+
fig = go.Figure()
|
522 |
+
fig.add_annotation(
|
523 |
+
text="Insufficient models for cross-track analysis",
|
524 |
+
x=0.5,
|
525 |
+
y=0.5,
|
526 |
+
showarrow=False,
|
527 |
+
)
|
528 |
+
return fig
|
529 |
+
|
530 |
+
# Create scatter plot matrix
|
531 |
+
track_pairs = [
|
532 |
+
(available_cols[i], available_cols[j])
|
533 |
+
for i in range(len(available_cols))
|
534 |
+
for j in range(i + 1, len(available_cols))
|
535 |
+
]
|
536 |
+
|
537 |
+
if not track_pairs:
|
538 |
fig = go.Figure()
|
539 |
fig.add_annotation(
|
540 |
+
text="No track pairs available", x=0.5, y=0.5, showarrow=False
|
|
|
541 |
)
|
|
|
542 |
return fig
|
543 |
+
|
544 |
+
# Use first pair for demonstration
|
545 |
+
x_col, y_col = track_pairs[0]
|
546 |
+
x_track = x_col.replace("_quality", "").replace("_", " ").title()
|
547 |
+
y_track = y_col.replace("_quality", "").replace("_", " ").title()
|
548 |
+
|
549 |
fig = go.Figure()
|
550 |
+
|
551 |
+
# Color by category
|
552 |
+
for category, info in MODEL_CATEGORIES.items():
|
553 |
+
category_models = multi_track_models[
|
554 |
+
multi_track_models["model_category"] == category
|
555 |
+
]
|
556 |
+
|
557 |
+
if len(category_models) > 0:
|
558 |
+
fig.add_trace(
|
559 |
+
go.Scatter(
|
560 |
+
x=category_models[x_col],
|
561 |
+
y=category_models[y_col],
|
562 |
+
mode="markers",
|
563 |
+
marker=dict(
|
564 |
+
size=10,
|
565 |
+
color=info["color"],
|
566 |
+
line=dict(color="black", width=1),
|
567 |
+
),
|
568 |
+
name=info["name"],
|
569 |
+
text=category_models["model_name"],
|
570 |
+
hovertemplate=(
|
571 |
+
"<b>%{text}</b><br>"
|
572 |
+
+ f"{x_track}: %{{x:.4f}}<br>"
|
573 |
+
+ f"{y_track}: %{{y:.4f}}<br>"
|
574 |
+
+ f"Category: {info['name']}<br>"
|
575 |
+
+ "<extra></extra>"
|
576 |
+
),
|
577 |
+
)
|
578 |
+
)
|
579 |
+
|
580 |
+
# Add diagonal line for reference
|
581 |
+
min_val = min(multi_track_models[x_col].min(), multi_track_models[y_col].min())
|
582 |
+
max_val = max(multi_track_models[x_col].max(), multi_track_models[y_col].max())
|
583 |
+
|
584 |
+
fig.add_trace(
|
585 |
+
go.Scatter(
|
586 |
+
x=[min_val, max_val],
|
587 |
+
y=[min_val, max_val],
|
588 |
+
mode="lines",
|
589 |
+
line=dict(dash="dash", color="gray", width=2),
|
590 |
+
name="Perfect Correlation",
|
591 |
+
showlegend=False,
|
592 |
+
hoverinfo="skip",
|
593 |
+
)
|
594 |
+
)
|
595 |
+
|
596 |
fig.update_layout(
|
597 |
+
title=f"🔄 Cross-Track Performance: {x_track} vs {y_track}",
|
598 |
+
xaxis_title=f"{x_track} Quality Score",
|
599 |
+
yaxis_title=f"{y_track} Quality Score",
|
600 |
+
height=600,
|
601 |
+
width=600,
|
602 |
+
plot_bgcolor="white",
|
603 |
+
paper_bgcolor="white",
|
604 |
)
|
605 |
+
|
606 |
return fig
|
607 |
|
608 |
+
|
609 |
+
def create_scientific_model_detail_plot(
|
610 |
+
model_results: Dict, model_name: str, track: str
|
611 |
+
) -> go.Figure:
|
612 |
+
"""Create detailed scientific analysis for a specific model."""
|
613 |
+
|
614 |
+
if not model_results or "tracks" not in model_results:
|
615 |
+
fig = go.Figure()
|
616 |
+
fig.add_annotation(
|
617 |
+
text="No model results available", x=0.5, y=0.5, showarrow=False
|
618 |
+
)
|
619 |
+
return fig
|
620 |
+
|
621 |
+
track_data = model_results["tracks"].get(track, {})
|
622 |
+
if track_data.get("error") or "pair_metrics" not in track_data:
|
623 |
fig = go.Figure()
|
624 |
+
fig.add_annotation(
|
625 |
+
text=f"No data for {track} track", x=0.5, y=0.5, showarrow=False
|
626 |
+
)
|
627 |
return fig
|
628 |
+
|
629 |
+
pair_metrics = track_data["pair_metrics"]
|
630 |
+
track_languages = EVALUATION_TRACKS[track]["languages"]
|
631 |
+
|
632 |
+
# Extract data for plotting
|
633 |
pairs = []
|
634 |
+
quality_means = []
|
635 |
+
quality_cis = []
|
636 |
+
bleu_means = []
|
637 |
sample_counts = []
|
638 |
+
|
639 |
+
for src in track_languages:
|
640 |
+
for tgt in track_languages:
|
641 |
+
if src == tgt:
|
642 |
+
continue
|
643 |
+
|
644 |
+
pair_key = f"{src}_to_{tgt}"
|
645 |
+
if pair_key in pair_metrics:
|
646 |
+
metrics = pair_metrics[pair_key]
|
647 |
+
|
648 |
+
if "quality_score" in metrics and "sample_count" in metrics:
|
649 |
+
pair_label = f"{LANGUAGE_NAMES.get(src, src)} → {LANGUAGE_NAMES.get(tgt, tgt)}"
|
650 |
+
pairs.append(pair_label)
|
651 |
+
|
652 |
+
quality_stats = metrics["quality_score"]
|
653 |
+
quality_means.append(quality_stats["mean"])
|
654 |
+
quality_cis.append(
|
655 |
+
[quality_stats["ci_lower"], quality_stats["ci_upper"]]
|
656 |
+
)
|
657 |
+
|
658 |
+
bleu_stats = metrics.get("bleu", {"mean": 0})
|
659 |
+
bleu_means.append(bleu_stats["mean"])
|
660 |
+
|
661 |
+
sample_counts.append(metrics["sample_count"])
|
662 |
+
|
663 |
if not pairs:
|
664 |
fig = go.Figure()
|
665 |
+
fig.add_annotation(
|
666 |
+
text="No language pair data available", x=0.5, y=0.5, showarrow=False
|
667 |
+
)
|
668 |
return fig
|
669 |
+
|
670 |
+
# Create subplots
|
671 |
fig = make_subplots(
|
672 |
+
rows=2,
|
673 |
+
cols=1,
|
674 |
subplot_titles=(
|
675 |
+
"Quality Scores by Language Pair (with 95% CI)",
|
676 |
+
"BLEU Scores by Language Pair",
|
677 |
),
|
678 |
vertical_spacing=0.15,
|
|
|
679 |
)
|
680 |
+
|
681 |
+
# Quality scores with confidence intervals
|
682 |
+
error_y = dict(
|
683 |
+
type="data",
|
684 |
+
array=[ci[1] - mean for ci, mean in zip(quality_cis, quality_means)],
|
685 |
+
arrayminus=[mean - ci[0] for ci, mean in zip(quality_cis, quality_means)],
|
686 |
+
visible=True,
|
687 |
+
thickness=2,
|
688 |
+
width=4,
|
689 |
+
)
|
690 |
+
|
691 |
fig.add_trace(
|
692 |
go.Bar(
|
693 |
x=pairs,
|
694 |
+
y=quality_means,
|
695 |
+
error_y=error_y,
|
696 |
+
name="Quality Score",
|
697 |
+
marker_color="steelblue",
|
698 |
+
text=[f"{score:.3f}" for score in quality_means],
|
699 |
+
textposition="outside",
|
700 |
+
hovertemplate=(
|
701 |
+
"<b>%{x}</b><br>"
|
702 |
+
+ "Quality: %{y:.4f}<br>"
|
703 |
+
+ "Samples: %{customdata}<br>"
|
704 |
+
+ "<extra></extra>"
|
705 |
+
),
|
706 |
+
customdata=sample_counts,
|
707 |
),
|
708 |
+
row=1,
|
709 |
+
col=1,
|
710 |
)
|
711 |
+
|
712 |
+
# BLEU scores
|
713 |
fig.add_trace(
|
714 |
go.Bar(
|
715 |
x=pairs,
|
716 |
+
y=bleu_means,
|
717 |
+
name="BLEU Score",
|
718 |
+
marker_color="coral",
|
719 |
+
text=[f"{score:.1f}" for score in bleu_means],
|
720 |
+
textposition="outside",
|
|
|
|
|
721 |
),
|
722 |
+
row=2,
|
723 |
+
col=1,
|
724 |
)
|
725 |
+
|
726 |
+
# Customize layout
|
727 |
+
track_info = EVALUATION_TRACKS[track]
|
728 |
fig.update_layout(
|
729 |
+
title=f"🔬 Detailed Analysis: {model_name} - {track_info['name']}",
|
730 |
height=900,
|
731 |
+
showlegend=False,
|
732 |
+
margin=dict(l=50, r=50, t=100, b=150),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
733 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
734 |
|
735 |
+
# Rotate x-axis labels
|
736 |
+
fig.update_xaxes(tickangle=45, row=1, col=1)
|
737 |
+
fig.update_xaxes(tickangle=45, row=2, col=1)
|
738 |
+
|
739 |
+
return fig
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|