Spaces:
Running
Running
File size: 4,269 Bytes
1abade4 b78ec70 1abade4 b78ec70 1abade4 b78ec70 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
# src/utils.py
import re
import datetime
import pandas as pd
from typing import Dict, List, Tuple, Set
from config import ALL_UG40_LANGUAGES, LANGUAGE_NAMES, GOOGLE_SUPPORTED_LANGUAGES
def get_all_language_pairs() -> List[Tuple[str, str]]:
"""Get all possible UG40 language pairs."""
pairs = []
for src in ALL_UG40_LANGUAGES:
for tgt in ALL_UG40_LANGUAGES:
if src != tgt:
pairs.append((src, tgt))
return pairs
def get_google_comparable_pairs() -> List[Tuple[str, str]]:
"""Get language pairs that can be compared with Google Translate."""
pairs = []
for src in GOOGLE_SUPPORTED_LANGUAGES:
for tgt in GOOGLE_SUPPORTED_LANGUAGES:
if src != tgt:
pairs.append((src, tgt))
return pairs
def format_language_pair(src: str, tgt: str) -> str:
"""Format language pair for display."""
src_name = LANGUAGE_NAMES.get(src, src)
tgt_name = LANGUAGE_NAMES.get(tgt, tgt)
return f"{src_name} → {tgt_name}"
def validate_language_code(lang: str) -> bool:
"""Validate if language code is supported."""
return lang in ALL_UG40_LANGUAGES
def create_submission_id() -> str:
"""Create unique submission ID."""
return datetime.datetime.now().strftime("%Y%m%d_%H%M%S_%f")[:-3]
def sanitize_model_name(name: str) -> str:
"""Sanitize model name for display."""
if not name:
return "Anonymous Model"
# Remove special characters, limit length
name = re.sub(r'[^\w\-.]', '_', name.strip())
return name[:50]
def format_metric_value(value: float, metric: str) -> str:
"""Format metric value for display."""
if metric in ['bleu']:
return f"{value:.2f}"
elif metric in ['cer', 'wer'] and value > 1:
return f"{min(value, 1.0):.4f}" # Cap error rates at 1.0
else:
return f"{value:.4f}"
def get_language_pair_stats(test_data: pd.DataFrame) -> Dict[str, Dict]:
"""Get statistics about language pair coverage in test data."""
stats = {}
for src in ALL_UG40_LANGUAGES:
for tgt in ALL_UG40_LANGUAGES:
if src != tgt:
pair_data = test_data[
(test_data['source_language'] == src) &
(test_data['target_language'] == tgt)
]
stats[f"{src}_{tgt}"] = {
'count': len(pair_data),
'google_comparable': src in GOOGLE_SUPPORTED_LANGUAGES and tgt in GOOGLE_SUPPORTED_LANGUAGES,
'display_name': format_language_pair(src, tgt)
}
return stats
def validate_submission_completeness(predictions: pd.DataFrame, test_set: pd.DataFrame) -> Dict:
"""Validate that submission covers all required samples."""
required_ids = set(test_set['sample_id'].astype(str))
provided_ids = set(predictions['sample_id'].astype(str))
missing_ids = required_ids - provided_ids
extra_ids = provided_ids - required_ids
return {
'is_complete': len(missing_ids) == 0,
'missing_count': len(missing_ids),
'extra_count': len(extra_ids),
'missing_ids': list(missing_ids)[:10], # First 10 for display
'coverage': len(provided_ids & required_ids) / len(required_ids)
}
def calculate_language_pair_coverage(predictions: pd.DataFrame, test_set: pd.DataFrame) -> Dict:
"""Calculate coverage by language pair."""
# Merge to get language info
merged = test_set.merge(predictions, on='sample_id', how='left', suffixes=('', '_pred'))
coverage = {}
for src in ALL_UG40_LANGUAGES:
for tgt in ALL_UG40_LANGUAGES:
if src != tgt:
pair_data = merged[
(merged['source_language'] == src) &
(merged['target_language'] == tgt)
]
if len(pair_data) > 0:
predicted_count = pair_data['prediction'].notna().sum()
coverage[f"{src}_{tgt}"] = {
'total': len(pair_data),
'predicted': predicted_count,
'coverage': predicted_count / len(pair_data)
}
return coverage |