Spaces:

akera
/

leaderboard

Sleeping

App Files Files Community

akera commited on Jun 12

Commit

b78ec70

verified ·

1 Parent(s): aa99a22

Update src/utils.py

Browse files

Files changed (1) hide show

src/utils.py +110 -50

src/utils.py CHANGED Viewed

@@ -1,57 +1,117 @@
 # src/utils.py
 import re
 import datetime
-from typing import Dict, List, Any
-import salt.constants
-def get_language_name(lang_code: str) -> str:
-    """Get full language name from ISO code."""
-    if lang_code is None:
-        return "Unknown"
-    return salt.constants.SALT_LANGUAGE_NAMES.get(lang_code, str(lang_code))
-def format_model_name(model_path: str) -> str:
-    """Format model name for display in leaderboard."""
-    if model_path == 'google-translate':
-        return 'Google Translate'
-    # Extract model name from HuggingFace path
-    if '/' in model_path:
-        return model_path.split('/')[-1]
-    return model_path
-def validate_model_path(model_path: str) -> bool:
-    """Validate if model path is supported."""
-    if model_path == 'google-translate':
-        return True
-    # Check if it's a valid HuggingFace model path format
-    pattern = r'^[a-zA-Z0-9._-]+/[a-zA-Z0-9._-]+$'
-    return bool(re.match(pattern, model_path)) or not '/' in model_path
-def get_model_type(model_path: str) -> str:
-    """Determine model type from path."""
-    model_path_lower = model_path.lower()
-    if model_path == 'google-translate':
-        return 'google-translate'
-    elif 'gemma' in model_path_lower:
-        return 'gemma'
-    elif 'qwen' in model_path_lower:
-        return 'qwen'
-    elif 'llama' in model_path_lower:
-        return 'llama'
-    elif 'nllb' in model_path_lower:
-        return 'nllb'
-    else:
-        return 'other'
 def create_submission_id() -> str:
     """Create unique submission ID."""
-    return datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
-def sanitize_input(text: str) -> str:
-    """Sanitize user input."""
-    if not text:
-        return ""
-    return re.sub(r'[^\w\-./]', '', text.strip())

 # src/utils.py
 import re
 import datetime
+import pandas as pd
+from typing import Dict, List, Tuple, Set
+from config import ALL_UG40_LANGUAGES, LANGUAGE_NAMES, GOOGLE_SUPPORTED_LANGUAGES
+def get_all_language_pairs() -> List[Tuple[str, str]]:
+    """Get all possible UG40 language pairs."""
+    pairs = []
+    for src in ALL_UG40_LANGUAGES:
+        for tgt in ALL_UG40_LANGUAGES:
+            if src != tgt:
+                pairs.append((src, tgt))
+    return pairs
+def get_google_comparable_pairs() -> List[Tuple[str, str]]:
+    """Get language pairs that can be compared with Google Translate."""
+    pairs = []
+    for src in GOOGLE_SUPPORTED_LANGUAGES:
+        for tgt in GOOGLE_SUPPORTED_LANGUAGES:
+            if src != tgt:
+                pairs.append((src, tgt))
+    return pairs
+def format_language_pair(src: str, tgt: str) -> str:
+    """Format language pair for display."""
+    src_name = LANGUAGE_NAMES.get(src, src)
+    tgt_name = LANGUAGE_NAMES.get(tgt, tgt)
+    return f"{src_name} → {tgt_name}"
+def validate_language_code(lang: str) -> bool:
+    """Validate if language code is supported."""
+    return lang in ALL_UG40_LANGUAGES
 def create_submission_id() -> str:
     """Create unique submission ID."""
+    return datetime.datetime.now().strftime("%Y%m%d_%H%M%S_%f")[:-3]
+def sanitize_model_name(name: str) -> str:
+    """Sanitize model name for display."""
+    if not name:
+        return "Anonymous Model"
+    # Remove special characters, limit length
+    name = re.sub(r'[^\w\-.]', '_', name.strip())
+    return name[:50]
+def format_metric_value(value: float, metric: str) -> str:
+    """Format metric value for display."""
+    if metric in ['bleu']:
+        return f"{value:.2f}"
+    elif metric in ['cer', 'wer'] and value > 1:
+        return f"{min(value, 1.0):.4f}"  # Cap error rates at 1.0
+    else:
+        return f"{value:.4f}"
+def get_language_pair_stats(test_data: pd.DataFrame) -> Dict[str, Dict]:
+    """Get statistics about language pair coverage in test data."""
+    stats = {}
+    for src in ALL_UG40_LANGUAGES:
+        for tgt in ALL_UG40_LANGUAGES:
+            if src != tgt:
+                pair_data = test_data[
+                    (test_data['source_language'] == src) &
+                    (test_data['target_language'] == tgt)
+                ]
+                stats[f"{src}_{tgt}"] = {
+                    'count': len(pair_data),
+                    'google_comparable': src in GOOGLE_SUPPORTED_LANGUAGES and tgt in GOOGLE_SUPPORTED_LANGUAGES,
+                    'display_name': format_language_pair(src, tgt)
+                }
+    return stats
+def validate_submission_completeness(predictions: pd.DataFrame, test_set: pd.DataFrame) -> Dict:
+    """Validate that submission covers all required samples."""
+    required_ids = set(test_set['sample_id'].astype(str))
+    provided_ids = set(predictions['sample_id'].astype(str))
+    missing_ids = required_ids - provided_ids
+    extra_ids = provided_ids - required_ids
+    return {
+        'is_complete': len(missing_ids) == 0,
+        'missing_count': len(missing_ids),
+        'extra_count': len(extra_ids),
+        'missing_ids': list(missing_ids)[:10],  # First 10 for display
+        'coverage': len(provided_ids & required_ids) / len(required_ids)
+    }
+def calculate_language_pair_coverage(predictions: pd.DataFrame, test_set: pd.DataFrame) -> Dict:
+    """Calculate coverage by language pair."""
+    # Merge to get language info
+    merged = test_set.merge(predictions, on='sample_id', how='left', suffixes=('', '_pred'))
+    coverage = {}
+    for src in ALL_UG40_LANGUAGES:
+        for tgt in ALL_UG40_LANGUAGES:
+            if src != tgt:
+                pair_data = merged[
+                    (merged['source_language'] == src) &
+                    (merged['target_language'] == tgt)
+                ]
+                if len(pair_data) > 0:
+                    predicted_count = pair_data['prediction'].notna().sum()
+                    coverage[f"{src}_{tgt}"] = {
+                        'total': len(pair_data),
+                        'predicted': predicted_count,
+                        'coverage': predicted_count / len(pair_data)
+                    }
+    return coverage