Spaces:

akera
/

leaderboard

Sleeping

App Files Files Community

akera commited on Jun 12

Commit

4fa2f10

verified ·

1 Parent(s): c1926c2

Rename src/evaluation.py to src/validation.py

Browse files

Files changed (2) hide show

src/evaluation.py +0 -413
src/validation.py +274 -0

src/evaluation.py DELETED Viewed

@@ -1,413 +0,0 @@
-# src/evaluation.py
-import torch
-import numpy as np
-from tqdm.auto import tqdm
-from sacrebleu.metrics import BLEU, CHRF
-from rouge_score import rouge_scorer
-import Levenshtein
-from collections import defaultdict
-from transformers.models.whisper.english_normalizer import BasicTextNormalizer
-import salt.constants
-import datetime
-import os
-from google.cloud import translate_v3
-from config import GOOGLE_LANG_MAP
-def setup_google_translate():
-    """Setup Google Cloud Translation client if credentials available."""
-    try:
-        # Check if running in HF Space with credentials
-        if os.getenv("GOOGLE_APPLICATION_CREDENTIALS") or os.getenv("GOOGLE_CLOUD_PROJECT"):
-            client = translate_v3.TranslationServiceClient()
-            project_id = os.getenv("GOOGLE_CLOUD_PROJECT", "sb-gcp-project-01")
-            parent = f"projects/{project_id}/locations/global"
-            return client, parent
-        else:
-            print("Google Cloud credentials not found. Google Translate will not be available.")
-            return None, None
-    except Exception as e:
-        print(f"Error setting up Google Translate: {e}")
-        return None, None
-def google_translate_batch(texts, source_langs, target_langs, client, parent):
-    """Translate using Google Cloud Translation API."""
-    translations = []
-    for text, src_lang, tgt_lang in tqdm(zip(texts, source_langs, target_langs),
-                                       total=len(texts), desc="Google Translate"):
-        try:
-            # Map SALT language codes to Google's format
-            src_google = GOOGLE_LANG_MAP.get(src_lang, src_lang)
-            tgt_google = GOOGLE_LANG_MAP.get(tgt_lang, tgt_lang)
-            # Check if language pair is supported
-            supported_langs = ['lg', 'ach', 'sw', 'en']
-            if src_google not in supported_langs or tgt_google not in supported_langs:
-                translations.append(f"[UNSUPPORTED: {src_lang}->{tgt_lang}]")
-                continue
-            # Make translation request
-            request = {
-                "parent": parent,
-                "contents": [text],
-                "mime_type": "text/plain",
-                "source_language_code": src_google,
-                "target_language_code": tgt_google,
-            }
-            response = client.translate_text(request=request)
-            translation = response.translations[0].translated_text
-            translations.append(translation)
-        except Exception as e:
-            print(f"Error translating '{text}': {e}")
-            translations.append(f"[ERROR: {str(e)[:50]}]")
-    return translations
-def get_translation_function(model, tokenizer, model_path):
-    """Get appropriate translation function based on model type."""
-    if model_path == 'google-translate':
-        client, parent = setup_google_translate()
-        if client is None:
-            raise Exception("Google Translate credentials not available")
-        def translation_fn(texts, from_langs, to_langs):
-            return google_translate_batch(texts, from_langs, to_langs, client, parent)
-        return translation_fn
-    elif 'gemma' in str(type(model)).lower() or 'gemma' in model_path.lower():
-        return get_gemma_translation_fn(model, tokenizer)
-    elif hasattr(model, 'base_model') and hasattr(model.base_model, 'model') and 'Qwen2ForCausalLM' in str(type(model.base_model.model)):
-        return get_qwen_translation_fn(model, tokenizer)
-    elif 'm2m_100' in str(type(model)).lower():
-        return get_nllb_translation_fn(model, tokenizer)
-    elif hasattr(model, 'base_model') and hasattr(model.base_model, 'model') and 'LlamaForCausalLM' in str(type(model.base_model.model)):
-        return get_llama_translation_fn(model, tokenizer)
-    else:
-        # Generic function for other models
-        return get_generic_translation_fn(model, tokenizer)
-def get_gemma_translation_fn(model, tokenizer):
-    """Translation function for Gemma models."""
-    def translation_fn(texts, from_langs, to_langs):
-        SYSTEM_MESSAGE = 'You are a linguist and translation assistant specialising in Ugandan languages.'
-        translations = []
-        batch_size = 4
-        device = next(model.parameters()).device
-        instructions = [
-            f'Translate from {salt.constants.SALT_LANGUAGE_NAMES[from_lang]} '
-            f'to {salt.constants.SALT_LANGUAGE_NAMES[to_lang]}: {text}'
-            for text, from_lang, to_lang in zip(texts, from_langs, to_langs)
-        ]
-        for i in tqdm(range(0, len(instructions), batch_size), desc="Generating translations"):
-            batch_instructions = instructions[i:i + batch_size]
-            messages_list = [
-                [
-                    {"role": "system", "content": SYSTEM_MESSAGE},
-                    {"role": "user", "content": instruction}
-                ] for instruction in batch_instructions
-            ]
-            prompts = [
-                tokenizer.apply_chat_template(
-                    messages, tokenize=False, add_generation_prompt=True
-                ) for messages in messages_list
-            ]
-            inputs = tokenizer(
-                prompts, return_tensors="pt",
-                padding=True, padding_side='left',
-                max_length=512, truncation=True
-            ).to(device)
-            with torch.no_grad():
-                outputs = model.generate(
-                    **inputs,
-                    max_new_tokens=100,
-                    temperature=0.5,
-                    num_beams=5,
-                    do_sample=True,
-                    no_repeat_ngram_size=5,
-                    pad_token_id=tokenizer.eos_token_id
-                )
-            for j in range(len(outputs)):
-                translation = tokenizer.decode(
-                    outputs[j, inputs['input_ids'].shape[1]:],
-                    skip_special_tokens=True
-                )
-                translations.append(translation)
-        return translations
-    return translation_fn
-def get_qwen_translation_fn(model, tokenizer):
-    """Translation function for Qwen models."""
-    def translation_fn(texts, from_langs, to_langs):
-        SYSTEM_MESSAGE = 'You are a Ugandan language assistant.'
-        translations = []
-        batch_size = 8
-        device = next(model.parameters()).device
-        instructions = [
-            f'Translate from {salt.constants.SALT_LANGUAGE_NAMES.get(from_lang, from_lang)} '
-            f'to {salt.constants.SALT_LANGUAGE_NAMES.get(to_lang, to_lang)}: {text}'
-            for text, from_lang, to_lang in zip(texts, from_langs, to_langs)
-        ]
-        for i in tqdm(range(0, len(instructions), batch_size), desc="Generating translations"):
-            batch_instructions = instructions[i:i + batch_size]
-            messages_list = [
-                [
-                    {"role": "system", "content": SYSTEM_MESSAGE},
-                    {"role": "user", "content": instruction}
-                ] for instruction in batch_instructions
-            ]
-            prompts = [
-                tokenizer.apply_chat_template(
-                    messages, tokenize=False, add_generation_prompt=True
-                ) for messages in messages_list
-            ]
-            inputs = tokenizer(
-                prompts, return_tensors="pt",
-                padding=True, padding_side='left', truncation=True
-            ).to(device)
-            with torch.no_grad():
-                outputs = model.generate(
-                    **inputs, max_new_tokens=100,
-                    temperature=0.01,
-                    pad_token_id=tokenizer.eos_token_id
-                )
-            for j in range(len(outputs)):
-                translation = tokenizer.decode(
-                    outputs[j, inputs['input_ids'].shape[1]:],
-                    skip_special_tokens=True
-                )
-                translations.append(translation)
-        return translations
-    return translation_fn
-def get_nllb_translation_fn(model, tokenizer):
-    """Translation function for NLLB models."""
-    def translation_fn(texts, source_langs, target_langs):
-        translations = []
-        language_tokens = salt.constants.SALT_LANGUAGE_TOKENS_NLLB_TRANSLATION
-        device = next(model.parameters()).device
-        for text, source_language, target_language in tqdm(
-            zip(texts, source_langs, target_langs), total=len(texts), desc="NLLB Translation"):
-            inputs = tokenizer(text, return_tensors="pt").to(device)
-            inputs['input_ids'][0][0] = language_tokens[source_language]
-            with torch.no_grad():
-                translated_tokens = model.generate(
-                    **inputs,
-                    forced_bos_token_id=language_tokens[target_language],
-                    max_length=100,
-                    num_beams=5,
-                )
-            result = tokenizer.batch_decode(
-                translated_tokens, skip_special_tokens=True)[0]
-            translations.append(result)
-        return translations
-    return translation_fn
-def get_llama_translation_fn(model, tokenizer):
-    """Translation function for Llama models."""
-    def translation_fn(texts, from_langs, to_langs):
-        DATE_TODAY = datetime.datetime.now().strftime("%d %b %Y")
-        SYSTEM_MESSAGE = ''
-        translations = []
-        batch_size = 8
-        device = next(model.parameters()).device
-        instructions = [
-            f'Translate from {salt.constants.SALT_LANGUAGE_NAMES.get(from_lang, from_lang)} '
-            f'to {salt.constants.SALT_LANGUAGE_NAMES.get(to_lang, to_lang)}: {text}'
-            for text, from_lang, to_lang in zip(texts, from_langs, to_langs)
-        ]
-        for i in tqdm(range(0, len(instructions), batch_size), desc="Llama Translation"):
-            batch_instructions = instructions[i:i + batch_size]
-            messages_list = [
-                [
-                    {"role": "system", "content": SYSTEM_MESSAGE},
-                    {"role": "user", "content": instruction}
-                ] for instruction in batch_instructions
-            ]
-            prompts = [
-                tokenizer.apply_chat_template(
-                    messages, tokenize=False, add_generation_prompt=True,
-                    date_string=DATE_TODAY,
-                ) for messages in messages_list
-            ]
-            inputs = tokenizer(
-                prompts, return_tensors="pt",
-                padding=True, padding_side='left',
-            ).to(device)
-            with torch.no_grad():
-                outputs = model.generate(
-                    **inputs, max_new_tokens=100,
-                    temperature=0.01,
-                    pad_token_id=tokenizer.eos_token_id
-                )
-            for j in range(len(outputs)):
-                translation = tokenizer.decode(
-                    outputs[j, inputs['input_ids'].shape[1]:],
-                    skip_special_tokens=True
-                )
-                translations.append(translation)
-        return translations
-    return translation_fn
-def get_generic_translation_fn(model, tokenizer):
-    """Generic translation function for unknown model types."""
-    def translation_fn(texts, from_langs, to_langs):
-        translations = []
-        device = next(model.parameters()).device
-        for text, from_lang, to_lang in tqdm(zip(texts, from_langs, to_langs),
-                                           desc="Generic Translation"):
-            prompt = f"Translate from {from_lang} to {to_lang}: {text}"
-            inputs = tokenizer(prompt, return_tensors="pt").to(device)
-            with torch.no_grad():
-                outputs = model.generate(
-                    **inputs,
-                    max_new_tokens=100,
-                    temperature=0.7,
-                    pad_token_id=tokenizer.eos_token_id
-                )
-            translation = tokenizer.decode(
-                outputs[0, inputs['input_ids'].shape[1]:],
-                skip_special_tokens=True
-            )
-            translations.append(translation)
-        return translations
-    return translation_fn
-def calculate_metrics(reference: str, prediction: str) -> dict:
-    """Calculate multiple translation quality metrics."""
-    bleu = BLEU(effective_order=True)
-    bleu_score = bleu.sentence_score(prediction, [reference]).score
-    chrf = CHRF()
-    chrf_score = chrf.sentence_score(prediction, [reference]).score / 100.0
-    cer = Levenshtein.distance(reference, prediction) / max(len(reference), 1)
-    ref_words = reference.split()
-    pred_words = prediction.split()
-    wer = Levenshtein.distance(ref_words, pred_words) / max(len(ref_words), 1)
-    len_ratio = len(prediction) / max(len(reference), 1)
-    metrics = {
-        "bleu": bleu_score,
-        "chrf": chrf_score,
-        "cer": cer,
-        "wer": wer,
-        "len_ratio": len_ratio,
-    }
-    try:
-        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
-        rouge_scores = scorer.score(reference, prediction)
-        metrics["rouge1"] = rouge_scores['rouge1'].fmeasure
-        metrics["rouge2"] = rouge_scores['rouge2'].fmeasure
-        metrics["rougeL"] = rouge_scores['rougeL'].fmeasure
-        metrics["quality_score"] = (
-            bleu_score/100 +
-            chrf_score +
-            (1-cer) +
-            (1-wer) +
-            rouge_scores['rouge1'].fmeasure +
-            rouge_scores['rougeL'].fmeasure
-        ) / 6
-    except Exception as e:
-        print(f"Error calculating ROUGE metrics: {e}")
-        metrics["quality_score"] = (bleu_score/100 + chrf_score + (1-cer) + (1-wer)) / 4
-    return metrics
-def evaluate_model_full(model, tokenizer, model_path: str, test_data) -> dict:
-    """Complete model evaluation pipeline."""
-    # Get translation function
-    translation_fn = get_translation_function(model, tokenizer, model_path)
-    # Generate predictions
-    print("Generating translations...")
-    predictions = translation_fn(
-        list(test_data['source']),
-        list(test_data['source.language']),
-        list(test_data['target.language']),
-    )
-    # Calculate metrics by language pair
-    print("Calculating metrics...")
-    translation_subsets = defaultdict(list)
-    for idx, row in test_data.iterrows():
-        direction = row['source.language'] + '_to_' + row['target.language']
-        row_dict = dict(row)
-        row_dict['prediction'] = predictions[idx]
-        translation_subsets[direction].append(row_dict)
-    normalizer = BasicTextNormalizer()
-    grouped_metrics = defaultdict(dict)
-    for subset in translation_subsets.keys():
-        subset_metrics = defaultdict(list)
-        for example in translation_subsets[subset]:
-            prediction = normalizer(str(example['prediction']))
-            reference = normalizer(example['target'])
-            metrics = calculate_metrics(reference, prediction)
-            for m in metrics.keys():
-                subset_metrics[m].append(metrics[m])
-        for m in subset_metrics.keys():
-            if subset_metrics[m]:  # Check if list is not empty
-                grouped_metrics[subset][m] = float(np.mean(subset_metrics[m]))
-    # Calculate overall averages
-    all_metrics = list(grouped_metrics.values())[0].keys() if grouped_metrics else []
-    for m in all_metrics:
-        metric_values = []
-        for subset in translation_subsets.keys():
-            if m in grouped_metrics[subset]:
-                metric_values.append(grouped_metrics[subset][m])
-        if metric_values:
-            grouped_metrics['averages'][m] = float(np.mean(metric_values))
-    return dict(grouped_metrics)

src/validation.py ADDED Viewed

	@@ -0,0 +1,274 @@

+# src/validation.py
+import pandas as pd
+import numpy as np
+from typing import Dict, List, Tuple, Optional
+import json
+import io
+from config import PREDICTION_FORMAT
+def validate_file_format(file_content: bytes, filename: str) -> Dict:
+    """Validate uploaded file format and structure."""
+    try:
+        # Determine file type
+        if filename.endswith('.csv'):
+            df = pd.read_csv(io.BytesIO(file_content))
+        elif filename.endswith('.tsv'):
+            df = pd.read_csv(io.BytesIO(file_content), sep='\t')
+        elif filename.endswith('.json'):
+            data = json.loads(file_content.decode('utf-8'))
+            df = pd.DataFrame(data)
+        else:
+            return {
+                'valid': False,
+                'error': f"Unsupported file type. Use: {', '.join(PREDICTION_FORMAT['file_types'])}"
+            }
+        # Check required columns
+        missing_cols = set(PREDICTION_FORMAT['required_columns']) - set(df.columns)
+        if missing_cols:
+            return {
+                'valid': False,
+                'error': f"Missing required columns: {', '.join(missing_cols)}"
+            }
+        # Basic data validation
+        if len(df) == 0:
+            return {
+                'valid': False,
+                'error': "File is empty"
+            }
+        # Check for required data
+        if df['sample_id'].isna().any():
+            return {
+                'valid': False,
+                'error': "Missing sample_id values found"
+            }
+        if df['prediction'].isna().any():
+            na_count = df['prediction'].isna().sum()
+            return {
+                'valid': False,
+                'error': f"Missing prediction values found ({na_count} empty predictions)"
+            }
+        # Check for duplicates
+        duplicates = df['sample_id'].duplicated()
+        if duplicates.any():
+            dup_count = duplicates.sum()
+            return {
+                'valid': False,
+                'error': f"Duplicate sample_id values found ({dup_count} duplicates)"
+            }
+        return {
+            'valid': True,
+            'dataframe': df,
+            'row_count': len(df),
+            'columns': list(df.columns)
+        }
+    except Exception as e:
+        return {
+            'valid': False,
+            'error': f"Error parsing file: {str(e)}"
+        }
+def validate_predictions_content(predictions: pd.DataFrame) -> Dict:
+    """Validate prediction content quality."""
+    issues = []
+    warnings = []
+    # Check prediction text quality
+    empty_predictions = predictions['prediction'].str.strip().eq('').sum()
+    if empty_predictions > 0:
+        issues.append(f"{empty_predictions} empty predictions found")
+    # Check for suspiciously short predictions
+    short_predictions = (predictions['prediction'].str.len() < 3).sum()
+    if short_predictions > len(predictions) * 0.1:  # More than 10%
+        warnings.append(f"{short_predictions} very short predictions (< 3 characters)")
+    # Check for suspiciously long predictions
+    long_predictions = (predictions['prediction'].str.len() > 500).sum()
+    if long_predictions > 0:
+        warnings.append(f"{long_predictions} very long predictions (> 500 characters)")
+    # Check for repeated predictions
+    duplicate_predictions = predictions['prediction'].duplicated().sum()
+    if duplicate_predictions > len(predictions) * 0.5:  # More than 50%
+        warnings.append(f"{duplicate_predictions} duplicate prediction texts")
+    # Check for non-text content
+    non_text_pattern = r'^[A-Za-z\s\'".,!?;:()\-]+$'
+    non_text_predictions = ~predictions['prediction'].str.match(non_text_pattern, na=False)
+    if non_text_predictions.sum() > 0:
+        warnings.append(f"{non_text_predictions.sum()} predictions contain unusual characters")
+    return {
+        'has_issues': len(issues) > 0,
+        'issues': issues,
+        'warnings': warnings,
+        'quality_score': max(0, 1.0 - len(issues) * 0.2 - len(warnings) * 0.1)
+    }
+def validate_against_test_set(predictions: pd.DataFrame, test_set: pd.DataFrame) -> Dict:
+    """Validate predictions against the official test set."""
+    # Convert IDs to string for comparison
+    pred_ids = set(predictions['sample_id'].astype(str))
+    test_ids = set(test_set['sample_id'].astype(str))
+    # Check coverage
+    missing_ids = test_ids - pred_ids
+    extra_ids = pred_ids - test_ids
+    matching_ids = pred_ids & test_ids
+    coverage = len(matching_ids) / len(test_ids)
+    # Detailed coverage by language pair
+    pair_coverage = {}
+    for _, row in test_set.iterrows():
+        pair_key = f"{row['source_language']}_{row['target_language']}"
+        if pair_key not in pair_coverage:
+            pair_coverage[pair_key] = {'total': 0, 'covered': 0}
+        pair_coverage[pair_key]['total'] += 1
+        if str(row['sample_id']) in pred_ids:
+            pair_coverage[pair_key]['covered'] += 1
+    # Calculate pair-wise coverage rates
+    for pair_key in pair_coverage:
+        pair_info = pair_coverage[pair_key]
+        pair_info['coverage_rate'] = pair_info['covered'] / pair_info['total']
+    return {
+        'overall_coverage': coverage,
+        'missing_count': len(missing_ids),
+        'extra_count': len(extra_ids),
+        'matching_count': len(matching_ids),
+        'is_complete': coverage == 1.0,
+        'pair_coverage': pair_coverage,
+        'missing_ids_sample': list(missing_ids)[:10],  # First 10 for display
+        'extra_ids_sample': list(extra_ids)[:10]
+    }
+def generate_validation_report(
+    format_result: Dict,
+    content_result: Dict,
+    test_set_result: Dict,
+    model_name: str = ""
+) -> str:
+    """Generate human-readable validation report."""
+    report = []
+    # Header
+    report.append(f"# Validation Report: {model_name or 'Submission'}")
+    report.append(f"Generated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}")
+    report.append("")
+    # File format validation
+    if format_result['valid']:
+        report.append("✅ **File Format**: Valid")
+        report.append(f"   - Rows: {format_result['row_count']:,}")
+        report.append(f"   - Columns: {', '.join(format_result['columns'])}")
+    else:
+        report.append("❌ **File Format**: Invalid")
+        report.append(f"   - Error: {format_result['error']}")
+        return "\n".join(report)
+    # Content validation
+    if content_result['has_issues']:
+        report.append("⚠️ **Content Quality**: Issues Found")
+        for issue in content_result['issues']:
+            report.append(f"   - ❌ {issue}")
+    else:
+        report.append("✅ **Content Quality**: Good")
+    if content_result['warnings']:
+        for warning in content_result['warnings']:
+            report.append(f"   - ⚠️ {warning}")
+    # Test set validation
+    coverage = test_set_result['overall_coverage']
+    if coverage == 1.0:
+        report.append("✅ **Test Set Coverage**: Complete")
+    elif coverage >= 0.95:
+        report.append("⚠️ **Test Set Coverage**: Nearly Complete")
+    else:
+        report.append("❌ **Test Set Coverage**: Incomplete")
+    report.append(f"   - Coverage: {coverage:.1%} ({test_set_result['matching_count']:,} / {test_set_result['matching_count'] + test_set_result['missing_count']:,})")
+    if test_set_result['missing_count'] > 0:
+        report.append(f"   - Missing: {test_set_result['missing_count']:,} samples")
+    if test_set_result['extra_count'] > 0:
+        report.append(f"   - Extra: {test_set_result['extra_count']:,} samples")
+    # Language pair coverage
+    pair_cov = test_set_result['pair_coverage']
+    incomplete_pairs = [k for k, v in pair_cov.items() if v['coverage_rate'] < 1.0]
+    if incomplete_pairs:
+        report.append("")
+        report.append("**Incomplete Language Pairs:**")
+        for pair in incomplete_pairs[:5]:  # Show first 5
+            info = pair_cov[pair]
+            src, tgt = pair.split('_')
+            report.append(f"   - {src}→{tgt}: {info['covered']}/{info['total']} ({info['coverage_rate']:.1%})")
+        if len(incomplete_pairs) > 5:
+            report.append(f"   - ... and {len(incomplete_pairs) - 5} more pairs")
+    # Final verdict
+    report.append("")
+    if format_result['valid'] and coverage >= 0.95 and not content_result['has_issues']:
+        report.append("🎉 **Overall**: Ready for evaluation!")
+    elif format_result['valid'] and coverage >= 0.8:
+        report.append("⚠️ **Overall**: Can be evaluated with warnings")
+    else:
+        report.append("❌ **Overall**: Please fix issues before submission")
+    return "\n".join(report)
+def validate_submission_complete(file_content: bytes, filename: str, test_set: pd.DataFrame, model_name: str = "") -> Dict:
+    """Complete validation pipeline for a submission."""
+    # Step 1: File format validation
+    format_result = validate_file_format(file_content, filename)
+    if not format_result['valid']:
+        return {
+            'valid': False,
+            'report': generate_validation_report(format_result, {}, {}, model_name),
+            'predictions': None
+        }
+    predictions = format_result['dataframe']
+    # Step 2: Content validation
+    content_result = validate_predictions_content(predictions)
+    # Step 3: Test set validation
+    test_set_result = validate_against_test_set(predictions, test_set)
+    # Step 4: Generate report
+    report = generate_validation_report(format_result, content_result, test_set_result, model_name)
+    # Overall validity
+    is_valid = (
+        format_result['valid'] and
+        not content_result['has_issues'] and
+        test_set_result['overall_coverage'] >= 0.95
+    )
+    return {
+        'valid': is_valid,
+        'coverage': test_set_result['overall_coverage'],
+        'report': report,
+        'predictions': predictions,
+        'pair_coverage': test_set_result['pair_coverage']
+    }