Spaces:

akera
/

leaderboard

Sleeping

App Files Files Community

akera commited on Jun 12

Commit

57c7739

verified ·

1 Parent(s): 7a9f2cb

Update src/test_set.py

Browse files

Files changed (1) hide show

src/test_set.py +164 -162

src/test_set.py CHANGED Viewed

@@ -1,19 +1,30 @@
-# src/test_set.py
 import pandas as pd
 import yaml
-from datasets import Dataset, load_dataset
-from typing import Dict, Tuple
-from config import *
 import salt.dataset
 from src.utils import get_all_language_pairs
 def generate_test_set(max_samples_per_pair: int = MAX_TEST_SAMPLES) -> pd.DataFrame:
-    """Generate standardized test set from SALT dataset."""
-    print("Generating SALT test set...")
-    # Load full SALT dataset
     dataset_config = f'''
     huggingface_load:
       path: {SALT_DATASET}
@@ -27,178 +38,169 @@ def generate_test_set(max_samples_per_pair: int = MAX_TEST_SAMPLES) -> pd.DataFr
       language: {ALL_UG40_LANGUAGES}
     allow_same_src_and_tgt_language: False
     '''
     config = yaml.safe_load(dataset_config)
     full_data = pd.DataFrame(salt.dataset.create(config))
-    # Sample data for each language pair
     test_samples = []
     sample_id_counter = 1
     for src_lang in ALL_UG40_LANGUAGES:
         for tgt_lang in ALL_UG40_LANGUAGES:
-            if src_lang != tgt_lang:
-                # Filter for this language pair
-                pair_data = full_data[
-                    (full_data['source.language'] == src_lang) &
-                    (full_data['target.language'] == tgt_lang)
-                ].copy()
-                if len(pair_data) > 0:
-                    # Sample up to max_samples_per_pair
-                    n_samples = min(len(pair_data), max_samples_per_pair)
-                    sampled = pair_data.sample(n=n_samples, random_state=42)
-                    # Add to test set with unique IDs
-                    for _, row in sampled.iterrows():
-                        test_samples.append({
-                            'sample_id': f"salt_{sample_id_counter:06d}",
-                            'source_text': row['source'],
-                            'target_text': row['target'],  # Hidden from public test set
-                            'source_language': src_lang,
-                            'target_language': tgt_lang,
-                            'domain': row.get('domain', 'general'),
-                            'google_comparable': (src_lang in GOOGLE_SUPPORTED_LANGUAGES and
-                                                tgt_lang in GOOGLE_SUPPORTED_LANGUAGES)
-                        })
-                        sample_id_counter += 1
     test_df = pd.DataFrame(test_samples)
-    print(f"Generated test set with {len(test_df)} samples across {len(get_all_language_pairs())} language pairs")
     return test_df
 def get_public_test_set() -> pd.DataFrame:
-    """Get public test set (sources only, no targets)."""
     try:
-        # Try to load existing test set
-        print(f"Loading test set from: {TEST_SET_DATASET}")
-        dataset = load_dataset(TEST_SET_DATASET, split='train')
-        test_df = dataset.to_pandas()
-        print(f"Loaded existing test set with {len(test_df)} samples")
     except Exception as e:
-        print(f"Could not load existing test set: {e}")
-        print("This is expected for first run. Generating new test set...")
-        # Generate new test set
-        test_df = generate_test_set()
-        # Save complete test set (with targets) privately
-        print("Saving test set for future use...")
         try:
-            save_complete_test_set(test_df)
-        except Exception as save_error:
-            print(f"Warning: Could not save test set: {save_error}")
-            print("Continuing with generated test set...")
-    # Return public version (without targets)
-    public_columns = [
-        'sample_id', 'source_text', 'source_language',
-        'target_language', 'domain', 'google_comparable'
-    ]
-    return test_df[public_columns].copy()
 def get_complete_test_set() -> pd.DataFrame:
-    """Get complete test set with targets (for evaluation)."""
     try:
-        # Load from private storage or regenerate
-        dataset = load_dataset(TEST_SET_DATASET + "-private", split='train')
-        return dataset.to_pandas()
     except Exception as e:
-        print(f"Regenerating complete test set: {e}")
-        return generate_test_set()
-def save_complete_test_set(test_df: pd.DataFrame) -> bool:
-    """Save complete test set to HuggingFace dataset."""
-    try:
-        # Save public version (no targets)
-        public_df = test_df[[
-            'sample_id', 'source_text', 'source_language',
-            'target_language', 'domain', 'google_comparable'
-        ]].copy()
-        public_dataset = Dataset.from_pandas(public_df)
-        public_dataset.push_to_hub(
-            TEST_SET_DATASET,
-            token=HF_TOKEN,
-            commit_message="Update public test set"
-        )
-        # Save private version (with targets)
-        private_dataset = Dataset.from_pandas(test_df)
-        private_dataset.push_to_hub(
-            TEST_SET_DATASET + "-private",
-            token=HF_TOKEN,
-            private=True,
-            commit_message="Update private test set with targets"
-        )
-        print("Test sets saved successfully!")
-        return True
-    except Exception as e:
-        print(f"Error saving test sets: {e}")
-        return False
-def create_test_set_download() -> Tuple[str, Dict]:
-    """Create downloadable test set file and statistics."""
-    public_test = get_public_test_set()
-    # Create download file
-    download_path = "salt_test_set.csv"
-    public_test.to_csv(download_path, index=False)
-    # Generate statistics
     stats = {
-        'total_samples': len(public_test),
-        'language_pairs': len(public_test.groupby(['source_language', 'target_language'])),
-        'google_comparable_samples': len(public_test[public_test['google_comparable'] == True]),
-        'languages': list(set(public_test['source_language'].unique()) | set(public_test['target_language'].unique())),
-        'domains': list(public_test['domain'].unique()) if 'domain' in public_test.columns else ['general']
     }
     return download_path, stats
-def validate_test_set_integrity() -> Dict:
-    """Validate test set integrity and coverage."""
-    try:
-        public_test = get_public_test_set()
-        complete_test = get_complete_test_set()
-        # Check alignment
-        public_ids = set(public_test['sample_id'])
-        private_ids = set(complete_test['sample_id'])
-        coverage_by_pair = {}
-        for src in ALL_UG40_LANGUAGES:
-            for tgt in ALL_UG40_LANGUAGES:
-                if src != tgt:
-                    pair_samples = public_test[
-                        (public_test['source_language'] == src) &
-                        (public_test['target_language'] == tgt)
-                    ]
-                    coverage_by_pair[f"{src}_{tgt}"] = {
-                        'count': len(pair_samples),
-                        'has_samples': len(pair_samples) >= MIN_SAMPLES_PER_PAIR
-                    }
-        return {
-            'alignment_check': len(public_ids - private_ids) == 0,
-            'total_samples': len(public_test),
-            'coverage_by_pair': coverage_by_pair,
-            'missing_pairs': [k for k, v in coverage_by_pair.items() if not v['has_samples']]
-        }
-    except Exception as e:
-        return {'error': str(e)}

+import os
 import pandas as pd
 import yaml
+from datasets import load_dataset
+from config import (
+    TEST_SET_DATASET,
+    SALT_DATASET,
+    MAX_TEST_SAMPLES,
+    HF_TOKEN,
+    MIN_SAMPLES_PER_PAIR,
+    ALL_UG40_LANGUAGES,
+    GOOGLE_SUPPORTED_LANGUAGES
+)
 import salt.dataset
 from src.utils import get_all_language_pairs
+# Local CSV filenames for persistence
+LOCAL_PUBLIC_CSV = "salt_test_set.csv"
+LOCAL_COMPLETE_CSV = "salt_complete_test_set.csv"
 def generate_test_set(max_samples_per_pair: int = MAX_TEST_SAMPLES) -> pd.DataFrame:
+    """
+    Generate standardized test set from the SALT dataset.
+    """
+    print("🔄 Generating SALT test set from source dataset...")
+    # Build SALT dataset config
     dataset_config = f'''
     huggingface_load:
       path: {SALT_DATASET}
       language: {ALL_UG40_LANGUAGES}
     allow_same_src_and_tgt_language: False
     '''
     config = yaml.safe_load(dataset_config)
     full_data = pd.DataFrame(salt.dataset.create(config))
     test_samples = []
     sample_id_counter = 1
     for src_lang in ALL_UG40_LANGUAGES:
         for tgt_lang in ALL_UG40_LANGUAGES:
+            if src_lang == tgt_lang:
+                continue
+            pair_data = full_data[
+                (full_data['source.language'] == src_lang) &
+                (full_data['target.language'] == tgt_lang)
+            ]
+            if pair_data.empty:
+                continue
+            # Sample up to max_samples_per_pair
+            n_samples = min(len(pair_data), max_samples_per_pair)
+            sampled = pair_data.sample(n=n_samples, random_state=42)
+            for _, row in sampled.iterrows():
+                test_samples.append({
+                    'sample_id': f"salt_{sample_id_counter:06d}",
+                    'source_text': row['source'],
+                    'target_text': row['target'],
+                    'source_language': src_lang,
+                    'target_language': tgt_lang,
+                    'domain': row.get('domain', 'general'),
+                    'google_comparable': (
+                        src_lang in GOOGLE_SUPPORTED_LANGUAGES and
+                        tgt_lang in GOOGLE_SUPPORTED_LANGUAGES
+                    )
+                })
+                sample_id_counter += 1
     test_df = pd.DataFrame(test_samples)
+    print(f"✅ Generated test set: {len(test_df):,} samples across {len(get_all_language_pairs()):,} pairs")
     return test_df
+def _generate_and_save_test_set() -> (pd.DataFrame, pd.DataFrame):
+    """
+    Generate the full test set and persist both public and complete CSV files.
+    """
+    full_df = generate_test_set()
+    # Public version (no target_text)
+    public_df = full_df[[
+        'sample_id', 'source_text', 'source_language',
+        'target_language', 'domain', 'google_comparable'
+    ]]
+    public_df.to_csv(LOCAL_PUBLIC_CSV, index=False)
+    # Complete version (with target_text)
+    full_df.to_csv(LOCAL_COMPLETE_CSV, index=False)
+    print(f"✅ Saved local CSVs: {LOCAL_PUBLIC_CSV}, {LOCAL_COMPLETE_CSV}")
+    return public_df, full_df
 def get_public_test_set() -> pd.DataFrame:
+    """
+    Load the public test set (without targets).
+    Tries HF Hub → local CSV → regenerate.
+    """
+    # 1) Try HF Hub
     try:
+        ds = load_dataset(TEST_SET_DATASET, split="train", token=HF_TOKEN)
+        df = ds.to_pandas()
+        print(f"✅ Loaded public test set from HF Hub ({len(df):,} samples)")
+        return df
     except Exception as e:
+        print("⚠️ HF Hub load failed, falling back to local CSV:", e)
+    # 2) Try local CSV
+    if os.path.exists(LOCAL_PUBLIC_CSV):
         try:
+            df = pd.read_csv(LOCAL_PUBLIC_CSV)
+            print(f"✅ Loaded public test set from local CSV ({len(df):,} samples)")
+            return df
+        except Exception as e:
+            print("⚠️ Failed to read local CSV, regenerating:", e)
+    # 3) Regenerate & save
+    print("🔄 Generating new public test set and saving to CSV...")
+    public_df, _ = _generate_and_save_test_set()
+    return public_df
 def get_complete_test_set() -> pd.DataFrame:
+    """
+    Load the complete test set (with targets).
+    Tries HF Hub-private → local CSV → regenerate.
+    """
+    # 1) Try HF Hub private
     try:
+        ds = load_dataset(TEST_SET_DATASET + "-private", split="train", token=HF_TOKEN)
+        df = ds.to_pandas()
+        print(f"✅ Loaded complete test set from HF Hub-private ({len(df):,} samples)")
+        return df
     except Exception as e:
+        print("⚠️ HF Hub-private load failed, falling back to local CSV:", e)
+    # 2) Try local CSV
+    if os.path.exists(LOCAL_COMPLETE_CSV):
+        try:
+            df = pd.read_csv(LOCAL_COMPLETE_CSV)
+            print(f"✅ Loaded complete test set from local CSV ({len(df):,} samples)")
+            return df
+        except Exception as e:
+            print("⚠️ Failed to read local complete CSV, regenerating:", e)
+    # 3) Regenerate & save
+    print("🔄 Generating new complete test set and saving to CSV...")
+    _, complete_df = _generate_and_save_test_set()
+    return complete_df
+def create_test_set_download() -> (str, dict):
+    """
+    Create a CSV download of the public test set and return its path + stats.
+    """
+    public_df = get_public_test_set()
+    download_path = LOCAL_PUBLIC_CSV
+    # Ensure the CSV is up-to-date
+    public_df.to_csv(download_path, index=False)
     stats = {
+        'total_samples': len(public_df),
+        'language_pairs': len(public_df.groupby(['source_language', 'target_language'])),
+        'google_comparable_samples': int(public_df['google_comparable'].sum()),
+        'languages': list(set(public_df['source_language']).union(public_df['target_language'])),
+        'domains': public_df['domain'].unique().tolist()
     }
     return download_path, stats
+def validate_test_set_integrity() -> dict:
+    """
+    Validate test set coverage and integrity.
+    """
+    public_df = get_public_test_set()
+    complete_df = get_complete_test_set()
+    public_ids = set(public_df['sample_id'])
+    private_ids = set(complete_df['sample_id'])
+    coverage_by_pair = {}
+    for src in ALL_UG40_LANGUAGES:
+        for tgt in ALL_UG40_LANGUAGES:
+            if src == tgt:
+                continue
+            subset = public_df[
+                (public_df['source_language'] == src) &
+                (public_df['target_language'] == tgt)
+            ]
+            count = len(subset)
+            coverage_by_pair[f"{src}_{tgt}"] = {
+                'count': count,
+                'has_samples': count >= MIN_SAMPLES_PER_PAIR
+            }
+    return {
+        'alignment_check': public_ids <= private_ids,
+        'total_samples': len(public_df),
+        'coverage_by_pair': coverage_by_pair,
+        'missing_pairs': [k for k, v in coverage_by_pair.items() if not v['has_samples']]
+    }