import pandas as pd import io import gradio as gr from constants import ( REQUIRED_COLUMNS, ASSAY_LIST, CV_COLUMN, EXAMPLE_FILE_DICT, ANTIBODY_NAMES_DICT, ) def validate_csv_can_be_read(file_content: str) -> pd.DataFrame: """ Validate that the CSV file can be read and parsed. Parameters ---------- file_content: str The content of the uploaded CSV file. Returns ------- pd.DataFrame The parsed DataFrame if successful. Raises ------ gr.Error: If CSV cannot be read or parsed """ try: # Read CSV content df = pd.read_csv(io.StringIO(file_content)) return df except pd.errors.EmptyDataError: raise gr.Error("❌ CSV file is empty or contains no valid data") except pd.errors.ParserError as e: raise gr.Error(f"❌ Invalid CSV format

" f"Error: {str(e)}") except UnicodeDecodeError: raise gr.Error( "❌ File encoding error

" "Your file appears to have an unsupported encoding.
" "Please save your CSV file with UTF-8 encoding and try again." ) except Exception as e: raise gr.Error(f"❌ Unexpected error reading CSV file: {str(e)}") def validate_cv_submission( df: pd.DataFrame, submission_type: str = "GDPa1_cross_validation" ) -> None: """Validate cross-validation submission""" # Must have CV_COLUMN for CV submissions if CV_COLUMN not in df.columns: raise gr.Error(f"❌ CV submissions must include a '{CV_COLUMN}' column") # Load canonical fold assignments expected_cv_df = pd.read_csv(EXAMPLE_FILE_DICT[submission_type])[ ["antibody_name", CV_COLUMN] ] antibody_check = expected_cv_df.merge( df[["antibody_name", CV_COLUMN]], on="antibody_name", how="left", suffixes=("_expected", "_submitted"), ) # CV fold assignments should match fold_mismatches = antibody_check[ antibody_check[f"{CV_COLUMN}_expected"] != antibody_check[f"{CV_COLUMN}_submitted"] ] if len(fold_mismatches) > 0: examples = [] for _, row in fold_mismatches.head(3).iterrows(): examples.append( f"{row['antibody_name']} (expected fold {row[f'{CV_COLUMN}_expected']}, got {row[f'{CV_COLUMN}_submitted']})" ) raise gr.Error( f"❌ Fold assignments don't match canonical CV folds: {'; '.join(examples)}" ) def validate_full_dataset_submission(df: pd.DataFrame) -> None: """Validate full dataset submission""" if CV_COLUMN in df.columns: raise gr.Error( f"❌ Your submission contains a '{CV_COLUMN}' column. " "Please select 'Cross-Validation Predictions' if you want to submit CV results." ) def get_assay_columns(df: pd.DataFrame) -> list[str]: """Get all assay columns from the DataFrame""" return [col for col in df.columns if col in ASSAY_LIST] def validate_dataframe(df: pd.DataFrame, submission_type: str = "GDPa1") -> None: """ Validate the DataFrame content and structure. Parameters ---------- df: pd.DataFrame The DataFrame to validate. submission_type: str Type of submission: "GDPa1" or "GDPa1_cross_validation" Raises ------ gr.Error: If validation fails """ if submission_type not in EXAMPLE_FILE_DICT.keys(): raise ValueError(f"Invalid submission type: {submission_type}") # Required columns should be present missing_columns = set(REQUIRED_COLUMNS) - set(df.columns) if missing_columns: raise gr.Error(f"❌ Missing required columns: {', '.join(missing_columns)}") # Should include at least 1 assay column assay_columns = get_assay_columns(df) if len(assay_columns) < 1: raise gr.Error( "❌ CSV should include at least one of the following assay columns: " + ", ".join(ASSAY_LIST) ) # Submission are name, sequence, and at least one assay column submission_columns = REQUIRED_COLUMNS + assay_columns # Data should not be empty if df.empty: raise gr.Error("❌ CSV file is empty") # No missing values in submission columns for col in submission_columns: missing_count = df[col].isnull().sum() if missing_count > 0: raise gr.Error(f"❌ Column '{col}' contains {missing_count} missing values") # All names should be unique n_duplicates = df["antibody_name"].duplicated().sum() if n_duplicates > 0: raise gr.Error( f"❌ CSV should have only one row per antibody. Found {n_duplicates} duplicates." ) # All antibody names should be recognizable unrecognized_antibodies = set(df["antibody_name"]) - set( ANTIBODY_NAMES_DICT[submission_type] ) if unrecognized_antibodies: raise gr.Error( f"❌ Found unrecognized antibody names: {', '.join(unrecognized_antibodies)}" ) # All antibody names should be present missing_antibodies = set(ANTIBODY_NAMES_DICT[submission_type]) - set( df["antibody_name"] ) if missing_antibodies: raise gr.Error( f"❌ Missing predictions for {len(missing_antibodies)} antibodies: {', '.join(missing_antibodies)}" ) # Submission-type specific validation if submission_type.endswith("_cross_validation"): validate_cv_submission(df, submission_type) else: # full_dataset validate_full_dataset_submission(df) def validate_csv_file(file_content: str, submission_type: str = "GDPa1") -> None: """ Validate the uploaded CSV file. Parameters ---------- file_content: str The content of the uploaded CSV file. submission_type: str Type of submission: "GDPa1" or "GDPa1_cross_validation" Raises ------ gr.Error: If validation fails """ df = validate_csv_can_be_read(file_content) validate_dataframe(df, submission_type)