import pandas as pd import io import gradio as gr from constants import REQUIRED_COLUMNS, MINIMAL_NUMBER_OF_ROWS, ANTIBODY_NAMES def validate_csv_can_be_read(file_content: str) -> pd.DataFrame: """ Validate that the CSV file can be read and parsed. Parameters ---------- file_content: str The content of the uploaded CSV file. Returns ------- pd.DataFrame The parsed DataFrame if successful. Raises ------ gr.Error: If CSV cannot be read or parsed """ try: # Read CSV content df = pd.read_csv(io.StringIO(file_content)) return df except pd.errors.EmptyDataError: raise gr.Error("❌ CSV file is empty or contains no valid data") except pd.errors.ParserError as e: raise gr.Error(f"❌ Invalid CSV format

" f"Error: {str(e)}") except UnicodeDecodeError: raise gr.Error( "❌ File encoding error

" "Your file appears to have an unsupported encoding.
" "Please save your CSV file with UTF-8 encoding and try again." ) def validate_dataframe(df: pd.DataFrame) -> None: """ Validate the DataFrame content and structure. Parameters ---------- df: pd.DataFrame The DataFrame to validate. Raises ------ gr.Error: If validation fails """ # Required columns should be present missing_columns = set(REQUIRED_COLUMNS) - set(df.columns) if missing_columns: raise gr.Error(f"❌ Missing required columns: {', '.join(missing_columns)}") # Data should not be empty if df.empty: raise gr.Error("❌ CSV file is empty") # No missing values in required columns for col in REQUIRED_COLUMNS: missing_count = df[col].isnull().sum() if missing_count > 0: raise gr.Error(f"❌ Column '{col}' contains {missing_count} missing values") # Above minimal number of rows if len(df) < MINIMAL_NUMBER_OF_ROWS: raise gr.Error(f"❌ CSV should have at least {MINIMAL_NUMBER_OF_ROWS} rows") # All names should be unique n_duplicates = df["antibody_name"].duplicated().sum() if n_duplicates > 0: raise gr.Error( f"❌ CSV should have only one row per antibody. Found {n_duplicates} duplicates." ) # All antibody names should be recognizable unrecognized_antibodies = set(df["antibody_name"]) - set(ANTIBODY_NAMES) if unrecognized_antibodies: raise gr.Error( f"❌ Found unrecognized antibody names: {', '.join(unrecognized_antibodies)}" ) def validate_csv_file(file_content: str) -> None: """ Validate the uploaded CSV file. Parameters ---------- file_content: str The content of the uploaded CSV file. Raises ------ gr.Error: If validation fails """ df = validate_csv_can_be_read(file_content) validate_dataframe(df)