abdev-leaderboard / validation.py
pquintero's picture
validate antibody names
4d9df8e
raw
history blame
2.95 kB
import pandas as pd
import io
import gradio as gr
from constants import REQUIRED_COLUMNS, MINIMAL_NUMBER_OF_ROWS, ANTIBODY_NAMES
def validate_csv_can_be_read(file_content: str) -> pd.DataFrame:
"""
Validate that the CSV file can be read and parsed.
Parameters
----------
file_content: str
The content of the uploaded CSV file.
Returns
-------
pd.DataFrame
The parsed DataFrame if successful.
Raises
------
gr.Error: If CSV cannot be read or parsed
"""
try:
# Read CSV content
df = pd.read_csv(io.StringIO(file_content))
return df
except pd.errors.EmptyDataError:
raise gr.Error("❌ CSV file is empty or contains no valid data")
except pd.errors.ParserError as e:
raise gr.Error(f"❌ Invalid CSV format<br><br>" f"Error: {str(e)}")
except UnicodeDecodeError:
raise gr.Error(
"❌ File encoding error<br><br>"
"Your file appears to have an unsupported encoding.<br>"
"Please save your CSV file with UTF-8 encoding and try again."
)
def validate_dataframe(df: pd.DataFrame) -> None:
"""
Validate the DataFrame content and structure.
Parameters
----------
df: pd.DataFrame
The DataFrame to validate.
Raises
------
gr.Error: If validation fails
"""
# Required columns should be present
missing_columns = set(REQUIRED_COLUMNS) - set(df.columns)
if missing_columns:
raise gr.Error(f"❌ Missing required columns: {', '.join(missing_columns)}")
# Data should not be empty
if df.empty:
raise gr.Error("❌ CSV file is empty")
# No missing values in required columns
for col in REQUIRED_COLUMNS:
missing_count = df[col].isnull().sum()
if missing_count > 0:
raise gr.Error(f"❌ Column '{col}' contains {missing_count} missing values")
# Above minimal number of rows
if len(df) < MINIMAL_NUMBER_OF_ROWS:
raise gr.Error(f"❌ CSV should have at least {MINIMAL_NUMBER_OF_ROWS} rows")
# All names should be unique
n_duplicates = df["antibody_name"].duplicated().sum()
if n_duplicates > 0:
raise gr.Error(
f"❌ CSV should have only one row per antibody. Found {n_duplicates} duplicates."
)
# All antibody names should be recognizable
unrecognized_antibodies = set(df["antibody_name"]) - set(ANTIBODY_NAMES)
if unrecognized_antibodies:
raise gr.Error(
f"❌ Found unrecognized antibody names: {', '.join(unrecognized_antibodies)}"
)
def validate_csv_file(file_content: str) -> None:
"""
Validate the uploaded CSV file.
Parameters
----------
file_content: str
The content of the uploaded CSV file.
Raises
------
gr.Error: If validation fails
"""
df = validate_csv_can_be_read(file_content)
validate_dataframe(df)