|
import pandas as pd |
|
import io |
|
import gradio as gr |
|
from constants import REQUIRED_COLUMNS, MINIMAL_NUMBER_OF_ROWS, ANTIBODY_NAMES |
|
|
|
|
|
def validate_csv_can_be_read(file_content: str) -> pd.DataFrame: |
|
""" |
|
Validate that the CSV file can be read and parsed. |
|
|
|
Parameters |
|
---------- |
|
file_content: str |
|
The content of the uploaded CSV file. |
|
|
|
Returns |
|
------- |
|
pd.DataFrame |
|
The parsed DataFrame if successful. |
|
|
|
Raises |
|
------ |
|
gr.Error: If CSV cannot be read or parsed |
|
""" |
|
try: |
|
|
|
df = pd.read_csv(io.StringIO(file_content)) |
|
return df |
|
|
|
except pd.errors.EmptyDataError: |
|
raise gr.Error("β CSV file is empty or contains no valid data") |
|
except pd.errors.ParserError as e: |
|
raise gr.Error(f"β Invalid CSV format<br><br>" f"Error: {str(e)}") |
|
except UnicodeDecodeError: |
|
raise gr.Error( |
|
"β File encoding error<br><br>" |
|
"Your file appears to have an unsupported encoding.<br>" |
|
"Please save your CSV file with UTF-8 encoding and try again." |
|
) |
|
|
|
|
|
def validate_dataframe(df: pd.DataFrame) -> None: |
|
""" |
|
Validate the DataFrame content and structure. |
|
|
|
Parameters |
|
---------- |
|
df: pd.DataFrame |
|
The DataFrame to validate. |
|
|
|
Raises |
|
------ |
|
gr.Error: If validation fails |
|
""" |
|
|
|
missing_columns = set(REQUIRED_COLUMNS) - set(df.columns) |
|
if missing_columns: |
|
raise gr.Error(f"β Missing required columns: {', '.join(missing_columns)}") |
|
|
|
|
|
if df.empty: |
|
raise gr.Error("β CSV file is empty") |
|
|
|
|
|
for col in REQUIRED_COLUMNS: |
|
missing_count = df[col].isnull().sum() |
|
if missing_count > 0: |
|
raise gr.Error(f"β Column '{col}' contains {missing_count} missing values") |
|
|
|
|
|
if len(df) < MINIMAL_NUMBER_OF_ROWS: |
|
raise gr.Error(f"β CSV should have at least {MINIMAL_NUMBER_OF_ROWS} rows") |
|
|
|
|
|
n_duplicates = df["antibody_name"].duplicated().sum() |
|
if n_duplicates > 0: |
|
raise gr.Error( |
|
f"β CSV should have only one row per antibody. Found {n_duplicates} duplicates." |
|
) |
|
|
|
|
|
unrecognized_antibodies = set(df["antibody_name"]) - set(ANTIBODY_NAMES) |
|
if unrecognized_antibodies: |
|
raise gr.Error( |
|
f"β Found unrecognized antibody names: {', '.join(unrecognized_antibodies)}" |
|
) |
|
|
|
|
|
def validate_csv_file(file_content: str) -> None: |
|
""" |
|
Validate the uploaded CSV file. |
|
|
|
Parameters |
|
---------- |
|
file_content: str |
|
The content of the uploaded CSV file. |
|
|
|
Raises |
|
------ |
|
gr.Error: If validation fails |
|
""" |
|
df = validate_csv_can_be_read(file_content) |
|
validate_dataframe(df) |
|
|