pquintero commited on
Commit
4d9df8e
Β·
1 Parent(s): 58d937b

validate antibody names

Browse files
constants.py CHANGED
@@ -4,7 +4,7 @@ Constants for the Antibody Developability Benchmark
4
 
5
  import os
6
  from huggingface_hub import HfApi
7
-
8
 
9
  ASSAY_LIST = ["AC-SINS_pH7.4", "PR_CHO", "HIC", "Tm2", "Titer"]
10
  ASSAY_RENAME = {
@@ -32,11 +32,11 @@ ASSAY_EMOJIS = {
32
  # Input CSV file requirements
33
  MINIMAL_NUMBER_OF_ROWS: int = 50
34
  REQUIRED_COLUMNS: list[str] = [
35
- "antibody_id",
36
  "antibody_name",
37
  "vh_protein_sequence",
38
  "vl_protein_sequence",
39
  ] + ASSAY_LIST
 
40
 
41
  # Huggingface API
42
  TOKEN = os.environ.get("HF_TOKEN")
 
4
 
5
  import os
6
  from huggingface_hub import HfApi
7
+ import pandas as pd
8
 
9
  ASSAY_LIST = ["AC-SINS_pH7.4", "PR_CHO", "HIC", "Tm2", "Titer"]
10
  ASSAY_RENAME = {
 
32
  # Input CSV file requirements
33
  MINIMAL_NUMBER_OF_ROWS: int = 50
34
  REQUIRED_COLUMNS: list[str] = [
 
35
  "antibody_name",
36
  "vh_protein_sequence",
37
  "vl_protein_sequence",
38
  ] + ASSAY_LIST
39
+ ANTIBODY_NAMES = pd.read_csv("data/antibody_names.csv")["antibody_name"].tolist()
40
 
41
  # Huggingface API
42
  TOKEN = os.environ.get("HF_TOKEN")
data/antibody_names.csv ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ antibody_name
2
+ abituzumab
3
+ abrilumab
4
+ adalimumab
5
+ alemtuzumab
6
+ alirocumab
7
+ anifrolumab
8
+ atezolizumab
9
+ bapineuzumab
10
+ basiliximab
11
+ bavituximab
12
+ belimumab
13
+ benralizumab
14
+ bevacizumab
15
+ bimagrumab
16
+ blosozumab
17
+ bococizumab
18
+ brentuximab
19
+ briakinumab
20
+ brodalumab
21
+ canakinumab
22
+ carlumab
23
+ certolizumab
24
+ cetuximab
25
+ cixutumumab
26
+ clazakizumab
27
+ codrituzumab
28
+ crenezumab
29
+ dacetuzumab
30
+ daclizumab
31
+ dalotuzumab
32
+ daratumumab
33
+ denosumab
34
+ dinutuximab
35
+ drozitumab
36
+ duligotuzumab
37
+ dupilumab
38
+ eculizumab
39
+ efalizumab
40
+ eldelumab
41
+ elotuzumab
42
+ emibetuzumab
43
+ enokizumab
44
+ epratuzumab
45
+ etrolizumab
46
+ evolocumab
47
+ farletuzumab
48
+ fasinumab
49
+ fezakinumab
50
+ ficlatuzumab
51
+ figitumumab
52
+ fletikumab
53
+ foralumab
54
+ fresolimumab
55
+ fulranumab
56
+ galiximab
57
+ ganitumab
58
+ gantenerumab
59
+ gemtuzumab
60
+ gevokizumab
61
+ girentuximab
62
+ glembatumumab
63
+ golimumab
64
+ guselkumab
65
+ ibalizumab
66
+ imgatuzumab
67
+ infliximab
68
+ inotuzumab
69
+ ipilimumab
70
+ ixekizumab
71
+ lampalizumab
72
+ lebrikizumab
73
+ lenzilumab
74
+ lintuzumab
75
+ lirilumab
76
+ lumiliximab
77
+ matuzumab
78
+ mavrilimumab
79
+ mepolizumab
80
+ mogamulizumab
81
+ motavizumab
82
+ muromonab
83
+ natalizumab
84
+ necitumumab
85
+ nimotuzumab
86
+ nivolumab
87
+ obinutuzumab
88
+ ocrelizumab
89
+ ofatumumab
90
+ olaratumab
91
+ olokizumab
92
+ omalizumab
93
+ onartuzumab
94
+ otelixizumab
95
+ otlertuzumab
96
+ ozanezumab
97
+ palivizumab
98
+ panitumumab
99
+ panobacumab
100
+ parsatuzumab
101
+ patritumab
102
+ pembrolizumab
103
+ pertuzumab
104
+ pinatuzumab
105
+ polatuzumab
106
+ ponezumab
107
+ radretumab
108
+ ramucirumab
109
+ ranibizumab
110
+ reslizumab
111
+ rilotumumab
112
+ rituximab
113
+ robatumumab
114
+ romosozumab
115
+ sarilumab
116
+ secukinumab
117
+ seribantumab
118
+ sifalimumab
119
+ siltuximab
120
+ simtuzumab
121
+ sirukumab
122
+ tabalumab
123
+ tanezumab
124
+ teplizumab
125
+ tigatuzumab
126
+ tildrakizumab
127
+ tocilizumab
128
+ tovetumab
129
+ tralokinumab
130
+ trastuzumab
131
+ tremelimumab
132
+ urelumab
133
+ ustekinumab
134
+ vedolizumab
135
+ veltuzumab
136
+ visilizumab
137
+ zalutumumab
138
+ zanolimumab
test/conftest.py CHANGED
@@ -1,14 +1,12 @@
1
  import pytest
2
  import pandas as pd
3
- from constants import MINIMAL_NUMBER_OF_ROWS, ASSAY_LIST
4
 
5
 
6
  @pytest.fixture
7
  def valid_csv_data():
8
- """Fixture providing valid CSV data with all required columns"""
9
  return {
10
- "antibody_id": ["AB001"] * MINIMAL_NUMBER_OF_ROWS,
11
- "antibody_name": ["AB001"] * MINIMAL_NUMBER_OF_ROWS,
12
  "vh_protein_sequence": [
13
  "EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYAMHWVRQAPGKGLEWVSAISGSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCARDYGDGYYFDYWGQGTLVTVSS"
14
  ]
@@ -23,11 +21,9 @@ def valid_csv_data():
23
 
24
  @pytest.fixture
25
  def valid_input_dataframe(valid_csv_data):
26
- """Fixture providing a valid input dataframe"""
27
  return pd.DataFrame(valid_csv_data)
28
 
29
 
30
  @pytest.fixture
31
  def valid_csv_content(valid_input_dataframe):
32
- """Fixture providing valid CSV content as string"""
33
  return valid_input_dataframe.to_csv(index=False)
 
1
  import pytest
2
  import pandas as pd
3
+ from constants import MINIMAL_NUMBER_OF_ROWS, ASSAY_LIST, ANTIBODY_NAMES
4
 
5
 
6
  @pytest.fixture
7
  def valid_csv_data():
 
8
  return {
9
+ "antibody_name": ANTIBODY_NAMES[:MINIMAL_NUMBER_OF_ROWS],
 
10
  "vh_protein_sequence": [
11
  "EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYAMHWVRQAPGKGLEWVSAISGSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCARDYGDGYYFDYWGQGTLVTVSS"
12
  ]
 
21
 
22
  @pytest.fixture
23
  def valid_input_dataframe(valid_csv_data):
 
24
  return pd.DataFrame(valid_csv_data)
25
 
26
 
27
  @pytest.fixture
28
  def valid_csv_content(valid_input_dataframe):
 
29
  return valid_input_dataframe.to_csv(index=False)
test/test_validation.py CHANGED
@@ -9,14 +9,12 @@ class TestValidateCsvCanBeRead:
9
  """Test cases for validate_csv_can_be_read function"""
10
 
11
  def test_valid_csv_can_be_read(self, valid_csv_content):
12
- """Test that valid CSV content can be read"""
13
  df = validate_csv_can_be_read(valid_csv_content)
14
  assert isinstance(df, pd.DataFrame)
15
  assert len(df) == MINIMAL_NUMBER_OF_ROWS
16
  assert list(df.columns) == list(REQUIRED_COLUMNS)
17
 
18
  def test_empty_csv_raises_error(self):
19
- """Test that empty CSV raises an error"""
20
  empty_csv = ""
21
 
22
  with pytest.raises(gr.Error) as exc_info:
@@ -25,7 +23,6 @@ class TestValidateCsvCanBeRead:
25
  assert "empty or contains no valid data" in str(exc_info.value)
26
 
27
  def test_invalid_csv_format_raises_error(self):
28
- """Test that invalid CSV format raises an error"""
29
  # Create a CSV with malformed structure that pandas cannot parse
30
  malformed_csv = 'column1,column2\nvalue1,"unclosed quote\nvalue4,value5'
31
 
@@ -35,10 +32,9 @@ class TestValidateCsvCanBeRead:
35
  assert "Invalid CSV format" in str(exc_info.value)
36
 
37
  def test_csv_with_quoted_fields_can_be_read(self):
38
- """Test that CSV with quoted fields can be read"""
39
  # Create CSV with quoted fields and enough rows
40
- base_row = 'AB001,"EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYAMHWVRQAPGKGLEWVSAISGSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCARDYGDGYYFDYWGQGTLVTVSS","DIQMTQSPSSLSASVGDRVTITCRASQSISSYLNWYQQKPGKAPKLLIYAASTLQSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQSYSTPFTFGQGTKVEIK",95.2,0.85,0.92,0.78,0.81,72.5'
41
- csv_content = "antibody_id,vh_protein_sequence,vl_protein_sequence,SEC %Monomer,HIC,PR_CHO,AC-SINS_pH6.0,AC-SINS_pH7.4,Tm\n"
42
  csv_content += "\n".join([base_row] * MINIMAL_NUMBER_OF_ROWS)
43
 
44
  df = validate_csv_can_be_read(csv_content)
@@ -47,14 +43,10 @@ class TestValidateCsvCanBeRead:
47
 
48
 
49
  class TestValidateDataframe:
50
- """Test cases for validate_dataframe function"""
51
-
52
  def test_valid_dataframe_passes(self, valid_input_dataframe):
53
- """Test that valid DataFrame passes validation"""
54
  validate_dataframe(valid_input_dataframe)
55
 
56
  def test_missing_columns_raises_error(self, valid_input_dataframe):
57
- """Test that DataFrame with missing columns raises an error"""
58
  missing_column = REQUIRED_COLUMNS[0]
59
  df = valid_input_dataframe.copy()
60
  df.drop(columns=[missing_column], inplace=True)
@@ -65,7 +57,6 @@ class TestValidateDataframe:
65
  assert f"Missing required columns: {missing_column}" in str(exc_info.value)
66
 
67
  def test_empty_dataframe_raises_error(self, valid_input_dataframe):
68
- """Test that empty DataFrame raises an error"""
69
  empty_df = valid_input_dataframe.head(0)
70
 
71
  with pytest.raises(gr.Error) as exc_info:
@@ -74,7 +65,6 @@ class TestValidateDataframe:
74
  assert "CSV file is empty" in str(exc_info.value)
75
 
76
  def test_insufficient_rows_raises_error(self, valid_input_dataframe):
77
- """Test that DataFrame with insufficient rows raises an error"""
78
  df = valid_input_dataframe.head(MINIMAL_NUMBER_OF_ROWS - 1)
79
  with pytest.raises(gr.Error) as exc_info:
80
  validate_dataframe(df)
@@ -84,7 +74,6 @@ class TestValidateDataframe:
84
  )
85
 
86
  def test_missing_values_raises_error(self, valid_input_dataframe):
87
- """Test that DataFrame with missing values raises an error"""
88
  bad_column = REQUIRED_COLUMNS[0]
89
  df = valid_input_dataframe.copy()
90
  df[bad_column] = [None] * len(df)
@@ -94,17 +83,31 @@ class TestValidateDataframe:
94
  assert f"contains {len(df)} missing values" in str(exc_info.value)
95
 
96
  def test_csv_with_extra_columns_passes(self, valid_input_dataframe):
97
- """Test that DataFrame with extra columns passes validation"""
98
  extra_column = "extra_column_1"
99
  df = valid_input_dataframe.copy()
100
  df[extra_column] = ["extra1"] * len(df)
101
  df[extra_column] = ["extra2"] * len(df)
102
  validate_dataframe(df)
103
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
- class TestValidateCsvFile:
106
- """Test cases for the combined validate_csv_file function"""
107
 
 
108
  def test_valid_csv_passes(self, valid_csv_content):
109
- """Test that a valid CSV with all required columns passes validation"""
110
  validate_csv_file(valid_csv_content)
 
9
  """Test cases for validate_csv_can_be_read function"""
10
 
11
  def test_valid_csv_can_be_read(self, valid_csv_content):
 
12
  df = validate_csv_can_be_read(valid_csv_content)
13
  assert isinstance(df, pd.DataFrame)
14
  assert len(df) == MINIMAL_NUMBER_OF_ROWS
15
  assert list(df.columns) == list(REQUIRED_COLUMNS)
16
 
17
  def test_empty_csv_raises_error(self):
 
18
  empty_csv = ""
19
 
20
  with pytest.raises(gr.Error) as exc_info:
 
23
  assert "empty or contains no valid data" in str(exc_info.value)
24
 
25
  def test_invalid_csv_format_raises_error(self):
 
26
  # Create a CSV with malformed structure that pandas cannot parse
27
  malformed_csv = 'column1,column2\nvalue1,"unclosed quote\nvalue4,value5'
28
 
 
32
  assert "Invalid CSV format" in str(exc_info.value)
33
 
34
  def test_csv_with_quoted_fields_can_be_read(self):
 
35
  # Create CSV with quoted fields and enough rows
36
+ base_row = 'test_antibody,"EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYAMHWVRQAPGKGLEWVSAISGSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCARDYGDGYYFDYWGQGTLVTVSS","DIQMTQSPSSLSASVGDRVTITCRASQSISSYLNWYQQKPGKAPKLLIYAASTLQSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQSYSTPFTFGQGTKVEIK",95.2,0.85,0.92,0.78,0.81,72.5'
37
+ csv_content = "antibody_name,vh_protein_sequence,vl_protein_sequence,SEC %Monomer,HIC,PR_CHO,AC-SINS_pH6.0,AC-SINS_pH7.4,Tm\n"
38
  csv_content += "\n".join([base_row] * MINIMAL_NUMBER_OF_ROWS)
39
 
40
  df = validate_csv_can_be_read(csv_content)
 
43
 
44
 
45
  class TestValidateDataframe:
 
 
46
  def test_valid_dataframe_passes(self, valid_input_dataframe):
 
47
  validate_dataframe(valid_input_dataframe)
48
 
49
  def test_missing_columns_raises_error(self, valid_input_dataframe):
 
50
  missing_column = REQUIRED_COLUMNS[0]
51
  df = valid_input_dataframe.copy()
52
  df.drop(columns=[missing_column], inplace=True)
 
57
  assert f"Missing required columns: {missing_column}" in str(exc_info.value)
58
 
59
  def test_empty_dataframe_raises_error(self, valid_input_dataframe):
 
60
  empty_df = valid_input_dataframe.head(0)
61
 
62
  with pytest.raises(gr.Error) as exc_info:
 
65
  assert "CSV file is empty" in str(exc_info.value)
66
 
67
  def test_insufficient_rows_raises_error(self, valid_input_dataframe):
 
68
  df = valid_input_dataframe.head(MINIMAL_NUMBER_OF_ROWS - 1)
69
  with pytest.raises(gr.Error) as exc_info:
70
  validate_dataframe(df)
 
74
  )
75
 
76
  def test_missing_values_raises_error(self, valid_input_dataframe):
 
77
  bad_column = REQUIRED_COLUMNS[0]
78
  df = valid_input_dataframe.copy()
79
  df[bad_column] = [None] * len(df)
 
83
  assert f"contains {len(df)} missing values" in str(exc_info.value)
84
 
85
  def test_csv_with_extra_columns_passes(self, valid_input_dataframe):
 
86
  extra_column = "extra_column_1"
87
  df = valid_input_dataframe.copy()
88
  df[extra_column] = ["extra1"] * len(df)
89
  df[extra_column] = ["extra2"] * len(df)
90
  validate_dataframe(df)
91
 
92
+ def test_duplicate_antibody_names_raises_error(self, valid_input_dataframe):
93
+ df = valid_input_dataframe.copy()
94
+ df = pd.concat([df, df.head(1)], ignore_index=True)
95
+ with pytest.raises(gr.Error) as exc_info:
96
+ validate_dataframe(df)
97
+ assert "CSV should have only one row per antibody. Found 1 duplicates." in str(
98
+ exc_info.value
99
+ )
100
+
101
+ def test_unrecognized_antibody_names_raises_error(self, valid_input_dataframe):
102
+ df = valid_input_dataframe.copy()
103
+ df.loc[0, "antibody_name"] = "unrecognized_antibody"
104
+ with pytest.raises(gr.Error) as exc_info:
105
+ validate_dataframe(df)
106
+ assert f"Found unrecognized antibody names: {'unrecognized_antibody'}" in str(
107
+ exc_info.value
108
+ )
109
 
 
 
110
 
111
+ class TestValidateCsvFile:
112
  def test_valid_csv_passes(self, valid_csv_content):
 
113
  validate_csv_file(valid_csv_content)
validation.py CHANGED
@@ -1,7 +1,7 @@
1
  import pandas as pd
2
  import io
3
  import gradio as gr
4
- from constants import REQUIRED_COLUMNS, MINIMAL_NUMBER_OF_ROWS
5
 
6
 
7
  def validate_csv_can_be_read(file_content: str) -> pd.DataFrame:
@@ -61,19 +61,29 @@ def validate_dataframe(df: pd.DataFrame) -> None:
61
  if df.empty:
62
  raise gr.Error("❌ CSV file is empty")
63
 
64
- # Check for missing values in required columns
65
  for col in REQUIRED_COLUMNS:
66
  missing_count = df[col].isnull().sum()
67
  if missing_count > 0:
68
  raise gr.Error(f"❌ Column '{col}' contains {missing_count} missing values")
69
 
70
- # Check for reasonable number of rows
71
  if len(df) < MINIMAL_NUMBER_OF_ROWS:
72
  raise gr.Error(f"❌ CSV should have at least {MINIMAL_NUMBER_OF_ROWS} rows")
73
 
74
- print(
75
- f"βœ… CSV validation passed! Found {len(df)} rows with columns: {', '.join(df.columns)}"
76
- )
 
 
 
 
 
 
 
 
 
 
77
 
78
 
79
  def validate_csv_file(file_content: str) -> None:
 
1
  import pandas as pd
2
  import io
3
  import gradio as gr
4
+ from constants import REQUIRED_COLUMNS, MINIMAL_NUMBER_OF_ROWS, ANTIBODY_NAMES
5
 
6
 
7
  def validate_csv_can_be_read(file_content: str) -> pd.DataFrame:
 
61
  if df.empty:
62
  raise gr.Error("❌ CSV file is empty")
63
 
64
+ # No missing values in required columns
65
  for col in REQUIRED_COLUMNS:
66
  missing_count = df[col].isnull().sum()
67
  if missing_count > 0:
68
  raise gr.Error(f"❌ Column '{col}' contains {missing_count} missing values")
69
 
70
+ # Above minimal number of rows
71
  if len(df) < MINIMAL_NUMBER_OF_ROWS:
72
  raise gr.Error(f"❌ CSV should have at least {MINIMAL_NUMBER_OF_ROWS} rows")
73
 
74
+ # All names should be unique
75
+ n_duplicates = df["antibody_name"].duplicated().sum()
76
+ if n_duplicates > 0:
77
+ raise gr.Error(
78
+ f"❌ CSV should have only one row per antibody. Found {n_duplicates} duplicates."
79
+ )
80
+
81
+ # All antibody names should be recognizable
82
+ unrecognized_antibodies = set(df["antibody_name"]) - set(ANTIBODY_NAMES)
83
+ if unrecognized_antibodies:
84
+ raise gr.Error(
85
+ f"❌ Found unrecognized antibody names: {', '.join(unrecognized_antibodies)}"
86
+ )
87
 
88
 
89
  def validate_csv_file(file_content: str) -> None: