File size: 4,787 Bytes
eb50e2e
 
 
 
21e1c3f
eb50e2e
 
 
 
8f9985e
eb50e2e
 
 
8f9985e
eb50e2e
 
8f9985e
eb50e2e
 
8f9985e
eb50e2e
8f9985e
eb50e2e
 
8f9985e
 
eb50e2e
 
8f9985e
eb50e2e
8f9985e
eb50e2e
 
4d9df8e
 
21e1c3f
8f9985e
eb50e2e
 
 
 
 
 
 
8f9985e
eb50e2e
 
 
 
8f9985e
eb50e2e
 
8f9985e
eb50e2e
8f9985e
0f3e1b5
 
 
 
 
 
 
 
 
eb50e2e
 
8f9985e
eb50e2e
 
8f9985e
eb50e2e
8f9985e
f412a50
 
eb50e2e
 
f412a50
8f9985e
eb50e2e
 
 
 
 
 
8f9985e
eb50e2e
8f9985e
eb50e2e
 
 
 
 
 
8f9985e
4d9df8e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eb50e2e
8f9985e
4d9df8e
eb50e2e
8f9985e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import pytest
import pandas as pd
import gradio as gr
from validation import validate_csv_file, validate_csv_can_be_read, validate_dataframe
from constants import REQUIRED_COLUMNS, ASSAY_LIST


class TestValidateCsvCanBeRead:
    """Test cases for validate_csv_can_be_read function"""

    def test_valid_csv_can_be_read(self, valid_csv_content):
        df = validate_csv_can_be_read(valid_csv_content)
        assert isinstance(df, pd.DataFrame)

    def test_empty_csv_raises_error(self):
        empty_csv = ""

        with pytest.raises(gr.Error) as exc_info:
            validate_csv_can_be_read(empty_csv)

        assert "empty or contains no valid data" in str(exc_info.value)

    def test_invalid_csv_format_raises_error(self):
        # Create a CSV with malformed structure that pandas cannot parse
        malformed_csv = 'column1,column2\nvalue1,"unclosed quote\nvalue4,value5'

        with pytest.raises(gr.Error) as exc_info:
            validate_csv_can_be_read(malformed_csv)

        assert "Invalid CSV format" in str(exc_info.value)

    def test_csv_with_quoted_fields_can_be_read(self):
        # Create CSV with quoted fields and enough rows
        base_row = 'test_antibody,"EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYAMHWVRQAPGKGLEWVSAISGSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCARDYGDGYYFDYWGQGTLVTVSS","DIQMTQSPSSLSASVGDRVTITCRASQSISSYLNWYQQKPGKAPKLLIYAASTLQSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQSYSTPFTFGQGTKVEIK",95.2,0.85,0.92,0.78,0.81,72.5'
        csv_content = "antibody_name,vh_protein_sequence,vl_protein_sequence,SEC %Monomer,HIC,PR_CHO,AC-SINS_pH6.0,AC-SINS_pH7.4,Tm\n"
        csv_content += "\n".join([base_row] * 10)

        df = validate_csv_can_be_read(csv_content)
        assert isinstance(df, pd.DataFrame)


class TestValidateDataframe:
    def test_valid_dataframe_passes(self, valid_input_dataframe):
        validate_dataframe(valid_input_dataframe)

    def test_missing_columns_raises_error(self, valid_input_dataframe):
        missing_column = REQUIRED_COLUMNS[0]
        df = valid_input_dataframe.copy()
        df.drop(columns=[missing_column], inplace=True)

        with pytest.raises(gr.Error) as exc_info:
            validate_dataframe(df)

        assert f"Missing required columns: {missing_column}" in str(exc_info.value)

    def test_at_least_one_assay_column_raises_error(self, valid_input_dataframe):
        df = valid_input_dataframe.copy()
        df.drop(columns=ASSAY_LIST, inplace=True, errors="ignore")
        with pytest.raises(gr.Error) as exc_info:
            validate_dataframe(df)
        assert "CSV should include at least one of the following assay columns" in str(
            exc_info.value
        )

    def test_empty_dataframe_raises_error(self, valid_input_dataframe):
        empty_df = valid_input_dataframe.head(0)

        with pytest.raises(gr.Error) as exc_info:
            validate_dataframe(empty_df)

        assert "CSV file is empty" in str(exc_info.value)

    def test_missing_antibodies_raises_error(self, valid_input_dataframe):
        df = valid_input_dataframe.head(50)
        with pytest.raises(gr.Error) as exc_info:
            validate_dataframe(df)
        assert "Missing predictions for" in str(exc_info.value)

    def test_missing_values_raises_error(self, valid_input_dataframe):
        bad_column = REQUIRED_COLUMNS[0]
        df = valid_input_dataframe.copy()
        df[bad_column] = [None] * len(df)
        with pytest.raises(gr.Error) as exc_info:
            validate_dataframe(df)

        assert f"contains {len(df)} missing values" in str(exc_info.value)

    def test_csv_with_extra_columns_passes(self, valid_input_dataframe):
        extra_column = "extra_column_1"
        df = valid_input_dataframe.copy()
        df[extra_column] = ["extra1"] * len(df)
        df[extra_column] = ["extra2"] * len(df)
        validate_dataframe(df)

    def test_duplicate_antibody_names_raises_error(self, valid_input_dataframe):
        df = valid_input_dataframe.copy()
        df = pd.concat([df, df.head(1)], ignore_index=True)
        with pytest.raises(gr.Error) as exc_info:
            validate_dataframe(df)
        assert "CSV should have only one row per antibody. Found 1 duplicates." in str(
            exc_info.value
        )

    def test_unrecognized_antibody_names_raises_error(self, valid_input_dataframe):
        df = valid_input_dataframe.copy()
        df.loc[0, "antibody_name"] = "unrecognized_antibody"
        with pytest.raises(gr.Error) as exc_info:
            validate_dataframe(df)
        assert f"Found unrecognized antibody names: {'unrecognized_antibody'}" in str(
            exc_info.value
        )


class TestValidateCsvFile:
    def test_valid_csv_passes(self, valid_csv_content):
        validate_csv_file(valid_csv_content)