{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "7c6c914c", "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "from scipy.stats import spearmanr\n", "import seaborn as sns\n", "from sklearn.linear_model import Ridge\n", "from sklearn.model_selection import train_test_split\n", "import torch\n", "from tqdm.auto import tqdm\n", "from transformers import AutoModelForCausalLM, AutoTokenizer" ] }, { "cell_type": "code", "execution_count": 10, "id": "00cfd012", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index(['antibody_id', 'antibody_name', 'Titer', 'Purity', 'SEC %Monomer',\n", " 'SMAC', 'HIC', 'HAC', 'PR_CHO', 'PR_Ova', 'AC-SINS_pH6.0',\n", " 'AC-SINS_pH7.4', 'Tonset', 'Tm1', 'Tm2', 'hc_subtype', 'lc_subtype',\n", " 'highest_clinical_trial_asof_feb2025', 'est_status_asof_feb2025',\n", " 'vh_protein_sequence', 'hc_protein_sequence', 'hc_dna_sequence',\n", " 'vl_protein_sequence', 'lc_protein_sequence', 'lc_dna_sequence',\n", " 'hierarchical_cluster_fold', 'random_fold',\n", " 'hierarchical_cluster_IgG_isotype_stratified_fold', 'light_aligned_aho',\n", " 'heavy_aligned_aho'],\n", " dtype='object')\n", "Titer 7\n", "HIC 4\n", "PR_CHO 49\n", "Tm2 53\n", "AC-SINS_pH7.4 4\n", "dtype: int64\n" ] }, { "data": { "text/html": [ "
\n", " | antibody_id | \n", "antibody_name | \n", "Titer | \n", "Purity | \n", "SEC %Monomer | \n", "SMAC | \n", "HIC | \n", "HAC | \n", "PR_CHO | \n", "PR_Ova | \n", "... | \n", "hc_protein_sequence | \n", "hc_dna_sequence | \n", "vl_protein_sequence | \n", "lc_protein_sequence | \n", "lc_dna_sequence | \n", "hierarchical_cluster_fold | \n", "random_fold | \n", "hierarchical_cluster_IgG_isotype_stratified_fold | \n", "light_aligned_aho | \n", "heavy_aligned_aho | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "GDPa1-001 | \n", "abagovomab | \n", "140.25 | \n", "98.530 | \n", "97.010 | \n", "2.730 | \n", "2.590 | \n", "NaN | \n", "0.337837 | \n", "0.263108 | \n", "... | \n", "MRAWIFFLLCLAGRALAQVKLQESGAELARPGASVKLSCKASGYTF... | \n", "GCCGCCACCATGAGAGCCTGGATCTTTTTCCTGCTGTGCCTGGCTG... | \n", "DIELTQSPASLSASVGETVTITCQASENIYSYLAWHQQKQGKSPQL... | \n", "MRAWIFFLLCLAGRALADIELTQSPASLSASVGETVTITCQASENI... | \n", "GCCGCCACCATGAGAGCCTGGATCTTTTTCCTGCTGTGCCTGGCTG... | \n", "1 | \n", "2 | \n", "2 | \n", "DIELTQSPASLSASVGETVTITCQAS--ENIY------SYLAWHQQ... | \n", "QVKLQES-GAELARPGASVKLSCKASG-YTFTN-----YWMQWVKQ... | \n", "
1 | \n", "GDPa1-002 | \n", "abituzumab | \n", "193.31 | \n", "99.825 | \n", "97.620 | \n", "2.745 | \n", "2.545 | \n", "3.690 | \n", "0.205246 | \n", "0.100155 | \n", "... | \n", "MRAWIFFLLCLAGRALAQVQLQQSGGELAKPGASVKVSCKASGYTF... | \n", "GCCGCCACCATGAGAGCCTGGATCTTTTTCCTGCTGTGCCTGGCTG... | \n", "DIQMTQSPSSLSASVGDRVTITCRASQDISNYLAWYQQKPGKAPKL... | \n", "MRAWIFFLLCLAGRALADIQMTQSPSSLSASVGDRVTITCRASQDI... | \n", "GCCGCCACCATGAGAGCCTGGATCTTTTTCCTGCTGTGCCTGGCTG... | \n", "1 | \n", "4 | \n", "0 | \n", "DIQMTQSPSSLSASVGDRVTITCRAS--QDIS------NYLAWYQQ... | \n", "QVQLQQS-GGELAKPGASVKVSCKASG-YTFSS-----FWMHWVRQ... | \n", "
2 | \n", "GDPa1-003 | \n", "abrezekimab | \n", "114.75 | \n", "98.350 | \n", "89.055 | \n", "2.740 | \n", "2.705 | \n", "NaN | \n", "0.138773 | \n", "0.101180 | \n", "... | \n", "MRAWIFFLLCLAGRALAQVTLKESGPVLVKPTETLTLTCTVSGFSL... | \n", "GCCGCCACCATGAGAGCCTGGATCTTTTTCCTGCTGTGCCTGGCTG... | \n", "DIQMTQSPSSLSASVGDRVTITCLASEDISNYLAWYQQKPGKAPKL... | \n", "MRAWIFFLLCLAGRALADIQMTQSPSSLSASVGDRVTITCLASEDI... | \n", "GCCGCCACCATGAGAGCCTGGATCTTTTTCCTGCTGTGCCTGGCTG... | \n", "2 | \n", "2 | \n", "2 | \n", "DIQMTQSPSSLSASVGDRVTITCLAS--EDIS------NYLAWYQQ... | \n", "QVTLKES-GPVLVKPTETLTLTCTVSG-FSLTN-----YHVQWIRQ... | \n", "
3 | \n", "GDPa1-004 | \n", "abrilumab | \n", "327.32 | \n", "98.575 | \n", "98.605 | \n", "2.715 | \n", "2.565 | \n", "1.005 | \n", "0.000000 | \n", "0.054971 | \n", "... | \n", "MRAWIFFLLCLAGRALAQVQLVQSGAEVKKPGASVKVSCKVSGYTL... | \n", "GCCGCCACCATGAGAGCCTGGATCTTTTTCCTGCTGTGCCTGGCTG... | \n", "DIQMTQSPSSVSASVGDRVTITCRASQGISSWLAWYQQKPGKAPKL... | \n", "MRAWIFFLLCLAGRALADIQMTQSPSSVSASVGDRVTITCRASQGI... | \n", "GCCGCCACCATGAGAGCCTGGATCTTTTTCCTGCTGTGCCTGGCTG... | \n", "1 | \n", "3 | \n", "0 | \n", "DIQMTQSPSSVSASVGDRVTITCRAS--QGIS------SWLAWYQQ... | \n", "QVQLVQS-GAEVKKPGASVKVSCKVSG-YTLSD-----LSIHWVRQ... | \n", "
4 | \n", "GDPa1-005 | \n", "adalimumab | \n", "313.39 | \n", "99.300 | \n", "96.120 | \n", "2.705 | \n", "2.495 | \n", "NaN | \n", "0.183387 | \n", "0.085628 | \n", "... | \n", "MRAWIFFLLCLAGRALAEVQLVESGGGLVQPGRSLRLSCAASGFTF... | \n", "GCCGCCACCATGAGAGCCTGGATCTTTTTCCTGCTGTGCCTGGCTG... | \n", "DIQMTQSPSSLSASVGDRVTITCRASQGIRNYLAWYQQKPGKAPKL... | \n", "MRAWIFFLLCLAGRALADIQMTQSPSSLSASVGDRVTITCRASQGI... | \n", "GCCGCCACCATGAGAGCCTGGATCTTTTTCCTGCTGTGCCTGGCTG... | \n", "0 | \n", "2 | \n", "0 | \n", "DIQMTQSPSSLSASVGDRVTITCRAS--QGIR------NYLAWYQQ... | \n", "EVQLVES-GGGLVQPGRSLRLSCAASG-FTFDD-----YAMHWVRQ... | \n", "
5 rows × 30 columns
\n", "