Spaces:

saketh11
/

ColiFormer

Running

saketh11 commited on 6 days ago

Commit

6e9b5dc

1 Parent(s): 2d634e1

Add local CodonTransformer modules for custom ColiFormer functionality

- Removed CodonTransformer PyPI package dependency
- Added local CodonTransformer/ directory with custom modifications
- This includes your enhanced ColiFormer-specific functionality
- App now uses your custom CodonTransformer implementation instead of standard package
- Fixes ModuleNotFoundError: No module named 'CodonTransformer'

Files changed (7) hide show

CodonTransformer/CodonData.py +682 -0
CodonTransformer/CodonEvaluation.py +575 -0
CodonTransformer/CodonJupyter.py +311 -0
CodonTransformer/CodonPostProcessing.py +83 -0
CodonTransformer/CodonPrediction.py +1374 -0
CodonTransformer/CodonUtils.py +871 -0
CodonTransformer/__init__.py +1 -0

CodonTransformer/CodonData.py ADDED Viewed

	@@ -0,0 +1,682 @@

+"""
+File: CodonData.py
+---------------------
+Includes helper functions for preprocessing NCBI or Kazusa databases and
+preparing the data for training and inference of the CodonTransformer model.
+"""
+import json
+import os
+import random
+from typing import Dict, List, Optional, Tuple, Union
+import pandas as pd
+import python_codon_tables as pct
+from Bio import SeqIO
+from Bio.Seq import Seq
+from sklearn.utils import shuffle as sk_shuffle
+from tqdm import tqdm
+from CodonTransformer.CodonUtils import (
+    AMBIGUOUS_AMINOACID_MAP,
+    AMINO2CODON_TYPE,
+    AMINO_ACIDS,
+    ORGANISM2ID,
+    START_CODONS,
+    STOP_CODONS,
+    STOP_SYMBOL,
+    STOP_SYMBOLS,
+    ProteinConfig,
+    find_pattern_in_fasta,
+    get_taxonomy_id,
+    sort_amino2codon_skeleton,
+)
+def prepare_training_data(
+    dataset: Union[str, pd.DataFrame], output_file: str, shuffle: bool = True
+) -> None:
+    """
+    Prepare a JSON dataset for training the CodonTransformer model.
+    Input dataset should have columns below:
+        - dna: str (DNA sequence)
+        - protein: str (Protein sequence)
+        - organism: Union[int, str] (ID or Name of the organism)
+    The output JSON dataset will have the following format:
+        {"idx": 0, "codons": "M_ATG R_AGG L_TTG L_CTA R_CGA __TAG", "organism": 51}
+        {"idx": 1, "codons": "M_ATG K_AAG C_TGC F_TTT F_TTC __TAA", "organism": 59}
+    Args:
+        dataset (Union[str, pd.DataFrame]): Input dataset in CSV or DataFrame format.
+        output_file (str): Path to save the output JSON dataset.
+        shuffle (bool, optional): Whether to shuffle the dataset before saving.
+            Defaults to True.
+    Returns:
+        None
+    """
+    if isinstance(dataset, str):
+        dataset = pd.read_csv(dataset)
+    required_columns = {"dna", "protein", "organism"}
+    if not required_columns.issubset(dataset.columns):
+        raise ValueError(f"Input dataset must have columns: {required_columns}")
+    # Prepare the dataset for finetuning
+    dataset["codons"] = dataset.apply(
+        lambda row: get_merged_seq(row["protein"], row["dna"], separator="_"), axis=1
+    )
+    # Replace organism str with organism id using ORGANISM2ID
+    dataset["organism"] = dataset["organism"].apply(
+        lambda org: process_organism(org, ORGANISM2ID)
+    )
+    # Save the dataset to a JSON file
+    dataframe_to_json(dataset[["codons", "organism"]], output_file, shuffle=shuffle)
+def dataframe_to_json(df: pd.DataFrame, output_file: str, shuffle: bool = True) -> None:
+    """
+    Convert pandas DataFrame to JSON file format suitable for training CodonTransformer.
+    This function takes a preprocessed DataFrame and writes it to a JSON file
+    where each line is a JSON object representing a single record.
+    Args:
+        df (pd.DataFrame): The input DataFrame with 'codons' and 'organism' columns.
+        output_file (str): Path to the output JSON file.
+        shuffle (bool, optional): Whether to shuffle the dataset before saving.
+            Defaults to True.
+    Returns:
+        None
+    Raises:
+        ValueError: If the required columns are not present in the DataFrame.
+    """
+    required_columns = {"codons", "organism"}
+    if not required_columns.issubset(df.columns):
+        raise ValueError(f"DataFrame must contain columns: {required_columns}")
+    print(f"\nStarted writing to {output_file}...")
+    # Shuffle the DataFrame if requested
+    if shuffle:
+        df = sk_shuffle(df)
+    # Write the DataFrame to a JSON file
+    with open(output_file, "w") as f:
+        for idx, row in tqdm(
+            df.iterrows(), total=len(df), desc="Writing JSON...", unit=" records"
+        ):
+            doc = {"idx": idx, "codons": row["codons"], "organism": row["organism"]}
+            f.write(json.dumps(doc) + "\n")
+    print(f"\nTotal Entries Saved: {len(df)}, JSON data saved to {output_file}")
+def process_organism(organism: Union[str, int], organism_to_id: Dict[str, int]) -> int:
+    """
+    Process and validate the organism input, converting it to a valid organism ID.
+    This function handles both string (organism name) and integer (organism ID) inputs.
+    It validates the input against a provided mapping of organism names to IDs.
+    Args:
+        organism (Union[str, int]): Input organism, either as a name (str) or ID (int).
+        organism_to_id (Dict[str, int]): Dictionary mapping organism names to their
+            corresponding IDs.
+    Returns:
+        int: The validated organism ID.
+    Raises:
+        ValueError: If the input is an invalid organism name or ID.
+        TypeError: If the input is neither a string nor an integer.
+    """
+    if isinstance(organism, str):
+        if organism not in organism_to_id:
+            raise ValueError(f"Invalid organism name: {organism}")
+        return organism_to_id[organism]
+    elif isinstance(organism, int):
+        if organism not in organism_to_id.values():
+            raise ValueError(f"Invalid organism ID: {organism}")
+        return organism
+    raise TypeError(
+        f"Organism must be a string or integer, not {type(organism).__name__}"
+    )
+def preprocess_protein_sequence(protein: str) -> str:
+    """
+    Preprocess a protein sequence by cleaning, standardizing, and handling
+    ambiguous amino acids.
+    Args:
+        protein (str): The input protein sequence.
+    Returns:
+        str: The preprocessed protein sequence.
+    Raises:
+        ValueError: If the protein sequence is invalid or if the configuration is invalid.
+    """
+    if not protein:
+        raise ValueError("Protein sequence is empty.")
+    # Clean and standardize the protein sequence
+    protein = (
+        protein.upper().strip().replace("\n", "").replace(" ", "").replace("\t", "")
+    )
+    # Handle ambiguous amino acids based on the specified behavior
+    config = ProteinConfig()
+    ambiguous_aminoacid_map_override = config.get("ambiguous_aminoacid_map_override")
+    ambiguous_aminoacid_behavior = config.get("ambiguous_aminoacid_behavior")
+    ambiguous_aminoacid_map = AMBIGUOUS_AMINOACID_MAP.copy()
+    for aminoacid, standard_aminoacids in ambiguous_aminoacid_map_override.items():
+        ambiguous_aminoacid_map[aminoacid] = standard_aminoacids
+    if ambiguous_aminoacid_behavior == "raise_error":
+        if any(aminoacid in ambiguous_aminoacid_map for aminoacid in protein):
+            raise ValueError("Ambiguous amino acids found in protein sequence.")
+    elif ambiguous_aminoacid_behavior == "standardize_deterministic":
+        protein = "".join(
+            ambiguous_aminoacid_map.get(aminoacid, [aminoacid])[0]
+            for aminoacid in protein
+        )
+    elif ambiguous_aminoacid_behavior == "standardize_random":
+        protein = "".join(
+            random.choice(ambiguous_aminoacid_map.get(aminoacid, [aminoacid]))
+            for aminoacid in protein
+        )
+    else:
+        raise ValueError(
+            f"Invalid ambiguous_aminoacid_behavior: {ambiguous_aminoacid_behavior}."
+        )
+    # Check for sequence validity
+    if any(aminoacid not in AMINO_ACIDS + STOP_SYMBOLS for aminoacid in protein):
+        raise ValueError("Invalid characters in protein sequence.")
+    if protein[-1] not in AMINO_ACIDS + STOP_SYMBOLS:
+        raise ValueError(
+            "Protein sequence must end with `*`, or `_`, or an amino acid."
+        )
+    # Replace '*' at the end of protein with STOP_SYMBOL if present
+    if protein[-1] == "*":
+        protein = protein[:-1] + STOP_SYMBOL
+    # Add stop symbol to end of protein
+    if protein[-1] != STOP_SYMBOL:
+        protein += STOP_SYMBOL
+    return protein
+def replace_ambiguous_codons(dna: str) -> str:
+    """
+    Replaces ambiguous codons in a DNA sequence with "UNK".
+    Args:
+        dna (str): The DNA sequence to process.
+    Returns:
+        str: The processed DNA sequence with ambiguous codons replaced by "UNK".
+    """
+    result = []
+    dna = dna.upper()
+    # Check codons in DNA sequence
+    for i in range(0, len(dna), 3):
+        codon = dna[i : i + 3]
+        if len(codon) == 3 and all(nucleotide in "ATCG" for nucleotide in codon):
+            result.append(codon)
+        else:
+            result.append("UNK")
+    return "".join(result)
+def preprocess_dna_sequence(dna: str) -> str:
+    """
+    Cleans and preprocesses a DNA sequence by standardizing it and replacing
+    ambiguous codons.
+    Args:
+        dna (str): The DNA sequence to preprocess.
+    Returns:
+        str: The cleaned and preprocessed DNA sequence.
+    """
+    if not dna:
+        return ""
+    # Clean and standardize the DNA sequence
+    dna = dna.upper().strip().replace("\n", "").replace(" ", "").replace("\t", "")
+    # Replace codons with ambigous nucleotides with "UNK"
+    dna = replace_ambiguous_codons(dna)
+    # Add unkown stop codon to end of DNA sequence if not present
+    if dna[-3:] not in STOP_CODONS:
+        dna += "UNK"
+    return dna
+def get_merged_seq(protein: str, dna: str = "", separator: str = "_") -> str:
+    """
+    Return the merged sequence of protein amino acids and DNA codons in the form
+    of tokens separated by space, where each token is composed of an amino acid +
+    separator + codon.
+    Args:
+        protein (str): Protein sequence.
+        dna (str): DNA sequence.
+        separator (str): Separator between amino acid and codon.
+    Returns:
+        str: Merged sequence.
+    Example:
+        >>> get_merged_seq(protein="MAV_", dna="ATGGCTGTGTAA", separator="_")
+        'M_ATG A_GCT V_GTG __TAA'
+        >>> get_merged_seq(protein="QHH_", dna="", separator="_")
+        'Q_UNK H_UNK H_UNK __UNK'
+    """
+    merged_seq = ""
+    # Prepare protein and dna sequences
+    dna = preprocess_dna_sequence(dna)
+    protein = preprocess_protein_sequence(protein)
+    # Check if the length of protein and dna sequences are equal
+    if len(dna) > 0 and len(protein) != len(dna) / 3:
+        raise ValueError(
+            'Length of protein (including stop symbol such as "_") and '
+            "the number of codons in DNA sequence (including stop codon) "
+            "must be equal."
+        )
+    # Merge protein and DNA sequences into tokens
+    for i, aminoacid in enumerate(protein):
+        merged_seq += f'{aminoacid}{separator}{dna[i * 3:i * 3 + 3] if dna else "UNK"} '
+    return merged_seq.strip()
+def is_correct_seq(dna: str, protein: str, stop_symbol: str = STOP_SYMBOL) -> bool:
+    """
+    Check if the given DNA and protein pair is correct, that is:
+        1. The length of dna is divisible by 3
+        2. There is an initiator codon in the beginning of dna
+        3. There is only one stop codon in the sequence
+        4. The only stop codon is the last codon
+    Note since in Codon Table 3, 'TGA' is interpreted as Triptophan (W),
+    there is a separate check to make sure those sequences are considered correct.
+    Args:
+        dna (str): DNA sequence.
+        protein (str): Protein sequence.
+        stop_symbol (str): Stop symbol.
+    Returns:
+        bool: True if the sequence is correct, False otherwise.
+    """
+    return (
+        len(dna) % 3 == 0  # Check if DNA length is divisible by 3
+        and dna[:3].upper() in START_CODONS  # Check for initiator codon
+        and protein[-1]
+        == stop_symbol  # Check if the last protein symbol is the stop symbol
+        and protein.count(stop_symbol) == 1  # Check if there is only one stop symbol
+        and len(set(dna))
+        == 4  # Check if DNA consists of 4 unique nucleotides (A, T, C, G)
+    )
+def get_amino_acid_sequence(
+    dna: str,
+    stop_symbol: str = "_",
+    codon_table: int = 1,
+    return_correct_seq: bool = False,
+) -> Union[str, Tuple[str, bool]]:
+    """
+    Return the translated protein sequence given a DNA sequence and codon table.
+    Args:
+        dna (str): DNA sequence.
+        stop_symbol (str): Stop symbol.
+        codon_table (int): Codon table number.
+        return_correct_seq (bool): Whether to return if the sequence is correct.
+    Returns:
+        Union[str, Tuple[str, bool]]: Protein sequence and correctness flag if
+        return_correct_seq is True, otherwise just the protein sequence.
+    """
+    dna_seq = Seq(dna).strip()
+    # Translate the DNA sequence to a protein sequence
+    protein_seq = str(
+        dna_seq.translate(
+            stop_symbol=stop_symbol,  # Symbol to use for stop codons
+            to_stop=False,  # Translate the entire sequence, including any stop codons
+            cds=False,  # Do not assume the input is a coding sequence
+            table=codon_table,  # Codon table to use for translation
+        )
+    ).strip()
+    return (
+        protein_seq
+        if not return_correct_seq
+        else (protein_seq, is_correct_seq(dna_seq, protein_seq, stop_symbol))
+    )
+def read_fasta_file(
+    input_file: str,
+    save_to_file: Optional[str] = None,
+    organism: str = "",
+    buffer_size: int = 50000,
+) -> pd.DataFrame:
+    """
+    Read a FASTA file of DNA sequences and convert it to a Pandas DataFrame.
+    Optionally, save the DataFrame to a CSV file.
+    Args:
+        input_file (str): Path to the input FASTA file.
+        save_to_file (Optional[str]): Path to save the output DataFrame. If None,
+            data is only returned.
+        organism (str): Name of the organism. If empty, it will be extracted from
+            the FASTA description.
+        buffer_size (int): Number of records to process before writing to file.
+    Returns:
+        pd.DataFrame: DataFrame containing the DNA sequences if return_dataframe
+        is True, else None.
+    Raises:
+        FileNotFoundError: If the input file does not exist.
+    """
+    if not os.path.exists(input_file):
+        raise FileNotFoundError(f"Input file not found: {input_file}")
+    buffer = []
+    columns = [
+        "dna",
+        "protein",
+        "correct_seq",
+        "organism",
+        "GeneID",
+        "description",
+        "tokenized",
+    ]
+    # Initialize DataFrame to store all data if return_dataframe is True
+    all_data = pd.DataFrame(columns=columns)
+    with open(input_file, "r") as fasta_file:
+        for record in tqdm(
+            SeqIO.parse(fasta_file, "fasta"),
+            desc=f"Processing {organism}",
+            unit=" Records",
+        ):
+            dna = str(record.seq).strip().upper()  # Ensure uppercase DNA sequence
+            # Determine the organism from the record if not provided
+            current_organism = organism or find_pattern_in_fasta(
+                "organism", record.description
+            )
+            gene_id = find_pattern_in_fasta("GeneID", record.description)
+            # Get the appropriate codon table for the organism
+            codon_table = get_codon_table(current_organism)
+            # Translate DNA to protein sequence
+            protein, correct_seq = get_amino_acid_sequence(
+                dna,
+                stop_symbol=STOP_SYMBOL,
+                codon_table=codon_table,
+                return_correct_seq=True,
+            )
+            description = record.description.split("[", 1)[0].strip()
+            tokenized = get_merged_seq(protein, dna, separator=STOP_SYMBOL)
+            # Create a data row for the current sequence
+            data_row = {
+                "dna": dna,
+                "protein": protein,
+                "correct_seq": correct_seq,
+                "organism": current_organism,
+                "GeneID": gene_id,
+                "description": description,
+                "tokenized": tokenized,
+            }
+            buffer.append(data_row)
+            # Write buffer to CSV file when buffer size is reached
+            if save_to_file and len(buffer) >= buffer_size:
+                write_buffer_to_csv(buffer, save_to_file, columns)
+                buffer = []
+            all_data = pd.concat(
+                [all_data, pd.DataFrame([data_row])], ignore_index=True
+            )
+    # Write remaining buffer to CSV file
+    if save_to_file and buffer:
+        write_buffer_to_csv(buffer, save_to_file, columns)
+    return all_data
+def write_buffer_to_csv(buffer: List[Dict], output_path: str, columns: List[str]):
+    """Helper function to write buffer to CSV file."""
+    buffer_df = pd.DataFrame(buffer, columns=columns)
+    buffer_df.to_csv(
+        output_path,
+        mode="a",
+        header=(not os.path.exists(output_path)),
+        index=True,
+    )
+def download_codon_frequencies_from_kazusa(
+    taxonomy_id: Optional[int] = None,
+    organism: Optional[str] = None,
+    taxonomy_reference: Optional[str] = None,
+    return_original_format: bool = False,
+) -> AMINO2CODON_TYPE:
+    """
+    Return the codon table of the given taxonomy ID from the Kazusa Database.
+    Args:
+        taxonomy_id (Optional[int]): Taxonomy ID.
+        organism (Optional[str]): Name of the organism.
+        taxonomy_reference (Optional[str]): Taxonomy reference.
+        return_original_format (bool): Whether to return in the original format.
+    Returns:
+        AMINO2CODON_TYPE: Codon table.
+    """
+    if taxonomy_reference:
+        taxonomy_id = get_taxonomy_id(taxonomy_reference, organism=organism)
+    kazusa_amino2codon = pct.get_codons_table(table_name=taxonomy_id)
+    if return_original_format:
+        return kazusa_amino2codon
+    # Replace "*" with STOP_SYMBOL in the codon table
+    kazusa_amino2codon[STOP_SYMBOL] = kazusa_amino2codon.pop("*")
+    # Create amino2codon dictionary
+    amino2codon = {
+        aminoacid: (list(codon2freq.keys()), list(codon2freq.values()))
+        for aminoacid, codon2freq in kazusa_amino2codon.items()
+    }
+    return sort_amino2codon_skeleton(amino2codon)
+def build_amino2codon_skeleton(organism: str) -> AMINO2CODON_TYPE:
+    """
+    Return the empty skeleton of the amino2codon dictionary, needed for
+    get_codon_frequencies.
+    Args:
+        organism (str): Name of the organism.
+    Returns:
+        AMINO2CODON_TYPE: Empty amino2codon dictionary.
+    """
+    amino2codon = {}
+    possible_codons = [f"{i}{j}{k}" for i in "ACGT" for j in "ACGT" for k in "ACGT"]
+    possible_aminoacids = get_amino_acid_sequence(
+        dna="".join(possible_codons),
+        codon_table=get_codon_table(organism),
+        return_correct_seq=False,
+    )
+    # Initialize the amino2codon skeleton with all possible codons and set their
+    # frequencies to 0
+    for i, (codon, amino) in enumerate(zip(possible_codons, possible_aminoacids)):
+        if amino not in amino2codon:
+            amino2codon[amino] = ([], [])
+        amino2codon[amino][0].append(codon)
+        amino2codon[amino][1].append(0)
+    # Sort the dictionary and each list of codon frequency alphabetically
+    amino2codon = sort_amino2codon_skeleton(amino2codon)
+    return amino2codon
+def get_codon_frequencies(
+    dna_sequences: List[str],
+    protein_sequences: Optional[List[str]] = None,
+    organism: Optional[str] = None,
+) -> AMINO2CODON_TYPE:
+    """
+    Return a dictionary mapping each codon to its respective frequency based on
+    the collection of DNA sequences and protein sequences.
+    Args:
+        dna_sequences (List[str]): List of DNA sequences.
+        protein_sequences (Optional[List[str]]): List of protein sequences.
+        organism (Optional[str]): Name of the organism.
+    Returns:
+        AMINO2CODON_TYPE: Dictionary mapping each amino acid to a tuple of codons
+        and frequencies.
+    """
+    if organism:
+        codon_table = get_codon_table(organism)
+        protein_sequences = [
+            get_amino_acid_sequence(
+                dna, codon_table=codon_table, return_correct_seq=False
+            )
+            for dna in dna_sequences
+        ]
+    amino2codon = build_amino2codon_skeleton(organism)
+    # Count the frequencies of each codon for each amino acid
+    for dna, protein in zip(dna_sequences, protein_sequences):
+        for i, amino in enumerate(protein):
+            codon = dna[i * 3 : (i + 1) * 3]
+            codon_loc = amino2codon[amino][0].index(codon)
+            amino2codon[amino][1][codon_loc] += 1
+    # Normalize codon frequencies per amino acid so they sum to 1
+    amino2codon = {
+        amino: (codons, [freq / (sum(frequencies) + 1e-100) for freq in frequencies])
+        for amino, (codons, frequencies) in amino2codon.items()
+    }
+    return amino2codon
+def get_organism_to_codon_frequencies(
+    dataset: pd.DataFrame, organisms: List[str]
+) -> Dict[str, AMINO2CODON_TYPE]:
+    """
+    Return a dictionary mapping each organism to their codon frequency distribution.
+    Args:
+        dataset (pd.DataFrame): DataFrame containing DNA sequences.
+        organisms (List[str]): List of organisms.
+    Returns:
+        Dict[str, AMINO2CODON_TYPE]: Dictionary mapping each organism to its codon
+        frequency distribution.
+    """
+    organism2frequencies = {}
+    # Calculate codon frequencies for each organism in the dataset
+    for organism in tqdm(
+        organisms, desc="Calculating Codon Frequencies: ", unit="Organism"
+    ):
+        organism_data = dataset.loc[dataset["organism"] == organism]
+        dna_sequences = organism_data["dna"].to_list()
+        protein_sequences = organism_data["protein"].to_list()
+        codon_frequencies = get_codon_frequencies(dna_sequences, protein_sequences)
+        organism2frequencies[organism] = codon_frequencies
+    return organism2frequencies
+def get_codon_table(organism: str) -> int:
+    """
+    Return the appropriate NCBI codon table for a given organism.
+    Args:
+        organism (str): Name of the organism.
+    Returns:
+        int: Codon table number.
+    """
+    # Common codon table (Table 1) for many model organisms
+    if organism in [
+        "Arabidopsis thaliana",
+        "Caenorhabditis elegans",
+        "Chlamydomonas reinhardtii",
+        "Saccharomyces cerevisiae",
+        "Danio rerio",
+        "Drosophila melanogaster",
+        "Homo sapiens",
+        "Mus musculus",
+        "Nicotiana tabacum",
+        "Solanum tuberosum",
+        "Solanum lycopersicum",
+        "Oryza sativa",
+        "Glycine max",
+        "Zea mays",
+    ]:
+        codon_table = 1
+    # Chloroplast codon table (Table 11)
+    elif organism in [
+        "Chlamydomonas reinhardtii chloroplast",
+        "Nicotiana tabacum chloroplast",
+    ]:
+        codon_table = 11
+    # Default to Table 11 for other bacteria and archaea
+    else:
+        codon_table = 11
+    return codon_table

CodonTransformer/CodonEvaluation.py ADDED Viewed

	@@ -0,0 +1,575 @@

+"""
+File: CodonEvaluation.py
+---------------------------
+Includes functions to calculate various evaluation metrics along with helper
+functions.
+"""
+from typing import Dict, List, Tuple, Optional
+import pandas as pd
+from CAI import CAI, relative_adaptiveness
+from tqdm import tqdm
+import math
+import numpy as np
+from collections import Counter
+from itertools import chain
+from statistics import mean
+import sys
+import os
+from io import StringIO
+def get_CSI_weights(sequences: List[str]) -> Dict[str, float]:
+    """
+    Calculate the Codon Similarity Index (CSI) weights for a list of DNA sequences.
+    Args:
+        sequences (List[str]): List of DNA sequences.
+    Returns:
+        dict: The CSI weights.
+    """
+    return relative_adaptiveness(sequences=sequences)
+def get_CSI_value(dna: str, weights: Dict[str, float]) -> float:
+    """
+    Calculate the Codon Similarity Index (CSI) for a DNA sequence.
+    Args:
+        dna (str): The DNA sequence.
+        weights (dict): The CSI weights from get_CSI_weights.
+    Returns:
+        float: The CSI value.
+    """
+    return CAI(dna, weights)
+def get_organism_to_CSI_weights(
+    dataset: pd.DataFrame, organisms: List[str]
+) -> Dict[str, dict]:
+    """
+    Calculate the Codon Similarity Index (CSI) weights for a list of organisms.
+    Args:
+        dataset (pd.DataFrame): Dataset containing organism and DNA sequence info.
+        organisms (List[str]): List of organism names.
+    Returns:
+        Dict[str, dict]: A dictionary mapping each organism to its CSI weights.
+    """
+    organism2weights = {}
+    # Iterate through each organism to calculate its CSI weights
+    for organism in tqdm(organisms, desc="Calculating CSI Weights: ", unit="Organism"):
+        organism_data = dataset.loc[dataset["organism"] == organism]
+        sequences = organism_data["dna"].to_list()
+        weights = get_CSI_weights(sequences)
+        organism2weights[organism] = weights
+    return organism2weights
+def get_GC_content(dna: str) -> float:
+    """
+    Calculate the GC content of a DNA sequence.
+    Args:
+        dna (str): The DNA sequence.
+    Returns:
+        float: The GC content as a percentage.
+    """
+    dna = dna.upper()
+    if not dna:
+        return 0.0
+    return (dna.count("G") + dna.count("C")) / len(dna) * 100
+def get_cfd(
+    dna: str,
+    codon_frequencies: Dict[str, Tuple[List[str], List[float]]],
+    threshold: float = 0.3,
+) -> float:
+    """
+    Calculate the codon frequency distribution (CFD) metric for a DNA sequence.
+    Args:
+        dna (str): The DNA sequence.
+        codon_frequencies (Dict[str, Tuple[List[str], List[float]]]): Codon
+            frequency distribution per amino acid.
+        threshold (float): Frequency threshold for counting rare codons.
+    Returns:
+        float: The CFD metric as a percentage.
+    """
+    # Get a dictionary mapping each codon to its normalized frequency
+    codon2frequency = {
+        codon: freq / max(frequencies)
+        for amino, (codons, frequencies) in codon_frequencies.items()
+        for codon, freq in zip(codons, frequencies)
+    }
+    cfd = 0
+    # Iterate through the DNA sequence in steps of 3 to process each codon
+    for i in range(0, len(dna), 3):
+        codon = dna[i : i + 3]
+        codon_frequency = codon2frequency[codon]
+        if codon_frequency < threshold:
+            cfd += 1
+    return cfd / (len(dna) / 3) * 100
+def get_min_max_percentage(
+    dna: str,
+    codon_frequencies: Dict[str, Tuple[List[str], List[float]]],
+    window_size: int = 18,
+) -> List[float]:
+    """
+    Calculate the %MinMax metric for a DNA sequence.
+    Args:
+        dna (str): The DNA sequence.
+        codon_frequencies (Dict[str, Tuple[List[str], List[float]]]): Codon
+            frequency distribution per amino acid.
+        window_size (int): Size of the window to calculate %MinMax.
+    Returns:
+        List[float]: List of %MinMax values for the sequence.
+    Credit: https://github.com/chowington/minmax
+    """
+    # Get a dictionary mapping each codon to its respective amino acid
+    codon2amino = {
+        codon: amino
+        for amino, (codons, frequencies) in codon_frequencies.items()
+        for codon in codons
+    }
+    min_max_values = []
+    codons = [dna[i : i + 3] for i in range(0, len(dna), 3)]  # Split DNA into codons
+    # Iterate through the DNA sequence using the specified window size
+    for i in range(len(codons) - window_size + 1):
+        codon_window = codons[i : i + window_size]  # Codons in the current window
+        Actual = 0.0  # Average of the actual codon frequencies
+        Max = 0.0  # Average of the min codon frequencies
+        Min = 0.0  # Average of the max codon frequencies
+        Avg = 0.0  # Average of the averages of all frequencies for each amino acid
+        # Sum the frequencies for codons in the current window
+        for codon in codon_window:
+            aminoacid = codon2amino[codon]
+            frequencies = codon_frequencies[aminoacid][1]
+            codon_index = codon_frequencies[aminoacid][0].index(codon)
+            codon_frequency = codon_frequencies[aminoacid][1][codon_index]
+            Actual += codon_frequency
+            Max += max(frequencies)
+            Min += min(frequencies)
+            Avg += sum(frequencies) / len(frequencies)
+        # Divide by the window size to get the averages
+        Actual = Actual / window_size
+        Max = Max / window_size
+        Min = Min / window_size
+        Avg = Avg / window_size
+        # Calculate %MinMax
+        percentMax = ((Actual - Avg) / (Max - Avg)) * 100
+        percentMin = ((Avg - Actual) / (Avg - Min)) * 100
+        # Append the appropriate %MinMax value
+        if percentMax >= 0:
+            min_max_values.append(percentMax)
+        else:
+            min_max_values.append(-percentMin)
+    # Populate the last floor(window_size / 2) entries of min_max_values with None
+    for i in range(int(window_size / 2)):
+        min_max_values.append(None)
+    return min_max_values
+def get_sequence_complexity(dna: str) -> float:
+    """
+    Calculate the sequence complexity score of a DNA sequence.
+    Args:
+        dna (str): The DNA sequence.
+    Returns:
+        float: The sequence complexity score.
+    """
+    def sum_up_to(x):
+        """Recursive function to calculate the sum of integers from 1 to x."""
+        if x <= 1:
+            return 1
+        else:
+            return x + sum_up_to(x - 1)
+    def f(x):
+        """Returns 4 if x is greater than or equal to 4, else returns x."""
+        if x >= 4:
+            return 4
+        elif x < 4:
+            return x
+    unique_subseq_length = []
+    # Calculate unique subsequences lengths
+    for i in range(1, len(dna) + 1):
+        unique_subseq = set()
+        for j in range(len(dna) - (i - 1)):
+            unique_subseq.add(dna[j : (j + i)])
+        unique_subseq_length.append(len(unique_subseq))
+    # Calculate complexity score
+    complexity_score = (
+        sum(unique_subseq_length) / (sum_up_to(len(dna) - 1) + f(len(dna)))
+    ) * 100
+    return complexity_score
+def get_sequence_similarity(
+    original: str, predicted: str, truncate: bool = True, window_length: int = 1
+) -> float:
+    """
+    Calculate the sequence similarity between two sequences.
+    Args:
+        original (str): The original sequence.
+        predicted (str): The predicted sequence.
+        truncate (bool): If True, truncate the original sequence to match the length
+            of the predicted sequence.
+        window_length (int): Length of the window for comparison (1 for amino acids,
+            3 for codons).
+    Returns:
+        float: The sequence similarity as a percentage.
+    Preconditions:
+        len(predicted) <= len(original).
+    """
+    if not truncate and len(original) != len(predicted):
+        raise ValueError(
+            "Set truncate to True if the length of sequences do not match."
+        )
+    identity = 0.0
+    original = original.strip()
+    predicted = predicted.strip()
+    if truncate:
+        original = original[: len(predicted)]
+    if window_length == 1:
+        # Simple comparison for amino acid
+        for i in range(len(predicted)):
+            if original[i] == predicted[i]:
+                identity += 1
+    else:
+        # Comparison for substrings based on window_length
+        for i in range(0, len(original) - window_length + 1, window_length):
+            if original[i : i + window_length] == predicted[i : i + window_length]:
+                identity += 1
+    return (identity / (len(predicted) / window_length)) * 100
+def scan_for_restriction_sites(seq: str, sites: List[str] = ['GAATTC', 'GGATCC', 'AAGCTT']) -> int:
+    """
+    Scans for a list of restriction enzyme sites in a DNA sequence.
+    """
+    return sum(seq.upper().count(site.upper()) for site in sites)
+def count_negative_cis_elements(seq: str, motifs: List[str] = ['TATAAT', 'TTGACA', 'AGCTAGT']) -> int:
+    """
+    Counts occurrences of negative cis-regulatory elements in a DNA sequence.
+    """
+    return sum(seq.upper().count(m.upper()) for m in motifs)
+def calculate_homopolymer_runs(seq: str, max_len: int = 8) -> int:
+    """
+    Calculates the number of homopolymer runs longer than a given length.
+    """
+    import re
+    min_len = max_len + 1
+    return len(re.findall(r'(A{%d,}|T{%d,}|G{%d,}|C{%d,})' % (min_len, min_len, min_len, min_len), seq.upper()))
+def get_min_max_profile(
+    dna: str,
+    codon_frequencies: Dict[str, Tuple[List[str], List[float]]],
+    window_size: int = 18,
+) -> List[float]:
+    """
+    Calculate the %MinMax profile for a DNA sequence. This is a list of
+    %MinMax values for sliding windows across the sequence.
+    Args:
+        dna (str): The DNA sequence.
+        codon_frequencies (Dict[str, Tuple[List[str], List[float]]]): Codon
+            frequency distribution per amino acid.
+        window_size (int): Size of the window to calculate %MinMax.
+    Returns:
+        List[float]: List of %MinMax values for the sequence.
+    """
+    return get_min_max_percentage(dna, codon_frequencies, window_size)
+def calculate_dtw_distance(profile1: List[float], profile2: List[float]) -> float:
+    """
+    Calculates the Dynamic Time Warping (DTW) distance between two profiles.
+    Args:
+        profile1 (List[float]): The first profile (e.g., %MinMax of generated sequence).
+        profile2 (List[float]): The second profile (e.g., %MinMax of natural sequence).
+    Returns:
+        float: The DTW distance between the two profiles.
+    """
+    from dtw import dtw
+    import numpy as np
+    # Ensure profiles are numpy arrays and handle potential None and NaN values
+    p1 = np.array([v for v in profile1 if v is not None and not np.isnan(v)]).reshape(
+        -1, 1
+    )
+    p2 = np.array([v for v in profile2 if v is not None and not np.isnan(v)]).reshape(
+        -1, 1
+    )
+    if len(p1) == 0 or len(p2) == 0:
+        return np.inf  # Return infinity if one of the profiles is empty
+    alignment = dtw(p1, p2, keep_internals=True)
+    return alignment.distance  # type: ignore
+def get_ecoli_tai_weights():
+    """
+    Returns a dictionary of tAI weights for E. coli based on tRNA gene copy numbers.
+    These weights are pre-calculated based on the relative adaptiveness of each codon.
+    """
+    codons = [
+        "TTT", "TTC", "TTA", "TTG", "TCT", "TCC", "TCA", "TCG", "TAT", "TAC",
+        "TGT", "TGC", "TGG", "CTT", "CTC", "CTA", "CTG", "CCT", "CCC", "CCA",
+        "CCG", "CAT", "CAC", "CAA", "CAG", "CGT", "CGC", "CGA", "CGG", "ATT",
+        "ATC", "ATA", "ACT", "ACC", "ACA", "ACG", "AAT", "AAC", "AAA", "AAG",
+        "AGT", "AGC", "AGA", "AGG", "GTT", "GTC", "GTA", "GTG", "GCT", "GCC",
+        "GCA", "GCG", "GAT", "GAC", "GAA", "GAG", "GGT", "GGC", "GGA", "GGG"
+    ]
+    weights = [
+        0.1966667, 0.3333333, 0.1666667, 0.2200000, 0.1966667, 0.3333333,
+        0.1666667, 0.2200000, 0.2950000, 0.5000000, 0.09833333, 0.1666667,
+        0.2200000, 0.09833333, 0.1666667, 0.1666667, 0.7200000, 0.09833333,
+        0.1666667, 0.1666667, 0.2200000, 0.09833333, 0.1666667, 0.3333333,
+        0.4400000, 0.6666667, 0.4800000, 0.00006666667, 0.1666667, 0.2950000,
+        0.5000000, 0.01833333, 0.1966667, 0.3333333, 0.1666667, 0.3866667,
+        0.3933333, 0.6666667, 1.0000000, 0.3200000, 0.09833333, 0.1666667,
+        0.1666667, 0.2200000, 0.1966667, 0.3333333, 0.8333333, 0.2666667,
+        0.1966667, 0.3333333, 0.5000000, 0.1600000, 0.2950000, 0.5000000,
+        0.6666667, 0.2133333, 0.3933333, 0.6666667, 0.1666667, 0.2200000
+    ]
+    return dict(zip(codons, weights))
+def calculate_tAI(sequence: str, tai_weights: Dict[str, float]) -> float:
+    """
+    Calculates the tRNA Adaptation Index (tAI) for a given DNA sequence.
+    Args:
+        sequence (str): The DNA sequence to analyze.
+        tai_weights (Dict[str, float]): A dictionary of tAI weights for each codon.
+    Returns:
+        float: The tAI value for the sequence.
+    """
+    from scipy.stats.mstats import gmean
+    codons = [sequence[i:i+3] for i in range(0, len(sequence), 3)]
+    # Filter out stop codons and codons not in weights
+    weights = [tai_weights[codon] for codon in codons if codon in tai_weights and tai_weights[codon] > 0]
+    if not weights:
+        return 0.0
+    return gmean(weights)
+def calculate_ENC(sequence: str) -> float:
+    """
+    Calculate the Effective Number of Codons (ENC) for a DNA sequence.
+    Uses the codonbias library implementation based on Wright (1990).
+    Args:
+        sequence (str): The DNA sequence.
+    Returns:
+        float: The ENC value for the sequence.
+    """
+    try:
+        from codonbias.scores import EffectiveNumberOfCodons
+        # Initialize ENC calculator
+        enc_calculator = EffectiveNumberOfCodons(
+            k_mer=1,  # Standard codon analysis
+            bg_correction=True,  # Use background correction
+            robust=True,  # Use robust calculation
+            genetic_code=1  # Standard genetic code
+        )
+        # Calculate ENC for the sequence
+        enc_value = enc_calculator.get_score(sequence)
+        return float(enc_value)
+    except ImportError:
+        raise ImportError("codonbias library is required for ENC calculation. Install with: pip install codonbias")
+    except Exception as e:
+        # Fallback to a simple ENC approximation if library fails
+        print(f"Warning: ENC calculation failed with error: {e}. Using approximation.")
+        return 45.0  # Typical E. coli ENC value as fallback
+def calculate_CPB(sequence: str, reference_sequences: Optional[List[str]] = None) -> float:
+    """
+    Calculate the Codon Pair Bias (CPB) for a DNA sequence.
+    Uses the codonbias library implementation based on Coleman et al. (2008).
+    Args:
+        sequence (str): The DNA sequence.
+        reference_sequences (List[str]): Reference sequences for calculating expected values.
+                                       If None, uses a default E. coli reference.
+    Returns:
+        float: The CPB value for the sequence.
+    """
+    try:
+        from codonbias.scores import CodonPairBias
+        # Use provided reference sequences or default
+        if reference_sequences is None:
+            # Use the input sequence as reference if none provided
+            reference_sequences = [sequence]
+        # Initialize CPB calculator with reference sequences
+        cpb_calculator = CodonPairBias(
+            ref_seq=reference_sequences,
+            k_mer=2,  # Codon pairs
+            genetic_code=1,  # Standard genetic code
+            ignore_stop=True,  # Ignore stop codons
+            pseudocount=1  # Pseudocount for unseen pairs
+        )
+        # Calculate CPB for the sequence
+        cpb_value = cpb_calculator.get_score(sequence)
+        return float(cpb_value)
+    except ImportError:
+        raise ImportError("codonbias library is required for CPB calculation. Install with: pip install codonbias")
+    except Exception as e:
+        # Fallback calculation if library fails
+        print(f"Warning: CPB calculation failed with error: {e}. Using approximation.")
+        return 0.0  # Neutral CPB as fallback
+def calculate_SCUO(sequence: str) -> float:
+    """
+    Calculate the Synonymous Codon Usage Order (SCUO) for a DNA sequence.
+    Uses the GCUA library implementation based on information theory.
+    Args:
+        sequence (str): The DNA sequence.
+    Returns:
+        float: The SCUO value (0-1, where 1 indicates maximum bias).
+    """
+    # Self-contained SCUO implementation (no external GCUA dependency).
+    # Based on Wan et al., 2004 information-theoretic definition.
+    from math import log2  # local import to avoid global cost
+    try:
+        # Build standard genetic code mapping using built-in tables (Biopython optional).
+        # Fall back to hard-coded table if Biopython absent.
+        try:
+            from Bio.Data import CodonTable  # type: ignore
+            codon_to_aa = CodonTable.unambiguous_dna_by_id[1].forward_table
+        except Exception:
+            codon_to_aa = {
+                # Partial table sufficient for SCUO calculation; stop codons omitted.
+                'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
+                'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
+                'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
+                'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
+                'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
+                'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
+                'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
+                'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
+                'TAT': 'Y', 'TAC': 'Y', 'TAA': '*', 'TAG': '*',
+                'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
+                'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
+                'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
+                'TGT': 'C', 'TGC': 'C', 'TGA': '*', 'TGG': 'W',
+                'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
+                'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
+                'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G',
+            }
+        # Group codons by amino acid (exclude stops)
+        aa_to_codons = {}
+        for codon, aa in codon_to_aa.items():
+            aa_to_codons.setdefault(aa, []).append(codon)
+        # Count codon occurrences in input sequence
+        seq = sequence.upper().replace('U', 'T')
+        codon_counts = {}
+        for i in range(0, len(seq) - len(seq) % 3, 3):
+            codon = seq[i:i+3]
+            if codon in codon_to_aa:
+                codon_counts[codon] = codon_counts.get(codon, 0) + 1
+        total_codons = sum(codon_counts.values())
+        if total_codons == 0:
+            return 0.0
+        scuo_sum = 0.0
+        for aa, codons in aa_to_codons.items():
+            n_codons = len(codons)
+            if n_codons == 1:
+                continue  # SCUO undefined for Met/Trp
+            counts = [codon_counts.get(c, 0) for c in codons]
+            total_aa = sum(counts)
+            if total_aa == 0:
+                continue
+            probs = [c / total_aa for c in counts if c]
+            H_obs = -sum(p * log2(p) for p in probs)
+            H_max = log2(n_codons)
+            O_i = (H_max - H_obs) / H_max if H_max else 0.0
+            F_i = total_aa / total_codons
+            scuo_sum += F_i * O_i
+        return scuo_sum
+    except Exception as exc:
+        print(f"Warning: internal SCUO computation failed ({exc}). Returning 0.5.")
+        return 0.5

CodonTransformer/CodonJupyter.py ADDED Viewed

	@@ -0,0 +1,311 @@

+"""
+File: CodonJupyter.py
+---------------------
+Includes Jupyter-specific functions for displaying interactive widgets.
+"""
+from typing import Dict, List, Tuple
+import ipywidgets as widgets
+from IPython.display import HTML, display
+from CodonTransformer.CodonUtils import (
+    COMMON_ORGANISMS,
+    ID2ORGANISM,
+    ORGANISM2ID,
+    DNASequencePrediction,
+)
+class UserContainer:
+    """
+    A container class to store user inputs for organism and protein sequence.
+    Attributes:
+        organism (int): The selected organism id.
+        protein (str): The input protein sequence.
+    """
+    def __init__(self) -> None:
+        self.organism: int = -1
+        self.protein: str = ""
+def create_styled_options(
+    organisms: list, organism2id: Dict[str, int], is_fine_tuned: bool = False
+) -> list:
+    """
+    Create styled options for the dropdown widget.
+    Args:
+        organisms (list): List of organism names.
+        organism2id (Dict[str, int]): Dictionary mapping organism names to their IDs.
+        is_fine_tuned (bool): Whether these are fine-tuned organisms.
+    Returns:
+        list: Styled options for the dropdown widget.
+    """
+    styled_options = []
+    for organism in organisms:
+        organism_id = organism2id[organism]
+        if is_fine_tuned:
+            if organism_id < 10:
+                styled_options.append(f"\u200b{organism_id:>6}.  {organism}")
+            elif organism_id < 100:
+                styled_options.append(f"\u200b{organism_id:>5}.  {organism}")
+            else:
+                styled_options.append(f"\u200b{organism_id:>4}.  {organism}")
+        else:
+            if organism_id < 10:
+                styled_options.append(f"{organism_id:>6}.  {organism}")
+            elif organism_id < 100:
+                styled_options.append(f"{organism_id:>5}.  {organism}")
+            else:
+                styled_options.append(f"{organism_id:>4}.  {organism}")
+    return styled_options
+def create_dropdown_options(organism2id: Dict[str, int]) -> list:
+    """
+    Create the full list of dropdown options, including section headers.
+    Args:
+        organism2id (Dict[str, int]): Dictionary mapping organism names to their IDs.
+    Returns:
+        list: Full list of dropdown options.
+    """
+    fine_tuned_organisms = sorted(
+        [org for org in organism2id.keys() if org in COMMON_ORGANISMS]
+    )
+    all_organisms = sorted(organism2id.keys())
+    fine_tuned_options = create_styled_options(
+        fine_tuned_organisms, organism2id, is_fine_tuned=True
+    )
+    all_organisms_options = create_styled_options(
+        all_organisms, organism2id, is_fine_tuned=False
+    )
+    return (
+        [""]
+        + ["Selected Organisms"]
+        + fine_tuned_options
+        + [""]
+        + ["All Organisms"]
+        + all_organisms_options
+    )
+def create_organism_dropdown(container: UserContainer) -> widgets.Dropdown:
+    """
+    Create and configure the organism dropdown widget.
+    Args:
+        container (UserContainer): Container to store the selected organism.
+    Returns:
+        widgets.Dropdown: Configured dropdown widget.
+    """
+    dropdown = widgets.Dropdown(
+        options=create_dropdown_options(ORGANISM2ID),
+        description="",
+        layout=widgets.Layout(width="40%", margin="0 0 10px 0"),
+        style={"description_width": "initial"},
+    )
+    def show_organism(change: Dict[str, str]) -> None:
+        """
+        Update the container with the selected organism and print to terminal.
+        Args:
+            change (Dict[str, str]): Information about the change in dropdown value.
+        """
+        dropdown_choice = change["new"]
+        if dropdown_choice and dropdown_choice not in [
+            "Selected Organisms",
+            "All Organisms",
+        ]:
+            organism = "".join(filter(str.isdigit, dropdown_choice))
+            organism_id = ID2ORGANISM[int(organism)]
+            container.organism = organism_id
+        else:
+            container.organism = None
+    dropdown.observe(show_organism, names="value")
+    return dropdown
+def get_dropdown_style() -> str:
+    """
+    Return the custom CSS style for the dropdown widget.
+    Returns:
+        str: CSS style string.
+    """
+    return """
+    <style>
+        .widget-dropdown > select {
+            font-size: 16px;
+            font-weight: normal;
+            background-color: #f0f0f0;
+            border-radius: 5px;
+            padding: 5px;
+        }
+        .widget-label {
+            font-size: 18px;
+            font-weight: bold;
+        }
+        .custom-container {
+            display: flex;
+            flex-direction: column;
+            align-items: flex-start;
+        }
+        .widget-dropdown option[value^="\u200b"] {
+            font-family: sans-serif;
+            font-weight: bold;
+            font-size: 18px;
+            padding: 510px;
+        }
+        .widget-dropdown option[value*="Selected Organisms"],
+        .widget-dropdown option[value*="All Organisms"] {
+            text-align: center;
+            font-family: Arial, sans-serif;
+            font-weight: bold;
+            font-size: 20px;
+            color: #6900A1;
+            background-color: #00D8A1;
+        }
+    </style>
+    """
+def display_organism_dropdown(container: UserContainer) -> None:
+    """
+    Display the organism dropdown widget and apply custom styles.
+    Args:
+        container (UserContainer): Container to store the selected organism.
+    """
+    dropdown = create_organism_dropdown(container)
+    header = widgets.HTML(
+        '<b style="font-size:20px;">Select Organism:</b>'
+        '<div style="height:10px;"></div>'
+    )
+    container_widget = widgets.VBox(
+        [header, dropdown],
+        layout=widgets.Layout(padding="12px 0 12px 25px"),
+    )
+    display(container_widget)
+    display(HTML(get_dropdown_style()))
+def display_protein_input(container: UserContainer) -> None:
+    """
+    Display a widget for entering a protein sequence and save it to the container.
+    Args:
+        container (UserContainer): A container to store the entered protein sequence.
+    """
+    protein_input = widgets.Textarea(
+        value="",
+        placeholder="Enter here...",
+        description="",
+        layout=widgets.Layout(width="100%", height="100px", margin="0 0 10px 0"),
+        style={"description_width": "initial"},
+    )
+    # Custom CSS for the input widget
+    input_style = """
+        <style>
+            .widget-textarea > textarea {
+                font-size: 12px;
+                font-family: Arial, sans-serif;
+                font-weight: normal;
+                background-color: #f0f0f0;
+                border-radius: 5px;
+                padding: 10px;
+            }
+            .widget-label {
+                font-size: 18px;
+                font-weight: bold;
+            }
+            .custom-container {
+                display: flex;
+                flex-direction: column;
+                align-items: flex-start;
+            }
+        </style>
+    """
+    # Function to save the input protein sequence to the container
+    def save_protein(change: Dict[str, str]) -> None:
+        """
+        Save the input protein sequence to the container.
+        Args:
+            change (Dict[str, str]): A dictionary containing information about
+            the change in textarea value.
+        """
+        container.protein = (
+            change["new"]
+            .upper()
+            .strip()
+            .replace("\n", "")
+            .replace(" ", "")
+            .replace("\t", "")
+        )
+    # Attach the function to the input widget
+    protein_input.observe(save_protein, names="value")
+    # Display the input widget
+    header = widgets.HTML(
+        '<b style="font-size:20px;">Enter Protein Sequence:</b>'
+        '<div style="height:18px;"></div>'
+    )
+    container_widget = widgets.VBox(
+        [header, protein_input], layout=widgets.Layout(padding="12px 12px 0 25px")
+    )
+    display(container_widget)
+    display(widgets.HTML(input_style))
+def format_model_output(output: DNASequencePrediction) -> str:
+    """
+    Format DNA sequence prediction output in an appealing and easy-to-read manner.
+    This function takes the prediction output and formats it into
+    a structured string with clear section headers and separators.
+    Args:
+        output (DNASequencePrediction): Object containing the prediction output.
+            Expected attributes:
+            - organism (str): The organism name.
+            - protein (str): The input protein sequence.
+            - processed_input (str): The processed input sequence.
+            - predicted_dna (str): The predicted DNA sequence.
+    Returns:
+        str: A formatted string containing the organized output.
+    """
+    def format_section(title: str, content: str) -> str:
+        """Helper function to format individual sections."""
+        separator = "-" * 29
+        title_line = f"| {title.center(25)} |"
+        return f"{separator}\n{title_line}\n{separator}\n{content}\n\n"
+    sections: List[Tuple[str, str]] = [
+        ("Organism", output.organism),
+        ("Input Protein", output.protein),
+        ("Processed Input", output.processed_input),
+        ("Predicted DNA", output.predicted_dna),
+    ]
+    formatted_output = ""
+    for title, content in sections:
+        formatted_output += format_section(title, content)
+    # Remove the last newline to avoid extra space at the end
+    return formatted_output.rstrip()

CodonTransformer/CodonPostProcessing.py ADDED Viewed

	@@ -0,0 +1,83 @@

+"""
+File: CodonPostProcessing.py
+---------------------------
+Post-processing utilities for codon optimization using DNAChisel.
+This module provides sequence polishing capabilities to fix restriction sites,
+homopolymers, and other constraints while preserving CAI and GC content.
+"""
+import warnings
+import numpy as np
+try:
+    from dnachisel import (
+        DnaOptimizationProblem,
+        AvoidPattern,
+        EnforceGCContent,
+        EnforceTranslation,
+        CodonOptimize,
+    )
+    DNACHISEL_AVAILABLE = True
+except ImportError:
+    DNACHISEL_AVAILABLE = False
+    # This warning will be shown when the module is first imported.
+    warnings.warn(
+        "DNAChisel is not installed. Post-processing features will be disabled."
+    )
+def polish_sequence_with_dnachisel(
+    dna_sequence: str,
+    protein_sequence: str,
+    gc_bounds: tuple = (45.0, 55.0),
+    cai_species: str = "e_coli",
+    avoid_homopolymers_length: int = 6,
+    enzymes_to_avoid: list = None
+):
+    """
+    Polishes a DNA sequence using DNAChisel to meet lab synthesis constraints.
+    """
+    if not DNACHISEL_AVAILABLE:
+        warnings.warn("DNAChisel not available, skipping post-processing.")
+        return dna_sequence
+    if enzymes_to_avoid is None:
+        # Common cloning enzymes
+        enzymes_to_avoid = ["EcoRI", "XbaI", "SpeI", "PstI", "NotI"]
+    try:
+        # Start with the basic, essential constraints
+        constraints = [
+            EnforceTranslation(translation=protein_sequence),
+            EnforceGCContent(mini=gc_bounds[0] / 100.0, maxi=gc_bounds[1] / 100.0),
+        ]
+        # Add enzyme avoidance constraints safely
+        for enzyme in enzymes_to_avoid:
+            try:
+                # This is the modern way to avoid enzyme sites
+                constraints.append(AvoidPattern.from_enzyme_name(enzyme))
+            except Exception:
+                warnings.warn(f"Could not find enzyme '{enzyme}' in DNAChisel library.")
+        # Add homopolymer avoidance constraints
+        for base in "ATGC":
+            constraints.append(AvoidPattern(base * avoid_homopolymers_length))
+        # Define the optimization problem
+        problem = DnaOptimizationProblem(
+            sequence=dna_sequence,
+            constraints=constraints,
+            objectives=[CodonOptimize(species=cai_species, method="match_codon_usage")]
+        )
+        # Solve the problem
+        problem.resolve_constraints()
+        problem.optimize()
+        # Return the polished sequence
+        return problem.sequence
+    except Exception as e:
+        warnings.warn(f"DNAChisel post-processing failed with an error: {e}")
+        # Return the original sequence if polishing fails
+        return dna_sequence

CodonTransformer/CodonPrediction.py ADDED Viewed

	@@ -0,0 +1,1374 @@

+"""
+File: CodonPrediction.py
+---------------------------
+Includes functions to tokenize input, load models, infer predicted dna sequences and
+helper functions related to processing data for passing to the model.
+"""
+import warnings
+from typing import Any, Dict, List, Optional, Tuple, Union
+import heapq
+from dataclasses import dataclass
+import numpy as np
+import onnxruntime as rt
+import torch
+import transformers
+from transformers import (
+    AutoTokenizer,
+    BatchEncoding,
+    BigBirdConfig,
+    BigBirdForMaskedLM,
+    PreTrainedTokenizerFast,
+)
+from CodonTransformer.CodonData import get_merged_seq
+from CodonTransformer.CodonUtils import (
+    AMINO_ACID_TO_INDEX,
+    INDEX2TOKEN,
+    NUM_ORGANISMS,
+    ORGANISM2ID,
+    TOKEN2INDEX,
+    DNASequencePrediction,
+    GC_COUNTS_PER_TOKEN,
+    CODON_GC_CONTENT,
+    AA_MIN_GC,
+    AA_MAX_GC,
+)
+def predict_dna_sequence(
+    protein: str,
+    organism: Union[int, str],
+    device: torch.device,
+    tokenizer: Union[str, PreTrainedTokenizerFast] = None,
+    model: Union[str, torch.nn.Module] = None,
+    attention_type: str = "original_full",
+    deterministic: bool = True,
+    temperature: float = 0.2,
+    top_p: float = 0.95,
+    num_sequences: int = 1,
+    match_protein: bool = False,
+    use_constrained_search: bool = False,
+    gc_bounds: Tuple[float, float] = (0.30, 0.70),
+    beam_size: int = 5,
+    length_penalty: float = 1.0,
+    diversity_penalty: float = 0.0,
+) -> Union[DNASequencePrediction, List[DNASequencePrediction]]:
+    """
+    Predict the DNA sequence(s) for a given protein using the CodonTransformer model.
+    This function takes a protein sequence and an organism (as ID or name) as input
+    and returns the predicted DNA sequence(s) using the CodonTransformer model. It can use
+    either provided tokenizer and model objects or load them from specified paths.
+    Args:
+        protein (str): The input protein sequence for which to predict the DNA sequence.
+        organism (Union[int, str]): Either the ID of the organism or its name (e.g.,
+            "Escherichia coli general"). If a string is provided, it will be converted
+            to the corresponding ID using ORGANISM2ID.
+        device (torch.device): The device (CPU or GPU) to run the model on.
+        tokenizer (Union[str, PreTrainedTokenizerFast, None], optional): Either a file
+            path to load the tokenizer from, a pre-loaded tokenizer object, or None. If
+            None, it will be loaded from HuggingFace. Defaults to None.
+        model (Union[str, torch.nn.Module, None], optional): Either a file path to load
+            the model from, a pre-loaded model object, or None. If None, it will be
+            loaded from HuggingFace. Defaults to None.
+        attention_type (str, optional): The type of attention mechanism to use in the
+            model. Can be either 'block_sparse' or 'original_full'. Defaults to
+            "original_full".
+        deterministic (bool, optional): Whether to use deterministic decoding (most
+            likely tokens). If False, samples tokens according to their probabilities
+            adjusted by the temperature. Defaults to True.
+        temperature (float, optional): A value controlling the randomness of predictions
+            during non-deterministic decoding. Lower values (e.g., 0.2) make the model
+            more conservative, while higher values (e.g., 0.8) increase randomness.
+            Using high temperatures may result in prediction of DNA sequences that
+            do not translate to the input protein.
+            Recommended values are:
+                - Low randomness: 0.2
+                - Medium randomness: 0.5
+                - High randomness: 0.8
+            The temperature must be a positive float. Defaults to 0.2.
+        top_p (float, optional): The cumulative probability threshold for nucleus sampling.
+            Tokens with cumulative probability up to top_p are considered for sampling.
+            This parameter helps balance diversity and coherence in the predicted DNA sequences.
+            The value must be a float between 0 and 1. Defaults to 0.95.
+        num_sequences (int, optional): The number of DNA sequences to generate. Only applicable
+            when deterministic is False. Defaults to 1.
+        match_protein (bool, optional): Ensures the predicted DNA sequence is translated
+            to the input protein sequence by sampling from only the respective codons of
+            given amino acids. Defaults to False.
+        use_constrained_search (bool, optional): Whether to use constrained beam search
+            with GC content bounds. Defaults to False.
+        gc_bounds (Tuple[float, float], optional): GC content bounds (min, max) for
+            constrained search. Defaults to (0.30, 0.70).
+        beam_size (int, optional): Beam size for constrained search. Defaults to 5.
+        length_penalty (float, optional): Length penalty for beam search scoring.
+            Defaults to 1.0.
+        diversity_penalty (float, optional): Diversity penalty to reduce repetitive
+            sequences. Defaults to 0.0.
+    Returns:
+        Union[DNASequencePrediction, List[DNASequencePrediction]]: An object or list of objects
+        containing the prediction results:
+            - organism (str): Name of the organism used for prediction.
+            - protein (str): Input protein sequence for which DNA sequence is predicted.
+            - processed_input (str): Processed input sequence (merged protein and DNA).
+            - predicted_dna (str): Predicted DNA sequence.
+    Raises:
+        ValueError: If the protein sequence is empty, if the organism is invalid,
+            if the temperature is not a positive float, if top_p is not between 0 and 1,
+            or if num_sequences is less than 1 or used with deterministic mode.
+    Note:
+        This function uses ORGANISM2ID, INDEX2TOKEN, and AMINO_ACID_TO_INDEX dictionaries
+        imported from CodonTransformer.CodonUtils. ORGANISM2ID maps organism names to their
+        corresponding IDs. INDEX2TOKEN maps model output indices (token IDs) to
+        respective codons. AMINO_ACID_TO_INDEX maps each amino acid and stop symbol to indices
+        of codon tokens that translate to it.
+    Example:
+        >>> import torch
+        >>> from transformers import AutoTokenizer, BigBirdForMaskedLM
+        >>> from CodonTransformer.CodonPrediction import predict_dna_sequence
+        >>> from CodonTransformer.CodonJupyter import format_model_output
+        >>>
+        >>> # Set up device
+        >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        >>>
+        >>> # Load tokenizer and model
+        >>> tokenizer = AutoTokenizer.from_pretrained("adibvafa/CodonTransformer")
+        >>> model = BigBirdForMaskedLM.from_pretrained("adibvafa/CodonTransformer")
+        >>> model = model.to(device)
+        >>>
+        >>> # Define protein sequence and organism
+        >>> protein = "MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLA"
+        >>> organism = "Escherichia coli general"
+        >>>
+        >>> # Predict DNA sequence with deterministic decoding (single sequence)
+        >>> output = predict_dna_sequence(
+        ...     protein=protein,
+        ...     organism=organism,
+        ...     device=device,
+        ...     tokenizer=tokenizer,
+        ...     model=model,
+        ...     attention_type="original_full",
+        ...     deterministic=True
+        ... )
+        >>>
+        >>> # Predict DNA sequence with constrained beam search
+        >>> output_constrained = predict_dna_sequence(
+        ...     protein=protein,
+        ...     organism=organism,
+        ...     device=device,
+        ...     tokenizer=tokenizer,
+        ...     model=model,
+        ...     use_constrained_search=True,
+        ...     gc_bounds=(0.40, 0.60),
+        ...     beam_size=10,
+        ...     length_penalty=1.2,
+        ...     diversity_penalty=0.1
+        ... )
+        >>>
+        >>> # Predict multiple DNA sequences with low randomness and top_p sampling
+        >>> output_random = predict_dna_sequence(
+        ...     protein=protein,
+        ...     organism=organism,
+        ...     device=device,
+        ...     tokenizer=tokenizer,
+        ...     model=model,
+        ...     attention_type="original_full",
+        ...     deterministic=False,
+        ...     temperature=0.2,
+        ...     top_p=0.95,
+        ...     num_sequences=3
+        ... )
+        >>>
+        >>> print(format_model_output(output))
+        >>> for i, seq in enumerate(output_random, 1):
+        ...     print(f"Sequence {i}:")
+        ...     print(format_model_output(seq))
+        ...     print()
+    """
+    if not protein:
+        raise ValueError("Protein sequence cannot be empty.")
+    if not isinstance(temperature, (float, int)) or temperature <= 0:
+        raise ValueError("Temperature must be a positive float.")
+    if not isinstance(top_p, (float, int)) or not 0 < top_p <= 1.0:
+        raise ValueError("top_p must be a float between 0 and 1.")
+    if not isinstance(num_sequences, int) or num_sequences < 1:
+        raise ValueError("num_sequences must be a positive integer.")
+    if use_constrained_search:
+        if not isinstance(gc_bounds, tuple) or len(gc_bounds) != 2:
+            raise ValueError("gc_bounds must be a tuple of (min_gc, max_gc).")
+        if not (0.0 <= gc_bounds[0] <= gc_bounds[1] <= 1.0):
+            raise ValueError("gc_bounds must be between 0.0 and 1.0 with min <= max.")
+        if not isinstance(beam_size, int) or beam_size < 1:
+            raise ValueError("beam_size must be a positive integer.")
+    if deterministic and num_sequences > 1 and not use_constrained_search:
+        raise ValueError(
+            "Multiple sequences can only be generated in non-deterministic mode "
+            "(unless using constrained search)."
+        )
+    if use_constrained_search and num_sequences > 1:
+        raise ValueError(
+            "Constrained beam search currently supports only single sequence generation."
+        )
+    # Load tokenizer
+    if not isinstance(tokenizer, PreTrainedTokenizerFast):
+        tokenizer = load_tokenizer(tokenizer)
+    # Load model
+    if not isinstance(model, torch.nn.Module):
+        model = load_model(model_path=model, device=device, attention_type=attention_type)
+    else:
+        model.eval()
+        model.bert.set_attention_type(attention_type)
+        model.to(device)
+    # Validate organism and convert to organism_id and organism_name
+    organism_id, organism_name = validate_and_convert_organism(organism)
+    # Inference loop
+    with torch.no_grad():
+        # Tokenize the input sequence
+        merged_seq = get_merged_seq(protein=protein, dna="")
+        input_dict = {
+            "idx": 0,  # sample index
+            "codons": merged_seq,
+            "organism": organism_id,
+        }
+        tokenized_input = tokenize([input_dict], tokenizer=tokenizer).to(device)
+        # Get the model predictions
+        output_dict = model(**tokenized_input, return_dict=True)
+        logits = output_dict.logits.detach().cpu()
+        logits = logits[:, 1:-1, :]  # Remove [CLS] and [SEP] tokens
+        # Mask the logits of codons that do not correspond to the input protein sequence
+        if match_protein:
+            possible_tokens_per_position = [
+                AMINO_ACID_TO_INDEX[token[0]] for token in merged_seq.split(" ")
+            ]
+            seq_len = logits.shape[1]
+            if len(possible_tokens_per_position) > seq_len:
+                possible_tokens_per_position = possible_tokens_per_position[:seq_len]
+            mask = torch.full_like(logits, float("-inf"))
+            for pos, possible_tokens in enumerate(possible_tokens_per_position):
+                mask[:, pos, possible_tokens] = 0
+            logits = mask + logits
+        predictions = []
+        for _ in range(num_sequences):
+            # Decode the predicted DNA sequence from the model output
+            if use_constrained_search:
+                # Use constrained beam search with GC bounds
+                predicted_indices = constrained_beam_search_simple(
+                    logits=logits.squeeze(0),
+                    protein_sequence=protein,
+                    gc_bounds=gc_bounds,
+                    max_attempts=50,
+                )
+            elif deterministic:
+                predicted_indices = logits.argmax(dim=-1).squeeze().tolist()
+            else:
+                predicted_indices = sample_non_deterministic(
+                    logits=logits, temperature=temperature, top_p=top_p
+                )
+            predicted_dna = list(map(INDEX2TOKEN.__getitem__, predicted_indices))
+            predicted_dna = (
+                "".join([token[-3:] for token in predicted_dna]).strip().upper()
+            )
+            predictions.append(
+                DNASequencePrediction(
+                    organism=organism_name,
+                    protein=protein,
+                    processed_input=merged_seq,
+                    predicted_dna=predicted_dna,
+                )
+            )
+    return predictions[0] if num_sequences == 1 else predictions
+@dataclass
+class BeamCandidate:
+    """Represents a candidate sequence in the beam search."""
+    tokens: List[int]
+    score: float
+    gc_count: int
+    length: int
+    def __post_init__(self):
+        self.gc_ratio = self.gc_count / max(self.length, 1)
+    def __lt__(self, other):
+        return self.score < other.score
+def _calculate_true_future_gc_range(
+    current_pos: int,
+    protein_sequence: str,
+    current_gc_count: int,
+    current_length: int
+) -> Tuple[float, float]:
+    """
+    Calculate the true minimum and maximum possible final GC content
+    given current state and remaining amino acids (perfect foresight).
+    Args:
+        current_pos: Current position in protein sequence
+        protein_sequence: Full protein sequence
+        current_gc_count: Current GC count in partial sequence
+        current_length: Current length in nucleotides
+    Returns:
+        Tuple of (min_possible_final_gc_ratio, max_possible_final_gc_ratio)
+    """
+    if current_pos >= len(protein_sequence):
+        # Already at end, return current ratio
+        final_ratio = current_gc_count / max(current_length, 1)
+        return final_ratio, final_ratio
+    # Calculate remaining amino acids
+    remaining_aas = protein_sequence[current_pos:]
+    # Calculate min/max possible GC from remaining amino acids
+    min_future_gc = 0
+    max_future_gc = 0
+    for aa in remaining_aas:
+        if aa.upper() in AA_MIN_GC and aa.upper() in AA_MAX_GC:
+            min_future_gc += AA_MIN_GC[aa.upper()]
+            max_future_gc += AA_MAX_GC[aa.upper()]
+        else:
+            # If amino acid not found, assume moderate GC (1-2 range)
+            min_future_gc += 1
+            max_future_gc += 2
+    # Calculate final sequence length
+    final_length = current_length + len(remaining_aas) * 3
+    # Calculate min/max possible final GC ratios
+    min_final_gc_ratio = (current_gc_count + min_future_gc) / final_length
+    max_final_gc_ratio = (current_gc_count + max_future_gc) / final_length
+    return min_final_gc_ratio, max_final_gc_ratio
+def constrained_beam_search_simple(
+    logits: torch.Tensor,
+    protein_sequence: str,
+    gc_bounds: Tuple[float, float] = (0.30, 0.70),
+    max_attempts: int = 100,
+) -> List[int]:
+    """
+    Simple constrained search - try multiple greedy samples and pick best one within GC bounds.
+    """
+    min_gc, max_gc = gc_bounds
+    seq_len = min(logits.shape[0], len(protein_sequence))
+    # Convert to probabilities
+    probs = torch.softmax(logits, dim=-1)
+    valid_sequences = []
+    for attempt in range(max_attempts):
+        tokens = []
+        total_gc = 0
+        # Generate sequence position by position
+        for pos in range(seq_len):
+            aa = protein_sequence[pos]
+            possible_tokens = AMINO_ACID_TO_INDEX.get(aa, [])
+            if not possible_tokens:
+                continue
+            # Filter tokens by current constraints and get probabilities
+            candidates = []
+            for token_idx in possible_tokens:
+                if token_idx < len(probs[pos]) and token_idx < len(GC_COUNTS_PER_TOKEN):
+                    prob = probs[pos][token_idx].item()
+                    gc_contribution = int(GC_COUNTS_PER_TOKEN[token_idx].item())
+                    # Check if this token could still lead to a valid final sequence (perfect foresight)
+                    new_gc_total = total_gc + gc_contribution
+                    new_length = (pos + 1) * 3
+                    # Calculate what's possible for the final sequence given this choice
+                    min_final_gc, max_final_gc = _calculate_true_future_gc_range(
+                        pos + 1, protein_sequence, new_gc_total, new_length
+                    )
+                    # Only prune if there's NO OVERLAP between possible final range and target bounds
+                    if max_final_gc >= min_gc and min_final_gc <= max_gc:
+                        # Calculate gentle GC penalty to steer toward target center
+                        target_gc = (min_gc + max_gc) / 2  # Target center (e.g., 0.50 for bounds 0.45-0.55)
+                        current_projected_gc = (min_final_gc + max_final_gc) / 2  # Projected center
+                        # Only apply penalty if we're significantly off-target AND late in sequence
+                        sequence_progress = (pos + 1) / seq_len
+                        if sequence_progress > 0.3:  # Only apply penalty after 30% of sequence
+                            gc_deviation = abs(current_projected_gc - target_gc)
+                            if gc_deviation > 0.05:  # Only if >5% deviation from target
+                                # Gentle penalty: reduce probability by small factor
+                                penalty_factor = max(0.7, 1.0 - 0.3 * gc_deviation)  # 0.7-1.0 range
+                                prob = prob * penalty_factor
+                        candidates.append((token_idx, prob, gc_contribution))
+            if not candidates:
+                # If no valid candidates, break and try next attempt
+                break
+            # Sample from valid candidates (with temperature)
+            if attempt == 0:
+                # First attempt: greedy (highest probability)
+                best_token = max(candidates, key=lambda x: x[1])
+            else:
+                # Other attempts: sample with some randomness
+                probs_list = [c[1] for c in candidates]
+                if sum(probs_list) > 0:
+                    # Normalize probabilities
+                    probs_array = np.array(probs_list)
+                    probs_array = probs_array / probs_array.sum()
+                    # Sample
+                    chosen_idx = np.random.choice(len(candidates), p=probs_array)
+                    best_token = candidates[chosen_idx]
+                else:
+                    best_token = candidates[0]
+            tokens.append(best_token[0])
+            total_gc += best_token[2]
+        # Check if we got a complete sequence
+        if len(tokens) == seq_len:
+            final_gc_ratio = total_gc / (seq_len * 3)
+            if min_gc <= final_gc_ratio <= max_gc:
+                # Calculate sequence score (sum of log probabilities)
+                score = sum(np.log(probs[i][tokens[i]].item() + 1e-8) for i in range(len(tokens)))
+                valid_sequences.append((tokens, score, final_gc_ratio))
+    if not valid_sequences:
+        raise ValueError(f"Could not generate valid sequence within GC bounds {gc_bounds} after {max_attempts} attempts")
+    # Return the sequence with highest score
+    best_sequence = max(valid_sequences, key=lambda x: x[1])
+    return best_sequence[0]
+def constrained_beam_search(
+    logits: torch.Tensor,
+    protein_sequence: str,
+    gc_bounds: Tuple[float, float] = (0.30, 0.70),
+    beam_size: int = 5,
+    length_penalty: float = 1.0,
+    diversity_penalty: float = 0.0,
+    temperature: float = 1.0,
+    max_candidates: int = 100,
+    position_aware_gc_penalty: bool = True,
+    gc_penalty_strength: float = 2.0,
+) -> List[int]:
+    """
+    Constrained beam search with exact per-residue GC bounds tracking.
+    Priority #1: Exact per-residue GC bounds tracking
+    - Tracks cumulative GC content after each codon selection
+    - Prunes candidates that would violate GC bounds
+    - Maintains beam of valid candidates
+    Priority #2: Position-aware GC penalty mechanism
+    - Applies variable penalty weights based on sequence position
+    - Preserves flexibility early, applies pressure when necessary
+    - Uses progressive penalty scaling based on deviation severity
+    Args:
+        logits (torch.Tensor): Model logits of shape [seq_len, vocab_size]
+        protein_sequence (str): Input protein sequence
+        gc_bounds (Tuple[float, float]): (min_gc, max_gc) bounds
+        beam_size (int): Number of candidates to maintain
+        length_penalty (float): Length penalty for scoring
+        diversity_penalty (float): Diversity penalty for scoring
+        temperature (float): Temperature for probability scaling
+        max_candidates (int): Maximum candidates to consider per position
+        position_aware_gc_penalty (bool): Whether to use position-aware GC penalties
+        gc_penalty_strength (float): Strength of GC penalty adjustment
+    Returns:
+        List[int]: Best sequence token indices
+    """
+    min_gc, max_gc = gc_bounds
+    seq_len = logits.shape[0]
+    protein_len = len(protein_sequence)
+    # Ensure we don't go beyond the protein sequence
+    if seq_len > protein_len:
+        print(f"Warning: logits length ({seq_len}) > protein length ({protein_len}). Truncating to protein length.")
+        seq_len = protein_len
+        logits = logits[:protein_len]
+    # Initialize beam with empty candidate
+    beam = [BeamCandidate(tokens=[], score=0.0, gc_count=0, length=0)]
+    # Apply temperature scaling
+    if temperature != 1.0:
+        logits = logits / temperature
+    # Convert to probabilities
+    probs = torch.softmax(logits, dim=-1)
+    for pos in range(min(seq_len, len(protein_sequence))):
+        # Get possible tokens for current amino acid
+        aa = protein_sequence[pos]
+        possible_tokens = AMINO_ACID_TO_INDEX.get(aa, [])
+        if not possible_tokens:
+            # Fallback to all tokens if amino acid not found
+            possible_tokens = list(range(probs.shape[1]))
+        # Get top candidates for this position
+        pos_probs = probs[pos]
+        top_candidates = []
+        for token_idx in possible_tokens:
+            if token_idx < len(pos_probs) and token_idx < len(GC_COUNTS_PER_TOKEN):
+                prob = pos_probs[token_idx].item()
+                gc_contribution = int(GC_COUNTS_PER_TOKEN[token_idx].item())
+                # Only include tokens with valid probabilities
+                if prob > 1e-10:  # Avoid extremely low probabilities
+                    top_candidates.append((token_idx, prob, gc_contribution))
+        # Sort by probability and take top max_candidates
+        top_candidates.sort(key=lambda x: x[1], reverse=True)
+        top_candidates = top_candidates[:max_candidates]
+        # If no valid candidates found, fallback to all possible tokens for this amino acid
+        if not top_candidates:
+            for token_idx in possible_tokens[:min(len(possible_tokens), max_candidates)]:
+                if token_idx < len(pos_probs) and token_idx < len(GC_COUNTS_PER_TOKEN):
+                    prob = max(pos_probs[token_idx].item(), 1e-10)  # Ensure minimum probability
+                    gc_contribution = int(GC_COUNTS_PER_TOKEN[token_idx].item())
+                    top_candidates.append((token_idx, prob, gc_contribution))
+        # Generate new beam candidates
+        new_beam = []
+        for candidate in beam:
+            for token_idx, prob, gc_contribution in top_candidates:
+                # Calculate new GC stats
+                new_gc_count = candidate.gc_count + gc_contribution
+                new_length = candidate.length + 3  # Each codon is 3 nucleotides
+                new_gc_ratio = new_gc_count / new_length
+                # Priority #2: Position-aware GC penalty mechanism
+                gc_penalty = 0.0
+                if position_aware_gc_penalty:
+                    # Calculate position weight (more penalty towards end of sequence)
+                    position_weight = (pos + 1) / seq_len
+                    # Calculate GC deviation severity
+                    target_gc = (min_gc + max_gc) / 2
+                    gc_deviation = abs(new_gc_ratio - target_gc)
+                    deviation_severity = gc_deviation / ((max_gc - min_gc) / 2)
+                    # Apply progressive penalty
+                    if deviation_severity > 0.5:  # Soft penalty zone
+                        gc_penalty = gc_penalty_strength * position_weight * (deviation_severity - 0.5) ** 2
+                    # Hard constraint: still prune sequences that exceed bounds
+                    if new_gc_ratio < min_gc or new_gc_ratio > max_gc:
+                        continue  # Prune invalid candidates
+                else:
+                    # Priority #1: Hard GC bounds only
+                    if new_gc_ratio < min_gc or new_gc_ratio > max_gc:
+                        continue  # Prune invalid candidates
+                # Calculate score with GC penalty
+                new_score = candidate.score + np.log(prob + 1e-8) - gc_penalty
+                # Apply length penalty
+                if length_penalty != 1.0:
+                    length_norm = ((pos + 1) ** length_penalty)
+                    normalized_score = new_score / length_norm
+                else:
+                    normalized_score = new_score
+                # Create new candidate
+                new_candidate = BeamCandidate(
+                    tokens=candidate.tokens + [token_idx],
+                    score=normalized_score,
+                    gc_count=new_gc_count,
+                    length=new_length
+                )
+                new_beam.append(new_candidate)
+        # Apply diversity penalty if specified
+        if diversity_penalty > 0.0:
+            new_beam = _apply_diversity_penalty(new_beam, diversity_penalty)
+        # Keep top beam_size candidates
+        beam = sorted(new_beam, key=lambda x: x.score, reverse=True)[:beam_size]
+        # Priority #3: Adaptive beam rescue for difficult sequences
+        if not beam:
+            # Attempt beam rescue by relaxing constraints progressively
+            rescue_attempts = 0
+            max_rescue_attempts = 3
+            while not beam and rescue_attempts < max_rescue_attempts:
+                rescue_attempts += 1
+                # Progressive relaxation strategy
+                if rescue_attempts == 1:
+                    # First attempt: increase beam size and relax GC bounds slightly
+                    temp_beam_size = min(beam_size * 2, max_candidates)
+                    temp_gc_bounds = (min_gc * 0.95, max_gc * 1.05)
+                elif rescue_attempts == 2:
+                    # Second attempt: further relax GC bounds and increase candidates
+                    temp_beam_size = min(beam_size * 3, max_candidates)
+                    temp_gc_bounds = (min_gc * 0.9, max_gc * 1.1)
+                else:
+                    # Final attempt: maximum relaxation
+                    temp_beam_size = max_candidates
+                    temp_gc_bounds = (min_gc * 0.85, max_gc * 1.15)
+                # Retry beam generation with relaxed parameters
+                rescue_beam = []
+                # Use previous beam state or start fresh if this is the first position with no beam
+                previous_beam = beam if beam else [BeamCandidate(tokens=[], score=0.0, gc_count=0, length=0)]
+                for candidate in previous_beam:
+                    for token_idx, prob, gc_contribution in top_candidates:
+                        new_gc_count = candidate.gc_count + gc_contribution
+                        new_length = candidate.length + 3
+                        new_gc_ratio = new_gc_count / new_length
+                        # Check relaxed bounds
+                        if temp_gc_bounds[0] <= new_gc_ratio <= temp_gc_bounds[1]:
+                            # Apply reduced GC penalty for rescue
+                            gc_penalty = 0.0
+                            if position_aware_gc_penalty:
+                                position_weight = (pos + 1) / seq_len
+                                target_gc = (min_gc + max_gc) / 2
+                                gc_deviation = abs(new_gc_ratio - target_gc)
+                                deviation_severity = gc_deviation / ((max_gc - min_gc) / 2)
+                                # Reduced penalty for rescue
+                                if deviation_severity > 0.7:
+                                    gc_penalty = (gc_penalty_strength * 0.5) * position_weight * (deviation_severity - 0.7) ** 2
+                            new_score = candidate.score + np.log(prob + 1e-8) - gc_penalty
+                            if length_penalty != 1.0:
+                                length_norm = ((pos + 1) ** length_penalty)
+                                normalized_score = new_score / length_norm
+                            else:
+                                normalized_score = new_score
+                            rescue_candidate = BeamCandidate(
+                                tokens=candidate.tokens + [token_idx],
+                                score=normalized_score,
+                                gc_count=new_gc_count,
+                                length=new_length
+                            )
+                            rescue_beam.append(rescue_candidate)
+                # Keep top candidates from rescue attempt
+                if rescue_beam:
+                    beam = sorted(rescue_beam, key=lambda x: x.score, reverse=True)[:temp_beam_size]
+                    break
+            # If all rescue attempts failed, raise error
+            if not beam:
+                raise ValueError(
+                    f"Beam rescue failed at position {pos} after {max_rescue_attempts} attempts. "
+                    f"The GC constraints {gc_bounds} may be too restrictive for this protein sequence. "
+                    f"Consider relaxing constraints or using a different approach."
+                )
+    # Return best candidate
+    best_candidate = max(beam, key=lambda x: x.score)
+    return best_candidate.tokens
+# Wrapper function that tries simple approach first
+def constrained_beam_search_wrapper(
+    logits: torch.Tensor,
+    protein_sequence: str,
+    gc_bounds: Tuple[float, float] = (0.30, 0.70),
+    **kwargs
+) -> List[int]:
+    """Wrapper that tries simple approach first, falls back to complex beam search."""
+    try:
+        # Try simple approach first
+        return constrained_beam_search_simple(logits, protein_sequence, gc_bounds)
+    except ValueError:
+        # Fall back to complex beam search
+        return constrained_beam_search(logits, protein_sequence, gc_bounds, **kwargs)
+def _apply_diversity_penalty(candidates: List[BeamCandidate], penalty: float) -> List[BeamCandidate]:
+    """
+    Apply diversity penalty to reduce repetitive sequences.
+    Args:
+        candidates (List[BeamCandidate]): List of candidates
+        penalty (float): Diversity penalty strength
+    Returns:
+        List[BeamCandidate]: Candidates with diversity penalty applied
+    """
+    if not candidates:
+        return candidates
+    # Count token occurrences
+    token_counts = {}
+    for candidate in candidates:
+        for token in candidate.tokens:
+            token_counts[token] = token_counts.get(token, 0) + 1
+    # Apply penalty
+    for candidate in candidates:
+        diversity_score = 0.0
+        for token in candidate.tokens:
+            if token_counts[token] > 1:
+                diversity_score += penalty * np.log(token_counts[token])
+        candidate.score -= diversity_score
+    return candidates
+def sample_non_deterministic(
+    logits: torch.Tensor,
+    temperature: float = 0.2,
+    top_p: float = 0.95,
+) -> List[int]:
+    """
+    Sample token indices from logits using temperature scaling and nucleus (top-p) sampling.
+    This function applies temperature scaling to the logits, computes probabilities,
+    and then performs nucleus sampling to select token indices. It is used for
+    non-deterministic decoding in language models to introduce randomness while
+    maintaining coherence in the generated sequences.
+    Args:
+        logits (torch.Tensor): The logits output from the model of shape
+            [seq_len, vocab_size] or [batch_size, seq_len, vocab_size].
+        temperature (float, optional): Temperature value for scaling logits.
+            Must be a positive float. Defaults to 1.0.
+        top_p (float, optional): Cumulative probability threshold for nucleus sampling.
+            Must be a float between 0 and 1. Tokens with cumulative probability up to
+            `top_p` are considered for sampling. Defaults to 0.95.
+    Returns:
+        List[int]: A list of sampled token indices corresponding to the predicted tokens.
+    Raises:
+        ValueError: If `temperature` is not a positive float or if `top_p` is not between 0 and 1.
+    Example:
+        >>> logits = model_output.logits  # Assume logits is a tensor of shape [seq_len, vocab_size]
+        >>> predicted_indices = sample_non_deterministic(logits, temperature=0.7, top_p=0.9)
+    """
+    if not isinstance(temperature, (float, int)) or temperature <= 0:
+        raise ValueError("Temperature must be a positive float.")
+    if not isinstance(top_p, (float, int)) or not 0 < top_p <= 1.0:
+        raise ValueError("top_p must be a float between 0 and 1.")
+    # Compute probabilities using temperature scaling
+    probs = torch.softmax(logits / temperature, dim=-1)
+    # Remove batch dimension if present
+    if probs.dim() == 3:
+        probs = probs.squeeze(0)  # Shape: [seq_len, vocab_size]
+    # Sort probabilities in descending order
+    probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
+    probs_sum = torch.cumsum(probs_sort, dim=-1)
+    mask = probs_sum - probs_sort > top_p
+    # Zero out probabilities for tokens beyond the top-p threshold
+    probs_sort[mask] = 0.0
+    # Renormalize the probabilities
+    probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
+    next_token = torch.multinomial(probs_sort, num_samples=1)
+    predicted_indices = torch.gather(probs_idx, -1, next_token).squeeze(-1)
+    return predicted_indices.tolist()
+def load_model(
+    model_path: Optional[str] = None,
+    device: torch.device = None,
+    attention_type: str = "original_full",
+    num_organisms: int = None,
+    remove_prefix: bool = True,
+) -> torch.nn.Module:
+    """
+    Load a BigBirdForMaskedLM model from a model file, checkpoint, or HuggingFace.
+    Args:
+        model_path (Optional[str]): Path to the model file or checkpoint. If None,
+            load from HuggingFace.
+        device (torch.device, optional): The device to load the model onto.
+        attention_type (str, optional): The type of attention, 'block_sparse'
+            or 'original_full'.
+        num_organisms (int, optional): Number of organisms, needed if loading from a
+            checkpoint that requires this.
+        remove_prefix (bool, optional): Whether to remove the "model." prefix from the
+            keys in the state dict.
+    Returns:
+        torch.nn.Module: The loaded model.
+    """
+    if not model_path:
+        warnings.warn("Model path not provided. Loading from HuggingFace.", UserWarning)
+        model = BigBirdForMaskedLM.from_pretrained("adibvafa/CodonTransformer")
+    elif model_path.endswith(".ckpt"):
+        checkpoint = torch.load(model_path, map_location="cpu")
+        # Detect Lightning checkpoint vs raw state dict
+        if isinstance(checkpoint, dict) and "state_dict" in checkpoint:
+            state_dict = checkpoint["state_dict"]
+            if remove_prefix:
+                state_dict = {
+                    k.replace("model.", ""): v for k, v in state_dict.items()
+                }
+        else:
+            # assume checkpoint itself is state_dict
+            state_dict = checkpoint
+        if num_organisms is None:
+            num_organisms = NUM_ORGANISMS
+        # Load model configuration and instantiate the model
+        config = load_bigbird_config(num_organisms)
+        model = BigBirdForMaskedLM(config=config)
+        model.load_state_dict(state_dict, strict=False)
+    elif model_path.endswith(".pt"):
+        state_dict = torch.load(model_path)
+        config = state_dict.pop("self.config")
+        model = BigBirdForMaskedLM(config=config)
+        model.load_state_dict(state_dict, strict=False)
+    else:
+        raise ValueError(
+            "Unsupported file type. Please provide a .ckpt or .pt file, "
+            "or None to load from HuggingFace."
+        )
+    # Prepare model for evaluation
+    model.bert.set_attention_type(attention_type)
+    model.eval()
+    if device:
+        model.to(device)
+    return model
+def load_bigbird_config(num_organisms: int) -> BigBirdConfig:
+    """
+    Load the config object used to train the BigBird transformer.
+    Args:
+        num_organisms (int): The number of organisms.
+    Returns:
+        BigBirdConfig: The configuration object for BigBird.
+    """
+    config = transformers.BigBirdConfig(
+        vocab_size=len(TOKEN2INDEX),  # Equal to len(tokenizer)
+        type_vocab_size=num_organisms,
+        sep_token_id=2,
+    )
+    return config
+def create_model_from_checkpoint(
+    checkpoint_dir: str, output_model_dir: str, num_organisms: int
+) -> None:
+    """
+    Save a model to disk using a previous checkpoint.
+    Args:
+        checkpoint_dir (str): Directory where the checkpoint is stored.
+        output_model_dir (str): Directory where the model will be saved.
+        num_organisms (int): Number of organisms.
+    """
+    checkpoint = load_model(model_path=checkpoint_dir, num_organisms=num_organisms)
+    state_dict = checkpoint.state_dict()
+    state_dict["self.config"] = load_bigbird_config(num_organisms=num_organisms)
+    # Save the model state dict to the output directory
+    torch.save(state_dict, output_model_dir)
+def load_tokenizer(tokenizer_path: Optional[Union[str, PreTrainedTokenizerFast]] = None) -> PreTrainedTokenizerFast:
+    """
+    Create and return a tokenizer object from tokenizer path or HuggingFace.
+    Args:
+        tokenizer_path (Optional[Union[str, PreTrainedTokenizerFast]]): Path to the tokenizer file,
+        a pre-loaded tokenizer object, or None. If None, load from HuggingFace.
+    Returns:
+        PreTrainedTokenizerFast: The tokenizer object.
+    """
+    # If a tokenizer object is already provided, return it
+    if isinstance(tokenizer_path, PreTrainedTokenizerFast):
+        return tokenizer_path
+    # If no path is provided, load from HuggingFace
+    if not tokenizer_path:
+        warnings.warn(
+            "Tokenizer path not provided. Loading from HuggingFace.", UserWarning
+        )
+        return AutoTokenizer.from_pretrained("adibvafa/CodonTransformer")
+    # Load from file path
+    return transformers.PreTrainedTokenizerFast(
+        tokenizer_file=tokenizer_path,
+        bos_token="[CLS]",
+        eos_token="[SEP]",
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+    )
+def tokenize(
+    batch: List[Dict[str, Any]],
+    tokenizer: Union[PreTrainedTokenizerFast, str] = None,
+    max_len: int = 2048,
+) -> BatchEncoding:
+    """
+    Return the tokenized sequences given a batch of input data.
+    Each data in the batch is expected to be a dictionary with "codons" and
+    "organism" keys.
+    Args:
+        batch (List[Dict[str, Any]]): A list of dictionaries with "codons" and
+            "organism" keys.
+        tokenizer (PreTrainedTokenizerFast, str, optional): The tokenizer object or
+            path to the tokenizer file.
+        max_len (int, optional): Maximum length of the tokenized sequence.
+    Returns:
+        BatchEncoding: The tokenized batch.
+    """
+    if not isinstance(tokenizer, PreTrainedTokenizerFast):
+        tokenizer = load_tokenizer(tokenizer)
+    tokenized = tokenizer(
+        [data["codons"] for data in batch],
+        return_attention_mask=True,
+        return_token_type_ids=True,
+        truncation=True,
+        padding=True,
+        max_length=max_len,
+        return_tensors="pt",
+    )
+    # Add token type IDs for species
+    seq_len = tokenized["input_ids"].shape[-1]
+    species_index = torch.tensor([[data["organism"]] for data in batch])
+    tokenized["token_type_ids"] = species_index.repeat(1, seq_len)
+    return tokenized
+def validate_and_convert_organism(organism: Union[int, str]) -> Tuple[int, str]:
+    """
+    Validate and convert the organism input to both ID and name.
+    This function takes either an organism ID or name as input and returns both
+    the ID and name. It performs validation to ensure the input corresponds to
+    a valid organism in the ORGANISM2ID dictionary.
+    Args:
+        organism (Union[int, str]): Either the ID of the organism (int) or its
+        name (str).
+    Returns:
+        Tuple[int, str]: A tuple containing the organism ID (int) and name (str).
+    Raises:
+        ValueError: If the input is neither a string nor an integer, if the
+        organism name is not found in ORGANISM2ID, if the organism ID is not a
+        value in ORGANISM2ID, or if no name is found for a given ID.
+    Note:
+        This function relies on the ORGANISM2ID dictionary imported from
+        CodonTransformer.CodonUtils, which maps organism names to their
+        corresponding IDs.
+    """
+    if isinstance(organism, str):
+        if organism not in ORGANISM2ID:
+            raise ValueError(
+                f"Invalid organism name: {organism}. "
+                "Please use a valid organism name or ID."
+            )
+        organism_id = ORGANISM2ID[organism]
+        organism_name = organism
+    elif isinstance(organism, int):
+        if organism not in ORGANISM2ID.values():
+            raise ValueError(
+                f"Invalid organism ID: {organism}. "
+                "Please use a valid organism name or ID."
+            )
+        organism_id = organism
+        organism_name = next(
+            (name for name, id in ORGANISM2ID.items() if id == organism), None
+        )
+        if organism_name is None:
+            raise ValueError(f"No organism name found for ID: {organism}")
+    return organism_id, organism_name
+def get_high_frequency_choice_sequence(
+    protein: str, codon_frequencies: Dict[str, Tuple[List[str], List[float]]]
+) -> str:
+    """
+    Return the DNA sequence optimized using High Frequency Choice (HFC) approach
+    in which the most frequent codon for a given amino acid is always chosen.
+    Args:
+        protein (str): The protein sequence.
+        codon_frequencies (Dict[str, Tuple[List[str], List[float]]]): Codon
+        frequencies for each amino acid.
+    Returns:
+        str: The optimized DNA sequence.
+    """
+    # Select the most frequent codon for each amino acid in the protein sequence
+    dna_codons = [
+        codon_frequencies[aminoacid][0][np.argmax(codon_frequencies[aminoacid][1])]
+        for aminoacid in protein
+    ]
+    return "".join(dna_codons)
+def precompute_most_frequent_codons(
+    codon_frequencies: Dict[str, Tuple[List[str], List[float]]],
+) -> Dict[str, str]:
+    """
+    Precompute the most frequent codon for each amino acid.
+    Args:
+        codon_frequencies (Dict[str, Tuple[List[str], List[float]]]): Codon
+        frequencies for each amino acid.
+    Returns:
+        Dict[str, str]: The most frequent codon for each amino acid.
+    """
+    # Create a dictionary mapping each amino acid to its most frequent codon
+    return {
+        aminoacid: codons[np.argmax(frequencies)]
+        for aminoacid, (codons, frequencies) in codon_frequencies.items()
+    }
+def get_high_frequency_choice_sequence_optimized(
+    protein: str, codon_frequencies: Dict[str, Tuple[List[str], List[float]]]
+) -> str:
+    """
+    Efficient implementation of get_high_frequency_choice_sequence that uses
+    vectorized operations and helper functions, achieving up to x10 faster speed.
+    Args:
+        protein (str): The protein sequence.
+        codon_frequencies (Dict[str, Tuple[List[str], List[float]]]): Codon
+        frequencies for each amino acid.
+    Returns:
+        str: The optimized DNA sequence.
+    """
+    # Precompute the most frequent codons for each amino acid
+    most_frequent_codons = precompute_most_frequent_codons(codon_frequencies)
+    return "".join(most_frequent_codons[aminoacid] for aminoacid in protein)
+def get_background_frequency_choice_sequence(
+    protein: str, codon_frequencies: Dict[str, Tuple[List[str], List[float]]]
+) -> str:
+    """
+    Return the DNA sequence optimized using Background Frequency Choice (BFC)
+    approach in which a random codon for a given amino acid is chosen using
+    the codon frequencies probability distribution.
+    Args:
+        protein (str): The protein sequence.
+        codon_frequencies (Dict[str, Tuple[List[str], List[float]]]): Codon
+        frequencies for each amino acid.
+    Returns:
+        str: The optimized DNA sequence.
+    """
+    # Select a random codon for each amino acid based on the codon frequencies
+    # probability distribution
+    dna_codons = [
+        np.random.choice(
+            codon_frequencies[aminoacid][0], p=codon_frequencies[aminoacid][1]
+        )
+        for aminoacid in protein
+    ]
+    return "".join(dna_codons)
+def precompute_cdf(
+    codon_frequencies: Dict[str, Tuple[List[str], List[float]]],
+) -> Dict[str, Tuple[List[str], Any]]:
+    """
+    Precompute the cumulative distribution function (CDF) for each amino acid.
+    Args:
+        codon_frequencies (Dict[str, Tuple[List[str], List[float]]]): Codon
+        frequencies for each amino acid.
+    Returns:
+        Dict[str, Tuple[List[str], Any]]: CDFs for each amino acid.
+    """
+    cdf = {}
+    # Calculate the cumulative distribution function for each amino acid
+    for aminoacid, (codons, frequencies) in codon_frequencies.items():
+        cdf[aminoacid] = (codons, np.cumsum(frequencies))
+    return cdf
+def get_background_frequency_choice_sequence_optimized(
+    protein: str, codon_frequencies: Dict[str, Tuple[List[str], List[float]]]
+) -> str:
+    """
+    Efficient implementation of get_background_frequency_choice_sequence that uses
+    vectorized operations and helper functions, achieving up to x8 faster speed.
+    Args:
+        protein (str): The protein sequence.
+        codon_frequencies (Dict[str, Tuple[List[str], List[float]]]): Codon
+        frequencies for each amino acid.
+    Returns:
+        str: The optimized DNA sequence.
+    """
+    dna_codons = []
+    cdf = precompute_cdf(codon_frequencies)
+    # Select a random codon for each amino acid using the precomputed CDFs
+    for aminoacid in protein:
+        codons, cumulative_prob = cdf[aminoacid]
+        selected_codon_index = np.searchsorted(cumulative_prob, np.random.rand())
+        dna_codons.append(codons[selected_codon_index])
+    return "".join(dna_codons)
+def get_uniform_random_choice_sequence(
+    protein: str, codon_frequencies: Dict[str, Tuple[List[str], List[float]]]
+) -> str:
+    """
+    Return the DNA sequence optimized using Uniform Random Choice (URC) approach
+    in which a random codon for a given amino acid is chosen using a uniform
+    prior.
+    Args:
+        protein (str): The protein sequence.
+        codon_frequencies (Dict[str, Tuple[List[str], List[float]]]): Codon
+        frequencies for each amino acid.
+    Returns:
+        str: The optimized DNA sequence.
+    """
+    # Select a random codon for each amino acid using a uniform prior distribution
+    dna_codons = []
+    for aminoacid in protein:
+        codons = codon_frequencies[aminoacid][0]
+        random_index = np.random.randint(0, len(codons))
+        dna_codons.append(codons[random_index])
+    return "".join(dna_codons)
+def get_icor_prediction(input_seq: str, model_path: str, stop_symbol: str) -> str:
+    """
+    Return the optimized codon sequence for the given protein sequence using ICOR.
+    Credit: ICOR: improving codon optimization with recurrent neural networks
+            Rishab Jain, Aditya Jain, Elizabeth Mauro, Kevin LeShane, Douglas
+            Densmore
+    Args:
+        input_seq (str): The input protein sequence.
+        model_path (str): The path to the ICOR model.
+        stop_symbol (str): The symbol representing stop codons in the sequence.
+    Returns:
+        str: The optimized DNA sequence.
+    """
+    input_seq = input_seq.strip().upper()
+    input_seq = input_seq.replace(stop_symbol, "*")
+    # Define categorical labels from when model was trained.
+    labels = [
+        "AAA",
+        "AAC",
+        "AAG",
+        "AAT",
+        "ACA",
+        "ACG",
+        "ACT",
+        "AGC",
+        "ATA",
+        "ATC",
+        "ATG",
+        "ATT",
+        "CAA",
+        "CAC",
+        "CAG",
+        "CCG",
+        "CCT",
+        "CTA",
+        "CTC",
+        "CTG",
+        "CTT",
+        "GAA",
+        "GAT",
+        "GCA",
+        "GCC",
+        "GCG",
+        "GCT",
+        "GGA",
+        "GGC",
+        "GTC",
+        "GTG",
+        "GTT",
+        "TAA",
+        "TAT",
+        "TCA",
+        "TCG",
+        "TCT",
+        "TGG",
+        "TGT",
+        "TTA",
+        "TTC",
+        "TTG",
+        "TTT",
+        "ACC",
+        "CAT",
+        "CCA",
+        "CGG",
+        "CGT",
+        "GAC",
+        "GAG",
+        "GGT",
+        "AGT",
+        "GGG",
+        "GTA",
+        "TGC",
+        "CCC",
+        "CGA",
+        "CGC",
+        "TAC",
+        "TAG",
+        "TCC",
+        "AGA",
+        "AGG",
+        "TGA",
+    ]
+    # Define aa to integer table
+    def aa2int(seq: str) -> List[int]:
+        _aa2int = {
+            "A": 1,
+            "R": 2,
+            "N": 3,
+            "D": 4,
+            "C": 5,
+            "Q": 6,
+            "E": 7,
+            "G": 8,
+            "H": 9,
+            "I": 10,
+            "L": 11,
+            "K": 12,
+            "M": 13,
+            "F": 14,
+            "P": 15,
+            "S": 16,
+            "T": 17,
+            "W": 18,
+            "Y": 19,
+            "V": 20,
+            "B": 21,
+            "Z": 22,
+            "X": 23,
+            "*": 24,
+            "-": 25,
+            "?": 26,
+        }
+        return [_aa2int[i] for i in seq]
+    # Create empty array to fill
+    oh_array = np.zeros(shape=(26, len(input_seq)))
+    # Load placements from aa2int
+    aa_placement = aa2int(input_seq)
+    # One-hot encode the amino acid sequence:
+    # style nit: more pythonic to write for i in range(0, len(aa_placement)):
+    for i in range(0, len(aa_placement)):
+        oh_array[aa_placement[i], i] = 1
+        i += 1
+    oh_array = [oh_array]
+    x = np.array(np.transpose(oh_array))
+    y = x.astype(np.float32)
+    y = np.reshape(y, (y.shape[0], 1, 26))
+    # Start ICOR session using model.
+    sess = rt.InferenceSession(model_path)
+    input_name = sess.get_inputs()[0].name
+    # Get prediction:
+    pred_onx = sess.run(None, {input_name: y})
+    # Get the index of the highest probability from softmax output:
+    pred_indices = []
+    for pred in pred_onx[0]:
+        pred_indices.append(np.argmax(pred))
+    out_str = ""
+    for index in pred_indices:
+        out_str += labels[index]
+    return out_str

CodonTransformer/CodonUtils.py ADDED Viewed

	@@ -0,0 +1,871 @@

+"""
+File: CodonUtils.py
+---------------------
+Includes constants and helper functions used by other Python scripts.
+"""
+import itertools
+import json
+import os
+import pickle
+import re
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Any, Dict, Iterator, List, Optional, Tuple
+import pandas as pd
+import requests
+import torch
+# List of all amino acids
+AMINO_ACIDS: List[str] = [
+    "A",  # Alanine
+    "C",  # Cysteine
+    "D",  # Aspartic acid
+    "E",  # Glutamic acid
+    "F",  # Phenylalanine
+    "G",  # Glycine
+    "H",  # Histidine
+    "I",  # Isoleucine
+    "K",  # Lysine
+    "L",  # Leucine
+    "M",  # Methionine
+    "N",  # Asparagine
+    "P",  # Proline
+    "Q",  # Glutamine
+    "R",  # Arginine
+    "S",  # Serine
+    "T",  # Threonine
+    "V",  # Valine
+    "W",  # Tryptophan
+    "Y",  # Tyrosine
+]
+STOP_SYMBOLS = ["_", "*"]  # Stop codon symbols
+# Dictionary ambiguous amino acids to standard amino acids
+AMBIGUOUS_AMINOACID_MAP: Dict[str, list[str]] = {
+    "B": ["N", "D"],  # Asparagine (N) or Aspartic acid (D)
+    "Z": ["Q", "E"],  # Glutamine (Q) or Glutamic acid (E)
+    "X": ["A"],  # Any amino acid (typically replaced with Alanine)
+    "J": ["L", "I"],  # Leucine (L) or Isoleucine (I)
+    "U": ["C"],  # Selenocysteine (typically replaced with Cysteine)
+    "O": ["K"],  # Pyrrolysine (typically replaced with Lysine)
+}
+# List of all possible start and stop codons
+START_CODONS: List[str] = ["ATG", "TTG", "CTG", "GTG"]
+STOP_CODONS: List[str] = ["TAA", "TAG", "TGA"]
+# Token-to-index mapping for amino acids and special tokens
+TOKEN2INDEX: Dict[str, int] = {
+    "[UNK]": 0,
+    "[CLS]": 1,
+    "[SEP]": 2,
+    "[PAD]": 3,
+    "[MASK]": 4,
+    "a_unk": 5,
+    "c_unk": 6,
+    "d_unk": 7,
+    "e_unk": 8,
+    "f_unk": 9,
+    "g_unk": 10,
+    "h_unk": 11,
+    "i_unk": 12,
+    "k_unk": 13,
+    "l_unk": 14,
+    "m_unk": 15,
+    "n_unk": 16,
+    "p_unk": 17,
+    "q_unk": 18,
+    "r_unk": 19,
+    "s_unk": 20,
+    "t_unk": 21,
+    "v_unk": 22,
+    "w_unk": 23,
+    "y_unk": 24,
+    "__unk": 25,
+    "k_aaa": 26,
+    "n_aac": 27,
+    "k_aag": 28,
+    "n_aat": 29,
+    "t_aca": 30,
+    "t_acc": 31,
+    "t_acg": 32,
+    "t_act": 33,
+    "r_aga": 34,
+    "s_agc": 35,
+    "r_agg": 36,
+    "s_agt": 37,
+    "i_ata": 38,
+    "i_atc": 39,
+    "m_atg": 40,
+    "i_att": 41,
+    "q_caa": 42,
+    "h_cac": 43,
+    "q_cag": 44,
+    "h_cat": 45,
+    "p_cca": 46,
+    "p_ccc": 47,
+    "p_ccg": 48,
+    "p_cct": 49,
+    "r_cga": 50,
+    "r_cgc": 51,
+    "r_cgg": 52,
+    "r_cgt": 53,
+    "l_cta": 54,
+    "l_ctc": 55,
+    "l_ctg": 56,
+    "l_ctt": 57,
+    "e_gaa": 58,
+    "d_gac": 59,
+    "e_gag": 60,
+    "d_gat": 61,
+    "a_gca": 62,
+    "a_gcc": 63,
+    "a_gcg": 64,
+    "a_gct": 65,
+    "g_gga": 66,
+    "g_ggc": 67,
+    "g_ggg": 68,
+    "g_ggt": 69,
+    "v_gta": 70,
+    "v_gtc": 71,
+    "v_gtg": 72,
+    "v_gtt": 73,
+    "__taa": 74,
+    "y_tac": 75,
+    "__tag": 76,
+    "y_tat": 77,
+    "s_tca": 78,
+    "s_tcc": 79,
+    "s_tcg": 80,
+    "s_tct": 81,
+    "__tga": 82,
+    "c_tgc": 83,
+    "w_tgg": 84,
+    "c_tgt": 85,
+    "l_tta": 86,
+    "f_ttc": 87,
+    "l_ttg": 88,
+    "f_ttt": 89,
+}
+# Index-to-token mapping, reverse of TOKEN2INDEX
+INDEX2TOKEN: Dict[int, str] = {i: c for c, i in TOKEN2INDEX.items()}
+# Dictionary mapping each codon to its GC content
+CODON_GC_CONTENT: Dict[str, int] = {
+    token.split("_")[1]: token.split("_")[1].upper().count("G") + token.split("_")[1].upper().count("C")
+    for token in TOKEN2INDEX
+    if "_" in token and len(token.split("_")[1]) == 3
+}
+# Tensor with GC counts for each token in the vocabulary
+GC_COUNTS_PER_TOKEN = torch.zeros(len(TOKEN2INDEX))
+for token, index in TOKEN2INDEX.items():
+    if "_" in token and len(token.split("_")[1]) == 3:
+        codon = token.split("_")[1].upper()
+        gc_count = codon.count("G") + codon.count("C")
+        GC_COUNTS_PER_TOKEN[index] = gc_count
+G_indices = [idx for token, idx in TOKEN2INDEX.items() if "g" in token.split("_")[-1]]
+C_indices = [idx for token, idx in TOKEN2INDEX.items() if "c" in token.split("_")[-1]]
+# Dictionary mapping each amino acid and stop symbol to indices of codon tokens that translate to it
+AMINO_ACID_TO_INDEX = {
+    aa: sorted(
+        [i for t, i in TOKEN2INDEX.items() if t[0].upper() == aa and t[-3:] != "unk"]
+    )
+    for aa in (AMINO_ACIDS + STOP_SYMBOLS)
+}
+# Dictionary mapping each amino acid to min/max GC content across all possible codons
+AA_MIN_GC: Dict[str, int] = {}
+AA_MAX_GC: Dict[str, int] = {}
+for aa, token_indices in AMINO_ACID_TO_INDEX.items():
+    if token_indices:  # Skip if no tokens for this amino acid
+        gc_counts = []
+        for token_idx in token_indices:
+            token = INDEX2TOKEN[token_idx]
+            if "_" in token and len(token.split("_")[1]) == 3:
+                codon = token.split("_")[1]
+                if codon in CODON_GC_CONTENT:
+                    gc_counts.append(CODON_GC_CONTENT[codon])
+        if gc_counts:
+            AA_MIN_GC[aa] = min(gc_counts)
+            AA_MAX_GC[aa] = max(gc_counts)
+# Mask token mapping
+TOKEN2MASK: Dict[int, int] = {
+    0: 0,
+    1: 1,
+    2: 2,
+    3: 3,
+    4: 4,
+    5: 5,
+    6: 6,
+    7: 7,
+    8: 8,
+    9: 9,
+    10: 10,
+    11: 11,
+    12: 12,
+    13: 13,
+    14: 14,
+    15: 15,
+    16: 16,
+    17: 17,
+    18: 18,
+    19: 19,
+    20: 20,
+    21: 21,
+    22: 22,
+    23: 23,
+    24: 24,
+    25: 25,
+    26: 13,
+    27: 16,
+    28: 13,
+    29: 16,
+    30: 21,
+    31: 21,
+    32: 21,
+    33: 21,
+    34: 19,
+    35: 20,
+    36: 19,
+    37: 20,
+    38: 12,
+    39: 12,
+    40: 15,
+    41: 12,
+    42: 18,
+    43: 11,
+    44: 18,
+    45: 11,
+    46: 17,
+    47: 17,
+    48: 17,
+    49: 17,
+    50: 19,
+    51: 19,
+    52: 19,
+    53: 19,
+    54: 14,
+    55: 14,
+    56: 14,
+    57: 14,
+    58: 8,
+    59: 7,
+    60: 8,
+    61: 7,
+    62: 5,
+    63: 5,
+    64: 5,
+    65: 5,
+    66: 10,
+    67: 10,
+    68: 10,
+    69: 10,
+    70: 22,
+    71: 22,
+    72: 22,
+    73: 22,
+    74: 25,
+    75: 24,
+    76: 25,
+    77: 24,
+    78: 20,
+    79: 20,
+    80: 20,
+    81: 20,
+    82: 25,
+    83: 6,
+    84: 23,
+    85: 6,
+    86: 14,
+    87: 9,
+    88: 14,
+    89: 9,
+}
+# List of organisms used for fine-tuning
+FINE_TUNE_ORGANISMS: List[str] = [
+    "Arabidopsis thaliana",
+    "Bacillus subtilis",
+    "Caenorhabditis elegans",
+    "Chlamydomonas reinhardtii",
+    "Chlamydomonas reinhardtii chloroplast",
+    "Danio rerio",
+    "Drosophila melanogaster",
+    "Homo sapiens",
+    "Mus musculus",
+    "Nicotiana tabacum",
+    "Nicotiana tabacum chloroplast",
+    "Pseudomonas putida",
+    "Saccharomyces cerevisiae",
+    "Escherichia coli O157-H7 str. Sakai",
+    "Escherichia coli general",
+    "Escherichia coli str. K-12 substr. MG1655",
+    "Thermococcus barophilus MPT",
+]
+# List of organisms most commonly used for coodn optimization
+COMMON_ORGANISMS: List[str] = [
+    "Arabidopsis thaliana",
+    "Bacillus subtilis",
+    "Caenorhabditis elegans",
+    "Chlamydomonas reinhardtii",
+    "Danio rerio",
+    "Drosophila melanogaster",
+    "Homo sapiens",
+    "Mus musculus",
+    "Nicotiana tabacum",
+    "Pseudomonas putida",
+    "Saccharomyces cerevisiae",
+    "Escherichia coli general",
+]
+# Dictionary mapping each organism name to respective organism id
+ORGANISM2ID: Dict[str, int] = {
+    "Arabidopsis thaliana": 0,
+    "Atlantibacter hermannii": 1,
+    "Bacillus subtilis": 2,
+    "Brenneria goodwinii": 3,
+    "Buchnera aphidicola (Schizaphis graminum)": 4,
+    "Caenorhabditis elegans": 5,
+    "Candidatus Erwinia haradaeae": 6,
+    "Candidatus Hamiltonella defensa 5AT (Acyrthosiphon pisum)": 7,
+    "Chlamydomonas reinhardtii": 8,
+    "Chlamydomonas reinhardtii chloroplast": 9,
+    "Citrobacter amalonaticus": 10,
+    "Citrobacter braakii": 11,
+    "Citrobacter cronae": 12,
+    "Citrobacter europaeus": 13,
+    "Citrobacter farmeri": 14,
+    "Citrobacter freundii": 15,
+    "Citrobacter koseri ATCC BAA-895": 16,
+    "Citrobacter portucalensis": 17,
+    "Citrobacter werkmanii": 18,
+    "Citrobacter youngae": 19,
+    "Cronobacter dublinensis subsp. dublinensis LMG 23823": 20,
+    "Cronobacter malonaticus LMG 23826": 21,
+    "Cronobacter sakazakii": 22,
+    "Cronobacter turicensis": 23,
+    "Danio rerio": 24,
+    "Dickeya dadantii 3937": 25,
+    "Dickeya dianthicola": 26,
+    "Dickeya fangzhongdai": 27,
+    "Dickeya solani": 28,
+    "Dickeya zeae": 29,
+    "Drosophila melanogaster": 30,
+    "Edwardsiella anguillarum ET080813": 31,
+    "Edwardsiella ictaluri": 32,
+    "Edwardsiella piscicida": 33,
+    "Edwardsiella tarda": 34,
+    "Enterobacter asburiae": 35,
+    "Enterobacter bugandensis": 36,
+    "Enterobacter cancerogenus": 37,
+    "Enterobacter chengduensis": 38,
+    "Enterobacter cloacae": 39,
+    "Enterobacter hormaechei": 40,
+    "Enterobacter kobei": 41,
+    "Enterobacter ludwigii": 42,
+    "Enterobacter mori": 43,
+    "Enterobacter quasiroggenkampii": 44,
+    "Enterobacter roggenkampii": 45,
+    "Enterobacter sichuanensis": 46,
+    "Erwinia amylovora CFBP1430": 47,
+    "Erwinia persicina": 48,
+    "Escherichia albertii": 49,
+    "Escherichia coli O157-H7 str. Sakai": 50,
+    "Escherichia coli general": 51,
+    "Escherichia coli str. K-12 substr. MG1655": 52,
+    "Escherichia fergusonii": 53,
+    "Escherichia marmotae": 54,
+    "Escherichia ruysiae": 55,
+    "Ewingella americana": 56,
+    "Hafnia alvei": 57,
+    "Hafnia paralvei": 58,
+    "Homo sapiens": 59,
+    "Kalamiella piersonii": 60,
+    "Klebsiella aerogenes": 61,
+    "Klebsiella grimontii": 62,
+    "Klebsiella michiganensis": 63,
+    "Klebsiella oxytoca": 64,
+    "Klebsiella pasteurii": 65,
+    "Klebsiella pneumoniae subsp. pneumoniae HS11286": 66,
+    "Klebsiella quasipneumoniae": 67,
+    "Klebsiella quasivariicola": 68,
+    "Klebsiella variicola": 69,
+    "Kosakonia cowanii": 70,
+    "Kosakonia radicincitans": 71,
+    "Leclercia adecarboxylata": 72,
+    "Lelliottia amnigena": 73,
+    "Lonsdalea populi": 74,
+    "Moellerella wisconsensis": 75,
+    "Morganella morganii": 76,
+    "Mus musculus": 77,
+    "Nicotiana tabacum": 78,
+    "Nicotiana tabacum chloroplast": 79,
+    "Obesumbacterium proteus": 80,
+    "Pantoea agglomerans": 81,
+    "Pantoea allii": 82,
+    "Pantoea ananatis PA13": 83,
+    "Pantoea dispersa": 84,
+    "Pantoea stewartii": 85,
+    "Pantoea vagans": 86,
+    "Pectobacterium aroidearum": 87,
+    "Pectobacterium atrosepticum": 88,
+    "Pectobacterium brasiliense": 89,
+    "Pectobacterium carotovorum": 90,
+    "Pectobacterium odoriferum": 91,
+    "Pectobacterium parmentieri": 92,
+    "Pectobacterium polaris": 93,
+    "Pectobacterium versatile": 94,
+    "Photorhabdus laumondii subsp. laumondii TTO1": 95,
+    "Plesiomonas shigelloides": 96,
+    "Pluralibacter gergoviae": 97,
+    "Proteus faecis": 98,
+    "Proteus mirabilis HI4320": 99,
+    "Proteus penneri": 100,
+    "Proteus terrae subsp. cibarius": 101,
+    "Proteus vulgaris": 102,
+    "Providencia alcalifaciens": 103,
+    "Providencia heimbachae": 104,
+    "Providencia rettgeri": 105,
+    "Providencia rustigianii": 106,
+    "Providencia stuartii": 107,
+    "Providencia thailandensis": 108,
+    "Pseudomonas putida": 109,
+    "Pyrococcus furiosus": 110,
+    "Pyrococcus horikoshii": 111,
+    "Pyrococcus yayanosii": 112,
+    "Rahnella aquatilis CIP 78.65 = ATCC 33071": 113,
+    "Raoultella ornithinolytica": 114,
+    "Raoultella planticola": 115,
+    "Raoultella terrigena": 116,
+    "Rosenbergiella epipactidis": 117,
+    "Rouxiella badensis": 118,
+    "Saccharolobus solfataricus": 119,
+    "Saccharomyces cerevisiae": 120,
+    "Salmonella bongori N268-08": 121,
+    "Salmonella enterica subsp. enterica serovar Typhimurium str. LT2": 122,
+    "Serratia bockelmannii": 123,
+    "Serratia entomophila": 124,
+    "Serratia ficaria": 125,
+    "Serratia fonticola": 126,
+    "Serratia grimesii": 127,
+    "Serratia liquefaciens": 128,
+    "Serratia marcescens": 129,
+    "Serratia nevei": 130,
+    "Serratia plymuthica AS9": 131,
+    "Serratia proteamaculans": 132,
+    "Serratia quinivorans": 133,
+    "Serratia rubidaea": 134,
+    "Serratia ureilytica": 135,
+    "Shigella boydii": 136,
+    "Shigella dysenteriae": 137,
+    "Shigella flexneri 2a str. 301": 138,
+    "Shigella sonnei": 139,
+    "Thermoccoccus kodakarensis": 140,
+    "Thermococcus barophilus MPT": 141,
+    "Thermococcus chitonophagus": 142,
+    "Thermococcus gammatolerans": 143,
+    "Thermococcus litoralis": 144,
+    "Thermococcus onnurineus": 145,
+    "Thermococcus sibiricus": 146,
+    "Xenorhabdus bovienii str. feltiae Florida": 147,
+    "Yersinia aldovae 670-83": 148,
+    "Yersinia aleksiciae": 149,
+    "Yersinia alsatica": 150,
+    "Yersinia enterocolitica": 151,
+    "Yersinia frederiksenii ATCC 33641": 152,
+    "Yersinia intermedia": 153,
+    "Yersinia kristensenii": 154,
+    "Yersinia massiliensis CCUG 53443": 155,
+    "Yersinia mollaretii ATCC 43969": 156,
+    "Yersinia pestis A1122": 157,
+    "Yersinia proxima": 158,
+    "Yersinia pseudotuberculosis IP 32953": 159,
+    "Yersinia rochesterensis": 160,
+    "Yersinia rohdei": 161,
+    "Yersinia ruckeri": 162,
+    "Yokenella regensburgei": 163,
+}
+# Dictionary mapping each organism id to respective organism name
+ID2ORGANISM = {v: k for k, v in ORGANISM2ID.items()}
+# Type alias for amino acid to codon mapping
+AMINO2CODON_TYPE = Dict[str, Tuple[List[str], List[float]]]
+# Constants for the number of organisms and sequence lengths
+NUM_ORGANISMS = 164
+MAX_LEN = 2048
+MAX_AMINO_ACIDS = MAX_LEN - 2  # Without special tokens [CLS] and [SEP]
+STOP_SYMBOL = "_"
+@dataclass
+class DNASequencePrediction:
+    """
+    A class to hold the output of the DNA sequence prediction.
+    Attributes:
+        organism (str): Name of the organism used for prediction.
+        protein (str): Input protein sequence for which DNA sequence is predicted.
+        processed_input (str): Processed input sequence (merged protein and DNA).
+        predicted_dna (str): Predicted DNA sequence.
+    """
+    organism: str
+    protein: str
+    processed_input: str
+    predicted_dna: str
+class IterableData(torch.utils.data.IterableDataset):
+    """
+    Defines the logic for iterable datasets (working over streams of
+    data) in parallel multi-processing environments, e.g., multi-GPU.
+    Args:
+        dist_env (Optional[str]): The distribution environment identifier
+        (e.g., "slurm").
+    Credit: Guillaume Filion
+    """
+    def __init__(self, dist_env: Optional[str] = None):
+        super().__init__()
+        if dist_env is None:
+            self.world_size_handle, self.rank_handle = ("WORLD_SIZE", "LOCAL_RANK")
+        else:
+            self.world_size_handle, self.rank_handle = {
+                "slurm": ("SLURM_NTASKS", "SLURM_PROCID")
+            }.get(dist_env, ("WORLD_SIZE", "LOCAL_RANK"))
+    @property
+    def iterator(self) -> Iterator:
+        """Define the stream logic for the dataset. Implement in subclasses."""
+        raise NotImplementedError
+    def __iter__(self) -> Iterator:
+        """
+        Create an iterator for the dataset, handling multi-processing contexts.
+        Returns:
+            Iterator: The iterator for the dataset.
+        """
+        worker_info = torch.utils.data.get_worker_info()
+        if worker_info is None:
+            return self.iterator
+        # In multi-processing context, use 'os.environ' to
+        # find global worker rank. Then use 'islice' to allocate
+        # the items of the stream to the workers.
+        world_size = int(os.environ.get(self.world_size_handle, "1"))
+        global_rank = int(os.environ.get(self.rank_handle, "0"))
+        local_rank = worker_info.id
+        local_num_workers = worker_info.num_workers
+        # Assume that each process has the same number of local workers.
+        worker_rk = global_rank * local_num_workers + local_rank
+        worker_nb = world_size * local_num_workers
+        return itertools.islice(self.iterator, worker_rk, None, worker_nb)
+class IterableJSONData(IterableData):
+    """
+    Iterate over the lines of a JSON file and uncompress if needed.
+    Args:
+        data_path (str): The path to the JSON data file.
+        train (bool): Flag indicating if the dataset is for training.
+        **kwargs: Additional keyword arguments for the base class.
+    """
+    def __init__(self, data_path: str, train: bool = True, **kwargs):
+        super().__init__(**kwargs)
+        self.data_path = data_path
+        self.train = train
+        with open(os.path.join(self.data_path, "finetune_set.json"), "r") as f:
+            self.records = [json.loads(line) for line in f]
+    def __len__(self):
+        return len(self.records)
+    @property
+    def iterator(self) -> Iterator:
+        """Define the stream logic for the dataset."""
+        for record in self.records:
+            yield record
+class ConfigManager(ABC):
+    """
+    Abstract base class for managing configuration settings.
+    """
+    _config: Dict[str, Any]
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_value, traceback):
+        if exc_type is not None:
+            print(f"Exception occurred: {exc_type}, {exc_value}, {traceback}")
+        self.reset_config()
+    @abstractmethod
+    def reset_config(self) -> None:
+        """Reset the configuration to default values."""
+        pass
+    def get(self, key: str) -> Any:
+        """
+        Get the value of a configuration key.
+        Args:
+            key (str): The key to retrieve the value for.
+        Returns:
+            Any: The value of the configuration key.
+        """
+        return self._config.get(key)
+    def set(self, key: str, value: Any) -> None:
+        """
+        Set the value of a configuration key.
+        Args:
+            key (str): The key to set the value for.
+            value (Any): The value to set for the key.
+        """
+        self.validate_inputs(key, value)
+        self._config[key] = value
+    def update(self, config_dict: dict) -> None:
+        """
+        Update the configuration with a dictionary of key-value pairs after validating them.
+        Args:
+            config_dict (dict): A dictionary of key-value pairs to update the configuration.
+        """
+        for key, value in config_dict.items():
+            self.validate_inputs(key, value)
+        self._config.update(config_dict)
+    @abstractmethod
+    def validate_inputs(self, key: str, value: Any) -> None:
+        """Validate the inputs for the configuration."""
+        pass
+class ProteinConfig(ConfigManager):
+    """
+    A class to manage configuration settings for protein sequences.
+    This class ensures that the configuration is a singleton.
+    It provides methods to get, set, and update configuration values.
+    Attributes:
+        _instance (Optional[ConfigManager]): The singleton instance of the ConfigManager.
+        _config (Dict[str, Any]): The configuration dictionary.
+    """
+    _instance = None
+    def __new__(cls):
+        """
+        Create a new instance of the ProteinConfig class.
+        Returns:
+            ProteinConfig: The singleton instance of the ProteinConfig.
+        """
+        if cls._instance is None:
+            cls._instance = super(ProteinConfig, cls).__new__(cls)
+            cls._instance.reset_config()
+        return cls._instance
+    def validate_inputs(self, key: str, value: Any) -> None:
+        """
+        Validate the inputs for the configuration.
+        Args:
+            key (str): The key to validate.
+            value (Any): The value to validate.
+        Raises:
+            ValueError: If the value is invalid.
+            TypeError: If the value is of the wrong type.
+        """
+        if key == "ambiguous_aminoacid_behavior":
+            if value not in [
+                "raise_error",
+                "standardize_deterministic",
+                "standardize_random",
+            ]:
+                raise ValueError(
+                    f"Invalid value for ambiguous_aminoacid_behavior: {value}."
+                )
+        elif key == "ambiguous_aminoacid_map_override":
+            if not isinstance(value, dict):
+                raise TypeError(
+                    f"Invalid type for ambiguous_aminoacid_map_override: {value}."
+                )
+            for ambiguous_aminoacid, aminoacids in value.items():
+                if not isinstance(aminoacids, list):
+                    raise TypeError(f"Invalid type for aminoacids: {aminoacids}.")
+                if not aminoacids:
+                    raise ValueError(
+                        f"Override for aminoacid '{ambiguous_aminoacid}' cannot be empty list."
+                    )
+                if ambiguous_aminoacid not in AMBIGUOUS_AMINOACID_MAP:
+                    raise ValueError(
+                        f"Invalid amino acid in ambiguous_aminoacid_map_override: {ambiguous_aminoacid}"
+                    )
+        else:
+            raise ValueError(f"Invalid configuration key: {key}")
+    def reset_config(self) -> None:
+        """
+        Reset the configuration to the default values.
+        """
+        self._config = {
+            "ambiguous_aminoacid_behavior": "standardize_random",
+            "ambiguous_aminoacid_map_override": {},
+        }
+def load_python_object_from_disk(file_path: str) -> Any:
+    """
+    Load a Pickle object from disk and return it as a Python object.
+    Args:
+        file_path (str): The path to the Pickle file.
+    Returns:
+        Any: The loaded Python object.
+    """
+    with open(file_path, "rb") as file:
+        return pickle.load(file)
+def save_python_object_to_disk(input_object: Any, file_path: str) -> None:
+    """
+    Save a Python object to disk using Pickle.
+    Args:
+        input_object (Any): The Python object to save.
+        file_path (str): The path where the object will be saved.
+    """
+    with open(file_path, "wb") as file:
+        pickle.dump(input_object, file)
+def find_pattern_in_fasta(keyword: str, text: str) -> str:
+    """
+    Find a specific keyword pattern in text. Helpful for identifying parts
+    of a FASTA sequence.
+    Args:
+        keyword (str): The keyword pattern to search for.
+        text (str): The text to search within.
+    Returns:
+        str: The found pattern or an empty string if not found.
+    """
+    # Search for the keyword pattern in the text using regex
+    result = re.search(keyword + r"=(.*?)]", text)
+    return result.group(1) if result else ""
+def get_organism2id_dict(organism_reference: str) -> Dict[str, int]:
+    """
+    Return a dictionary mapping each organism in training data to an index
+    used for training.
+    Args:
+        organism_reference (str): Path to a CSV file containing a list of
+            all organisms. The format of the CSV file should be as follows:
+                0,Escherichia coli
+                1,Homo sapiens
+                2,Mus musculus
+    Returns:
+        Dict[str, int]: Dictionary mapping organism names to their respective indices.
+    """
+    # Read the CSV file and create a dictionary mapping organisms to their indices
+    organisms = pd.read_csv(organism_reference, index_col=0, header=None)
+    organism2id = {organisms.iloc[i].values[0]: i for i in organisms.index}
+    return organism2id
+def get_taxonomy_id(
+    taxonomy_reference: str, organism: Optional[str] = None, return_dict: bool = False
+) -> Any:
+    """
+    Return the taxonomy id of a given organism using a reference file.
+    Optionally, return the whole dictionary instead if return_dict is True.
+    Args:
+        taxonomy_reference (str): Path to the taxonomy reference file.
+        organism (Optional[str]): The name of the organism to look up.
+        return_dict (bool): Whether to return the entire dictionary.
+    Returns:
+        Any: The taxonomy id of the organism or the entire dictionary.
+    """
+    # Load the organism-to-taxonomy mapping from a Pickle file
+    organism2taxonomy = load_python_object_from_disk(taxonomy_reference)
+    if return_dict:
+        return dict(sorted(organism2taxonomy.items()))
+    return organism2taxonomy[organism]
+def sort_amino2codon_skeleton(amino2codon: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Sort the amino2codon dictionary alphabetically by amino acid and by codon name.
+    Args:
+        amino2codon (Dict[str, Any]): The amino2codon dictionary to sort.
+    Returns:
+        Dict[str, Any]: The sorted amino2codon dictionary.
+    """
+    # Sort the dictionary by amino acid and then by codon name
+    amino2codon = dict(sorted(amino2codon.items()))
+    amino2codon = {
+        amino: (
+            [codon for codon, _ in sorted(zip(codons, frequencies))],
+            [freq for _, freq in sorted(zip(codons, frequencies))],
+        )
+        for amino, (codons, frequencies) in amino2codon.items()
+    }
+    return amino2codon
+def load_pkl_from_url(url: str) -> Any:
+    """
+    Download a Pickle file from a URL and return the loaded object.
+    Args:
+        url (str): The URL to download the Pickle file from.
+    Returns:
+        Any: The loaded Python object from the Pickle file.
+    """
+    response = requests.get(url)
+    response.raise_for_status()  # Ensure the request was successful
+    # Load the Pickle object from the response content
+    return pickle.loads(response.content)

CodonTransformer/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """CodonTransformer package."""