Spaces:
Running
Running
""" | |
File: CodonEvaluation.py | |
--------------------------- | |
Includes functions to calculate various evaluation metrics along with helper | |
functions. | |
""" | |
from typing import Dict, List, Tuple, Optional | |
import pandas as pd | |
from CAI import CAI, relative_adaptiveness | |
from tqdm import tqdm | |
import math | |
import numpy as np | |
from collections import Counter | |
from itertools import chain | |
from statistics import mean | |
import sys | |
import os | |
from io import StringIO | |
def get_CSI_weights(sequences: List[str]) -> Dict[str, float]: | |
""" | |
Calculate the Codon Similarity Index (CSI) weights for a list of DNA sequences. | |
Args: | |
sequences (List[str]): List of DNA sequences. | |
Returns: | |
dict: The CSI weights. | |
""" | |
return relative_adaptiveness(sequences=sequences) | |
def get_CSI_value(dna: str, weights: Dict[str, float]) -> float: | |
""" | |
Calculate the Codon Similarity Index (CSI) for a DNA sequence. | |
Args: | |
dna (str): The DNA sequence. | |
weights (dict): The CSI weights from get_CSI_weights. | |
Returns: | |
float: The CSI value. | |
""" | |
return CAI(dna, weights) | |
def get_organism_to_CSI_weights( | |
dataset: pd.DataFrame, organisms: List[str] | |
) -> Dict[str, dict]: | |
""" | |
Calculate the Codon Similarity Index (CSI) weights for a list of organisms. | |
Args: | |
dataset (pd.DataFrame): Dataset containing organism and DNA sequence info. | |
organisms (List[str]): List of organism names. | |
Returns: | |
Dict[str, dict]: A dictionary mapping each organism to its CSI weights. | |
""" | |
organism2weights = {} | |
# Iterate through each organism to calculate its CSI weights | |
for organism in tqdm(organisms, desc="Calculating CSI Weights: ", unit="Organism"): | |
organism_data = dataset.loc[dataset["organism"] == organism] | |
sequences = organism_data["dna"].to_list() | |
weights = get_CSI_weights(sequences) | |
organism2weights[organism] = weights | |
return organism2weights | |
def get_GC_content(dna: str) -> float: | |
""" | |
Calculate the GC content of a DNA sequence. | |
Args: | |
dna (str): The DNA sequence. | |
Returns: | |
float: The GC content as a percentage. | |
""" | |
dna = dna.upper() | |
if not dna: | |
return 0.0 | |
return (dna.count("G") + dna.count("C")) / len(dna) * 100 | |
def get_cfd( | |
dna: str, | |
codon_frequencies: Dict[str, Tuple[List[str], List[float]]], | |
threshold: float = 0.3, | |
) -> float: | |
""" | |
Calculate the codon frequency distribution (CFD) metric for a DNA sequence. | |
Args: | |
dna (str): The DNA sequence. | |
codon_frequencies (Dict[str, Tuple[List[str], List[float]]]): Codon | |
frequency distribution per amino acid. | |
threshold (float): Frequency threshold for counting rare codons. | |
Returns: | |
float: The CFD metric as a percentage. | |
""" | |
# Get a dictionary mapping each codon to its normalized frequency | |
codon2frequency = { | |
codon: freq / max(frequencies) | |
for amino, (codons, frequencies) in codon_frequencies.items() | |
for codon, freq in zip(codons, frequencies) | |
} | |
cfd = 0 | |
# Iterate through the DNA sequence in steps of 3 to process each codon | |
for i in range(0, len(dna), 3): | |
codon = dna[i : i + 3] | |
codon_frequency = codon2frequency[codon] | |
if codon_frequency < threshold: | |
cfd += 1 | |
return cfd / (len(dna) / 3) * 100 | |
def get_min_max_percentage( | |
dna: str, | |
codon_frequencies: Dict[str, Tuple[List[str], List[float]]], | |
window_size: int = 18, | |
) -> List[float]: | |
""" | |
Calculate the %MinMax metric for a DNA sequence. | |
Args: | |
dna (str): The DNA sequence. | |
codon_frequencies (Dict[str, Tuple[List[str], List[float]]]): Codon | |
frequency distribution per amino acid. | |
window_size (int): Size of the window to calculate %MinMax. | |
Returns: | |
List[float]: List of %MinMax values for the sequence. | |
Credit: https://github.com/chowington/minmax | |
""" | |
# Get a dictionary mapping each codon to its respective amino acid | |
codon2amino = { | |
codon: amino | |
for amino, (codons, frequencies) in codon_frequencies.items() | |
for codon in codons | |
} | |
min_max_values = [] | |
codons = [dna[i : i + 3] for i in range(0, len(dna), 3)] # Split DNA into codons | |
# Iterate through the DNA sequence using the specified window size | |
for i in range(len(codons) - window_size + 1): | |
codon_window = codons[i : i + window_size] # Codons in the current window | |
Actual = 0.0 # Average of the actual codon frequencies | |
Max = 0.0 # Average of the min codon frequencies | |
Min = 0.0 # Average of the max codon frequencies | |
Avg = 0.0 # Average of the averages of all frequencies for each amino acid | |
# Sum the frequencies for codons in the current window | |
for codon in codon_window: | |
aminoacid = codon2amino[codon] | |
frequencies = codon_frequencies[aminoacid][1] | |
codon_index = codon_frequencies[aminoacid][0].index(codon) | |
codon_frequency = codon_frequencies[aminoacid][1][codon_index] | |
Actual += codon_frequency | |
Max += max(frequencies) | |
Min += min(frequencies) | |
Avg += sum(frequencies) / len(frequencies) | |
# Divide by the window size to get the averages | |
Actual = Actual / window_size | |
Max = Max / window_size | |
Min = Min / window_size | |
Avg = Avg / window_size | |
# Calculate %MinMax | |
percentMax = ((Actual - Avg) / (Max - Avg)) * 100 | |
percentMin = ((Avg - Actual) / (Avg - Min)) * 100 | |
# Append the appropriate %MinMax value | |
if percentMax >= 0: | |
min_max_values.append(percentMax) | |
else: | |
min_max_values.append(-percentMin) | |
# Populate the last floor(window_size / 2) entries of min_max_values with None | |
for i in range(int(window_size / 2)): | |
min_max_values.append(None) | |
return min_max_values | |
def get_sequence_complexity(dna: str) -> float: | |
""" | |
Calculate the sequence complexity score of a DNA sequence. | |
Args: | |
dna (str): The DNA sequence. | |
Returns: | |
float: The sequence complexity score. | |
""" | |
def sum_up_to(x): | |
"""Recursive function to calculate the sum of integers from 1 to x.""" | |
if x <= 1: | |
return 1 | |
else: | |
return x + sum_up_to(x - 1) | |
def f(x): | |
"""Returns 4 if x is greater than or equal to 4, else returns x.""" | |
if x >= 4: | |
return 4 | |
elif x < 4: | |
return x | |
unique_subseq_length = [] | |
# Calculate unique subsequences lengths | |
for i in range(1, len(dna) + 1): | |
unique_subseq = set() | |
for j in range(len(dna) - (i - 1)): | |
unique_subseq.add(dna[j : (j + i)]) | |
unique_subseq_length.append(len(unique_subseq)) | |
# Calculate complexity score | |
complexity_score = ( | |
sum(unique_subseq_length) / (sum_up_to(len(dna) - 1) + f(len(dna))) | |
) * 100 | |
return complexity_score | |
def get_sequence_similarity( | |
original: str, predicted: str, truncate: bool = True, window_length: int = 1 | |
) -> float: | |
""" | |
Calculate the sequence similarity between two sequences. | |
Args: | |
original (str): The original sequence. | |
predicted (str): The predicted sequence. | |
truncate (bool): If True, truncate the original sequence to match the length | |
of the predicted sequence. | |
window_length (int): Length of the window for comparison (1 for amino acids, | |
3 for codons). | |
Returns: | |
float: The sequence similarity as a percentage. | |
Preconditions: | |
len(predicted) <= len(original). | |
""" | |
if not truncate and len(original) != len(predicted): | |
raise ValueError( | |
"Set truncate to True if the length of sequences do not match." | |
) | |
identity = 0.0 | |
original = original.strip() | |
predicted = predicted.strip() | |
if truncate: | |
original = original[: len(predicted)] | |
if window_length == 1: | |
# Simple comparison for amino acid | |
for i in range(len(predicted)): | |
if original[i] == predicted[i]: | |
identity += 1 | |
else: | |
# Comparison for substrings based on window_length | |
for i in range(0, len(original) - window_length + 1, window_length): | |
if original[i : i + window_length] == predicted[i : i + window_length]: | |
identity += 1 | |
return (identity / (len(predicted) / window_length)) * 100 | |
def scan_for_restriction_sites(seq: str, sites: List[str] = ['GAATTC', 'GGATCC', 'AAGCTT']) -> int: | |
""" | |
Scans for a list of restriction enzyme sites in a DNA sequence. | |
""" | |
return sum(seq.upper().count(site.upper()) for site in sites) | |
def count_negative_cis_elements(seq: str, motifs: List[str] = ['TATAAT', 'TTGACA', 'AGCTAGT']) -> int: | |
""" | |
Counts occurrences of negative cis-regulatory elements in a DNA sequence. | |
""" | |
return sum(seq.upper().count(m.upper()) for m in motifs) | |
def calculate_homopolymer_runs(seq: str, max_len: int = 8) -> int: | |
""" | |
Calculates the number of homopolymer runs longer than a given length. | |
""" | |
import re | |
min_len = max_len + 1 | |
return len(re.findall(r'(A{%d,}|T{%d,}|G{%d,}|C{%d,})' % (min_len, min_len, min_len, min_len), seq.upper())) | |
def get_min_max_profile( | |
dna: str, | |
codon_frequencies: Dict[str, Tuple[List[str], List[float]]], | |
window_size: int = 18, | |
) -> List[float]: | |
""" | |
Calculate the %MinMax profile for a DNA sequence. This is a list of | |
%MinMax values for sliding windows across the sequence. | |
Args: | |
dna (str): The DNA sequence. | |
codon_frequencies (Dict[str, Tuple[List[str], List[float]]]): Codon | |
frequency distribution per amino acid. | |
window_size (int): Size of the window to calculate %MinMax. | |
Returns: | |
List[float]: List of %MinMax values for the sequence. | |
""" | |
return get_min_max_percentage(dna, codon_frequencies, window_size) | |
def calculate_dtw_distance(profile1: List[float], profile2: List[float]) -> float: | |
""" | |
Calculates the Dynamic Time Warping (DTW) distance between two profiles. | |
Args: | |
profile1 (List[float]): The first profile (e.g., %MinMax of generated sequence). | |
profile2 (List[float]): The second profile (e.g., %MinMax of natural sequence). | |
Returns: | |
float: The DTW distance between the two profiles. | |
""" | |
from dtw import dtw | |
import numpy as np | |
# Ensure profiles are numpy arrays and handle potential None and NaN values | |
p1 = np.array([v for v in profile1 if v is not None and not np.isnan(v)]).reshape( | |
-1, 1 | |
) | |
p2 = np.array([v for v in profile2 if v is not None and not np.isnan(v)]).reshape( | |
-1, 1 | |
) | |
if len(p1) == 0 or len(p2) == 0: | |
return np.inf # Return infinity if one of the profiles is empty | |
alignment = dtw(p1, p2, keep_internals=True) | |
return alignment.distance # type: ignore | |
def get_ecoli_tai_weights(): | |
""" | |
Returns a dictionary of tAI weights for E. coli based on tRNA gene copy numbers. | |
These weights are pre-calculated based on the relative adaptiveness of each codon. | |
""" | |
codons = [ | |
"TTT", "TTC", "TTA", "TTG", "TCT", "TCC", "TCA", "TCG", "TAT", "TAC", | |
"TGT", "TGC", "TGG", "CTT", "CTC", "CTA", "CTG", "CCT", "CCC", "CCA", | |
"CCG", "CAT", "CAC", "CAA", "CAG", "CGT", "CGC", "CGA", "CGG", "ATT", | |
"ATC", "ATA", "ACT", "ACC", "ACA", "ACG", "AAT", "AAC", "AAA", "AAG", | |
"AGT", "AGC", "AGA", "AGG", "GTT", "GTC", "GTA", "GTG", "GCT", "GCC", | |
"GCA", "GCG", "GAT", "GAC", "GAA", "GAG", "GGT", "GGC", "GGA", "GGG" | |
] | |
weights = [ | |
0.1966667, 0.3333333, 0.1666667, 0.2200000, 0.1966667, 0.3333333, | |
0.1666667, 0.2200000, 0.2950000, 0.5000000, 0.09833333, 0.1666667, | |
0.2200000, 0.09833333, 0.1666667, 0.1666667, 0.7200000, 0.09833333, | |
0.1666667, 0.1666667, 0.2200000, 0.09833333, 0.1666667, 0.3333333, | |
0.4400000, 0.6666667, 0.4800000, 0.00006666667, 0.1666667, 0.2950000, | |
0.5000000, 0.01833333, 0.1966667, 0.3333333, 0.1666667, 0.3866667, | |
0.3933333, 0.6666667, 1.0000000, 0.3200000, 0.09833333, 0.1666667, | |
0.1666667, 0.2200000, 0.1966667, 0.3333333, 0.8333333, 0.2666667, | |
0.1966667, 0.3333333, 0.5000000, 0.1600000, 0.2950000, 0.5000000, | |
0.6666667, 0.2133333, 0.3933333, 0.6666667, 0.1666667, 0.2200000 | |
] | |
return dict(zip(codons, weights)) | |
def calculate_tAI(sequence: str, tai_weights: Dict[str, float]) -> float: | |
""" | |
Calculates the tRNA Adaptation Index (tAI) for a given DNA sequence. | |
Args: | |
sequence (str): The DNA sequence to analyze. | |
tai_weights (Dict[str, float]): A dictionary of tAI weights for each codon. | |
Returns: | |
float: The tAI value for the sequence. | |
""" | |
from scipy.stats.mstats import gmean | |
codons = [sequence[i:i+3] for i in range(0, len(sequence), 3)] | |
# Filter out stop codons and codons not in weights | |
weights = [tai_weights[codon] for codon in codons if codon in tai_weights and tai_weights[codon] > 0] | |
if not weights: | |
return 0.0 | |
return gmean(weights) | |
def calculate_ENC(sequence: str) -> float: | |
""" | |
Calculate the Effective Number of Codons (ENC) for a DNA sequence. | |
Uses the codonbias library implementation based on Wright (1990). | |
Args: | |
sequence (str): The DNA sequence. | |
Returns: | |
float: The ENC value for the sequence. | |
""" | |
try: | |
from codonbias.scores import EffectiveNumberOfCodons | |
# Initialize ENC calculator | |
enc_calculator = EffectiveNumberOfCodons( | |
k_mer=1, # Standard codon analysis | |
bg_correction=True, # Use background correction | |
robust=True, # Use robust calculation | |
genetic_code=1 # Standard genetic code | |
) | |
# Calculate ENC for the sequence | |
enc_value = enc_calculator.get_score(sequence) | |
return float(enc_value) | |
except ImportError: | |
raise ImportError("codonbias library is required for ENC calculation. Install with: pip install codonbias") | |
except Exception as e: | |
# Fallback to a simple ENC approximation if library fails | |
print(f"Warning: ENC calculation failed with error: {e}. Using approximation.") | |
return 45.0 # Typical E. coli ENC value as fallback | |
def calculate_CPB(sequence: str, reference_sequences: Optional[List[str]] = None) -> float: | |
""" | |
Calculate the Codon Pair Bias (CPB) for a DNA sequence. | |
Uses the codonbias library implementation based on Coleman et al. (2008). | |
Args: | |
sequence (str): The DNA sequence. | |
reference_sequences (List[str]): Reference sequences for calculating expected values. | |
If None, uses a default E. coli reference. | |
Returns: | |
float: The CPB value for the sequence. | |
""" | |
try: | |
from codonbias.scores import CodonPairBias | |
# Use provided reference sequences or default | |
if reference_sequences is None: | |
# Use the input sequence as reference if none provided | |
reference_sequences = [sequence] | |
# Initialize CPB calculator with reference sequences | |
cpb_calculator = CodonPairBias( | |
ref_seq=reference_sequences, | |
k_mer=2, # Codon pairs | |
genetic_code=1, # Standard genetic code | |
ignore_stop=True, # Ignore stop codons | |
pseudocount=1 # Pseudocount for unseen pairs | |
) | |
# Calculate CPB for the sequence | |
cpb_value = cpb_calculator.get_score(sequence) | |
return float(cpb_value) | |
except ImportError: | |
raise ImportError("codonbias library is required for CPB calculation. Install with: pip install codonbias") | |
except Exception as e: | |
# Fallback calculation if library fails | |
print(f"Warning: CPB calculation failed with error: {e}. Using approximation.") | |
return 0.0 # Neutral CPB as fallback | |
def calculate_SCUO(sequence: str) -> float: | |
""" | |
Calculate the Synonymous Codon Usage Order (SCUO) for a DNA sequence. | |
Uses the GCUA library implementation based on information theory. | |
Args: | |
sequence (str): The DNA sequence. | |
Returns: | |
float: The SCUO value (0-1, where 1 indicates maximum bias). | |
""" | |
# Self-contained SCUO implementation (no external GCUA dependency). | |
# Based on Wan et al., 2004 information-theoretic definition. | |
from math import log2 # local import to avoid global cost | |
try: | |
# Build standard genetic code mapping using built-in tables (Biopython optional). | |
# Fall back to hard-coded table if Biopython absent. | |
try: | |
from Bio.Data import CodonTable # type: ignore | |
codon_to_aa = CodonTable.unambiguous_dna_by_id[1].forward_table | |
except Exception: | |
codon_to_aa = { | |
# Partial table sufficient for SCUO calculation; stop codons omitted. | |
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', | |
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', | |
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', | |
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', | |
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', | |
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', | |
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', | |
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', | |
'TAT': 'Y', 'TAC': 'Y', 'TAA': '*', 'TAG': '*', | |
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q', | |
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K', | |
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', | |
'TGT': 'C', 'TGC': 'C', 'TGA': '*', 'TGG': 'W', | |
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', | |
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R', | |
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G', | |
} | |
# Group codons by amino acid (exclude stops) | |
aa_to_codons = {} | |
for codon, aa in codon_to_aa.items(): | |
aa_to_codons.setdefault(aa, []).append(codon) | |
# Count codon occurrences in input sequence | |
seq = sequence.upper().replace('U', 'T') | |
codon_counts = {} | |
for i in range(0, len(seq) - len(seq) % 3, 3): | |
codon = seq[i:i+3] | |
if codon in codon_to_aa: | |
codon_counts[codon] = codon_counts.get(codon, 0) + 1 | |
total_codons = sum(codon_counts.values()) | |
if total_codons == 0: | |
return 0.0 | |
scuo_sum = 0.0 | |
for aa, codons in aa_to_codons.items(): | |
n_codons = len(codons) | |
if n_codons == 1: | |
continue # SCUO undefined for Met/Trp | |
counts = [codon_counts.get(c, 0) for c in codons] | |
total_aa = sum(counts) | |
if total_aa == 0: | |
continue | |
probs = [c / total_aa for c in counts if c] | |
H_obs = -sum(p * log2(p) for p in probs) | |
H_max = log2(n_codons) | |
O_i = (H_max - H_obs) / H_max if H_max else 0.0 | |
F_i = total_aa / total_codons | |
scuo_sum += F_i * O_i | |
return scuo_sum | |
except Exception as exc: | |
print(f"Warning: internal SCUO computation failed ({exc}). Returning 0.5.") | |
return 0.5 | |