Spaces:

MoraxCheng
/

Transeption_iGEM_BASISCHINA_2025

Running on Zero

MoraxCheng commited on 21 days ago

Commit

e809d91

1 Parent(s): 9957ccd

Fix resource management and memory leaks

- Add try-finally blocks for proper model cleanup even on errors
- Fix matplotlib memory leak with proper figure cleanup (close, clf, cla)
- Limit figure size to prevent excessive memory usage
- Remove unused imports (tempfile, atexit, threading)
- Remove unused global variables (active_inferences, inference_lock)
- Add better error handling for file cleanup operations
- Add error handling for initial repository setup
- Ensure model is always deleted from memory after inference

Files changed (1) hide show

app.py +86 -69

app.py CHANGED Viewed

@@ -16,9 +16,6 @@ from huggingface_hub import hf_hub_download
 import zipfile
 import shutil
 import uuid
-import tempfile
-import atexit
-import threading
 import gc
 # Add current directory to path
@@ -27,12 +24,20 @@ sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
 # Check if we need to download and extract the tranception module
 if not os.path.exists("tranception"):
     print("Downloading Tranception repository...")
-    # Clone the repository structure
-    os.system("git clone https://github.com/OATML-Markslab/Tranception.git temp_tranception")
-    # Move the tranception module to current directory
-    shutil.move("temp_tranception/tranception", "tranception")
-    # Clean up
-    shutil.rmtree("temp_tranception")
 import tranception
 from tranception import config, model_pytorch
@@ -110,7 +115,10 @@ def create_scoring_matrix_visual(scores,sequence,image_index=0,mutation_range_st
   # Continue with visualization
   mutation_range_len = mutation_range_end - mutation_range_start + 1
-  fig, ax = plt.subplots(figsize=(50,mutation_range_len))
   scores_dict = {}
   valid_mutant_set=set(filtered_scores.mutant)
   ax.tick_params(bottom=True, top=True, left=True, right=True)
@@ -144,11 +152,15 @@ def create_scoring_matrix_visual(scores,sequence,image_index=0,mutation_range_st
   # Set x-axis labels (amino acids) - ensuring correct number
   heat.set_xticklabels(list(AA_vocab), fontsize=fontsize)
-  plt.tight_layout()
-  image_path = 'fitness_scoring_substitution_matrix_{}_{}.png'.format(unique_id, image_index)
-  plt.savefig(image_path,dpi=100)
-  plt.close()
-  return image_path, csv_path
 def suggest_mutations(scores):
   intro_message = "The following mutations may be sensible options to improve fitness: \n\n"
@@ -181,10 +193,6 @@ def check_valid_mutant(sequence,mutant,AA_vocab=AA_vocab):
   if to_AA not in AA_vocab: valid=False
   return valid
-# Global variable to track active inference threads
-active_inferences = {}
-inference_lock = threading.Lock()
 def cleanup_old_files(max_age_minutes=30):
     """Clean up old inference files"""
     import glob
@@ -194,14 +202,20 @@ def cleanup_old_files(max_age_minutes=30):
                 "fitness_scoring_substitution_matrix_*.csv",
                 "all_mutations_fitness_scores_*.csv"]
     for pattern in patterns:
         for file_path in glob.glob(pattern):
             try:
                 file_age = current_time - os.path.getmtime(file_path)
                 if file_age > max_age_minutes * 60:
                     os.remove(file_path)
-            except:
-                pass
 def get_mutated_protein(sequence,mutant):
   if not check_valid_mutant(sequence,mutant):
@@ -257,55 +271,58 @@ def score_and_create_matrix_all_singles(sequence,mutation_range_start=None,mutat
     # Reduce batch size for CPU inference
     batch_size_inference = min(batch_size_inference, 10)
-  model.eval()
-  model.config.tokenizer = tokenizer
-  all_single_mutants = create_all_single_mutants(sequence,AA_vocab,mutation_range_start,mutation_range_end)
-  with torch.no_grad():
-    scores = model.score_mutants(DMS_data=all_single_mutants,
-                                      target_seq=sequence,
-                                      scoring_mirror=scoring_mirror,
-                                      batch_size_inference=batch_size_inference,
-                                      num_workers=num_workers,
-                                      indel_mode=False
-                                      )
-  scores = pd.merge(scores,all_single_mutants,on="mutated_sequence",how="left")
-  scores["position"]=scores["mutant"].map(lambda x: int(x[1:-1]))
-  scores["target_AA"] = scores["mutant"].map(lambda x: x[-1])
-  score_heatmaps = []
-  csv_files = []
-  mutation_range = mutation_range_end - mutation_range_start + 1
-  number_heatmaps = int((mutation_range - 1) / max_number_positions_per_heatmap) + 1
-  image_index = 0
-  window_start = mutation_range_start
-  window_end = min(mutation_range_end,mutation_range_start+max_number_positions_per_heatmap-1)
-  for image_index in range(number_heatmaps):
-    image_path, csv_path = create_scoring_matrix_visual(scores,sequence,image_index,window_start,window_end,AA_vocab,unique_id=unique_id)
-    score_heatmaps.append(image_path)
-    csv_files.append(csv_path)
-    window_start += max_number_positions_per_heatmap
-    window_end = min(mutation_range_end,window_start+max_number_positions_per_heatmap-1)
-  # Also save a comprehensive CSV with all mutations
-  comprehensive_csv_path = 'all_mutations_fitness_scores_{}.csv'.format(unique_id)
-  scores_export = scores[['mutant', 'position', 'target_AA', 'avg_score', 'mutated_sequence']].copy()
-  scores_export['original_AA'] = scores_export['mutant'].str[0]
-  scores_export = scores_export.rename(columns={'avg_score': 'fitness_score'})
-  scores_export = scores_export[['position', 'original_AA', 'target_AA', 'mutant', 'fitness_score', 'mutated_sequence']]
-  scores_export.to_csv(comprehensive_csv_path, index=False)
-  csv_files.append(comprehensive_csv_path)
-  # Clean up model from memory after inference
-  del model
-  gc.collect()
-  if torch.cuda.is_available():
-    torch.cuda.empty_cache()
-  return score_heatmaps, suggest_mutations(scores), csv_files
 def extract_sequence(protein_id, taxon, sequence):
   return sequence

 import zipfile
 import shutil
 import uuid
 import gc
 # Add current directory to path
 # Check if we need to download and extract the tranception module
 if not os.path.exists("tranception"):
     print("Downloading Tranception repository...")
+    try:
+        # Clone the repository structure
+        result = os.system("git clone https://github.com/OATML-Markslab/Tranception.git temp_tranception")
+        if result != 0:
+            raise Exception("Failed to clone Tranception repository")
+        # Move the tranception module to current directory
+        shutil.move("temp_tranception/tranception", "tranception")
+        # Clean up
+        shutil.rmtree("temp_tranception")
+    except Exception as e:
+        print(f"Error setting up Tranception: {e}")
+        if os.path.exists("temp_tranception"):
+            shutil.rmtree("temp_tranception")
+        raise
 import tranception
 from tranception import config, model_pytorch
   # Continue with visualization
   mutation_range_len = mutation_range_end - mutation_range_start + 1
+  # Limit figure size to prevent memory issues
+  fig_width = min(50, len(AA_vocab) * 0.8)
+  fig_height = min(mutation_range_len, 50)
+  fig, ax = plt.subplots(figsize=(fig_width, fig_height))
   scores_dict = {}
   valid_mutant_set=set(filtered_scores.mutant)
   ax.tick_params(bottom=True, top=True, left=True, right=True)
   # Set x-axis labels (amino acids) - ensuring correct number
   heat.set_xticklabels(list(AA_vocab), fontsize=fontsize)
+  try:
+    plt.tight_layout()
+    image_path = 'fitness_scoring_substitution_matrix_{}_{}.png'.format(unique_id, image_index)
+    plt.savefig(image_path,dpi=100)
+    return image_path, csv_path
+  finally:
+    plt.close('all')  # Ensure all figures are closed
+    plt.clf()  # Clear the current figure
+    plt.cla()  # Clear the current axes
 def suggest_mutations(scores):
   intro_message = "The following mutations may be sensible options to improve fitness: \n\n"
   if to_AA not in AA_vocab: valid=False
   return valid
 def cleanup_old_files(max_age_minutes=30):
     """Clean up old inference files"""
     import glob
                 "fitness_scoring_substitution_matrix_*.csv",
                 "all_mutations_fitness_scores_*.csv"]
+    cleaned_count = 0
     for pattern in patterns:
         for file_path in glob.glob(pattern):
             try:
                 file_age = current_time - os.path.getmtime(file_path)
                 if file_age > max_age_minutes * 60:
                     os.remove(file_path)
+                    cleaned_count += 1
+            except Exception as e:
+                # Log error but continue cleaning other files
+                print(f"Warning: Could not remove {file_path}: {e}")
+    if cleaned_count > 0:
+        print(f"Cleaned up {cleaned_count} old files")
 def get_mutated_protein(sequence,mutant):
   if not check_valid_mutant(sequence,mutant):
     # Reduce batch size for CPU inference
     batch_size_inference = min(batch_size_inference, 10)
+  try:
+    model.eval()
+    model.config.tokenizer = tokenizer
+    all_single_mutants = create_all_single_mutants(sequence,AA_vocab,mutation_range_start,mutation_range_end)
+    with torch.no_grad():
+      scores = model.score_mutants(DMS_data=all_single_mutants,
+                                        target_seq=sequence,
+                                        scoring_mirror=scoring_mirror,
+                                        batch_size_inference=batch_size_inference,
+                                        num_workers=num_workers,
+                                        indel_mode=False
+                                        )
+    scores = pd.merge(scores,all_single_mutants,on="mutated_sequence",how="left")
+    scores["position"]=scores["mutant"].map(lambda x: int(x[1:-1]))
+    scores["target_AA"] = scores["mutant"].map(lambda x: x[-1])
+    score_heatmaps = []
+    csv_files = []
+    mutation_range = mutation_range_end - mutation_range_start + 1
+    number_heatmaps = int((mutation_range - 1) / max_number_positions_per_heatmap) + 1
+    image_index = 0
+    window_start = mutation_range_start
+    window_end = min(mutation_range_end,mutation_range_start+max_number_positions_per_heatmap-1)
+    for image_index in range(number_heatmaps):
+      image_path, csv_path = create_scoring_matrix_visual(scores,sequence,image_index,window_start,window_end,AA_vocab,unique_id=unique_id)
+      score_heatmaps.append(image_path)
+      csv_files.append(csv_path)
+      window_start += max_number_positions_per_heatmap
+      window_end = min(mutation_range_end,window_start+max_number_positions_per_heatmap-1)
+    # Also save a comprehensive CSV with all mutations
+    comprehensive_csv_path = 'all_mutations_fitness_scores_{}.csv'.format(unique_id)
+    scores_export = scores[['mutant', 'position', 'target_AA', 'avg_score', 'mutated_sequence']].copy()
+    scores_export['original_AA'] = scores_export['mutant'].str[0]
+    scores_export = scores_export.rename(columns={'avg_score': 'fitness_score'})
+    scores_export = scores_export[['position', 'original_AA', 'target_AA', 'mutant', 'fitness_score', 'mutated_sequence']]
+    scores_export.to_csv(comprehensive_csv_path, index=False)
+    csv_files.append(comprehensive_csv_path)
+    return score_heatmaps, suggest_mutations(scores), csv_files
+  finally:
+    # Always clean up model from memory
+    if 'model' in locals():
+      del model
+      gc.collect()
+      if torch.cuda.is_available():
+        torch.cuda.empty_cache()
 def extract_sequence(protein_id, taxon, sequence):
   return sequence