pbordesinstadeep commited on
Commit
afac2a3
·
verified ·
1 Parent(s): f3db296

Update isoformer_tokenizer.py

Browse files
Files changed (1) hide show
  1. isoformer_tokenizer.py +25 -11
isoformer_tokenizer.py CHANGED
@@ -14,6 +14,7 @@
14
  # limitations under the License.
15
  """Tokenization classes for ESM."""
16
  import os
 
17
  from typing import List, Optional
18
 
19
  #from transformers.models.esm.tokenization_esm import PreTrainedTokenizer
@@ -38,17 +39,30 @@ class IsoformerTokenizer(PreTrainedTokenizer):
38
  self,
39
  **kwargs
40
  ):
41
- print(f"\n>>> DEBUG: IsoformerTokenizer __init__ received kwargs: {kwargs}")
42
- pretrained_model_path = kwargs.get("pretrained_model_name_or_path", "")
43
- print(f">>> DEBUG: Determined pretrained_model_path: '{pretrained_model_path}'")
44
-
45
- dna_vocab_path = os.path.join(pretrained_model_path, "dna_vocab_list.txt")
46
- rna_vocab_path = os.path.join(pretrained_model_path, "rna_vocab_list.txt")
47
- protein_vocab_path = os.path.join(pretrained_model_path, "protein_vocab_list.txt")
48
- print(f">>> DEBUG: dna_vocab_path will be: '{dna_vocab_path}'") # Add this
49
- print(f">>> DEBUG: Checking if dna_vocab_path exists: {os.path.exists(dna_vocab_path)}")
50
- print(f">>> DEBUG: Checking if dna_vocab_list.txt exists in CWD: {os.path.exists('dna_vocab_list.txt')}")
51
-
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  dna_hf_tokenizer = EsmTokenizer(dna_vocab_path, model_max_length=196608)
53
  dna_hf_tokenizer.eos_token = None # Stops the tokenizer adding an EOS/SEP token at the end
54
  dna_hf_tokenizer.init_kwargs["eos_token"] = None # Ensures it doesn't come back when reloading
 
14
  # limitations under the License.
15
  """Tokenization classes for ESM."""
16
  import os
17
+ from huggingface_hub import hf_hub_download
18
  from typing import List, Optional
19
 
20
  #from transformers.models.esm.tokenization_esm import PreTrainedTokenizer
 
39
  self,
40
  **kwargs
41
  ):
42
+ # Get the model ID from kwargs
43
+ model_id = kwargs.get("name_or_path", None) # This will be "InstaDeepAI/isoformer"
44
+
45
+ # Use hf_hub_download to get the local path to each vocabulary file.
46
+ # This function intelligently uses the local cache if the file is already downloaded.
47
+ if model_id:
48
+ try:
49
+ dna_vocab_path = hf_hub_download(repo_id=model_id, filename="dna_vocab_list.txt")
50
+ rna_vocab_path = hf_hub_download(repo_id=model_id, filename="rna_vocab_list.txt")
51
+ protein_vocab_path = hf_hub_download(repo_id=model_id, filename="protein_vocab_list.txt")
52
+ except Exception as e:
53
+ # Fallback in case hf_hub_download fails (e.g., if model_id was a local path not a Hub ID)
54
+ # This fallback might not be perfect for all edge cases, but covers the common local loading.
55
+ print(f"Warning: Failed to resolve model files via hf_hub_download. Attempting local fallback. Error: {e}")
56
+ dna_vocab_path = os.path.join(model_id, "dna_vocab_list.txt")
57
+ rna_vocab_path = os.path.join(model_id, "rna_vocab_list.txt")
58
+ protein_vocab_path = os.path.join(model_id, "protein_vocab_list.txt")
59
+ else:
60
+ # Fallback if model_id is not found (unlikely for AutoTokenizer.from_pretrained)
61
+ print("Warning: Could not determine model_id from kwargs. Falling back to relative paths.")
62
+ dna_vocab_path = "dna_vocab_list.txt"
63
+ rna_vocab_path = "rna_vocab_list.txt"
64
+ protein_vocab_path = "protein_vocab_list.txt"
65
+
66
  dna_hf_tokenizer = EsmTokenizer(dna_vocab_path, model_max_length=196608)
67
  dna_hf_tokenizer.eos_token = None # Stops the tokenizer adding an EOS/SEP token at the end
68
  dna_hf_tokenizer.init_kwargs["eos_token"] = None # Ensures it doesn't come back when reloading