Samuel Thomas commited on
Commit
a6c7ab4
·
1 Parent(s): 3127c31

update tokenizer

Browse files
Files changed (1) hide show
  1. tools.py +10 -1
tools.py CHANGED
@@ -27,7 +27,7 @@ from PIL import Image
27
  from bs4 import BeautifulSoup
28
  from duckduckgo_search import DDGS
29
  from sentence_transformers import SentenceTransformer
30
- from transformers import BlipProcessor, BlipForQuestionAnswering, pipeline
31
 
32
  # LangChain Ecosystem
33
  from langchain.docstore.document import Document
@@ -71,9 +71,18 @@ def create_llm_pipeline():
71
  #model_id = "mistralai/Mistral-Small-24B-Base-2501"
72
  model_id = "mistralai/Mistral-7B-Instruct-v0.3"
73
  #model_id = "Qwen/Qwen2-7B-Instruct"
 
 
 
 
 
 
 
 
74
  return pipeline(
75
  "text-generation",
76
  model=model_id,
 
77
  device_map="auto",
78
  torch_dtype=torch.float16,
79
  max_new_tokens=1024,
 
27
  from bs4 import BeautifulSoup
28
  from duckduckgo_search import DDGS
29
  from sentence_transformers import SentenceTransformer
30
+ from transformers import BlipProcessor, BlipForQuestionAnswering, pipeline, AutoTokenizer
31
 
32
  # LangChain Ecosystem
33
  from langchain.docstore.document import Document
 
71
  #model_id = "mistralai/Mistral-Small-24B-Base-2501"
72
  model_id = "mistralai/Mistral-7B-Instruct-v0.3"
73
  #model_id = "Qwen/Qwen2-7B-Instruct"
74
+
75
+ # Load tokenizer explicitly with fast version
76
+ tokenizer = AutoTokenizer.from_pretrained(
77
+ model_id,
78
+ use_fast=True, # Force fast tokenizer
79
+ add_prefix_space=True # Only if actually needed
80
+ )
81
+
82
  return pipeline(
83
  "text-generation",
84
  model=model_id,
85
+ tokenizer = tokenizer,
86
  device_map="auto",
87
  torch_dtype=torch.float16,
88
  max_new_tokens=1024,