DragonProgrammer commited on
Commit
c6b11f9
·
verified ·
1 Parent(s): f7a97c8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -9
app.py CHANGED
@@ -66,8 +66,7 @@ class LangChainAgentWrapper:
66
  def __init__(self):
67
  print("Initializing LangChainAgentWrapper...")
68
 
69
- # We will use the more powerful gemma-2b-it model, but load it in 4-bit.
70
- model_id = "google/gemma-2b-it"
71
 
72
  try:
73
  hf_auth_token = os.getenv("HF_TOKEN")
@@ -76,7 +75,7 @@ class LangChainAgentWrapper:
76
  else:
77
  print("HF_TOKEN secret found.")
78
 
79
- # 1. Create the 4-bit quantization configuration.
80
  print("Creating 4-bit quantization config...")
81
  quantization_config = BitsAndBytesConfig(
82
  load_in_4bit=True,
@@ -86,28 +85,28 @@ class LangChainAgentWrapper:
86
  )
87
  print("Quantization config created.")
88
 
89
- # 2. Load the tokenizer.
90
  print(f"Loading tokenizer for: {model_id}")
91
  tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_auth_token)
92
  print("Tokenizer loaded successfully.")
93
 
94
- # 3. Load the model with the quantization config.
95
  print(f"Loading model '{model_id}' with quantization...")
96
  model = AutoModelForCausalLM.from_pretrained(
97
  model_id,
98
  quantization_config=quantization_config,
99
- device_map="auto",
100
  token=hf_auth_token
101
  )
102
  print("Model loaded successfully.")
103
 
104
- # 4. Create the Hugging Face pipeline using the pre-loaded model and tokenizer.
105
  print("Creating text-generation pipeline...")
106
  llm_pipeline = transformers.pipeline(
107
- "text-generation", # Use "text-generation" for Gemma
108
  model=model,
109
  tokenizer=tokenizer,
110
- max_new_tokens=512 # Add max_new_tokens to prevent overly long responses
111
  )
112
  print("Model pipeline created successfully.")
113
 
@@ -169,7 +168,9 @@ class LangChainAgentWrapper:
169
  def __call__(self, question: str) -> str:
170
  print(f"\n--- LangChainAgentWrapper received question: {question[:100]}... ---")
171
  try:
 
172
  response = self.agent_executor.invoke({"input": question})
 
173
  return response.get("output", "No output found.")
174
  except Exception as e:
175
  print(f"ERROR: LangChain agent execution failed: {e}")
 
66
  def __init__(self):
67
  print("Initializing LangChainAgentWrapper...")
68
 
69
+ model_id = "google/gemma-2b-it"
 
70
 
71
  try:
72
  hf_auth_token = os.getenv("HF_TOKEN")
 
75
  else:
76
  print("HF_TOKEN secret found.")
77
 
78
+ # 1. Create the 4-bit quantization configuration
79
  print("Creating 4-bit quantization config...")
80
  quantization_config = BitsAndBytesConfig(
81
  load_in_4bit=True,
 
85
  )
86
  print("Quantization config created.")
87
 
88
+ # 2. Load the tokenizer separately
89
  print(f"Loading tokenizer for: {model_id}")
90
  tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_auth_token)
91
  print("Tokenizer loaded successfully.")
92
 
93
+ # 3. Load the model with the quantization config
94
  print(f"Loading model '{model_id}' with quantization...")
95
  model = AutoModelForCausalLM.from_pretrained(
96
  model_id,
97
  quantization_config=quantization_config,
98
+ # device_map="auto", # <<<--- THIS LINE IS REMOVED
99
  token=hf_auth_token
100
  )
101
  print("Model loaded successfully.")
102
 
103
+ # 4. Create the Hugging Face pipeline using the pre-loaded model and tokenizer
104
  print("Creating text-generation pipeline...")
105
  llm_pipeline = transformers.pipeline(
106
+ "text-generation",
107
  model=model,
108
  tokenizer=tokenizer,
109
+ max_new_tokens=512
110
  )
111
  print("Model pipeline created successfully.")
112
 
 
168
  def __call__(self, question: str) -> str:
169
  print(f"\n--- LangChainAgentWrapper received question: {question[:100]}... ---")
170
  try:
171
+ # Invoke the agent executor
172
  response = self.agent_executor.invoke({"input": question})
173
+ # The answer is in the 'output' key of the response dictionary
174
  return response.get("output", "No output found.")
175
  except Exception as e:
176
  print(f"ERROR: LangChain agent execution failed: {e}")