DragonProgrammer commited on
Commit
1cd3b83
·
verified ·
1 Parent(s): bac1dc6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -13
app.py CHANGED
@@ -10,6 +10,8 @@ import re
10
  import requests
11
  import traceback
12
  import sys
 
 
13
 
14
  # --- LangChain and new Transformers imports ---
15
  from langchain.agents import AgentExecutor, create_react_agent
@@ -61,27 +63,57 @@ def safe_calculator_func(expression: str) -> str:
61
  return f"Error calculating '{expression}': Invalid expression or calculation error ({e})."
62
 
63
 
64
- # --- LangChain Agent Definition ---
65
  class LangChainAgentWrapper:
66
  def __init__(self):
67
  print("Initializing LangChainAgentWrapper...")
68
 
69
- # --- CHANGE 1: Switched to a smaller, CPU-friendly model ---
70
- model_id = "google/flan-t5-base"
71
 
72
  try:
73
- hf_auth_token = os.getenv("HF_TOKEN") # Good practice to keep, but not needed for FLAN-T5
74
-
75
- # --- CHANGE 2 & 3: Use the correct task for T5 and remove quantization ---
76
- # We no longer need to load the tokenizer and model separately,
77
- # as we are not applying a custom quantization config.
78
- print(f"Loading model pipeline for: {model_id}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  llm_pipeline = transformers.pipeline(
80
- "text2text-generation", # <<< IMPORTANT: Changed task for T5 models
81
- model=model_id,
82
- device_map="auto"
 
83
  )
84
- print("Model pipeline loaded successfully.")
 
 
85
 
86
  # Wrap the pipeline in a LangChain LLM object
87
  self.llm = HuggingFacePipeline(pipeline=llm_pipeline)
 
10
  import requests
11
  import traceback
12
  import sys
13
+ import torch
14
+ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
15
 
16
  # --- LangChain and new Transformers imports ---
17
  from langchain.agents import AgentExecutor, create_react_agent
 
63
  return f"Error calculating '{expression}': Invalid expression or calculation error ({e})."
64
 
65
 
 
66
  class LangChainAgentWrapper:
67
  def __init__(self):
68
  print("Initializing LangChainAgentWrapper...")
69
 
70
+ model_id = "google/gemma-2b-it"
 
71
 
72
  try:
73
+ hf_auth_token = os.getenv("HF_TOKEN")
74
+ if not hf_auth_token:
75
+ raise ValueError("HF_TOKEN secret is missing. It is required for downloading models.")
76
+ else:
77
+ print("HF_TOKEN secret found.")
78
+
79
+ # --- CORRECTED MODEL LOADING WITH QUANTIZATION ---
80
+
81
+ # 1. Create the 4-bit quantization configuration
82
+ print("Creating 4-bit quantization config...")
83
+ quantization_config = BitsAndBytesConfig(
84
+ load_in_4bit=True,
85
+ bnb_4bit_quant_type="nf4",
86
+ bnb_4bit_use_double_quant=True,
87
+ bnb_4bit_compute_dtype=torch.bfloat16
88
+ )
89
+ print("Quantization config created.")
90
+
91
+ # 2. Load the tokenizer separately
92
+ print(f"Loading tokenizer for: {model_id}")
93
+ tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_auth_token)
94
+ print("Tokenizer loaded successfully.")
95
+
96
+ # 3. Load the model with the quantization config
97
+ print(f"Loading model '{model_id}' with quantization...")
98
+ model = AutoModelForCausalLM.from_pretrained(
99
+ model_id,
100
+ quantization_config=quantization_config,
101
+ device_map="auto",
102
+ token=hf_auth_token
103
+ )
104
+ print("Model loaded successfully.")
105
+
106
+ # 4. Create the Hugging Face pipeline using the pre-loaded model and tokenizer
107
+ print("Creating text-generation pipeline...")
108
  llm_pipeline = transformers.pipeline(
109
+ "text-generation",
110
+ model=model,
111
+ tokenizer=tokenizer,
112
+ max_new_tokens=512 # Add max_new_tokens to prevent overly long responses
113
  )
114
+ print("Model pipeline created successfully.")
115
+
116
+ # --- END OF CORRECTION ---
117
 
118
  # Wrap the pipeline in a LangChain LLM object
119
  self.llm = HuggingFacePipeline(pipeline=llm_pipeline)