Mettaton

Running

App Files Files Community

DragonProgrammer commited on 4 days ago

Commit

765e810

verified ·

1 Parent(s): dd964ab

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -18

app.py CHANGED Viewed

@@ -78,31 +78,55 @@ class LangChainAgentWrapper:
     def __init__(self):
         print("Initializing LangChainAgentWrapper...")
-        # Switched to a smaller, CPU-friendly instruction-tuned model
-        model_id = "google/flan-t5-base"
         try:
             hf_auth_token = os.getenv("HF_TOKEN")
-            print(f"Loading model pipeline for: {model_id}")
-            # We load the model and tokenizer objects first
-            tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
-            model = transformers.AutoModelForSeq2SeqLM.from_pretrained(model_id)
-            # Now we use our custom FlanT5Pipeline class
-            llm_pipeline = FlanT5Pipeline(
-                task="text2text-generation",
                 model=model,
                 tokenizer=tokenizer,
-                device_map="auto",
-                max_new_tokens=512
             )
-            print("Model pipeline loaded successfully.")
             # Wrap the pipeline in a LangChain LLM object
             self.llm = HuggingFacePipeline(pipeline=llm_pipeline)
-            # Define the list of LangChain tools (this part is unchanged and correct)
             self.tools = [
                 Tool(
                     name="get_current_time_in_timezone",
@@ -118,7 +142,7 @@ class LangChainAgentWrapper:
             ]
             print(f"Tools prepared for agent: {[tool.name for tool in self.tools]}")
-            # Create the ReAct agent prompt (this part is unchanged and correct)
             react_prompt = PromptTemplate.from_template(
                 """
                 You are a helpful assistant. Answer the following questions as best you can.
@@ -144,7 +168,7 @@ class LangChainAgentWrapper:
                 """
             )
-            # Create the agent and executor (this part is unchanged and correct)
             agent = create_react_agent(self.llm, self.tools, react_prompt)
             self.agent_executor = AgentExecutor(agent=agent, tools=self.tools, verbose=True, handle_parsing_errors=True)
             print("LangChain agent created successfully.")
@@ -157,9 +181,7 @@ class LangChainAgentWrapper:
     def __call__(self, question: str) -> str:
         print(f"\n--- LangChainAgentWrapper received question: {question[:100]}... ---")
         try:
-            # Invoke the agent executor
             response = self.agent_executor.invoke({"input": question})
-            # The answer is in the 'output' key of the response dictionary
             return response.get("output", "No output found.")
         except Exception as e:
             print(f"ERROR: LangChain agent execution failed: {e}")

     def __init__(self):
         print("Initializing LangChainAgentWrapper...")
+        # We will use the more powerful gemma-2b-it model, but load it in 4-bit.
+        model_id = "google/gemma-2b-it"
         try:
             hf_auth_token = os.getenv("HF_TOKEN")
+            if not hf_auth_token:
+                raise ValueError("HF_TOKEN secret is missing. It is required for downloading models.")
+            else:
+                print("HF_TOKEN secret found.")
+            # 1. Create the 4-bit quantization configuration.
+            print("Creating 4-bit quantization config...")
+            quantization_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_quant_type="nf4",
+                bnb_4bit_use_double_quant=True,
+                bnb_4bit_compute_dtype=torch.bfloat16
+            )
+            print("Quantization config created.")
+            # 2. Load the tokenizer.
+            print(f"Loading tokenizer for: {model_id}")
+            tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_auth_token)
+            print("Tokenizer loaded successfully.")
+            # 3. Load the model with the quantization config.
+            print(f"Loading model '{model_id}' with quantization...")
+            model = AutoModelForCausalLM.from_pretrained(
+                model_id,
+                quantization_config=quantization_config,
+                device_map="auto",
+                token=hf_auth_token
+            )
+            print("Model loaded successfully.")
+            # 4. Create the Hugging Face pipeline using the pre-loaded model and tokenizer.
+            print("Creating text-generation pipeline...")
+            llm_pipeline = transformers.pipeline(
+                "text-generation", # Use "text-generation" for Gemma
                 model=model,
                 tokenizer=tokenizer,
+                max_new_tokens=512 # Add max_new_tokens to prevent overly long responses
             )
+            print("Model pipeline created successfully.")
             # Wrap the pipeline in a LangChain LLM object
             self.llm = HuggingFacePipeline(pipeline=llm_pipeline)
+            # Define the list of LangChain tools (this part is correct)
             self.tools = [
                 Tool(
                     name="get_current_time_in_timezone",
             ]
             print(f"Tools prepared for agent: {[tool.name for tool in self.tools]}")
+            # Create the ReAct agent prompt (this part is correct)
             react_prompt = PromptTemplate.from_template(
                 """
                 You are a helpful assistant. Answer the following questions as best you can.
                 """
             )
+            # Create the agent and executor (this part is correct)
             agent = create_react_agent(self.llm, self.tools, react_prompt)
             self.agent_executor = AgentExecutor(agent=agent, tools=self.tools, verbose=True, handle_parsing_errors=True)
             print("LangChain agent created successfully.")
     def __call__(self, question: str) -> str:
         print(f"\n--- LangChainAgentWrapper received question: {question[:100]}... ---")
         try:
             response = self.agent_executor.invoke({"input": question})
             return response.get("output", "No output found.")
         except Exception as e:
             print(f"ERROR: LangChain agent execution failed: {e}")