Mettaton

Sleeping

App Files Files Community

DragonProgrammer commited on 3 days ago

Commit

c6b11f9

verified ·

1 Parent(s): f7a97c8

Update app.py

Browse files

Files changed (1) hide show

app.py +10 -9

app.py CHANGED Viewed

@@ -66,8 +66,7 @@ class LangChainAgentWrapper:
     def __init__(self):
         print("Initializing LangChainAgentWrapper...")
-        # We will use the more powerful gemma-2b-it model, but load it in 4-bit.
-        model_id = "google/gemma-2b-it"
         try:
             hf_auth_token = os.getenv("HF_TOKEN")
@@ -76,7 +75,7 @@ class LangChainAgentWrapper:
             else:
                 print("HF_TOKEN secret found.")
-            # 1. Create the 4-bit quantization configuration.
             print("Creating 4-bit quantization config...")
             quantization_config = BitsAndBytesConfig(
                 load_in_4bit=True,
@@ -86,28 +85,28 @@ class LangChainAgentWrapper:
             )
             print("Quantization config created.")
-            # 2. Load the tokenizer.
             print(f"Loading tokenizer for: {model_id}")
             tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_auth_token)
             print("Tokenizer loaded successfully.")
-            # 3. Load the model with the quantization config.
             print(f"Loading model '{model_id}' with quantization...")
             model = AutoModelForCausalLM.from_pretrained(
                 model_id,
                 quantization_config=quantization_config,
-                device_map="auto",
                 token=hf_auth_token
             )
             print("Model loaded successfully.")
-            # 4. Create the Hugging Face pipeline using the pre-loaded model and tokenizer.
             print("Creating text-generation pipeline...")
             llm_pipeline = transformers.pipeline(
-                "text-generation", # Use "text-generation" for Gemma
                 model=model,
                 tokenizer=tokenizer,
-                max_new_tokens=512 # Add max_new_tokens to prevent overly long responses
             )
             print("Model pipeline created successfully.")
@@ -169,7 +168,9 @@ class LangChainAgentWrapper:
     def __call__(self, question: str) -> str:
         print(f"\n--- LangChainAgentWrapper received question: {question[:100]}... ---")
         try:
             response = self.agent_executor.invoke({"input": question})
             return response.get("output", "No output found.")
         except Exception as e:
             print(f"ERROR: LangChain agent execution failed: {e}")

     def __init__(self):
         print("Initializing LangChainAgentWrapper...")
+        model_id = "google/gemma-2b-it"
         try:
             hf_auth_token = os.getenv("HF_TOKEN")
             else:
                 print("HF_TOKEN secret found.")
+            # 1. Create the 4-bit quantization configuration
             print("Creating 4-bit quantization config...")
             quantization_config = BitsAndBytesConfig(
                 load_in_4bit=True,
             )
             print("Quantization config created.")
+            # 2. Load the tokenizer separately
             print(f"Loading tokenizer for: {model_id}")
             tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_auth_token)
             print("Tokenizer loaded successfully.")
+            # 3. Load the model with the quantization config
             print(f"Loading model '{model_id}' with quantization...")
             model = AutoModelForCausalLM.from_pretrained(
                 model_id,
                 quantization_config=quantization_config,
+                # device_map="auto", # <<<--- THIS LINE IS REMOVED
                 token=hf_auth_token
             )
             print("Model loaded successfully.")
+            # 4. Create the Hugging Face pipeline using the pre-loaded model and tokenizer
             print("Creating text-generation pipeline...")
             llm_pipeline = transformers.pipeline(
+                "text-generation",
                 model=model,
                 tokenizer=tokenizer,
+                max_new_tokens=512
             )
             print("Model pipeline created successfully.")
     def __call__(self, question: str) -> str:
         print(f"\n--- LangChainAgentWrapper received question: {question[:100]}... ---")
         try:
+            # Invoke the agent executor
             response = self.agent_executor.invoke({"input": question})
+            # The answer is in the 'output' key of the response dictionary
             return response.get("output", "No output found.")
         except Exception as e:
             print(f"ERROR: LangChain agent execution failed: {e}")