New_Final_Assignment

Sleeping

App Files Files Community

naman1102 commited on Jun 11

Commit

a14b206

1 Parent(s): 1d09500

image

Browse files

Files changed (3) hide show

agent.py +4 -2
app.py +7 -5
tools.py +58 -29

agent.py CHANGED Viewed

@@ -14,7 +14,8 @@ from tools import (
     arxiv_search_tool,
     audio_transcriber_tool,
     excel_tool,
-    analyze_code_tool
 )
 # ─────────────────────────── Configuration ───────────────────────────────
@@ -41,7 +42,8 @@ def build_graph():
         arxiv_search_tool,
         audio_transcriber_tool,
         excel_tool,
-        analyze_code_tool
     ]
     # Create the react agent - it will use the system prompt from the messages

     arxiv_search_tool,
     audio_transcriber_tool,
     excel_tool,
+    analyze_code_tool,
+    image_tool
 )
 # ─────────────────────────── Configuration ───────────────────────────────
         arxiv_search_tool,
         audio_transcriber_tool,
         excel_tool,
+        analyze_code_tool,
+        image_tool
     ]
     # Create the react agent - it will use the system prompt from the messages

app.py CHANGED Viewed

@@ -13,14 +13,16 @@ from state import AgentState
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 SYSTEM_PROMPT = """
-You are a general AI assistant. I will ask you a question.
-Report your thoughts in brief, and finish your answer with the following template:
-FINAL ANSWER: [YOUR FINAL ANSWER]
 IMPORTANT: When using tools that require file access (such as audio_transcriber_tool, excel_tool, analyze_code_tool, or image_tool), ALWAYS use the task_id parameter only. Do NOT use any file names mentioned by the user - ignore them completely and only pass the task_id.
-YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
-When using tools that require file access (such as audio_transcriber_tool, excel_tool, analyze_code_tool, or image_tool), ALWAYS use the task_id parameter only. Do NOT use any file names mentioned by the user - ignore them completely and only pass the task_id thats provided to you in the state.
 """

 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 SYSTEM_PROMPT = """
+You are a general AI assistant. I will ask you a question. Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER]. YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
 IMPORTANT: When using tools that require file access (such as audio_transcriber_tool, excel_tool, analyze_code_tool, or image_tool), ALWAYS use the task_id parameter only. Do NOT use any file names mentioned by the user - ignore them completely and only pass the task_id.
+SEARCH STRATEGY:
+- If wikipedia_search_tool fails or returns insufficient/irrelevant results, try these fallback strategies:
+  1. Try wikipedia_search_tool again with a broader, more general query (remove specific terms, use synonyms)
+  2. If Wikipedia still doesn't help, try arxiv_search_tool for academic/research topics
+  3. You can use multiple search attempts with different keywords to find better information
+- Always evaluate if the search results are relevant and sufficient before proceeding to your final answer
 """

tools.py CHANGED Viewed

@@ -47,78 +47,107 @@ def image_tool(task_id: str) -> str:
     Returns: "OCR text + brief caption or an error message"
     """
-    print("reached image_tool")
-    # path_or_id = state.get("ocr_path", "")
     for ext in ("png", "jpg", "jpeg"):
-            candidate = _download_file_for_task(task_id, ext)
-            if candidate:
-                local_img = candidate
-                break
     if not local_img or not os.path.exists(local_img):
-        return {
-            "ocr_path": None,
-            "ocr_result": "Error: No image file found (local nonexistent or download failed)."
-        }
     # 2) Read raw bytes
     try:
         with open(local_img, "rb") as f:
             image_bytes = f.read()
     except Exception as e:
-        return  f"Error reading image file: {e}"
     # 3) Prepare HF Inference headers
     hf_token = os.getenv("HF_TOKEN")
     if not hf_token:
-        return "Error: HUGGINGFACE_API_KEY not set in environment."
     headers = {"Authorization": f"Bearer {hf_token}"}
-    # 4) Call HF’s vision-ocr to extract text
     ocr_text = ""
     try:
         ocr_resp = requests.post(
-            "https://api-inference.huggingface.co/models/google/vit-ocr",
             headers=headers,
             files={"file": image_bytes},
             timeout=30
         )
         ocr_resp.raise_for_status()
         ocr_json = ocr_resp.json()
-        # The JSON has “pages” → list of blocks → “lines” → each line has “text”
-        lines = []
-        for page in ocr_json.get("pages", []):
-            for line in page.get("lines", []):
-                lines.append(line.get("text", "").strip())
-        ocr_text = "\n".join(lines).strip() or "(no visible text)"
     except Exception as e:
         ocr_text = f"Error during HF OCR: {e}"
-    # 5) Call HF’s image-captioning to get a brief description
     caption = ""
     try:
         cap_resp = requests.post(
             "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-base",
             headers=headers,
             files={"file": image_bytes},
             timeout=30
         )
         cap_resp.raise_for_status()
         cap_json = cap_resp.json()
-        # The response looks like: {"generated_text": "...caption..."}
-        caption = cap_json.get("generated_text", "").strip()
         if not caption:
-            caption = "(no caption returned)"
     except Exception as e:
         caption = f"Error during HF captioning: {e}"
     # 6) Combine OCR + caption
     combined = f"OCR text:\n{ocr_text}\n\nImage caption:\n{caption}"
-    print("combined: ")
     return combined
 @tool
@@ -289,7 +318,7 @@ def analyze_code_tool(task_id: str) -> str:
 #     """
 #     Expects: state["web_search_query"] is a non‐empty string.
 #     Returns: {"web_search_query": None, "web_search_result": <string>}.
-#     Retries up to 5 times on either a DuckDuckGo “202 Ratelimit” response or any exception (e.g. timeout).
 #     """
 #     print("reached web_search_tool")
 #     query = state.get("web_search_query", "")

     Returns: "OCR text + brief caption or an error message"
     """
+    print(f"DEBUG: image_tool called with task_id: {task_id}")
+    local_img = None  # Initialize the variable
+    # Try to download image file with different extensions
     for ext in ("png", "jpg", "jpeg"):
+        print(f"DEBUG: Trying to download {task_id}.{ext}")
+        candidate = _download_file_for_task(task_id, ext)
+        if candidate:
+            local_img = candidate
+            print(f"DEBUG: Successfully downloaded image: {local_img}")
+            break
+        else:
+            print(f"DEBUG: Failed to download {task_id}.{ext}")
     if not local_img or not os.path.exists(local_img):
+        error_msg = f"Error: No image file found for task_id {task_id} (tried png, jpg, jpeg extensions)"
+        print(f"DEBUG: {error_msg}")
+        return error_msg
     # 2) Read raw bytes
     try:
+        print(f"DEBUG: Reading image file: {local_img}")
         with open(local_img, "rb") as f:
             image_bytes = f.read()
+        print(f"DEBUG: Successfully read {len(image_bytes)} bytes from image")
     except Exception as e:
+        error_msg = f"Error reading image file: {e}"
+        print(f"DEBUG: {error_msg}")
+        return error_msg
     # 3) Prepare HF Inference headers
     hf_token = os.getenv("HF_TOKEN")
     if not hf_token:
+        error_msg = "Error: HF_TOKEN not set in environment."
+        print(f"DEBUG: {error_msg}")
+        return error_msg
     headers = {"Authorization": f"Bearer {hf_token}"}
+    print("DEBUG: HF token found, proceeding with API calls")
+    # 4) Call HF's vision-ocr to extract text
     ocr_text = ""
     try:
+        print("DEBUG: Calling HF OCR API...")
         ocr_resp = requests.post(
+            "https://api-inference.huggingface.co/models/microsoft/trocr-base-printed",
             headers=headers,
             files={"file": image_bytes},
             timeout=30
         )
+        print(f"DEBUG: OCR API response status: {ocr_resp.status_code}")
         ocr_resp.raise_for_status()
         ocr_json = ocr_resp.json()
+        print(f"DEBUG: OCR API response: {ocr_json}")
+        # Handle different response formats
+        if isinstance(ocr_json, list) and len(ocr_json) > 0:
+            # If it's a list, take the first result
+            ocr_text = ocr_json[0].get("generated_text", "").strip()
+        elif isinstance(ocr_json, dict):
+            ocr_text = ocr_json.get("generated_text", "").strip()
+        if not ocr_text:
+            ocr_text = "(no visible text detected)"
+        print(f"DEBUG: Extracted OCR text: {ocr_text}")
     except Exception as e:
         ocr_text = f"Error during HF OCR: {e}"
+        print(f"DEBUG: OCR failed: {e}")
+    # 5) Call HF's image-captioning to get a brief description
     caption = ""
     try:
+        print("DEBUG: Calling HF Image Captioning API...")
         cap_resp = requests.post(
             "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-base",
             headers=headers,
             files={"file": image_bytes},
             timeout=30
         )
+        print(f"DEBUG: Captioning API response status: {cap_resp.status_code}")
         cap_resp.raise_for_status()
         cap_json = cap_resp.json()
+        print(f"DEBUG: Captioning API response: {cap_json}")
+        # Handle different response formats
+        if isinstance(cap_json, list) and len(cap_json) > 0:
+            caption = cap_json[0].get("generated_text", "").strip()
+        elif isinstance(cap_json, dict):
+            caption = cap_json.get("generated_text", "").strip()
         if not caption:
+            caption = "(no caption generated)"
+        print(f"DEBUG: Generated caption: {caption}")
     except Exception as e:
         caption = f"Error during HF captioning: {e}"
+        print(f"DEBUG: Captioning failed: {e}")
     # 6) Combine OCR + caption
     combined = f"OCR text:\n{ocr_text}\n\nImage caption:\n{caption}"
+    print(f"DEBUG: Final result: {combined}")
     return combined
 @tool
 #     """
 #     Expects: state["web_search_query"] is a non‐empty string.
 #     Returns: {"web_search_query": None, "web_search_result": <string>}.
+#     Retries up to 5 times on either a DuckDuckGo "202 Ratelimit" response or any exception (e.g. timeout).
 #     """
 #     print("reached web_search_tool")
 #     query = state.get("web_search_query", "")