Final_Assignment_Template

Sleeping

App Files Files Community

Facelook commited on Apr 29

Commit

771c32e

1 Parent(s): 65a76ae

Added use of NLP to break down question to improve accuracy of web search.

Browse files

Files changed (3) hide show

agent_tools.py +2 -7
app.py +33 -23
requirements.txt +2 -1

agent_tools.py CHANGED Viewed

@@ -51,9 +51,7 @@ def duckduckgo_search(query: str, count: int = 3) -> list:
                 #    "snippet": snippet
                 # })
-                results.append({
-                    "summary": snippet
-                })
                 if len(results) >= count:
                     break
@@ -104,16 +102,13 @@ def langsearch_search(query: str, count: int = 5) -> list:
         response = requests.post(url, headers=headers, data=payload, timeout=30)
         response.raise_for_status()
-        print(f"LangSearch response status code: {response.status_code}")
         if response.status_code != 200:
             print(f"LangSearch API error: {response.text}")
             return []
         response = response.json()
         results = []
         for result in response["data"]["webPages"]["value"]:
-            results.append({
-                "summary": result["summary"]
-            })
         print(f"LangSearch results: {results}")
         return results
     except Exception as e:

                 #    "snippet": snippet
                 # })
+                results.append(snippet)
                 if len(results) >= count:
                     break
         response = requests.post(url, headers=headers, data=payload, timeout=30)
         response.raise_for_status()
         if response.status_code != 200:
             print(f"LangSearch API error: {response.text}")
             return []
         response = response.json()
         results = []
         for result in response["data"]["webPages"]["value"]:
+            results.append(result["summary"])
         print(f"LangSearch results: {results}")
         return results
     except Exception as e:

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ import gradio as gr
 import requests
 import json
 import pandas as pd
 from openai import OpenAI
 from agent_tools import duckduckgo_search, langsearch_search, TOOLS_MAPPING, TOOLS_DEFINITION
@@ -12,8 +13,6 @@ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 # --- Basic Agent Definition ---
 # ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
 class BasicAgent:
     def __init__(self):
         print("BasicAgent initialized.")
@@ -23,23 +22,34 @@ class BasicAgent:
         print(f"Agent received question: {question}")
         try:
-            content = "You are an assistant that has access to the following set of tools. Read the question carefully and do not report your thoughts, explanations, reasoning, or conclusion. Always use RAG. If you know the answer, give only YOUR FINAL ANSWER. YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string. On the other hand, if you really don't know the answer after your best efforts, break down the question and list all search queries in a string array."
             count = 0
-            # Call duckduckgo_search function
-            # search_results = duckduckgo_search(query=question, count=10)
-            # if len(search_results) > 0:
-            #    # Convert search results to a readable text format
-            #    search_results_text = ""
-            #    for i, result in enumerate(search_results, 1):
-            #        count += 1
-            #        search_results_text += f"\n\n---SEARCH RESULT #{count}---\n"
-            #        search_results_text += f"{search_results[i - 1]}"
-            #    content += f"\n\nThe following are the results from the DuckDuckGo API, you may use it as reference on top of your knowledge base: {search_results_text}"
             # Call langsearch_search function
-            search_results = langsearch_search(query=question, count=5)
             if len(search_results) > 0:
                 # Convert search results to a readable text format
                 search_results_text = ""
@@ -47,9 +57,9 @@ class BasicAgent:
                     count += 1
                     search_results_text += f"\n\n---SEARCH RESULT #{count}---\n"
                     search_results_text += f"{search_results[i - 1]}"
-                content += f"\n\nThe following are the results from the LangSearch API, you may use it as reference on top of your knowledge base: {search_results_text}"
-            #print(f"Content for system message: {content}")
             messages = [
                 {
@@ -244,12 +254,12 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
         #     'Level': '1',
         #     'file_name': 'f918266a-b3e0-4914-865d-4faa564f1aef.py'
         # },
-        {
-            'task_id': '3f57289b-8c60-48be-bd80-01f8099ca449',
-            'question': 'How many at bats did the Yankee with the most walks in the 1977 regular season have that same season?',
-            'Level': '1',
-            'file_name': ''
-        },
         # {
         #     'task_id': '1f975693-876d-457b-a649-393859e79bf3',
         #     'question': "Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I need to study for my Calculus mid-term next week. My friend from class sent me an audio recording of Professor Willowbrook giving out the recommended reading for the test, but my headphones are broken :(\n\nCould you please listen to the recording for me and tell me the page numbers I'm supposed to go over? I've attached a file called Homework.mp3 that has the recording. Please provide just the page numbers as a comma-delimited list. And please provide the list in ascending order.",

 import requests
 import json
 import pandas as pd
+import spacy
 from openai import OpenAI
 from agent_tools import duckduckgo_search, langsearch_search, TOOLS_MAPPING, TOOLS_DEFINITION
 # --- Basic Agent Definition ---
 # ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
 class BasicAgent:
     def __init__(self):
         print("BasicAgent initialized.")
         print(f"Agent received question: {question}")
         try:
             count = 0
+            # content = "You are an assistant that has access to the following set of tools. Read the question carefully and do not report your thoughts, explanations, reasoning, or conclusion. Always use RAG. If you know the answer, give only YOUR FINAL ANSWER. YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string. On the other hand, if you really don't know the answer after your best efforts, break down the question and list all search queries in a string array."
+            content = "You are an assistant that has access to the following set of tools. Read the question carefully and do not report your thoughts, explanations, reasoning, or conclusion. Give only YOUR FINAL ANSWER. YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string. If you don't have complete certainty, you must still provide your best answer based on the information available to you. Always provide an answer rather than expressing uncertainty - use your best judgment to determine the most likely correct response."
+            nlp = spacy.load("en_core_web_sm")
+            doc = nlp(question)
+            keywords = [token.text for token in doc if token.pos_ in ['NOUN', 'PROPN']]
+            entities = entities = [ent.text for ent in doc.ents if ent.label_ in ['PRODUCT', 'WORK_OF_ART', 'EVENT', 'ORG', 'FAC', 'GPE', 'NORP', 'LOC', 'LANGUAGE', 'PERSON', 'DATE', 'TIME', 'MONEY', 'LAW']]
+            print("Keywords:", keywords)
+            print("Entities:", entities)
             # Call langsearch_search function
+            #search_results = langsearch_search(query=question, count=10)
+            # Use entities for search if available, otherwise use the original question
+            search_query = ""
+            if entities:
+                search_query = " ".join(entities)
+                print(f"Using entities for search query: '{search_query}'")
+            else:
+                # Fallback: If no specific entities are found, use keywords or the original question
+                if keywords:
+                    search_query = " ".join(keywords)
+                    print(f"No entities found, using keywords for search query: '{search_query}'")
+                else:
+                    search_query = question
+                    print("No entities or keywords found, using original question for search query.")
+            search_results = langsearch_search(query=search_query, count=10)
             if len(search_results) > 0:
                 # Convert search results to a readable text format
                 search_results_text = ""
                     count += 1
                     search_results_text += f"\n\n---SEARCH RESULT #{count}---\n"
                     search_results_text += f"{search_results[i - 1]}"
+                content += f"\n\nThe following are the results from the LangSearch API, use it as reference along with your own knowledge base to provide the most accurate answer: {search_results_text}"
+            # print(f"Content for system message: {content}")
             messages = [
                 {
         #     'Level': '1',
         #     'file_name': 'f918266a-b3e0-4914-865d-4faa564f1aef.py'
         # },
+        # {
+        #    'task_id': '3f57289b-8c60-48be-bd80-01f8099ca449',
+        #    'question': 'How many at bats did the Yankee with the most walks in the 1977 regular season have that same season?',
+        #    'Level': '1',
+        #    'file_name': ''
+        # },
         # {
         #     'task_id': '1f975693-876d-457b-a649-393859e79bf3',
         #     'question': "Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I need to study for my Calculus mid-term next week. My friend from class sent me an audio recording of Professor Willowbrook giving out the recommended reading for the test, but my headphones are broken :(\n\nCould you please listen to the recording for me and tell me the page numbers I'm supposed to go over? I've attached a file called Homework.mp3 that has the recording. Please provide just the page numbers as a comma-delimited list. And please provide the list in ascending order.",

requirements.txt CHANGED Viewed

@@ -1,7 +1,8 @@
 gradio
 requests
 huggingface_hub
 openai
 bs4
 itsdangerous
-huggingface_hub[cli]

 gradio
 requests
 huggingface_hub
+huggingface_hub[cli]
 openai
 bs4
 itsdangerous
+spacy