Spaces:
Sleeping
Sleeping
Added use of NLP to break down question to improve accuracy of web search.
Browse files- agent_tools.py +2 -7
- app.py +33 -23
- requirements.txt +2 -1
agent_tools.py
CHANGED
@@ -51,9 +51,7 @@ def duckduckgo_search(query: str, count: int = 3) -> list:
|
|
51 |
# "snippet": snippet
|
52 |
# })
|
53 |
|
54 |
-
results.append(
|
55 |
-
"summary": snippet
|
56 |
-
})
|
57 |
|
58 |
if len(results) >= count:
|
59 |
break
|
@@ -104,16 +102,13 @@ def langsearch_search(query: str, count: int = 5) -> list:
|
|
104 |
|
105 |
response = requests.post(url, headers=headers, data=payload, timeout=30)
|
106 |
response.raise_for_status()
|
107 |
-
print(f"LangSearch response status code: {response.status_code}")
|
108 |
if response.status_code != 200:
|
109 |
print(f"LangSearch API error: {response.text}")
|
110 |
return []
|
111 |
response = response.json()
|
112 |
results = []
|
113 |
for result in response["data"]["webPages"]["value"]:
|
114 |
-
results.append(
|
115 |
-
"summary": result["summary"]
|
116 |
-
})
|
117 |
print(f"LangSearch results: {results}")
|
118 |
return results
|
119 |
except Exception as e:
|
|
|
51 |
# "snippet": snippet
|
52 |
# })
|
53 |
|
54 |
+
results.append(snippet)
|
|
|
|
|
55 |
|
56 |
if len(results) >= count:
|
57 |
break
|
|
|
102 |
|
103 |
response = requests.post(url, headers=headers, data=payload, timeout=30)
|
104 |
response.raise_for_status()
|
|
|
105 |
if response.status_code != 200:
|
106 |
print(f"LangSearch API error: {response.text}")
|
107 |
return []
|
108 |
response = response.json()
|
109 |
results = []
|
110 |
for result in response["data"]["webPages"]["value"]:
|
111 |
+
results.append(result["summary"])
|
|
|
|
|
112 |
print(f"LangSearch results: {results}")
|
113 |
return results
|
114 |
except Exception as e:
|
app.py
CHANGED
@@ -3,6 +3,7 @@ import gradio as gr
|
|
3 |
import requests
|
4 |
import json
|
5 |
import pandas as pd
|
|
|
6 |
from openai import OpenAI
|
7 |
from agent_tools import duckduckgo_search, langsearch_search, TOOLS_MAPPING, TOOLS_DEFINITION
|
8 |
|
@@ -12,8 +13,6 @@ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
|
12 |
|
13 |
# --- Basic Agent Definition ---
|
14 |
# ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
|
15 |
-
|
16 |
-
|
17 |
class BasicAgent:
|
18 |
def __init__(self):
|
19 |
print("BasicAgent initialized.")
|
@@ -23,23 +22,34 @@ class BasicAgent:
|
|
23 |
print(f"Agent received question: {question}")
|
24 |
|
25 |
try:
|
26 |
-
content = "You are an assistant that has access to the following set of tools. Read the question carefully and do not report your thoughts, explanations, reasoning, or conclusion. Always use RAG. If you know the answer, give only YOUR FINAL ANSWER. YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string. On the other hand, if you really don't know the answer after your best efforts, break down the question and list all search queries in a string array."
|
27 |
-
|
28 |
count = 0
|
29 |
|
30 |
-
#
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
# content += f"\n\nThe following are the results from the DuckDuckGo API, you may use it as reference on top of your knowledge base: {search_results_text}"
|
40 |
|
41 |
# Call langsearch_search function
|
42 |
-
search_results = langsearch_search(query=question, count=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
if len(search_results) > 0:
|
44 |
# Convert search results to a readable text format
|
45 |
search_results_text = ""
|
@@ -47,9 +57,9 @@ class BasicAgent:
|
|
47 |
count += 1
|
48 |
search_results_text += f"\n\n---SEARCH RESULT #{count}---\n"
|
49 |
search_results_text += f"{search_results[i - 1]}"
|
50 |
-
content += f"\n\nThe following are the results from the LangSearch API,
|
51 |
|
52 |
-
#print(f"Content for system message: {content}")
|
53 |
|
54 |
messages = [
|
55 |
{
|
@@ -244,12 +254,12 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
244 |
# 'Level': '1',
|
245 |
# 'file_name': 'f918266a-b3e0-4914-865d-4faa564f1aef.py'
|
246 |
# },
|
247 |
-
{
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
},
|
253 |
# {
|
254 |
# 'task_id': '1f975693-876d-457b-a649-393859e79bf3',
|
255 |
# 'question': "Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I need to study for my Calculus mid-term next week. My friend from class sent me an audio recording of Professor Willowbrook giving out the recommended reading for the test, but my headphones are broken :(\n\nCould you please listen to the recording for me and tell me the page numbers I'm supposed to go over? I've attached a file called Homework.mp3 that has the recording. Please provide just the page numbers as a comma-delimited list. And please provide the list in ascending order.",
|
|
|
3 |
import requests
|
4 |
import json
|
5 |
import pandas as pd
|
6 |
+
import spacy
|
7 |
from openai import OpenAI
|
8 |
from agent_tools import duckduckgo_search, langsearch_search, TOOLS_MAPPING, TOOLS_DEFINITION
|
9 |
|
|
|
13 |
|
14 |
# --- Basic Agent Definition ---
|
15 |
# ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
|
|
|
|
|
16 |
class BasicAgent:
|
17 |
def __init__(self):
|
18 |
print("BasicAgent initialized.")
|
|
|
22 |
print(f"Agent received question: {question}")
|
23 |
|
24 |
try:
|
|
|
|
|
25 |
count = 0
|
26 |
|
27 |
+
# content = "You are an assistant that has access to the following set of tools. Read the question carefully and do not report your thoughts, explanations, reasoning, or conclusion. Always use RAG. If you know the answer, give only YOUR FINAL ANSWER. YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string. On the other hand, if you really don't know the answer after your best efforts, break down the question and list all search queries in a string array."
|
28 |
+
content = "You are an assistant that has access to the following set of tools. Read the question carefully and do not report your thoughts, explanations, reasoning, or conclusion. Give only YOUR FINAL ANSWER. YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string. If you don't have complete certainty, you must still provide your best answer based on the information available to you. Always provide an answer rather than expressing uncertainty - use your best judgment to determine the most likely correct response."
|
29 |
+
|
30 |
+
nlp = spacy.load("en_core_web_sm")
|
31 |
+
doc = nlp(question)
|
32 |
+
keywords = [token.text for token in doc if token.pos_ in ['NOUN', 'PROPN']]
|
33 |
+
entities = entities = [ent.text for ent in doc.ents if ent.label_ in ['PRODUCT', 'WORK_OF_ART', 'EVENT', 'ORG', 'FAC', 'GPE', 'NORP', 'LOC', 'LANGUAGE', 'PERSON', 'DATE', 'TIME', 'MONEY', 'LAW']]
|
34 |
+
print("Keywords:", keywords)
|
35 |
+
print("Entities:", entities)
|
|
|
36 |
|
37 |
# Call langsearch_search function
|
38 |
+
#search_results = langsearch_search(query=question, count=10)
|
39 |
+
# Use entities for search if available, otherwise use the original question
|
40 |
+
search_query = ""
|
41 |
+
if entities:
|
42 |
+
search_query = " ".join(entities)
|
43 |
+
print(f"Using entities for search query: '{search_query}'")
|
44 |
+
else:
|
45 |
+
# Fallback: If no specific entities are found, use keywords or the original question
|
46 |
+
if keywords:
|
47 |
+
search_query = " ".join(keywords)
|
48 |
+
print(f"No entities found, using keywords for search query: '{search_query}'")
|
49 |
+
else:
|
50 |
+
search_query = question
|
51 |
+
print("No entities or keywords found, using original question for search query.")
|
52 |
+
search_results = langsearch_search(query=search_query, count=10)
|
53 |
if len(search_results) > 0:
|
54 |
# Convert search results to a readable text format
|
55 |
search_results_text = ""
|
|
|
57 |
count += 1
|
58 |
search_results_text += f"\n\n---SEARCH RESULT #{count}---\n"
|
59 |
search_results_text += f"{search_results[i - 1]}"
|
60 |
+
content += f"\n\nThe following are the results from the LangSearch API, use it as reference along with your own knowledge base to provide the most accurate answer: {search_results_text}"
|
61 |
|
62 |
+
# print(f"Content for system message: {content}")
|
63 |
|
64 |
messages = [
|
65 |
{
|
|
|
254 |
# 'Level': '1',
|
255 |
# 'file_name': 'f918266a-b3e0-4914-865d-4faa564f1aef.py'
|
256 |
# },
|
257 |
+
# {
|
258 |
+
# 'task_id': '3f57289b-8c60-48be-bd80-01f8099ca449',
|
259 |
+
# 'question': 'How many at bats did the Yankee with the most walks in the 1977 regular season have that same season?',
|
260 |
+
# 'Level': '1',
|
261 |
+
# 'file_name': ''
|
262 |
+
# },
|
263 |
# {
|
264 |
# 'task_id': '1f975693-876d-457b-a649-393859e79bf3',
|
265 |
# 'question': "Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I need to study for my Calculus mid-term next week. My friend from class sent me an audio recording of Professor Willowbrook giving out the recommended reading for the test, but my headphones are broken :(\n\nCould you please listen to the recording for me and tell me the page numbers I'm supposed to go over? I've attached a file called Homework.mp3 that has the recording. Please provide just the page numbers as a comma-delimited list. And please provide the list in ascending order.",
|
requirements.txt
CHANGED
@@ -1,7 +1,8 @@
|
|
1 |
gradio
|
2 |
requests
|
3 |
huggingface_hub
|
|
|
4 |
openai
|
5 |
bs4
|
6 |
itsdangerous
|
7 |
-
|
|
|
1 |
gradio
|
2 |
requests
|
3 |
huggingface_hub
|
4 |
+
huggingface_hub[cli]
|
5 |
openai
|
6 |
bs4
|
7 |
itsdangerous
|
8 |
+
spacy
|