Facelook commited on
Commit
771c32e
·
1 Parent(s): 65a76ae

Added use of NLP to break down question to improve accuracy of web search.

Browse files
Files changed (3) hide show
  1. agent_tools.py +2 -7
  2. app.py +33 -23
  3. requirements.txt +2 -1
agent_tools.py CHANGED
@@ -51,9 +51,7 @@ def duckduckgo_search(query: str, count: int = 3) -> list:
51
  # "snippet": snippet
52
  # })
53
 
54
- results.append({
55
- "summary": snippet
56
- })
57
 
58
  if len(results) >= count:
59
  break
@@ -104,16 +102,13 @@ def langsearch_search(query: str, count: int = 5) -> list:
104
 
105
  response = requests.post(url, headers=headers, data=payload, timeout=30)
106
  response.raise_for_status()
107
- print(f"LangSearch response status code: {response.status_code}")
108
  if response.status_code != 200:
109
  print(f"LangSearch API error: {response.text}")
110
  return []
111
  response = response.json()
112
  results = []
113
  for result in response["data"]["webPages"]["value"]:
114
- results.append({
115
- "summary": result["summary"]
116
- })
117
  print(f"LangSearch results: {results}")
118
  return results
119
  except Exception as e:
 
51
  # "snippet": snippet
52
  # })
53
 
54
+ results.append(snippet)
 
 
55
 
56
  if len(results) >= count:
57
  break
 
102
 
103
  response = requests.post(url, headers=headers, data=payload, timeout=30)
104
  response.raise_for_status()
 
105
  if response.status_code != 200:
106
  print(f"LangSearch API error: {response.text}")
107
  return []
108
  response = response.json()
109
  results = []
110
  for result in response["data"]["webPages"]["value"]:
111
+ results.append(result["summary"])
 
 
112
  print(f"LangSearch results: {results}")
113
  return results
114
  except Exception as e:
app.py CHANGED
@@ -3,6 +3,7 @@ import gradio as gr
3
  import requests
4
  import json
5
  import pandas as pd
 
6
  from openai import OpenAI
7
  from agent_tools import duckduckgo_search, langsearch_search, TOOLS_MAPPING, TOOLS_DEFINITION
8
 
@@ -12,8 +13,6 @@ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
12
 
13
  # --- Basic Agent Definition ---
14
  # ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
15
-
16
-
17
  class BasicAgent:
18
  def __init__(self):
19
  print("BasicAgent initialized.")
@@ -23,23 +22,34 @@ class BasicAgent:
23
  print(f"Agent received question: {question}")
24
 
25
  try:
26
- content = "You are an assistant that has access to the following set of tools. Read the question carefully and do not report your thoughts, explanations, reasoning, or conclusion. Always use RAG. If you know the answer, give only YOUR FINAL ANSWER. YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string. On the other hand, if you really don't know the answer after your best efforts, break down the question and list all search queries in a string array."
27
-
28
  count = 0
29
 
30
- # Call duckduckgo_search function
31
- # search_results = duckduckgo_search(query=question, count=10)
32
- # if len(search_results) > 0:
33
- # # Convert search results to a readable text format
34
- # search_results_text = ""
35
- # for i, result in enumerate(search_results, 1):
36
- # count += 1
37
- # search_results_text += f"\n\n---SEARCH RESULT #{count}---\n"
38
- # search_results_text += f"{search_results[i - 1]}"
39
- # content += f"\n\nThe following are the results from the DuckDuckGo API, you may use it as reference on top of your knowledge base: {search_results_text}"
40
 
41
  # Call langsearch_search function
42
- search_results = langsearch_search(query=question, count=5)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  if len(search_results) > 0:
44
  # Convert search results to a readable text format
45
  search_results_text = ""
@@ -47,9 +57,9 @@ class BasicAgent:
47
  count += 1
48
  search_results_text += f"\n\n---SEARCH RESULT #{count}---\n"
49
  search_results_text += f"{search_results[i - 1]}"
50
- content += f"\n\nThe following are the results from the LangSearch API, you may use it as reference on top of your knowledge base: {search_results_text}"
51
 
52
- #print(f"Content for system message: {content}")
53
 
54
  messages = [
55
  {
@@ -244,12 +254,12 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
244
  # 'Level': '1',
245
  # 'file_name': 'f918266a-b3e0-4914-865d-4faa564f1aef.py'
246
  # },
247
- {
248
- 'task_id': '3f57289b-8c60-48be-bd80-01f8099ca449',
249
- 'question': 'How many at bats did the Yankee with the most walks in the 1977 regular season have that same season?',
250
- 'Level': '1',
251
- 'file_name': ''
252
- },
253
  # {
254
  # 'task_id': '1f975693-876d-457b-a649-393859e79bf3',
255
  # 'question': "Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I need to study for my Calculus mid-term next week. My friend from class sent me an audio recording of Professor Willowbrook giving out the recommended reading for the test, but my headphones are broken :(\n\nCould you please listen to the recording for me and tell me the page numbers I'm supposed to go over? I've attached a file called Homework.mp3 that has the recording. Please provide just the page numbers as a comma-delimited list. And please provide the list in ascending order.",
 
3
  import requests
4
  import json
5
  import pandas as pd
6
+ import spacy
7
  from openai import OpenAI
8
  from agent_tools import duckduckgo_search, langsearch_search, TOOLS_MAPPING, TOOLS_DEFINITION
9
 
 
13
 
14
  # --- Basic Agent Definition ---
15
  # ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
 
 
16
  class BasicAgent:
17
  def __init__(self):
18
  print("BasicAgent initialized.")
 
22
  print(f"Agent received question: {question}")
23
 
24
  try:
 
 
25
  count = 0
26
 
27
+ # content = "You are an assistant that has access to the following set of tools. Read the question carefully and do not report your thoughts, explanations, reasoning, or conclusion. Always use RAG. If you know the answer, give only YOUR FINAL ANSWER. YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string. On the other hand, if you really don't know the answer after your best efforts, break down the question and list all search queries in a string array."
28
+ content = "You are an assistant that has access to the following set of tools. Read the question carefully and do not report your thoughts, explanations, reasoning, or conclusion. Give only YOUR FINAL ANSWER. YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string. If you don't have complete certainty, you must still provide your best answer based on the information available to you. Always provide an answer rather than expressing uncertainty - use your best judgment to determine the most likely correct response."
29
+
30
+ nlp = spacy.load("en_core_web_sm")
31
+ doc = nlp(question)
32
+ keywords = [token.text for token in doc if token.pos_ in ['NOUN', 'PROPN']]
33
+ entities = entities = [ent.text for ent in doc.ents if ent.label_ in ['PRODUCT', 'WORK_OF_ART', 'EVENT', 'ORG', 'FAC', 'GPE', 'NORP', 'LOC', 'LANGUAGE', 'PERSON', 'DATE', 'TIME', 'MONEY', 'LAW']]
34
+ print("Keywords:", keywords)
35
+ print("Entities:", entities)
 
36
 
37
  # Call langsearch_search function
38
+ #search_results = langsearch_search(query=question, count=10)
39
+ # Use entities for search if available, otherwise use the original question
40
+ search_query = ""
41
+ if entities:
42
+ search_query = " ".join(entities)
43
+ print(f"Using entities for search query: '{search_query}'")
44
+ else:
45
+ # Fallback: If no specific entities are found, use keywords or the original question
46
+ if keywords:
47
+ search_query = " ".join(keywords)
48
+ print(f"No entities found, using keywords for search query: '{search_query}'")
49
+ else:
50
+ search_query = question
51
+ print("No entities or keywords found, using original question for search query.")
52
+ search_results = langsearch_search(query=search_query, count=10)
53
  if len(search_results) > 0:
54
  # Convert search results to a readable text format
55
  search_results_text = ""
 
57
  count += 1
58
  search_results_text += f"\n\n---SEARCH RESULT #{count}---\n"
59
  search_results_text += f"{search_results[i - 1]}"
60
+ content += f"\n\nThe following are the results from the LangSearch API, use it as reference along with your own knowledge base to provide the most accurate answer: {search_results_text}"
61
 
62
+ # print(f"Content for system message: {content}")
63
 
64
  messages = [
65
  {
 
254
  # 'Level': '1',
255
  # 'file_name': 'f918266a-b3e0-4914-865d-4faa564f1aef.py'
256
  # },
257
+ # {
258
+ # 'task_id': '3f57289b-8c60-48be-bd80-01f8099ca449',
259
+ # 'question': 'How many at bats did the Yankee with the most walks in the 1977 regular season have that same season?',
260
+ # 'Level': '1',
261
+ # 'file_name': ''
262
+ # },
263
  # {
264
  # 'task_id': '1f975693-876d-457b-a649-393859e79bf3',
265
  # 'question': "Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I need to study for my Calculus mid-term next week. My friend from class sent me an audio recording of Professor Willowbrook giving out the recommended reading for the test, but my headphones are broken :(\n\nCould you please listen to the recording for me and tell me the page numbers I'm supposed to go over? I've attached a file called Homework.mp3 that has the recording. Please provide just the page numbers as a comma-delimited list. And please provide the list in ascending order.",
requirements.txt CHANGED
@@ -1,7 +1,8 @@
1
  gradio
2
  requests
3
  huggingface_hub
 
4
  openai
5
  bs4
6
  itsdangerous
7
- huggingface_hub[cli]
 
1
  gradio
2
  requests
3
  huggingface_hub
4
+ huggingface_hub[cli]
5
  openai
6
  bs4
7
  itsdangerous
8
+ spacy