phucdev commited on
Commit
823bd24
·
1 Parent(s): 039dacb

Rework wiki search by directly using Wikimedia API and RetrievalQA chain

Browse files
Files changed (3) hide show
  1. agent.py +41 -20
  2. requirements.txt +3 -0
  3. tools.py +149 -15
agent.py CHANGED
@@ -1,10 +1,12 @@
1
- from typing import Annotated, Optional, TypedDict
2
 
3
  from dotenv import find_dotenv, load_dotenv
4
  from langchain.chat_models import init_chat_model
5
- from langchain_core.messages import AnyMessage, HumanMessage
 
6
  from langgraph.graph.message import add_messages
7
- from langgraph.prebuilt import create_react_agent
 
8
 
9
  from tools import (add, ask_about_image, divide, get_current_time_and_date,
10
  get_sum, get_weather_info, get_youtube_transcript,
@@ -14,26 +16,27 @@ from tools import (add, ask_about_image, divide, get_current_time_and_date,
14
 
15
 
16
  class AgentState(TypedDict):
17
- input_file: Optional[str] # Contains file path
18
- messages: Annotated[list[AnyMessage], add_messages]
19
 
20
 
21
  class BasicAgent:
22
  def __init__(self):
23
  load_dotenv(find_dotenv())
24
- model = init_chat_model("groq:meta-llama/llama-4-scout-17b-16e-instruct")
25
  system_prompt = (
26
- "You are a general AI assistant. I will ask you a question. Report your thoughts, and finish your answer "
27
- "with the following template: FINAL ANSWER: [YOUR FINAL ANSWER]. YOUR FINAL ANSWER should be a number OR "
28
- "as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a "
29
- "number, don't use comma to write your number neither use units such as $ or percent sign unless specified "
30
- "otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), "
31
- "and write the digits in plain text unless specified otherwise. If you are asked for a comma separated "
32
- "list, apply the above rules depending of whether the element to be put in the list is a number or a string."
33
- "Give it all you can: I know for a fact that you have access to all the relevant tools to solve it and find "
34
- "the correct answer (the answer does exist). Failure or 'I cannot answer' or 'None found' will not be "
35
- "tolerated, success will be rewarded. Run verification steps if that's needed, you must make sure you find "
36
- "the correct answer! "
 
 
37
  )
38
  tools = [
39
  get_weather_info,
@@ -52,14 +55,32 @@ class BasicAgent:
52
  get_youtube_video_info,
53
  get_youtube_transcript,
54
  ]
 
55
 
56
- self.agent = create_react_agent(model=model, tools=tools, prompt=system_prompt)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  print("BasicAgent initialized.")
58
 
59
  def __call__(self, question: str) -> str:
60
  print(f"Agent received question (first 50 chars): {question[:50]}...")
61
  messages = [HumanMessage(content=question)]
62
- response = self.agent.invoke({"messages": messages})
63
- response_string = response["messages"][-1].content
64
  print(f"Agent's response: {response_string}")
65
  return response_string
 
1
+ from typing import Annotated, TypedDict
2
 
3
  from dotenv import find_dotenv, load_dotenv
4
  from langchain.chat_models import init_chat_model
5
+ from langchain_core.messages import HumanMessage, SystemMessage
6
+ from langfuse.callback import CallbackHandler
7
  from langgraph.graph.message import add_messages
8
+ from langgraph.graph import START, StateGraph
9
+ from langgraph.prebuilt import ToolNode, tools_condition
10
 
11
  from tools import (add, ask_about_image, divide, get_current_time_and_date,
12
  get_sum, get_weather_info, get_youtube_transcript,
 
16
 
17
 
18
  class AgentState(TypedDict):
19
+ messages: Annotated[list, add_messages]
 
20
 
21
 
22
  class BasicAgent:
23
  def __init__(self):
24
  load_dotenv(find_dotenv())
25
+ llm = init_chat_model("groq:meta-llama/llama-4-scout-17b-16e-instruct")
26
  system_prompt = (
27
+ "You are a powerful general AI assistant designed to answer challenging questions using reasoning and tools.\n"
28
+ "Each question has a correct answer, and you are expected to find it.\n"
29
+ "Use all available tools including calculator, search, or other domain-specific utilities to verify your work or retrieve information.\n"
30
+ "If a question requires computation or external data, you must call the appropriate tool.\n"
31
+ "Think through the problem step by step, then clearly state your final answer using this format:\n"
32
+ "FINAL ANSWER: [YOUR FINAL ANSWER]\n\n"
33
+ "Your final answer must follow these rules:\n"
34
+ "- If the answer is a number, do not use commas or units (unless explicitly requested).\n"
35
+ "- If the answer is a string, use as few words as possible and do not use articles, abbreviations, or numeric digits.\n"
36
+ "- If the answer is a comma-separated list, follow the above rules for each element.\n"
37
+ "- If the answer is a string and unless you are asked to provide a list, capitalize the first letter of the final answer.\n"
38
+ "Do not say “I cannot answer” or “no answer found”. Success is mandatory. "
39
+ "You have access to everything you need to solve this."
40
  )
41
  tools = [
42
  get_weather_info,
 
55
  get_youtube_video_info,
56
  get_youtube_transcript,
57
  ]
58
+ llm_with_tools = llm.bind_tools(tools)
59
 
60
+ def assistant(state: AgentState):
61
+ sys_msg = SystemMessage(content=system_prompt)
62
+ return {"messages": llm_with_tools.invoke([sys_msg] + state["messages"])}
63
+
64
+ graph_builder = StateGraph(AgentState)
65
+
66
+ graph_builder.add_node("assistant", assistant)
67
+ graph_builder.add_node("tools", ToolNode(tools))
68
+
69
+ graph_builder.add_edge(START, "assistant")
70
+ graph_builder.add_conditional_edges(
71
+ "assistant",
72
+ tools_condition,
73
+ )
74
+ graph_builder.add_edge("tools", "assistant")
75
+
76
+ self.agent = graph_builder.compile()
77
+ self.langfuse_handler = CallbackHandler()
78
  print("BasicAgent initialized.")
79
 
80
  def __call__(self, question: str) -> str:
81
  print(f"Agent received question (first 50 chars): {question[:50]}...")
82
  messages = [HumanMessage(content=question)]
83
+ state = self.agent.invoke({"messages": messages}, config={"callbacks": [self.langfuse_handler]})
84
+ response_string = state["messages"][-1].content
85
  print(f"Agent's response: {response_string}")
86
  return response_string
requirements.txt CHANGED
@@ -1,7 +1,9 @@
1
  beautifulsoup4==4.13.4
2
  datasets==3.5.1
3
  duckduckgo-search==8.0.1
 
4
  gradio==5.29.0
 
5
  huggingface-hub==0.30.2
6
  langchain==0.3.25
7
  langchain-community==0.3.23
@@ -9,6 +11,7 @@ langchain-core==0.3.58
9
  langchain_groq==0.3.2
10
  langchain-huggingface==0.1.2
11
  langchain-openai==0.3.16
 
12
  langgraph==0.4.1
13
  numpy==2.2.5
14
  openai-whisper==20240930
 
1
  beautifulsoup4==4.13.4
2
  datasets==3.5.1
3
  duckduckgo-search==8.0.1
4
+ faiss-cpu==1.11.0
5
  gradio==5.29.0
6
+ hf_xet==1.1.2
7
  huggingface-hub==0.30.2
8
  langchain==0.3.25
9
  langchain-community==0.3.23
 
11
  langchain_groq==0.3.2
12
  langchain-huggingface==0.1.2
13
  langchain-openai==0.3.16
14
+ langfuse==2.60.5
15
  langgraph==0.4.1
16
  numpy==2.2.5
17
  openai-whisper==20240930
tools.py CHANGED
@@ -1,12 +1,15 @@
1
  import base64
2
  import os
3
- from datetime import datetime
4
 
5
  import pandas as pd
6
  import requests
7
  import whisper
8
- import wikipedia
 
 
9
  from dotenv import find_dotenv, load_dotenv
 
10
  from langchain.chat_models import init_chat_model
11
  from langchain_community.document_loaders import (
12
  UnstructuredPDFLoader, UnstructuredPowerPointLoader,
@@ -14,13 +17,26 @@ from langchain_community.document_loaders import (
14
  from langchain_community.tools import DuckDuckGoSearchRun
15
  from langchain_core.prompts import ChatPromptTemplate
16
  from langchain_core.tools import tool
 
 
 
 
 
17
  from youtube_transcript_api import YouTubeTranscriptApi
18
  from yt_dlp import YoutubeDL
19
 
20
 
 
 
 
 
 
 
 
 
21
  @tool
22
  def get_weather_info(location: str) -> str:
23
- """Fetches dummy weather information for a given location.
24
 
25
  Usage:
26
  ```
@@ -127,20 +143,127 @@ def reverse_text(text: str) -> str:
127
  return text[::-1]
128
 
129
 
130
- @tool
131
- def wiki_search(query: str) -> str:
132
- """Searches Wikipedia for a given query and returns the summary.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
 
134
  Args:
135
  query (str): The search query.
 
136
  """
137
- search_results = wikipedia.search(query)
138
- if not search_results:
 
 
 
 
 
 
 
 
 
 
 
 
139
  return "No results found."
140
- page_title = search_results[0]
141
- summary = wikipedia.summary(page_title)
142
- # Alternatively wikipedia.page(page_title).content[:max_length]
143
- return f"Title: {page_title}\n\nSummary: {summary}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
 
145
 
146
  @tool
@@ -243,7 +366,7 @@ def ask_about_image(image_path: str, question: str) -> str:
243
  question (str): Your question about the image, as a natural language sentence. Provide as much context as possible.
244
  """
245
  load_dotenv(find_dotenv())
246
- llm = init_chat_model("groq:meta-llama/llama-4-scout-17b-16e-instruct")
247
  prompt = ChatPromptTemplate(
248
  [
249
  {
@@ -256,16 +379,26 @@ def ask_about_image(image_path: str, question: str) -> str:
256
  {
257
  "type": "image_url",
258
  "image_url": {
259
- "url": "data:image/jpeg;base64,{base64_image}",
260
  },
261
  },
262
  ],
263
  }
264
  ]
265
  )
 
 
 
 
 
 
266
  chain = prompt | llm
267
  response = chain.invoke(
268
- {"question": question, "base64_image": encode_image(image_path)}
 
 
 
 
269
  )
270
  return response.text()
271
 
@@ -322,6 +455,7 @@ def inspect_file_as_text(file_path: str) -> str:
322
  Args:
323
  file_path (str): The path to the file you want to read as text. If it is an image, use `vision_qa` tool.
324
  """
 
325
  try:
326
  suffix = os.path.splitext(file_path)[-1]
327
  if suffix in [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff"]:
 
1
  import base64
2
  import os
3
+ from typing import Optional
4
 
5
  import pandas as pd
6
  import requests
7
  import whisper
8
+
9
+ from bs4 import BeautifulSoup
10
+ from datetime import datetime
11
  from dotenv import find_dotenv, load_dotenv
12
+ from langchain.chains import RetrievalQA
13
  from langchain.chat_models import init_chat_model
14
  from langchain_community.document_loaders import (
15
  UnstructuredPDFLoader, UnstructuredPowerPointLoader,
 
17
  from langchain_community.tools import DuckDuckGoSearchRun
18
  from langchain_core.prompts import ChatPromptTemplate
19
  from langchain_core.tools import tool
20
+ from langchain.schema import Document
21
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
22
+ from langchain_community.vectorstores import FAISS
23
+ from langchain_huggingface.embeddings import HuggingFaceEmbeddings
24
+ from markdownify import markdownify as md
25
  from youtube_transcript_api import YouTubeTranscriptApi
26
  from yt_dlp import YoutubeDL
27
 
28
 
29
+ UNWANTED_SECTIONS = {
30
+ "references",
31
+ "external links",
32
+ "further reading",
33
+ "see also",
34
+ "notes",
35
+ }
36
+
37
  @tool
38
  def get_weather_info(location: str) -> str:
39
+ """Fetches weather information for a given location.
40
 
41
  Usage:
42
  ```
 
143
  return text[::-1]
144
 
145
 
146
+ def build_retriever(text: str):
147
+ """Builds a retriever from the given text.
148
+
149
+ Args:
150
+ text (str): The text to be used for retrieval.
151
+ """
152
+ splitter = RecursiveCharacterTextSplitter(
153
+ separators=["\n### ", "\n## ", "\n# "],
154
+ chunk_size=1000,
155
+ chunk_overlap=200,
156
+ )
157
+ chunks = splitter.split_text(text)
158
+ docs = [
159
+ Document(page_content=chunk)
160
+ for chunk in chunks
161
+ ]
162
+ hf_embed = HuggingFaceEmbeddings(
163
+ model_name="sentence-transformers/all-MiniLM-L6-v2"
164
+ )
165
+ index = FAISS.from_documents(docs, hf_embed)
166
+ return index.as_retriever(search_kwargs={"k": 3})
167
+
168
+
169
+ def get_retrieval_qa(text: str):
170
+ """Creates a RetrievalQA instance for the given text.
171
+ Args:
172
+ text (str): The text to be used for retrieval.
173
+ """
174
+ retriever = build_retriever(text)
175
+ llm = init_chat_model("groq:meta-llama/llama-4-scout-17b-16e-instruct")
176
+ return RetrievalQA.from_chain_type(
177
+ llm=llm,
178
+ chain_type="stuff",
179
+ retriever=retriever,
180
+ return_source_documents=True,
181
+ )
182
+
183
+
184
+ def clean_html(html: str) -> str:
185
+ soup = BeautifulSoup(html, "html.parser")
186
+
187
+ # 1. Remove <script> & <style>
188
+ for tag in soup(["script", "style"]):
189
+ tag.decompose()
190
+
191
+ # 2. Drop whole <section> blocks whose first heading is unwanted
192
+ for sec in soup.find_all("section"):
193
+ h = sec.find(["h1","h2","h3","h4","h5","h6"])
194
+ if h and any(h.get_text(strip=True).lower().startswith(u) for u in UNWANTED_SECTIONS):
195
+ sec.decompose()
196
+
197
+ # 3. Additional filtering by CSS selector
198
+ for selector in [".toc", ".navbox", ".vertical-navbox", ".hatnote", ".reflist", ".mw-references-wrap"]:
199
+ for el in soup.select(selector):
200
+ el.decompose()
201
+
202
+ # 4. Isolate the main content container if present
203
+ main = soup.find("div", class_="mw-parser-output")
204
+ return str(main or soup)
205
+
206
+
207
+ def get_wikipedia_article(query: str, lang: str = "en") -> str:
208
+ """Fetches a Wikipedia article for a given query and returns its content in Markdown format.
209
 
210
  Args:
211
  query (str): The search query.
212
+ lang (str): The language code for the search. Default is "en".
213
  """
214
+ headers = {
215
+ 'User-Agent': 'MyLLMAgent ([email protected])'
216
+ }
217
+
218
+ # Step 1: Search
219
+ search_url = f"https://api.wikimedia.org/core/v1/wikipedia/{lang}/search/page"
220
+ search_params = {'q': query, 'limit': 1}
221
+ search_response = requests.get(search_url, headers=headers, params=search_params, timeout=15)
222
+
223
+ if search_response.status_code != 200:
224
+ return f"Search error: {search_response.status_code}"
225
+
226
+ results = search_response.json().get("pages", [])
227
+ if not results:
228
  return "No results found."
229
+
230
+ page = results[0]
231
+ page_key = page["key"]
232
+
233
+ # Step 2: Get the wiki page, only keep relevant content and convert to Markdown
234
+ content_url = f"https://api.wikimedia.org/core/v1/wikipedia/{lang}/page/{page_key}/html"
235
+ content_response = requests.get(content_url, timeout=15)
236
+
237
+ if content_response.status_code != 200:
238
+ return f"Content fetch error: {content_response.status_code}"
239
+
240
+ html = clean_html(content_response.text)
241
+
242
+ markdown = md(
243
+ html,
244
+ heading_style="ATX",
245
+ bullets="*+-",
246
+ table_infer_header=True,
247
+ strip=['a', 'span']
248
+ )
249
+ return markdown
250
+
251
+
252
+ @tool
253
+ def wiki_search(query: str, question: str, lang: str="en") -> str:
254
+ """Searches Wikipedia for a specific article and answers a question based on its content.
255
+
256
+ The function retrieves a Wikipedia article based on the provided query, converts it to Markdown,
257
+ and uses a retrieval-based QA system to answer the specified question.
258
+
259
+ Args:
260
+ query (str): A concise topic name with optional keywords, ideally matching the relevant Wikipedia page title.
261
+ question (str): The question to answer using the article.
262
+ lang (str): Language code for the Wikipedia edition to search (default: "en").
263
+ """
264
+ markdown = get_wikipedia_article(query, lang)
265
+ qa = get_retrieval_qa(markdown)
266
+ return qa.invoke(question)
267
 
268
 
269
  @tool
 
366
  question (str): Your question about the image, as a natural language sentence. Provide as much context as possible.
367
  """
368
  load_dotenv(find_dotenv())
369
+ llm = init_chat_model("groq:meta-llama/llama-4-maverick-17b-128e-instruct")
370
  prompt = ChatPromptTemplate(
371
  [
372
  {
 
379
  {
380
  "type": "image_url",
381
  "image_url": {
382
+ "url": "data:image/{image_format};base64,{base64_image}",
383
  },
384
  },
385
  ],
386
  }
387
  ]
388
  )
389
+ file_suffix = os.path.splitext(image_path)[-1]
390
+ if file_suffix == ".png":
391
+ image_format = "png"
392
+ else:
393
+ # We could handle other formats explicitly, but for simplicity we assume JPEG
394
+ image_format = "jpeg"
395
  chain = prompt | llm
396
  response = chain.invoke(
397
+ {
398
+ "question": question,
399
+ "base64_image": encode_image(image_path),
400
+ "image_format": image_format,
401
+ }
402
  )
403
  return response.text()
404
 
 
455
  Args:
456
  file_path (str): The path to the file you want to read as text. If it is an image, use `vision_qa` tool.
457
  """
458
+ # TODO we could also pass the file content to a retrieval chain
459
  try:
460
  suffix = os.path.splitext(file_path)[-1]
461
  if suffix in [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff"]: