Spaces:
Sleeping
Sleeping
Rework wiki search by directly using Wikimedia API and RetrievalQA chain
Browse files- agent.py +41 -20
- requirements.txt +3 -0
- tools.py +149 -15
agent.py
CHANGED
@@ -1,10 +1,12 @@
|
|
1 |
-
from typing import Annotated,
|
2 |
|
3 |
from dotenv import find_dotenv, load_dotenv
|
4 |
from langchain.chat_models import init_chat_model
|
5 |
-
from langchain_core.messages import
|
|
|
6 |
from langgraph.graph.message import add_messages
|
7 |
-
from langgraph.
|
|
|
8 |
|
9 |
from tools import (add, ask_about_image, divide, get_current_time_and_date,
|
10 |
get_sum, get_weather_info, get_youtube_transcript,
|
@@ -14,26 +16,27 @@ from tools import (add, ask_about_image, divide, get_current_time_and_date,
|
|
14 |
|
15 |
|
16 |
class AgentState(TypedDict):
|
17 |
-
|
18 |
-
messages: Annotated[list[AnyMessage], add_messages]
|
19 |
|
20 |
|
21 |
class BasicAgent:
|
22 |
def __init__(self):
|
23 |
load_dotenv(find_dotenv())
|
24 |
-
|
25 |
system_prompt = (
|
26 |
-
"You are a general AI assistant
|
27 |
-
"
|
28 |
-
"
|
29 |
-
"
|
30 |
-
"
|
31 |
-
"
|
32 |
-
"
|
33 |
-
"
|
34 |
-
"
|
35 |
-
"
|
36 |
-
"the
|
|
|
|
|
37 |
)
|
38 |
tools = [
|
39 |
get_weather_info,
|
@@ -52,14 +55,32 @@ class BasicAgent:
|
|
52 |
get_youtube_video_info,
|
53 |
get_youtube_transcript,
|
54 |
]
|
|
|
55 |
|
56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
print("BasicAgent initialized.")
|
58 |
|
59 |
def __call__(self, question: str) -> str:
|
60 |
print(f"Agent received question (first 50 chars): {question[:50]}...")
|
61 |
messages = [HumanMessage(content=question)]
|
62 |
-
|
63 |
-
response_string =
|
64 |
print(f"Agent's response: {response_string}")
|
65 |
return response_string
|
|
|
1 |
+
from typing import Annotated, TypedDict
|
2 |
|
3 |
from dotenv import find_dotenv, load_dotenv
|
4 |
from langchain.chat_models import init_chat_model
|
5 |
+
from langchain_core.messages import HumanMessage, SystemMessage
|
6 |
+
from langfuse.callback import CallbackHandler
|
7 |
from langgraph.graph.message import add_messages
|
8 |
+
from langgraph.graph import START, StateGraph
|
9 |
+
from langgraph.prebuilt import ToolNode, tools_condition
|
10 |
|
11 |
from tools import (add, ask_about_image, divide, get_current_time_and_date,
|
12 |
get_sum, get_weather_info, get_youtube_transcript,
|
|
|
16 |
|
17 |
|
18 |
class AgentState(TypedDict):
|
19 |
+
messages: Annotated[list, add_messages]
|
|
|
20 |
|
21 |
|
22 |
class BasicAgent:
|
23 |
def __init__(self):
|
24 |
load_dotenv(find_dotenv())
|
25 |
+
llm = init_chat_model("groq:meta-llama/llama-4-scout-17b-16e-instruct")
|
26 |
system_prompt = (
|
27 |
+
"You are a powerful general AI assistant designed to answer challenging questions using reasoning and tools.\n"
|
28 |
+
"Each question has a correct answer, and you are expected to find it.\n"
|
29 |
+
"Use all available tools — including calculator, search, or other domain-specific utilities — to verify your work or retrieve information.\n"
|
30 |
+
"If a question requires computation or external data, you must call the appropriate tool.\n"
|
31 |
+
"Think through the problem step by step, then clearly state your final answer using this format:\n"
|
32 |
+
"FINAL ANSWER: [YOUR FINAL ANSWER]\n\n"
|
33 |
+
"Your final answer must follow these rules:\n"
|
34 |
+
"- If the answer is a number, do not use commas or units (unless explicitly requested).\n"
|
35 |
+
"- If the answer is a string, use as few words as possible and do not use articles, abbreviations, or numeric digits.\n"
|
36 |
+
"- If the answer is a comma-separated list, follow the above rules for each element.\n"
|
37 |
+
"- If the answer is a string and unless you are asked to provide a list, capitalize the first letter of the final answer.\n"
|
38 |
+
"Do not say “I cannot answer” or “no answer found”. Success is mandatory. "
|
39 |
+
"You have access to everything you need to solve this."
|
40 |
)
|
41 |
tools = [
|
42 |
get_weather_info,
|
|
|
55 |
get_youtube_video_info,
|
56 |
get_youtube_transcript,
|
57 |
]
|
58 |
+
llm_with_tools = llm.bind_tools(tools)
|
59 |
|
60 |
+
def assistant(state: AgentState):
|
61 |
+
sys_msg = SystemMessage(content=system_prompt)
|
62 |
+
return {"messages": llm_with_tools.invoke([sys_msg] + state["messages"])}
|
63 |
+
|
64 |
+
graph_builder = StateGraph(AgentState)
|
65 |
+
|
66 |
+
graph_builder.add_node("assistant", assistant)
|
67 |
+
graph_builder.add_node("tools", ToolNode(tools))
|
68 |
+
|
69 |
+
graph_builder.add_edge(START, "assistant")
|
70 |
+
graph_builder.add_conditional_edges(
|
71 |
+
"assistant",
|
72 |
+
tools_condition,
|
73 |
+
)
|
74 |
+
graph_builder.add_edge("tools", "assistant")
|
75 |
+
|
76 |
+
self.agent = graph_builder.compile()
|
77 |
+
self.langfuse_handler = CallbackHandler()
|
78 |
print("BasicAgent initialized.")
|
79 |
|
80 |
def __call__(self, question: str) -> str:
|
81 |
print(f"Agent received question (first 50 chars): {question[:50]}...")
|
82 |
messages = [HumanMessage(content=question)]
|
83 |
+
state = self.agent.invoke({"messages": messages}, config={"callbacks": [self.langfuse_handler]})
|
84 |
+
response_string = state["messages"][-1].content
|
85 |
print(f"Agent's response: {response_string}")
|
86 |
return response_string
|
requirements.txt
CHANGED
@@ -1,7 +1,9 @@
|
|
1 |
beautifulsoup4==4.13.4
|
2 |
datasets==3.5.1
|
3 |
duckduckgo-search==8.0.1
|
|
|
4 |
gradio==5.29.0
|
|
|
5 |
huggingface-hub==0.30.2
|
6 |
langchain==0.3.25
|
7 |
langchain-community==0.3.23
|
@@ -9,6 +11,7 @@ langchain-core==0.3.58
|
|
9 |
langchain_groq==0.3.2
|
10 |
langchain-huggingface==0.1.2
|
11 |
langchain-openai==0.3.16
|
|
|
12 |
langgraph==0.4.1
|
13 |
numpy==2.2.5
|
14 |
openai-whisper==20240930
|
|
|
1 |
beautifulsoup4==4.13.4
|
2 |
datasets==3.5.1
|
3 |
duckduckgo-search==8.0.1
|
4 |
+
faiss-cpu==1.11.0
|
5 |
gradio==5.29.0
|
6 |
+
hf_xet==1.1.2
|
7 |
huggingface-hub==0.30.2
|
8 |
langchain==0.3.25
|
9 |
langchain-community==0.3.23
|
|
|
11 |
langchain_groq==0.3.2
|
12 |
langchain-huggingface==0.1.2
|
13 |
langchain-openai==0.3.16
|
14 |
+
langfuse==2.60.5
|
15 |
langgraph==0.4.1
|
16 |
numpy==2.2.5
|
17 |
openai-whisper==20240930
|
tools.py
CHANGED
@@ -1,12 +1,15 @@
|
|
1 |
import base64
|
2 |
import os
|
3 |
-
from
|
4 |
|
5 |
import pandas as pd
|
6 |
import requests
|
7 |
import whisper
|
8 |
-
|
|
|
|
|
9 |
from dotenv import find_dotenv, load_dotenv
|
|
|
10 |
from langchain.chat_models import init_chat_model
|
11 |
from langchain_community.document_loaders import (
|
12 |
UnstructuredPDFLoader, UnstructuredPowerPointLoader,
|
@@ -14,13 +17,26 @@ from langchain_community.document_loaders import (
|
|
14 |
from langchain_community.tools import DuckDuckGoSearchRun
|
15 |
from langchain_core.prompts import ChatPromptTemplate
|
16 |
from langchain_core.tools import tool
|
|
|
|
|
|
|
|
|
|
|
17 |
from youtube_transcript_api import YouTubeTranscriptApi
|
18 |
from yt_dlp import YoutubeDL
|
19 |
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
@tool
|
22 |
def get_weather_info(location: str) -> str:
|
23 |
-
"""Fetches
|
24 |
|
25 |
Usage:
|
26 |
```
|
@@ -127,20 +143,127 @@ def reverse_text(text: str) -> str:
|
|
127 |
return text[::-1]
|
128 |
|
129 |
|
130 |
-
|
131 |
-
|
132 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
|
134 |
Args:
|
135 |
query (str): The search query.
|
|
|
136 |
"""
|
137 |
-
|
138 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
return "No results found."
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
|
145 |
|
146 |
@tool
|
@@ -243,7 +366,7 @@ def ask_about_image(image_path: str, question: str) -> str:
|
|
243 |
question (str): Your question about the image, as a natural language sentence. Provide as much context as possible.
|
244 |
"""
|
245 |
load_dotenv(find_dotenv())
|
246 |
-
llm = init_chat_model("groq:meta-llama/llama-4-
|
247 |
prompt = ChatPromptTemplate(
|
248 |
[
|
249 |
{
|
@@ -256,16 +379,26 @@ def ask_about_image(image_path: str, question: str) -> str:
|
|
256 |
{
|
257 |
"type": "image_url",
|
258 |
"image_url": {
|
259 |
-
"url": "data:image/
|
260 |
},
|
261 |
},
|
262 |
],
|
263 |
}
|
264 |
]
|
265 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
266 |
chain = prompt | llm
|
267 |
response = chain.invoke(
|
268 |
-
{
|
|
|
|
|
|
|
|
|
269 |
)
|
270 |
return response.text()
|
271 |
|
@@ -322,6 +455,7 @@ def inspect_file_as_text(file_path: str) -> str:
|
|
322 |
Args:
|
323 |
file_path (str): The path to the file you want to read as text. If it is an image, use `vision_qa` tool.
|
324 |
"""
|
|
|
325 |
try:
|
326 |
suffix = os.path.splitext(file_path)[-1]
|
327 |
if suffix in [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff"]:
|
|
|
1 |
import base64
|
2 |
import os
|
3 |
+
from typing import Optional
|
4 |
|
5 |
import pandas as pd
|
6 |
import requests
|
7 |
import whisper
|
8 |
+
|
9 |
+
from bs4 import BeautifulSoup
|
10 |
+
from datetime import datetime
|
11 |
from dotenv import find_dotenv, load_dotenv
|
12 |
+
from langchain.chains import RetrievalQA
|
13 |
from langchain.chat_models import init_chat_model
|
14 |
from langchain_community.document_loaders import (
|
15 |
UnstructuredPDFLoader, UnstructuredPowerPointLoader,
|
|
|
17 |
from langchain_community.tools import DuckDuckGoSearchRun
|
18 |
from langchain_core.prompts import ChatPromptTemplate
|
19 |
from langchain_core.tools import tool
|
20 |
+
from langchain.schema import Document
|
21 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
22 |
+
from langchain_community.vectorstores import FAISS
|
23 |
+
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
|
24 |
+
from markdownify import markdownify as md
|
25 |
from youtube_transcript_api import YouTubeTranscriptApi
|
26 |
from yt_dlp import YoutubeDL
|
27 |
|
28 |
|
29 |
+
UNWANTED_SECTIONS = {
|
30 |
+
"references",
|
31 |
+
"external links",
|
32 |
+
"further reading",
|
33 |
+
"see also",
|
34 |
+
"notes",
|
35 |
+
}
|
36 |
+
|
37 |
@tool
|
38 |
def get_weather_info(location: str) -> str:
|
39 |
+
"""Fetches weather information for a given location.
|
40 |
|
41 |
Usage:
|
42 |
```
|
|
|
143 |
return text[::-1]
|
144 |
|
145 |
|
146 |
+
def build_retriever(text: str):
|
147 |
+
"""Builds a retriever from the given text.
|
148 |
+
|
149 |
+
Args:
|
150 |
+
text (str): The text to be used for retrieval.
|
151 |
+
"""
|
152 |
+
splitter = RecursiveCharacterTextSplitter(
|
153 |
+
separators=["\n### ", "\n## ", "\n# "],
|
154 |
+
chunk_size=1000,
|
155 |
+
chunk_overlap=200,
|
156 |
+
)
|
157 |
+
chunks = splitter.split_text(text)
|
158 |
+
docs = [
|
159 |
+
Document(page_content=chunk)
|
160 |
+
for chunk in chunks
|
161 |
+
]
|
162 |
+
hf_embed = HuggingFaceEmbeddings(
|
163 |
+
model_name="sentence-transformers/all-MiniLM-L6-v2"
|
164 |
+
)
|
165 |
+
index = FAISS.from_documents(docs, hf_embed)
|
166 |
+
return index.as_retriever(search_kwargs={"k": 3})
|
167 |
+
|
168 |
+
|
169 |
+
def get_retrieval_qa(text: str):
|
170 |
+
"""Creates a RetrievalQA instance for the given text.
|
171 |
+
Args:
|
172 |
+
text (str): The text to be used for retrieval.
|
173 |
+
"""
|
174 |
+
retriever = build_retriever(text)
|
175 |
+
llm = init_chat_model("groq:meta-llama/llama-4-scout-17b-16e-instruct")
|
176 |
+
return RetrievalQA.from_chain_type(
|
177 |
+
llm=llm,
|
178 |
+
chain_type="stuff",
|
179 |
+
retriever=retriever,
|
180 |
+
return_source_documents=True,
|
181 |
+
)
|
182 |
+
|
183 |
+
|
184 |
+
def clean_html(html: str) -> str:
|
185 |
+
soup = BeautifulSoup(html, "html.parser")
|
186 |
+
|
187 |
+
# 1. Remove <script> & <style>
|
188 |
+
for tag in soup(["script", "style"]):
|
189 |
+
tag.decompose()
|
190 |
+
|
191 |
+
# 2. Drop whole <section> blocks whose first heading is unwanted
|
192 |
+
for sec in soup.find_all("section"):
|
193 |
+
h = sec.find(["h1","h2","h3","h4","h5","h6"])
|
194 |
+
if h and any(h.get_text(strip=True).lower().startswith(u) for u in UNWANTED_SECTIONS):
|
195 |
+
sec.decompose()
|
196 |
+
|
197 |
+
# 3. Additional filtering by CSS selector
|
198 |
+
for selector in [".toc", ".navbox", ".vertical-navbox", ".hatnote", ".reflist", ".mw-references-wrap"]:
|
199 |
+
for el in soup.select(selector):
|
200 |
+
el.decompose()
|
201 |
+
|
202 |
+
# 4. Isolate the main content container if present
|
203 |
+
main = soup.find("div", class_="mw-parser-output")
|
204 |
+
return str(main or soup)
|
205 |
+
|
206 |
+
|
207 |
+
def get_wikipedia_article(query: str, lang: str = "en") -> str:
|
208 |
+
"""Fetches a Wikipedia article for a given query and returns its content in Markdown format.
|
209 |
|
210 |
Args:
|
211 |
query (str): The search query.
|
212 |
+
lang (str): The language code for the search. Default is "en".
|
213 |
"""
|
214 |
+
headers = {
|
215 |
+
'User-Agent': 'MyLLMAgent ([email protected])'
|
216 |
+
}
|
217 |
+
|
218 |
+
# Step 1: Search
|
219 |
+
search_url = f"https://api.wikimedia.org/core/v1/wikipedia/{lang}/search/page"
|
220 |
+
search_params = {'q': query, 'limit': 1}
|
221 |
+
search_response = requests.get(search_url, headers=headers, params=search_params, timeout=15)
|
222 |
+
|
223 |
+
if search_response.status_code != 200:
|
224 |
+
return f"Search error: {search_response.status_code}"
|
225 |
+
|
226 |
+
results = search_response.json().get("pages", [])
|
227 |
+
if not results:
|
228 |
return "No results found."
|
229 |
+
|
230 |
+
page = results[0]
|
231 |
+
page_key = page["key"]
|
232 |
+
|
233 |
+
# Step 2: Get the wiki page, only keep relevant content and convert to Markdown
|
234 |
+
content_url = f"https://api.wikimedia.org/core/v1/wikipedia/{lang}/page/{page_key}/html"
|
235 |
+
content_response = requests.get(content_url, timeout=15)
|
236 |
+
|
237 |
+
if content_response.status_code != 200:
|
238 |
+
return f"Content fetch error: {content_response.status_code}"
|
239 |
+
|
240 |
+
html = clean_html(content_response.text)
|
241 |
+
|
242 |
+
markdown = md(
|
243 |
+
html,
|
244 |
+
heading_style="ATX",
|
245 |
+
bullets="*+-",
|
246 |
+
table_infer_header=True,
|
247 |
+
strip=['a', 'span']
|
248 |
+
)
|
249 |
+
return markdown
|
250 |
+
|
251 |
+
|
252 |
+
@tool
|
253 |
+
def wiki_search(query: str, question: str, lang: str="en") -> str:
|
254 |
+
"""Searches Wikipedia for a specific article and answers a question based on its content.
|
255 |
+
|
256 |
+
The function retrieves a Wikipedia article based on the provided query, converts it to Markdown,
|
257 |
+
and uses a retrieval-based QA system to answer the specified question.
|
258 |
+
|
259 |
+
Args:
|
260 |
+
query (str): A concise topic name with optional keywords, ideally matching the relevant Wikipedia page title.
|
261 |
+
question (str): The question to answer using the article.
|
262 |
+
lang (str): Language code for the Wikipedia edition to search (default: "en").
|
263 |
+
"""
|
264 |
+
markdown = get_wikipedia_article(query, lang)
|
265 |
+
qa = get_retrieval_qa(markdown)
|
266 |
+
return qa.invoke(question)
|
267 |
|
268 |
|
269 |
@tool
|
|
|
366 |
question (str): Your question about the image, as a natural language sentence. Provide as much context as possible.
|
367 |
"""
|
368 |
load_dotenv(find_dotenv())
|
369 |
+
llm = init_chat_model("groq:meta-llama/llama-4-maverick-17b-128e-instruct")
|
370 |
prompt = ChatPromptTemplate(
|
371 |
[
|
372 |
{
|
|
|
379 |
{
|
380 |
"type": "image_url",
|
381 |
"image_url": {
|
382 |
+
"url": "data:image/{image_format};base64,{base64_image}",
|
383 |
},
|
384 |
},
|
385 |
],
|
386 |
}
|
387 |
]
|
388 |
)
|
389 |
+
file_suffix = os.path.splitext(image_path)[-1]
|
390 |
+
if file_suffix == ".png":
|
391 |
+
image_format = "png"
|
392 |
+
else:
|
393 |
+
# We could handle other formats explicitly, but for simplicity we assume JPEG
|
394 |
+
image_format = "jpeg"
|
395 |
chain = prompt | llm
|
396 |
response = chain.invoke(
|
397 |
+
{
|
398 |
+
"question": question,
|
399 |
+
"base64_image": encode_image(image_path),
|
400 |
+
"image_format": image_format,
|
401 |
+
}
|
402 |
)
|
403 |
return response.text()
|
404 |
|
|
|
455 |
Args:
|
456 |
file_path (str): The path to the file you want to read as text. If it is an image, use `vision_qa` tool.
|
457 |
"""
|
458 |
+
# TODO we could also pass the file content to a retrieval chain
|
459 |
try:
|
460 |
suffix = os.path.splitext(file_path)[-1]
|
461 |
if suffix in [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff"]:
|