Spaces:
Sleeping
Sleeping
Move utility code into separate modules and add MarkdownWebBaseLoader implementation
Browse files- app.py +1 -1
- requirements.txt +10 -5
- retrieval.py +80 -0
- tools.py +20 -174
- web_utilities.py +297 -0
app.py
CHANGED
@@ -148,7 +148,7 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
148 |
)
|
149 |
else:
|
150 |
print(f"Running agent on {len(questions_data)} questions...")
|
151 |
-
for item in
|
152 |
result = solve_question(item)
|
153 |
results_log.append(result)
|
154 |
with open(results_file_path, "w") as results_file:
|
|
|
148 |
)
|
149 |
else:
|
150 |
print(f"Running agent on {len(questions_data)} questions...")
|
151 |
+
for item in filtered_questions_data:
|
152 |
result = solve_question(item)
|
153 |
results_log.append(result)
|
154 |
with open(results_file_path, "w") as results_file:
|
requirements.txt
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
beautifulsoup4==4.13.4
|
2 |
datasets==3.5.1
|
3 |
duckduckgo-search==8.0.1
|
@@ -6,17 +7,21 @@ gradio==5.29.0
|
|
6 |
hf_xet==1.1.2
|
7 |
huggingface-hub==0.30.2
|
8 |
langchain==0.3.25
|
9 |
-
langchain-
|
10 |
-
langchain-
|
11 |
-
|
12 |
-
langchain-
|
13 |
-
langchain-
|
|
|
|
|
14 |
langfuse==2.60.5
|
15 |
langgraph==0.4.1
|
|
|
16 |
numpy==2.2.5
|
17 |
openai-whisper==20240930
|
18 |
openpyxl==3.1.5
|
19 |
pandas==2.2.3
|
|
|
20 |
pyrootutils~=1.0.4
|
21 |
python-dotenv~=1.1.0
|
22 |
requests==2.32.3
|
|
|
1 |
+
anthropic==0.52.2
|
2 |
beautifulsoup4==4.13.4
|
3 |
datasets==3.5.1
|
4 |
duckduckgo-search==8.0.1
|
|
|
7 |
hf_xet==1.1.2
|
8 |
huggingface-hub==0.30.2
|
9 |
langchain==0.3.25
|
10 |
+
langchain-anthropic==0.3.15
|
11 |
+
langchain-community==0.3.24
|
12 |
+
langchain-core==0.3.64
|
13 |
+
langchain-groq==0.3.2
|
14 |
+
langchain-huggingface==0.2.0
|
15 |
+
langchain-openai==0.3.21
|
16 |
+
langchain-tavily==0.1.6
|
17 |
langfuse==2.60.5
|
18 |
langgraph==0.4.1
|
19 |
+
markdownify==1.1.0
|
20 |
numpy==2.2.5
|
21 |
openai-whisper==20240930
|
22 |
openpyxl==3.1.5
|
23 |
pandas==2.2.3
|
24 |
+
playwright==1.52.0
|
25 |
pyrootutils~=1.0.4
|
26 |
python-dotenv~=1.1.0
|
27 |
requests==2.32.3
|
retrieval.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Union
|
2 |
+
from dotenv import find_dotenv, load_dotenv
|
3 |
+
from langchain.chains import RetrievalQA
|
4 |
+
from langchain.chat_models import init_chat_model
|
5 |
+
from langchain.schema import Document
|
6 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
7 |
+
from langchain_community.vectorstores import FAISS
|
8 |
+
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
|
9 |
+
|
10 |
+
|
11 |
+
def get_default_splitter() -> RecursiveCharacterTextSplitter:
|
12 |
+
"""Returns a pre-configured text splitter."""
|
13 |
+
return RecursiveCharacterTextSplitter(
|
14 |
+
# Using markdown headers as separators is a good strategy
|
15 |
+
separators=["\n### ", "\n## ", "\n# ", "\n\n", "\n", " "],
|
16 |
+
chunk_size=1000,
|
17 |
+
chunk_overlap=200,
|
18 |
+
)
|
19 |
+
|
20 |
+
def get_default_embeddings() -> HuggingFaceEmbeddings:
|
21 |
+
"""Returns a pre-configured embedding model."""
|
22 |
+
return HuggingFaceEmbeddings(
|
23 |
+
model_name="sentence-transformers/all-MiniLM-L6-v2",
|
24 |
+
model_kwargs={'device': 'cpu'}
|
25 |
+
)
|
26 |
+
|
27 |
+
|
28 |
+
def build_retriever(
|
29 |
+
data: Union[str, List[Document]],
|
30 |
+
splitter: RecursiveCharacterTextSplitter = None,
|
31 |
+
embeddings: HuggingFaceEmbeddings = None,
|
32 |
+
top_k: int = 5):
|
33 |
+
"""Builds a retriever from either a raw text string or a list of documents.
|
34 |
+
|
35 |
+
Args:
|
36 |
+
Args:
|
37 |
+
data (Union[str, List[Document]]): The source data to build the retriever from.
|
38 |
+
splitter (RecursiveCharacterTextSplitter, optional): The text splitter to use.
|
39 |
+
Defaults to get_default_splitter().
|
40 |
+
embeddings (HuggingFaceEmbeddings, optional): The embedding model to use.
|
41 |
+
Defaults to get_default_embeddings().
|
42 |
+
top_k (int, optional): The number of top results to return. Defaults to 5.
|
43 |
+
"""
|
44 |
+
splitter = splitter or get_default_splitter()
|
45 |
+
embeddings = embeddings or get_default_embeddings()
|
46 |
+
if isinstance(data, str):
|
47 |
+
# If the input is a raw string, split it into chunks first
|
48 |
+
chunks = splitter.split_text(data)
|
49 |
+
# Then convert those chunks into Document objects
|
50 |
+
docs = [Document(page_content=chunk) for chunk in chunks]
|
51 |
+
elif isinstance(data, list):
|
52 |
+
# If the input is already a list of documents, split them directly
|
53 |
+
docs = splitter.split_documents(data)
|
54 |
+
else:
|
55 |
+
raise ValueError(f"Unsupported data type: {type(data)}. Must be str or List[Document].")
|
56 |
+
|
57 |
+
index = FAISS.from_documents(docs, embeddings)
|
58 |
+
return index.as_retriever(search_kwargs={"k": top_k})
|
59 |
+
|
60 |
+
|
61 |
+
def create_retrieval_qa(
|
62 |
+
retriever,
|
63 |
+
llm=None
|
64 |
+
) -> RetrievalQA:
|
65 |
+
"""Creates a RetrievalQA instance from a given retriever and LLM.
|
66 |
+
|
67 |
+
Args:
|
68 |
+
retriever (BaseRetriever): The retriever to be used by the QA chain.
|
69 |
+
llm (LLM, optional): The language model to use. If not provided,
|
70 |
+
a default model will be initialized.
|
71 |
+
"""
|
72 |
+
if llm is None:
|
73 |
+
load_dotenv(find_dotenv())
|
74 |
+
llm = init_chat_model("groq:meta-llama/llama-4-scout-17b-16e-instruct")
|
75 |
+
return RetrievalQA.from_chain_type(
|
76 |
+
llm=llm,
|
77 |
+
chain_type="stuff",
|
78 |
+
retriever=retriever,
|
79 |
+
return_source_documents=True,
|
80 |
+
)
|
tools.py
CHANGED
@@ -1,42 +1,30 @@
|
|
1 |
import base64
|
2 |
import json
|
3 |
import os
|
4 |
-
import re
|
5 |
-
from typing import Optional, Dict
|
6 |
-
|
7 |
import pandas as pd
|
|
|
8 |
import requests
|
9 |
import whisper
|
10 |
|
11 |
-
from bs4 import BeautifulSoup
|
12 |
from datetime import datetime
|
13 |
from dotenv import find_dotenv, load_dotenv
|
14 |
from langchain.chains import RetrievalQA
|
15 |
from langchain.chat_models import init_chat_model
|
16 |
-
from langchain.schema import Document
|
17 |
-
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
18 |
from langchain_community.document_loaders import (
|
19 |
UnstructuredPDFLoader, UnstructuredPowerPointLoader,
|
20 |
UnstructuredWordDocumentLoader, WebBaseLoader)
|
21 |
-
from langchain_community.tools import DuckDuckGoSearchResults
|
22 |
from langchain_community.utilities import GoogleSerperAPIWrapper
|
23 |
-
from langchain_community.vectorstores import FAISS
|
24 |
from langchain_core.prompts import ChatPromptTemplate
|
25 |
from langchain_core.tools import tool
|
26 |
-
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
|
27 |
from langchain_tavily import TavilySearch
|
28 |
-
from
|
29 |
from youtube_transcript_api import YouTubeTranscriptApi
|
30 |
from yt_dlp import YoutubeDL
|
31 |
|
|
|
|
|
32 |
|
33 |
-
UNWANTED_SECTIONS = {
|
34 |
-
"references",
|
35 |
-
"external links",
|
36 |
-
"further reading",
|
37 |
-
"see also",
|
38 |
-
"notes",
|
39 |
-
}
|
40 |
|
41 |
@tool
|
42 |
def get_weather_info(location: str) -> str:
|
@@ -147,153 +135,6 @@ def reverse_text(text: str) -> str:
|
|
147 |
return text[::-1]
|
148 |
|
149 |
|
150 |
-
def build_retriever(text: str):
|
151 |
-
"""Builds a retriever from the given text.
|
152 |
-
|
153 |
-
Args:
|
154 |
-
text (str): The text to be used for retrieval.
|
155 |
-
"""
|
156 |
-
splitter = RecursiveCharacterTextSplitter(
|
157 |
-
separators=["\n### ", "\n## ", "\n# "],
|
158 |
-
chunk_size=1000,
|
159 |
-
chunk_overlap=200,
|
160 |
-
)
|
161 |
-
chunks = splitter.split_text(text)
|
162 |
-
docs = [
|
163 |
-
Document(page_content=chunk)
|
164 |
-
for chunk in chunks
|
165 |
-
]
|
166 |
-
hf_embed = HuggingFaceEmbeddings(
|
167 |
-
model_name="sentence-transformers/all-MiniLM-L6-v2"
|
168 |
-
)
|
169 |
-
index = FAISS.from_documents(docs, hf_embed)
|
170 |
-
return index.as_retriever(search_kwargs={"k": 3})
|
171 |
-
|
172 |
-
|
173 |
-
def get_retrieval_qa(text: str):
|
174 |
-
"""Creates a RetrievalQA instance for the given text.
|
175 |
-
Args:
|
176 |
-
text (str): The text to be used for retrieval.
|
177 |
-
"""
|
178 |
-
retriever = build_retriever(text)
|
179 |
-
llm = init_chat_model("groq:meta-llama/llama-4-scout-17b-16e-instruct")
|
180 |
-
return RetrievalQA.from_chain_type(
|
181 |
-
llm=llm,
|
182 |
-
chain_type="stuff",
|
183 |
-
retriever=retriever,
|
184 |
-
return_source_documents=True,
|
185 |
-
)
|
186 |
-
|
187 |
-
|
188 |
-
def clean_html(html: str) -> str:
|
189 |
-
soup = BeautifulSoup(html, "html.parser")
|
190 |
-
|
191 |
-
# 1. Remove <script> & <style>
|
192 |
-
for tag in soup(["script", "style"]):
|
193 |
-
tag.decompose()
|
194 |
-
|
195 |
-
# 2. Drop whole <section> blocks whose first heading is unwanted
|
196 |
-
for sec in soup.find_all("section"):
|
197 |
-
h = sec.find(["h1","h2","h3","h4","h5","h6"])
|
198 |
-
if h and any(h.get_text(strip=True).lower().startswith(u) for u in UNWANTED_SECTIONS):
|
199 |
-
sec.decompose()
|
200 |
-
|
201 |
-
# 3. Additional filtering by CSS selector
|
202 |
-
for selector in [".toc", ".navbox", ".vertical-navbox", ".hatnote", ".reflist", ".mw-references-wrap"]:
|
203 |
-
for el in soup.select(selector):
|
204 |
-
el.decompose()
|
205 |
-
|
206 |
-
# 4. Isolate the main content container if present
|
207 |
-
main = soup.find("div", class_="mw-parser-output")
|
208 |
-
return str(main or soup)
|
209 |
-
|
210 |
-
|
211 |
-
def fetch_page_markdown(page_key: str, lang: str="en") -> str:
|
212 |
-
"""Fetches the page HTML and returns the <body> as Markdown.
|
213 |
-
Args:
|
214 |
-
page_key (str): The unique key of the Wikipedia page.
|
215 |
-
lang (str): The language code for the Wikipedia edition to fetch (default: "en").
|
216 |
-
"""
|
217 |
-
url = f"https://api.wikimedia.org/core/v1/wikipedia/{lang}/page/{page_key}/html"
|
218 |
-
resp = requests.get(url, timeout=15)
|
219 |
-
resp.raise_for_status()
|
220 |
-
html = clean_html(resp.text) # Optional, but recommended: clean the HTML to remove unwanted sections
|
221 |
-
|
222 |
-
markdown = md(
|
223 |
-
html,
|
224 |
-
heading_style="ATX",
|
225 |
-
bullets="*+-",
|
226 |
-
table_infer_header=True,
|
227 |
-
strip=['a', 'span']
|
228 |
-
)
|
229 |
-
return markdown
|
230 |
-
|
231 |
-
|
232 |
-
def get_wikipedia_article(query: str) -> Dict[str, str]:
|
233 |
-
"""Fetches a Wikipedia article for a given query and returns its content in Markdown format.
|
234 |
-
|
235 |
-
Args:
|
236 |
-
query (str): The search query.
|
237 |
-
"""
|
238 |
-
headers = {
|
239 |
-
'User-Agent': 'MyLLMAgent ([email protected])'
|
240 |
-
}
|
241 |
-
|
242 |
-
# Step 1: Search
|
243 |
-
search_url = f"https://api.wikimedia.org/core/v1/wikipedia/en/search/page"
|
244 |
-
search_params = {'q': query, 'limit': 1}
|
245 |
-
search_response = requests.get(search_url, headers=headers, params=search_params, timeout=15)
|
246 |
-
|
247 |
-
if search_response.status_code != 200:
|
248 |
-
raise Exception(f"Search error: {search_response.status_code} - {search_response.text}")
|
249 |
-
|
250 |
-
results = search_response.json().get("pages", [])
|
251 |
-
if not results:
|
252 |
-
raise Exception(f"No results found for query: {query}")
|
253 |
-
|
254 |
-
page = results[0]
|
255 |
-
page_key = page["key"]
|
256 |
-
|
257 |
-
# Step 2: Get the wiki page, only keep relevant content and convert to Markdown
|
258 |
-
markdown = fetch_page_markdown(page_key)
|
259 |
-
return {
|
260 |
-
"page_key": page_key,
|
261 |
-
"markdown": markdown,
|
262 |
-
}
|
263 |
-
|
264 |
-
|
265 |
-
def parse_sections(markdown_text: str) -> Dict[str, Dict]:
|
266 |
-
"""
|
267 |
-
Parses markdown into a nested dict:
|
268 |
-
{ section_title: {
|
269 |
-
"full": full_section_md,
|
270 |
-
"subsections": { sub_title: sub_md, ... }
|
271 |
-
}, ... }
|
272 |
-
"""
|
273 |
-
# First split top-level sections
|
274 |
-
top_pat = re.compile(r"^##\s+(.*)$", re.MULTILINE)
|
275 |
-
top_matches = list(top_pat.finditer(markdown_text))
|
276 |
-
sections: Dict[str, Dict] = {}
|
277 |
-
for i, m in enumerate(top_matches):
|
278 |
-
sec_title = m.group(1).strip()
|
279 |
-
start = m.start()
|
280 |
-
end = top_matches[i+1].start() if i+1 < len(top_matches) else len(markdown_text)
|
281 |
-
sec_md = markdown_text[start:end].strip()
|
282 |
-
|
283 |
-
# Now split subsections within this block
|
284 |
-
sub_pat = re.compile(r"^###\s+(.*)$", re.MULTILINE)
|
285 |
-
subs: Dict[str, str] = {}
|
286 |
-
sub_matches = list(sub_pat.finditer(sec_md))
|
287 |
-
for j, sm in enumerate(sub_matches):
|
288 |
-
sub_title = sm.group(1).strip()
|
289 |
-
sub_start = sm.start()
|
290 |
-
sub_end = sub_matches[j+1].start() if j+1 < len(sub_matches) else len(sec_md)
|
291 |
-
subs[sub_title] = sec_md[sub_start:sub_end].strip()
|
292 |
-
|
293 |
-
sections[sec_title] = {"full": sec_md, "subsections": subs}
|
294 |
-
return sections
|
295 |
-
|
296 |
-
|
297 |
@tool
|
298 |
def wiki_search_qa(query: str, question: str) -> str:
|
299 |
"""Searches Wikipedia for a specific article and answers a question based on its content.
|
@@ -304,10 +145,13 @@ def wiki_search_qa(query: str, question: str) -> str:
|
|
304 |
Args:
|
305 |
query (str): A concise topic name with optional keywords, ideally matching the relevant Wikipedia page title.
|
306 |
question (str): The question to answer using the article.
|
|
|
|
|
307 |
"""
|
308 |
article = get_wikipedia_article(query)
|
309 |
markdown = article["markdown"]
|
310 |
-
|
|
|
311 |
return qa.invoke(question)
|
312 |
|
313 |
|
@@ -344,8 +188,8 @@ def wiki_get_section(
|
|
344 |
Returns:
|
345 |
Markdown string of either the entire section or just the named subsection.
|
346 |
"""
|
347 |
-
|
348 |
-
markdown =
|
349 |
sections = parse_sections(markdown)
|
350 |
|
351 |
sec_info = sections.get(section)
|
@@ -368,7 +212,7 @@ def web_search(query: str, max_results: int = 5) -> str:
|
|
368 |
|
369 |
Args:
|
370 |
query (str): The search query.
|
371 |
-
max_results (int): The maximum number of results to return. Default is
|
372 |
"""
|
373 |
if os.getenv("SERPER_API_KEY"):
|
374 |
# Preferred choice: Use Google Serper API for search
|
@@ -400,6 +244,8 @@ def web_search(query: str, max_results: int = 5) -> str:
|
|
400 |
search_tool = DuckDuckGoSearchResults()
|
401 |
results = search_tool.invoke(query)
|
402 |
if results:
|
|
|
|
|
403 |
return results
|
404 |
else:
|
405 |
return "No results found."
|
@@ -412,12 +258,12 @@ def visit_website(url: str) -> str:
|
|
412 |
Args:
|
413 |
url (str): The URL of the website to visit.
|
414 |
"""
|
415 |
-
|
416 |
-
|
417 |
-
|
418 |
-
return
|
419 |
-
|
420 |
-
return "
|
421 |
|
422 |
|
423 |
@tool
|
|
|
1 |
import base64
|
2 |
import json
|
3 |
import os
|
|
|
|
|
|
|
4 |
import pandas as pd
|
5 |
+
import re
|
6 |
import requests
|
7 |
import whisper
|
8 |
|
|
|
9 |
from datetime import datetime
|
10 |
from dotenv import find_dotenv, load_dotenv
|
11 |
from langchain.chains import RetrievalQA
|
12 |
from langchain.chat_models import init_chat_model
|
|
|
|
|
13 |
from langchain_community.document_loaders import (
|
14 |
UnstructuredPDFLoader, UnstructuredPowerPointLoader,
|
15 |
UnstructuredWordDocumentLoader, WebBaseLoader)
|
16 |
+
from langchain_community.tools import DuckDuckGoSearchResults
|
17 |
from langchain_community.utilities import GoogleSerperAPIWrapper
|
|
|
18 |
from langchain_core.prompts import ChatPromptTemplate
|
19 |
from langchain_core.tools import tool
|
|
|
20 |
from langchain_tavily import TavilySearch
|
21 |
+
from typing import Optional
|
22 |
from youtube_transcript_api import YouTubeTranscriptApi
|
23 |
from yt_dlp import YoutubeDL
|
24 |
|
25 |
+
from retrieval import build_retriever, create_retrieval_qa
|
26 |
+
from web_utilities import get_wikipedia_article, parse_sections, fetch_wikipedia_page, MarkdownWebBaseLoader
|
27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
@tool
|
30 |
def get_weather_info(location: str) -> str:
|
|
|
135 |
return text[::-1]
|
136 |
|
137 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
@tool
|
139 |
def wiki_search_qa(query: str, question: str) -> str:
|
140 |
"""Searches Wikipedia for a specific article and answers a question based on its content.
|
|
|
145 |
Args:
|
146 |
query (str): A concise topic name with optional keywords, ideally matching the relevant Wikipedia page title.
|
147 |
question (str): The question to answer using the article.
|
148 |
+
Returns:
|
149 |
+
str: The answer to the question based on the retrieved article.
|
150 |
"""
|
151 |
article = get_wikipedia_article(query)
|
152 |
markdown = article["markdown"]
|
153 |
+
retriever = build_retriever(markdown)
|
154 |
+
qa = create_retrieval_qa(retriever=retriever)
|
155 |
return qa.invoke(question)
|
156 |
|
157 |
|
|
|
188 |
Returns:
|
189 |
Markdown string of either the entire section or just the named subsection.
|
190 |
"""
|
191 |
+
result_dict = fetch_wikipedia_page(page_key=page_key)
|
192 |
+
markdown = result_dict.get("markdown")
|
193 |
sections = parse_sections(markdown)
|
194 |
|
195 |
sec_info = sections.get(section)
|
|
|
212 |
|
213 |
Args:
|
214 |
query (str): The search query.
|
215 |
+
max_results (int): The maximum number of results to return. Default is 3.
|
216 |
"""
|
217 |
if os.getenv("SERPER_API_KEY"):
|
218 |
# Preferred choice: Use Google Serper API for search
|
|
|
244 |
search_tool = DuckDuckGoSearchResults()
|
245 |
results = search_tool.invoke(query)
|
246 |
if results:
|
247 |
+
# Clean up the results to remove any unnecessary spaces or newlines, e.g. \n\n\n
|
248 |
+
results = re.sub(r"\n{2,}", "\n", results.strip())
|
249 |
return results
|
250 |
else:
|
251 |
return "No results found."
|
|
|
258 |
Args:
|
259 |
url (str): The URL of the website to visit.
|
260 |
"""
|
261 |
+
try:
|
262 |
+
page_content = MarkdownWebBaseLoader(url).load()[0].page_content
|
263 |
+
# Use retrieval chain if page_content is large
|
264 |
+
return page_content
|
265 |
+
except Exception as e:
|
266 |
+
return f"Could not retrieve website content. Error: {e}"
|
267 |
|
268 |
|
269 |
@tool
|
web_utilities.py
ADDED
@@ -0,0 +1,297 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import asyncio
|
2 |
+
import logging
|
3 |
+
import re
|
4 |
+
import requests
|
5 |
+
|
6 |
+
from bs4 import BeautifulSoup
|
7 |
+
from langchain.chains import RetrievalQA
|
8 |
+
from langchain_community.document_loaders import WebBaseLoader
|
9 |
+
from langchain_core.documents import Document
|
10 |
+
from markdownify import markdownify as md
|
11 |
+
from playwright.async_api import async_playwright
|
12 |
+
from typing import Any, AsyncIterator, Dict, List, Iterator, Optional, Sequence, Union
|
13 |
+
|
14 |
+
|
15 |
+
logger = logging.getLogger(__name__)
|
16 |
+
|
17 |
+
UNWANTED_SECTIONS = {
|
18 |
+
"references",
|
19 |
+
"external links",
|
20 |
+
"further reading",
|
21 |
+
"see also",
|
22 |
+
"notes",
|
23 |
+
}
|
24 |
+
|
25 |
+
|
26 |
+
def build_metadata(soup: Any, url: str) -> dict:
|
27 |
+
"""Build metadata from BeautifulSoup output."""
|
28 |
+
metadata = {"source": url}
|
29 |
+
if title := soup.find("title"):
|
30 |
+
metadata["title"] = title.get_text()
|
31 |
+
if description := soup.find("meta", attrs={"name": "description"}):
|
32 |
+
metadata["description"] = description.get("content", "No description found.")
|
33 |
+
if html := soup.find("html"):
|
34 |
+
metadata["language"] = html.get("lang", "No language found.")
|
35 |
+
return metadata
|
36 |
+
|
37 |
+
|
38 |
+
class MarkdownWebBaseLoader(WebBaseLoader):
|
39 |
+
"""
|
40 |
+
A WebBaseLoader subclass that uses Playwright to render JS, then
|
41 |
+
strips boilerplate and converts structured pieces to Markdown.
|
42 |
+
"""
|
43 |
+
def __init__(
|
44 |
+
self,
|
45 |
+
web_path: Union[str, Sequence[str]] = "",
|
46 |
+
header_template: Optional[dict] = None,
|
47 |
+
verify_ssl: bool = True,
|
48 |
+
proxies: Optional[dict] = None,
|
49 |
+
continue_on_failure: bool = False,
|
50 |
+
autoset_encoding: bool = True,
|
51 |
+
encoding: Optional[str] = None,
|
52 |
+
web_paths: Sequence[str] = (),
|
53 |
+
requests_per_second: int = 2,
|
54 |
+
default_parser: str = "html.parser",
|
55 |
+
requests_kwargs: Optional[Dict[str, Any]] = None,
|
56 |
+
raise_for_status: bool = False,
|
57 |
+
bs_get_text_kwargs: Optional[Dict[str, Any]] = None,
|
58 |
+
bs_kwargs: Optional[Dict[str, Any]] = None,
|
59 |
+
session: Any = None,
|
60 |
+
markdown_kwargs: Optional[Dict[str, Any]] = None,
|
61 |
+
unwanted_css: Optional[List[str]] = None,
|
62 |
+
unwanted_headings: Optional[List[str]] = None,
|
63 |
+
render_wait: float = 1.0,
|
64 |
+
*,
|
65 |
+
show_progress: bool = True,
|
66 |
+
trust_env: bool = False,
|
67 |
+
) -> None:
|
68 |
+
"""Initialize loader.
|
69 |
+
|
70 |
+
Args:
|
71 |
+
markdown_kwargs: Optional[Dict[str, Any]]: Arguments for markdownify.
|
72 |
+
unwanted_css: Optional[List[str]]: CSS selectors to remove from the page.
|
73 |
+
unwanted_headings: Optional[List[str]]: Headings to remove from the page.
|
74 |
+
render_wait: float: Time to wait for JS rendering (default: 2.0 seconds).
|
75 |
+
"""
|
76 |
+
super().__init__(
|
77 |
+
web_path=web_path,
|
78 |
+
header_template=header_template,
|
79 |
+
verify_ssl=verify_ssl,
|
80 |
+
proxies=proxies,
|
81 |
+
continue_on_failure=continue_on_failure,
|
82 |
+
autoset_encoding=autoset_encoding,
|
83 |
+
encoding=encoding,
|
84 |
+
web_paths=web_paths,
|
85 |
+
requests_per_second=requests_per_second,
|
86 |
+
default_parser=default_parser,
|
87 |
+
requests_kwargs=requests_kwargs,
|
88 |
+
raise_for_status=raise_for_status,
|
89 |
+
bs_get_text_kwargs=bs_get_text_kwargs,
|
90 |
+
bs_kwargs=bs_kwargs,
|
91 |
+
session=session,
|
92 |
+
show_progress=show_progress,
|
93 |
+
trust_env=trust_env,
|
94 |
+
)
|
95 |
+
self.markdown_kwargs = markdown_kwargs or {
|
96 |
+
"heading_style": "ATX",
|
97 |
+
"bullets": "*+-",
|
98 |
+
"strip": ["a", "span"],
|
99 |
+
"table_infer_header": True
|
100 |
+
}
|
101 |
+
self.unwanted_css = unwanted_css or [
|
102 |
+
".toc", ".navbox", ".sidebar", ".advertisement", ".cookie-banner", ".vertical-navbox",
|
103 |
+
".hatnote", ".reflist", ".mw-references-wrap"
|
104 |
+
]
|
105 |
+
self.unwanted_headings = [h.lower() for h in (unwanted_headings or UNWANTED_SECTIONS)]
|
106 |
+
self.render_wait = render_wait
|
107 |
+
|
108 |
+
@staticmethod
|
109 |
+
def _should_render(html: str, soup: Any) -> bool:
|
110 |
+
low_text = len(soup.get_text(strip=True)) < 100
|
111 |
+
has_noscript = bool(soup.find("noscript"))
|
112 |
+
cf_challenge = "just a moment" in html.lower() or "enable javascript" in html.lower()
|
113 |
+
many_scripts = len(soup.find_all("script")) > 20
|
114 |
+
return has_noscript or cf_challenge or low_text or many_scripts
|
115 |
+
|
116 |
+
async def _fetch_with_playwright(self, url: str) -> str:
|
117 |
+
async with async_playwright() as pw:
|
118 |
+
browser = await pw.chromium.launch(headless=True)
|
119 |
+
page = await browser.new_page()
|
120 |
+
# If you need cookies/auth, you can do:
|
121 |
+
# await page.set_extra_http_headers(self.session.headers)
|
122 |
+
await page.goto(url)
|
123 |
+
await asyncio.sleep(self.render_wait) # allow JS to finish
|
124 |
+
content = await page.content()
|
125 |
+
await browser.close()
|
126 |
+
return content
|
127 |
+
|
128 |
+
def _scrape(
|
129 |
+
self,
|
130 |
+
url: str,
|
131 |
+
parser: Union[str, None] = None,
|
132 |
+
bs_kwargs: Optional[dict] = None,
|
133 |
+
) -> Any:
|
134 |
+
if parser is None:
|
135 |
+
parser = "xml" if url.endswith(".xml") else self.default_parser
|
136 |
+
self._check_parser(parser)
|
137 |
+
|
138 |
+
resp = self.session.get(url, **self.requests_kwargs)
|
139 |
+
if self.raise_for_status:
|
140 |
+
resp.raise_for_status()
|
141 |
+
if self.encoding is not None:
|
142 |
+
resp.encoding = self.encoding
|
143 |
+
elif self.autoset_encoding:
|
144 |
+
resp.encoding = resp.apparent_encoding
|
145 |
+
html = resp.text
|
146 |
+
|
147 |
+
soup = BeautifulSoup(html, parser, **(bs_kwargs or {}))
|
148 |
+
|
149 |
+
# If the html looks JS-heavy, re-render with Playwright
|
150 |
+
if not url.endswith(".xml") and self._should_render(html, soup):
|
151 |
+
try:
|
152 |
+
rendered = asyncio.run(self._fetch_with_playwright(url))
|
153 |
+
soup = BeautifulSoup(rendered, parser, **(bs_kwargs or {}))
|
154 |
+
except Exception as e:
|
155 |
+
logger.warning("Playwright rendering failed for %s: %s. Falling back to requests.", url, e)
|
156 |
+
|
157 |
+
return soup
|
158 |
+
|
159 |
+
@staticmethod
|
160 |
+
def normalize_whitespace(text: str) -> str:
|
161 |
+
"""
|
162 |
+
Collapse runs of spaces, tabs, etc. down to single spaces—but skip
|
163 |
+
inside fenced code blocks ```…``` or inline code `…`.
|
164 |
+
"""
|
165 |
+
# Replace non-breaking and invisible spaces with regular spaces
|
166 |
+
text = text.replace("\u00A0", " ")
|
167 |
+
# Strip zero-width spaces:
|
168 |
+
text = re.sub(r"[\u200B\u200C\u200D\uFEFF]", "", text)
|
169 |
+
|
170 |
+
# Split out fenced code -> keep code blocks intact while normalizing other text
|
171 |
+
parts = re.split(r'(```.*?```)', text, flags=re.S)
|
172 |
+
for i, part in enumerate(parts):
|
173 |
+
if not part.startswith("```"):
|
174 |
+
# further split out inline code
|
175 |
+
subparts = re.split(r'(`[^`\n]+`)', part)
|
176 |
+
for j, sp in enumerate(subparts):
|
177 |
+
if not sp.startswith("`"):
|
178 |
+
# collapse whitespace, strip edges of each segment
|
179 |
+
subparts[j] = re.sub(r'[ \t\r\f\v]+', ' ', sp).strip()
|
180 |
+
parts[i] = "".join(subparts)
|
181 |
+
# Rejoin and ensure paragraphs are separated by a single blank line
|
182 |
+
normalized = "\n\n".join(p for p in parts if p.strip() != "")
|
183 |
+
return normalized
|
184 |
+
|
185 |
+
def _convert_soup_to_text(self, soup: Any) -> str:
|
186 |
+
# Strip scripts & styles
|
187 |
+
for tag in soup(["script", "style"]):
|
188 |
+
tag.decompose()
|
189 |
+
# Drop blocks whose first heading matches unwanted
|
190 |
+
for sec in soup.find_all(["section", "div", "aside"]):
|
191 |
+
h = sec.find(["h1", "h2", "h3", "h4", "h5", "h6"])
|
192 |
+
if h and any(h.get_text(strip=True).lower().startswith(u) for u in self.unwanted_headings):
|
193 |
+
sec.decompose()
|
194 |
+
# Drop by CSS selector
|
195 |
+
for sel in self.unwanted_css:
|
196 |
+
for el in soup.select(sel):
|
197 |
+
el.decompose()
|
198 |
+
# Isolate the main content container if present
|
199 |
+
soup = soup.find("div", class_="mw-parser-output") or soup.find("main") or soup.find("article") or soup
|
200 |
+
|
201 |
+
# Convert to Markdown text with markdownify
|
202 |
+
markdown = md(str(soup), **self.markdown_kwargs)
|
203 |
+
markdown = self.normalize_whitespace(markdown)
|
204 |
+
return markdown
|
205 |
+
|
206 |
+
def lazy_load(self) -> Iterator[Document]:
|
207 |
+
"""Lazy load text from the url(s) in web_path."""
|
208 |
+
for path in self.web_paths:
|
209 |
+
soup = self._scrape(path, bs_kwargs=self.bs_kwargs)
|
210 |
+
text = self._convert_soup_to_text(soup)
|
211 |
+
metadata = build_metadata(soup, path)
|
212 |
+
yield Document(page_content=text, metadata=metadata)
|
213 |
+
|
214 |
+
async def alazy_load(self) -> AsyncIterator[Document]:
|
215 |
+
"""Async lazy load text from the url(s) in web_path."""
|
216 |
+
results = await self.ascrape_all(self.web_paths)
|
217 |
+
for path, soup in zip(self.web_paths, results):
|
218 |
+
text = self._convert_soup_to_text(soup)
|
219 |
+
metadata = build_metadata(soup, path)
|
220 |
+
yield Document(page_content=text, metadata=metadata)
|
221 |
+
|
222 |
+
|
223 |
+
def fetch_wikipedia_page(page_key: str, lang: str = "en") -> Dict[str, str]:
|
224 |
+
"""Fetches a Wikipedia page by its key and returns its content in Markdown format.
|
225 |
+
|
226 |
+
Args:
|
227 |
+
page_key (str): The unique key of the Wikipedia page.
|
228 |
+
lang (str): The language code for the Wikipedia edition to fetch (default: "en").
|
229 |
+
"""
|
230 |
+
page_key = page_key.replace(" ", "_") # Ensure the page key is URL-safe
|
231 |
+
page_url = f"https://api.wikimedia.org/core/v1/wikipedia/{lang}/page/{page_key}/html"
|
232 |
+
visit_website_tool = MarkdownWebBaseLoader(page_url)
|
233 |
+
markdown = visit_website_tool.load()[0].page_content
|
234 |
+
return {
|
235 |
+
"page_key": page_key,
|
236 |
+
"markdown": markdown,
|
237 |
+
}
|
238 |
+
|
239 |
+
|
240 |
+
def get_wikipedia_article(query: str, lang: str = "en") -> Dict[str, str]:
|
241 |
+
"""Searches and fetches a Wikipedia article for a given query and returns its content in Markdown format.
|
242 |
+
|
243 |
+
Args:
|
244 |
+
query (str): The search query.
|
245 |
+
lang (str): The language code for the Wikipedia edition to search (default: "en").
|
246 |
+
"""
|
247 |
+
headers = {
|
248 |
+
'User-Agent': 'MyLLMAgent ([email protected])'
|
249 |
+
}
|
250 |
+
|
251 |
+
search_url = f"https://api.wikimedia.org/core/v1/wikipedia/en/search/page"
|
252 |
+
search_params = {'q': query, 'limit': 1}
|
253 |
+
search_response = requests.get(search_url, headers=headers, params=search_params, timeout=15)
|
254 |
+
|
255 |
+
if search_response.status_code != 200:
|
256 |
+
raise Exception(f"Search error: {search_response.status_code} - {search_response.text}")
|
257 |
+
|
258 |
+
results = search_response.json().get("pages", [])
|
259 |
+
if not results:
|
260 |
+
raise Exception(f"No results found for query: {query}")
|
261 |
+
|
262 |
+
page = results[0]
|
263 |
+
page_key = page["key"]
|
264 |
+
|
265 |
+
return fetch_wikipedia_page(page_key, lang)
|
266 |
+
|
267 |
+
|
268 |
+
def parse_sections(markdown_text: str) -> Dict[str, Dict]:
|
269 |
+
"""
|
270 |
+
Parses markdown into a nested dict:
|
271 |
+
{ section_title: {
|
272 |
+
"full": full_section_md,
|
273 |
+
"subsections": { sub_title: sub_md, ... }
|
274 |
+
}, ... }
|
275 |
+
"""
|
276 |
+
# First split top-level sections
|
277 |
+
top_pat = re.compile(r"^##\s+(.*)$", re.MULTILINE)
|
278 |
+
top_matches = list(top_pat.finditer(markdown_text))
|
279 |
+
sections: Dict[str, Dict] = {}
|
280 |
+
for i, m in enumerate(top_matches):
|
281 |
+
sec_title = m.group(1).strip()
|
282 |
+
start = m.start()
|
283 |
+
end = top_matches[i+1].start() if i+1 < len(top_matches) else len(markdown_text)
|
284 |
+
sec_md = markdown_text[start:end].strip()
|
285 |
+
|
286 |
+
# Now split subsections within this block
|
287 |
+
sub_pat = re.compile(r"^###\s+(.*)$", re.MULTILINE)
|
288 |
+
subs: Dict[str, str] = {}
|
289 |
+
sub_matches = list(sub_pat.finditer(sec_md))
|
290 |
+
for j, sm in enumerate(sub_matches):
|
291 |
+
sub_title = sm.group(1).strip()
|
292 |
+
sub_start = sm.start()
|
293 |
+
sub_end = sub_matches[j+1].start() if j+1 < len(sub_matches) else len(sec_md)
|
294 |
+
subs[sub_title] = sec_md[sub_start:sub_end].strip()
|
295 |
+
|
296 |
+
sections[sec_title] = {"full": sec_md, "subsections": subs}
|
297 |
+
return sections
|