phucdev commited on
Commit
1284099
·
1 Parent(s): 88a1595

Move utility code into separate modules and add MarkdownWebBaseLoader implementation

Browse files
Files changed (5) hide show
  1. app.py +1 -1
  2. requirements.txt +10 -5
  3. retrieval.py +80 -0
  4. tools.py +20 -174
  5. web_utilities.py +297 -0
app.py CHANGED
@@ -148,7 +148,7 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
148
  )
149
  else:
150
  print(f"Running agent on {len(questions_data)} questions...")
151
- for item in questions_data:
152
  result = solve_question(item)
153
  results_log.append(result)
154
  with open(results_file_path, "w") as results_file:
 
148
  )
149
  else:
150
  print(f"Running agent on {len(questions_data)} questions...")
151
+ for item in filtered_questions_data:
152
  result = solve_question(item)
153
  results_log.append(result)
154
  with open(results_file_path, "w") as results_file:
requirements.txt CHANGED
@@ -1,3 +1,4 @@
 
1
  beautifulsoup4==4.13.4
2
  datasets==3.5.1
3
  duckduckgo-search==8.0.1
@@ -6,17 +7,21 @@ gradio==5.29.0
6
  hf_xet==1.1.2
7
  huggingface-hub==0.30.2
8
  langchain==0.3.25
9
- langchain-community==0.3.23
10
- langchain-core==0.3.58
11
- langchain_groq==0.3.2
12
- langchain-huggingface==0.1.2
13
- langchain-openai==0.3.16
 
 
14
  langfuse==2.60.5
15
  langgraph==0.4.1
 
16
  numpy==2.2.5
17
  openai-whisper==20240930
18
  openpyxl==3.1.5
19
  pandas==2.2.3
 
20
  pyrootutils~=1.0.4
21
  python-dotenv~=1.1.0
22
  requests==2.32.3
 
1
+ anthropic==0.52.2
2
  beautifulsoup4==4.13.4
3
  datasets==3.5.1
4
  duckduckgo-search==8.0.1
 
7
  hf_xet==1.1.2
8
  huggingface-hub==0.30.2
9
  langchain==0.3.25
10
+ langchain-anthropic==0.3.15
11
+ langchain-community==0.3.24
12
+ langchain-core==0.3.64
13
+ langchain-groq==0.3.2
14
+ langchain-huggingface==0.2.0
15
+ langchain-openai==0.3.21
16
+ langchain-tavily==0.1.6
17
  langfuse==2.60.5
18
  langgraph==0.4.1
19
+ markdownify==1.1.0
20
  numpy==2.2.5
21
  openai-whisper==20240930
22
  openpyxl==3.1.5
23
  pandas==2.2.3
24
+ playwright==1.52.0
25
  pyrootutils~=1.0.4
26
  python-dotenv~=1.1.0
27
  requests==2.32.3
retrieval.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Union
2
+ from dotenv import find_dotenv, load_dotenv
3
+ from langchain.chains import RetrievalQA
4
+ from langchain.chat_models import init_chat_model
5
+ from langchain.schema import Document
6
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
7
+ from langchain_community.vectorstores import FAISS
8
+ from langchain_huggingface.embeddings import HuggingFaceEmbeddings
9
+
10
+
11
+ def get_default_splitter() -> RecursiveCharacterTextSplitter:
12
+ """Returns a pre-configured text splitter."""
13
+ return RecursiveCharacterTextSplitter(
14
+ # Using markdown headers as separators is a good strategy
15
+ separators=["\n### ", "\n## ", "\n# ", "\n\n", "\n", " "],
16
+ chunk_size=1000,
17
+ chunk_overlap=200,
18
+ )
19
+
20
+ def get_default_embeddings() -> HuggingFaceEmbeddings:
21
+ """Returns a pre-configured embedding model."""
22
+ return HuggingFaceEmbeddings(
23
+ model_name="sentence-transformers/all-MiniLM-L6-v2",
24
+ model_kwargs={'device': 'cpu'}
25
+ )
26
+
27
+
28
+ def build_retriever(
29
+ data: Union[str, List[Document]],
30
+ splitter: RecursiveCharacterTextSplitter = None,
31
+ embeddings: HuggingFaceEmbeddings = None,
32
+ top_k: int = 5):
33
+ """Builds a retriever from either a raw text string or a list of documents.
34
+
35
+ Args:
36
+ Args:
37
+ data (Union[str, List[Document]]): The source data to build the retriever from.
38
+ splitter (RecursiveCharacterTextSplitter, optional): The text splitter to use.
39
+ Defaults to get_default_splitter().
40
+ embeddings (HuggingFaceEmbeddings, optional): The embedding model to use.
41
+ Defaults to get_default_embeddings().
42
+ top_k (int, optional): The number of top results to return. Defaults to 5.
43
+ """
44
+ splitter = splitter or get_default_splitter()
45
+ embeddings = embeddings or get_default_embeddings()
46
+ if isinstance(data, str):
47
+ # If the input is a raw string, split it into chunks first
48
+ chunks = splitter.split_text(data)
49
+ # Then convert those chunks into Document objects
50
+ docs = [Document(page_content=chunk) for chunk in chunks]
51
+ elif isinstance(data, list):
52
+ # If the input is already a list of documents, split them directly
53
+ docs = splitter.split_documents(data)
54
+ else:
55
+ raise ValueError(f"Unsupported data type: {type(data)}. Must be str or List[Document].")
56
+
57
+ index = FAISS.from_documents(docs, embeddings)
58
+ return index.as_retriever(search_kwargs={"k": top_k})
59
+
60
+
61
+ def create_retrieval_qa(
62
+ retriever,
63
+ llm=None
64
+ ) -> RetrievalQA:
65
+ """Creates a RetrievalQA instance from a given retriever and LLM.
66
+
67
+ Args:
68
+ retriever (BaseRetriever): The retriever to be used by the QA chain.
69
+ llm (LLM, optional): The language model to use. If not provided,
70
+ a default model will be initialized.
71
+ """
72
+ if llm is None:
73
+ load_dotenv(find_dotenv())
74
+ llm = init_chat_model("groq:meta-llama/llama-4-scout-17b-16e-instruct")
75
+ return RetrievalQA.from_chain_type(
76
+ llm=llm,
77
+ chain_type="stuff",
78
+ retriever=retriever,
79
+ return_source_documents=True,
80
+ )
tools.py CHANGED
@@ -1,42 +1,30 @@
1
  import base64
2
  import json
3
  import os
4
- import re
5
- from typing import Optional, Dict
6
-
7
  import pandas as pd
 
8
  import requests
9
  import whisper
10
 
11
- from bs4 import BeautifulSoup
12
  from datetime import datetime
13
  from dotenv import find_dotenv, load_dotenv
14
  from langchain.chains import RetrievalQA
15
  from langchain.chat_models import init_chat_model
16
- from langchain.schema import Document
17
- from langchain.text_splitter import RecursiveCharacterTextSplitter
18
  from langchain_community.document_loaders import (
19
  UnstructuredPDFLoader, UnstructuredPowerPointLoader,
20
  UnstructuredWordDocumentLoader, WebBaseLoader)
21
- from langchain_community.tools import DuckDuckGoSearchResults, GoogleSearchResults
22
  from langchain_community.utilities import GoogleSerperAPIWrapper
23
- from langchain_community.vectorstores import FAISS
24
  from langchain_core.prompts import ChatPromptTemplate
25
  from langchain_core.tools import tool
26
- from langchain_huggingface.embeddings import HuggingFaceEmbeddings
27
  from langchain_tavily import TavilySearch
28
- from markdownify import markdownify as md
29
  from youtube_transcript_api import YouTubeTranscriptApi
30
  from yt_dlp import YoutubeDL
31
 
 
 
32
 
33
- UNWANTED_SECTIONS = {
34
- "references",
35
- "external links",
36
- "further reading",
37
- "see also",
38
- "notes",
39
- }
40
 
41
  @tool
42
  def get_weather_info(location: str) -> str:
@@ -147,153 +135,6 @@ def reverse_text(text: str) -> str:
147
  return text[::-1]
148
 
149
 
150
- def build_retriever(text: str):
151
- """Builds a retriever from the given text.
152
-
153
- Args:
154
- text (str): The text to be used for retrieval.
155
- """
156
- splitter = RecursiveCharacterTextSplitter(
157
- separators=["\n### ", "\n## ", "\n# "],
158
- chunk_size=1000,
159
- chunk_overlap=200,
160
- )
161
- chunks = splitter.split_text(text)
162
- docs = [
163
- Document(page_content=chunk)
164
- for chunk in chunks
165
- ]
166
- hf_embed = HuggingFaceEmbeddings(
167
- model_name="sentence-transformers/all-MiniLM-L6-v2"
168
- )
169
- index = FAISS.from_documents(docs, hf_embed)
170
- return index.as_retriever(search_kwargs={"k": 3})
171
-
172
-
173
- def get_retrieval_qa(text: str):
174
- """Creates a RetrievalQA instance for the given text.
175
- Args:
176
- text (str): The text to be used for retrieval.
177
- """
178
- retriever = build_retriever(text)
179
- llm = init_chat_model("groq:meta-llama/llama-4-scout-17b-16e-instruct")
180
- return RetrievalQA.from_chain_type(
181
- llm=llm,
182
- chain_type="stuff",
183
- retriever=retriever,
184
- return_source_documents=True,
185
- )
186
-
187
-
188
- def clean_html(html: str) -> str:
189
- soup = BeautifulSoup(html, "html.parser")
190
-
191
- # 1. Remove <script> & <style>
192
- for tag in soup(["script", "style"]):
193
- tag.decompose()
194
-
195
- # 2. Drop whole <section> blocks whose first heading is unwanted
196
- for sec in soup.find_all("section"):
197
- h = sec.find(["h1","h2","h3","h4","h5","h6"])
198
- if h and any(h.get_text(strip=True).lower().startswith(u) for u in UNWANTED_SECTIONS):
199
- sec.decompose()
200
-
201
- # 3. Additional filtering by CSS selector
202
- for selector in [".toc", ".navbox", ".vertical-navbox", ".hatnote", ".reflist", ".mw-references-wrap"]:
203
- for el in soup.select(selector):
204
- el.decompose()
205
-
206
- # 4. Isolate the main content container if present
207
- main = soup.find("div", class_="mw-parser-output")
208
- return str(main or soup)
209
-
210
-
211
- def fetch_page_markdown(page_key: str, lang: str="en") -> str:
212
- """Fetches the page HTML and returns the <body> as Markdown.
213
- Args:
214
- page_key (str): The unique key of the Wikipedia page.
215
- lang (str): The language code for the Wikipedia edition to fetch (default: "en").
216
- """
217
- url = f"https://api.wikimedia.org/core/v1/wikipedia/{lang}/page/{page_key}/html"
218
- resp = requests.get(url, timeout=15)
219
- resp.raise_for_status()
220
- html = clean_html(resp.text) # Optional, but recommended: clean the HTML to remove unwanted sections
221
-
222
- markdown = md(
223
- html,
224
- heading_style="ATX",
225
- bullets="*+-",
226
- table_infer_header=True,
227
- strip=['a', 'span']
228
- )
229
- return markdown
230
-
231
-
232
- def get_wikipedia_article(query: str) -> Dict[str, str]:
233
- """Fetches a Wikipedia article for a given query and returns its content in Markdown format.
234
-
235
- Args:
236
- query (str): The search query.
237
- """
238
- headers = {
239
- 'User-Agent': 'MyLLMAgent ([email protected])'
240
- }
241
-
242
- # Step 1: Search
243
- search_url = f"https://api.wikimedia.org/core/v1/wikipedia/en/search/page"
244
- search_params = {'q': query, 'limit': 1}
245
- search_response = requests.get(search_url, headers=headers, params=search_params, timeout=15)
246
-
247
- if search_response.status_code != 200:
248
- raise Exception(f"Search error: {search_response.status_code} - {search_response.text}")
249
-
250
- results = search_response.json().get("pages", [])
251
- if not results:
252
- raise Exception(f"No results found for query: {query}")
253
-
254
- page = results[0]
255
- page_key = page["key"]
256
-
257
- # Step 2: Get the wiki page, only keep relevant content and convert to Markdown
258
- markdown = fetch_page_markdown(page_key)
259
- return {
260
- "page_key": page_key,
261
- "markdown": markdown,
262
- }
263
-
264
-
265
- def parse_sections(markdown_text: str) -> Dict[str, Dict]:
266
- """
267
- Parses markdown into a nested dict:
268
- { section_title: {
269
- "full": full_section_md,
270
- "subsections": { sub_title: sub_md, ... }
271
- }, ... }
272
- """
273
- # First split top-level sections
274
- top_pat = re.compile(r"^##\s+(.*)$", re.MULTILINE)
275
- top_matches = list(top_pat.finditer(markdown_text))
276
- sections: Dict[str, Dict] = {}
277
- for i, m in enumerate(top_matches):
278
- sec_title = m.group(1).strip()
279
- start = m.start()
280
- end = top_matches[i+1].start() if i+1 < len(top_matches) else len(markdown_text)
281
- sec_md = markdown_text[start:end].strip()
282
-
283
- # Now split subsections within this block
284
- sub_pat = re.compile(r"^###\s+(.*)$", re.MULTILINE)
285
- subs: Dict[str, str] = {}
286
- sub_matches = list(sub_pat.finditer(sec_md))
287
- for j, sm in enumerate(sub_matches):
288
- sub_title = sm.group(1).strip()
289
- sub_start = sm.start()
290
- sub_end = sub_matches[j+1].start() if j+1 < len(sub_matches) else len(sec_md)
291
- subs[sub_title] = sec_md[sub_start:sub_end].strip()
292
-
293
- sections[sec_title] = {"full": sec_md, "subsections": subs}
294
- return sections
295
-
296
-
297
  @tool
298
  def wiki_search_qa(query: str, question: str) -> str:
299
  """Searches Wikipedia for a specific article and answers a question based on its content.
@@ -304,10 +145,13 @@ def wiki_search_qa(query: str, question: str) -> str:
304
  Args:
305
  query (str): A concise topic name with optional keywords, ideally matching the relevant Wikipedia page title.
306
  question (str): The question to answer using the article.
 
 
307
  """
308
  article = get_wikipedia_article(query)
309
  markdown = article["markdown"]
310
- qa = get_retrieval_qa(markdown)
 
311
  return qa.invoke(question)
312
 
313
 
@@ -344,8 +188,8 @@ def wiki_get_section(
344
  Returns:
345
  Markdown string of either the entire section or just the named subsection.
346
  """
347
- page_key = page_key.strip().replace(" ", "_")
348
- markdown = fetch_page_markdown(page_key)
349
  sections = parse_sections(markdown)
350
 
351
  sec_info = sections.get(section)
@@ -368,7 +212,7 @@ def web_search(query: str, max_results: int = 5) -> str:
368
 
369
  Args:
370
  query (str): The search query.
371
- max_results (int): The maximum number of results to return. Default is 5.
372
  """
373
  if os.getenv("SERPER_API_KEY"):
374
  # Preferred choice: Use Google Serper API for search
@@ -400,6 +244,8 @@ def web_search(query: str, max_results: int = 5) -> str:
400
  search_tool = DuckDuckGoSearchResults()
401
  results = search_tool.invoke(query)
402
  if results:
 
 
403
  return results
404
  else:
405
  return "No results found."
@@ -412,12 +258,12 @@ def visit_website(url: str) -> str:
412
  Args:
413
  url (str): The URL of the website to visit.
414
  """
415
- loader = WebBaseLoader(url)
416
- documents = loader.load()
417
- if documents:
418
- return documents[0].page_content
419
- else:
420
- return "No content found."
421
 
422
 
423
  @tool
 
1
  import base64
2
  import json
3
  import os
 
 
 
4
  import pandas as pd
5
+ import re
6
  import requests
7
  import whisper
8
 
 
9
  from datetime import datetime
10
  from dotenv import find_dotenv, load_dotenv
11
  from langchain.chains import RetrievalQA
12
  from langchain.chat_models import init_chat_model
 
 
13
  from langchain_community.document_loaders import (
14
  UnstructuredPDFLoader, UnstructuredPowerPointLoader,
15
  UnstructuredWordDocumentLoader, WebBaseLoader)
16
+ from langchain_community.tools import DuckDuckGoSearchResults
17
  from langchain_community.utilities import GoogleSerperAPIWrapper
 
18
  from langchain_core.prompts import ChatPromptTemplate
19
  from langchain_core.tools import tool
 
20
  from langchain_tavily import TavilySearch
21
+ from typing import Optional
22
  from youtube_transcript_api import YouTubeTranscriptApi
23
  from yt_dlp import YoutubeDL
24
 
25
+ from retrieval import build_retriever, create_retrieval_qa
26
+ from web_utilities import get_wikipedia_article, parse_sections, fetch_wikipedia_page, MarkdownWebBaseLoader
27
 
 
 
 
 
 
 
 
28
 
29
  @tool
30
  def get_weather_info(location: str) -> str:
 
135
  return text[::-1]
136
 
137
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  @tool
139
  def wiki_search_qa(query: str, question: str) -> str:
140
  """Searches Wikipedia for a specific article and answers a question based on its content.
 
145
  Args:
146
  query (str): A concise topic name with optional keywords, ideally matching the relevant Wikipedia page title.
147
  question (str): The question to answer using the article.
148
+ Returns:
149
+ str: The answer to the question based on the retrieved article.
150
  """
151
  article = get_wikipedia_article(query)
152
  markdown = article["markdown"]
153
+ retriever = build_retriever(markdown)
154
+ qa = create_retrieval_qa(retriever=retriever)
155
  return qa.invoke(question)
156
 
157
 
 
188
  Returns:
189
  Markdown string of either the entire section or just the named subsection.
190
  """
191
+ result_dict = fetch_wikipedia_page(page_key=page_key)
192
+ markdown = result_dict.get("markdown")
193
  sections = parse_sections(markdown)
194
 
195
  sec_info = sections.get(section)
 
212
 
213
  Args:
214
  query (str): The search query.
215
+ max_results (int): The maximum number of results to return. Default is 3.
216
  """
217
  if os.getenv("SERPER_API_KEY"):
218
  # Preferred choice: Use Google Serper API for search
 
244
  search_tool = DuckDuckGoSearchResults()
245
  results = search_tool.invoke(query)
246
  if results:
247
+ # Clean up the results to remove any unnecessary spaces or newlines, e.g. \n\n\n
248
+ results = re.sub(r"\n{2,}", "\n", results.strip())
249
  return results
250
  else:
251
  return "No results found."
 
258
  Args:
259
  url (str): The URL of the website to visit.
260
  """
261
+ try:
262
+ page_content = MarkdownWebBaseLoader(url).load()[0].page_content
263
+ # Use retrieval chain if page_content is large
264
+ return page_content
265
+ except Exception as e:
266
+ return f"Could not retrieve website content. Error: {e}"
267
 
268
 
269
  @tool
web_utilities.py ADDED
@@ -0,0 +1,297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import logging
3
+ import re
4
+ import requests
5
+
6
+ from bs4 import BeautifulSoup
7
+ from langchain.chains import RetrievalQA
8
+ from langchain_community.document_loaders import WebBaseLoader
9
+ from langchain_core.documents import Document
10
+ from markdownify import markdownify as md
11
+ from playwright.async_api import async_playwright
12
+ from typing import Any, AsyncIterator, Dict, List, Iterator, Optional, Sequence, Union
13
+
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ UNWANTED_SECTIONS = {
18
+ "references",
19
+ "external links",
20
+ "further reading",
21
+ "see also",
22
+ "notes",
23
+ }
24
+
25
+
26
+ def build_metadata(soup: Any, url: str) -> dict:
27
+ """Build metadata from BeautifulSoup output."""
28
+ metadata = {"source": url}
29
+ if title := soup.find("title"):
30
+ metadata["title"] = title.get_text()
31
+ if description := soup.find("meta", attrs={"name": "description"}):
32
+ metadata["description"] = description.get("content", "No description found.")
33
+ if html := soup.find("html"):
34
+ metadata["language"] = html.get("lang", "No language found.")
35
+ return metadata
36
+
37
+
38
+ class MarkdownWebBaseLoader(WebBaseLoader):
39
+ """
40
+ A WebBaseLoader subclass that uses Playwright to render JS, then
41
+ strips boilerplate and converts structured pieces to Markdown.
42
+ """
43
+ def __init__(
44
+ self,
45
+ web_path: Union[str, Sequence[str]] = "",
46
+ header_template: Optional[dict] = None,
47
+ verify_ssl: bool = True,
48
+ proxies: Optional[dict] = None,
49
+ continue_on_failure: bool = False,
50
+ autoset_encoding: bool = True,
51
+ encoding: Optional[str] = None,
52
+ web_paths: Sequence[str] = (),
53
+ requests_per_second: int = 2,
54
+ default_parser: str = "html.parser",
55
+ requests_kwargs: Optional[Dict[str, Any]] = None,
56
+ raise_for_status: bool = False,
57
+ bs_get_text_kwargs: Optional[Dict[str, Any]] = None,
58
+ bs_kwargs: Optional[Dict[str, Any]] = None,
59
+ session: Any = None,
60
+ markdown_kwargs: Optional[Dict[str, Any]] = None,
61
+ unwanted_css: Optional[List[str]] = None,
62
+ unwanted_headings: Optional[List[str]] = None,
63
+ render_wait: float = 1.0,
64
+ *,
65
+ show_progress: bool = True,
66
+ trust_env: bool = False,
67
+ ) -> None:
68
+ """Initialize loader.
69
+
70
+ Args:
71
+ markdown_kwargs: Optional[Dict[str, Any]]: Arguments for markdownify.
72
+ unwanted_css: Optional[List[str]]: CSS selectors to remove from the page.
73
+ unwanted_headings: Optional[List[str]]: Headings to remove from the page.
74
+ render_wait: float: Time to wait for JS rendering (default: 2.0 seconds).
75
+ """
76
+ super().__init__(
77
+ web_path=web_path,
78
+ header_template=header_template,
79
+ verify_ssl=verify_ssl,
80
+ proxies=proxies,
81
+ continue_on_failure=continue_on_failure,
82
+ autoset_encoding=autoset_encoding,
83
+ encoding=encoding,
84
+ web_paths=web_paths,
85
+ requests_per_second=requests_per_second,
86
+ default_parser=default_parser,
87
+ requests_kwargs=requests_kwargs,
88
+ raise_for_status=raise_for_status,
89
+ bs_get_text_kwargs=bs_get_text_kwargs,
90
+ bs_kwargs=bs_kwargs,
91
+ session=session,
92
+ show_progress=show_progress,
93
+ trust_env=trust_env,
94
+ )
95
+ self.markdown_kwargs = markdown_kwargs or {
96
+ "heading_style": "ATX",
97
+ "bullets": "*+-",
98
+ "strip": ["a", "span"],
99
+ "table_infer_header": True
100
+ }
101
+ self.unwanted_css = unwanted_css or [
102
+ ".toc", ".navbox", ".sidebar", ".advertisement", ".cookie-banner", ".vertical-navbox",
103
+ ".hatnote", ".reflist", ".mw-references-wrap"
104
+ ]
105
+ self.unwanted_headings = [h.lower() for h in (unwanted_headings or UNWANTED_SECTIONS)]
106
+ self.render_wait = render_wait
107
+
108
+ @staticmethod
109
+ def _should_render(html: str, soup: Any) -> bool:
110
+ low_text = len(soup.get_text(strip=True)) < 100
111
+ has_noscript = bool(soup.find("noscript"))
112
+ cf_challenge = "just a moment" in html.lower() or "enable javascript" in html.lower()
113
+ many_scripts = len(soup.find_all("script")) > 20
114
+ return has_noscript or cf_challenge or low_text or many_scripts
115
+
116
+ async def _fetch_with_playwright(self, url: str) -> str:
117
+ async with async_playwright() as pw:
118
+ browser = await pw.chromium.launch(headless=True)
119
+ page = await browser.new_page()
120
+ # If you need cookies/auth, you can do:
121
+ # await page.set_extra_http_headers(self.session.headers)
122
+ await page.goto(url)
123
+ await asyncio.sleep(self.render_wait) # allow JS to finish
124
+ content = await page.content()
125
+ await browser.close()
126
+ return content
127
+
128
+ def _scrape(
129
+ self,
130
+ url: str,
131
+ parser: Union[str, None] = None,
132
+ bs_kwargs: Optional[dict] = None,
133
+ ) -> Any:
134
+ if parser is None:
135
+ parser = "xml" if url.endswith(".xml") else self.default_parser
136
+ self._check_parser(parser)
137
+
138
+ resp = self.session.get(url, **self.requests_kwargs)
139
+ if self.raise_for_status:
140
+ resp.raise_for_status()
141
+ if self.encoding is not None:
142
+ resp.encoding = self.encoding
143
+ elif self.autoset_encoding:
144
+ resp.encoding = resp.apparent_encoding
145
+ html = resp.text
146
+
147
+ soup = BeautifulSoup(html, parser, **(bs_kwargs or {}))
148
+
149
+ # If the html looks JS-heavy, re-render with Playwright
150
+ if not url.endswith(".xml") and self._should_render(html, soup):
151
+ try:
152
+ rendered = asyncio.run(self._fetch_with_playwright(url))
153
+ soup = BeautifulSoup(rendered, parser, **(bs_kwargs or {}))
154
+ except Exception as e:
155
+ logger.warning("Playwright rendering failed for %s: %s. Falling back to requests.", url, e)
156
+
157
+ return soup
158
+
159
+ @staticmethod
160
+ def normalize_whitespace(text: str) -> str:
161
+ """
162
+ Collapse runs of spaces, tabs, etc. down to single spaces—but skip
163
+ inside fenced code blocks ```…``` or inline code `…`.
164
+ """
165
+ # Replace non-breaking and invisible spaces with regular spaces
166
+ text = text.replace("\u00A0", " ")
167
+ # Strip zero-width spaces:
168
+ text = re.sub(r"[\u200B\u200C\u200D\uFEFF]", "", text)
169
+
170
+ # Split out fenced code -> keep code blocks intact while normalizing other text
171
+ parts = re.split(r'(```.*?```)', text, flags=re.S)
172
+ for i, part in enumerate(parts):
173
+ if not part.startswith("```"):
174
+ # further split out inline code
175
+ subparts = re.split(r'(`[^`\n]+`)', part)
176
+ for j, sp in enumerate(subparts):
177
+ if not sp.startswith("`"):
178
+ # collapse whitespace, strip edges of each segment
179
+ subparts[j] = re.sub(r'[ \t\r\f\v]+', ' ', sp).strip()
180
+ parts[i] = "".join(subparts)
181
+ # Rejoin and ensure paragraphs are separated by a single blank line
182
+ normalized = "\n\n".join(p for p in parts if p.strip() != "")
183
+ return normalized
184
+
185
+ def _convert_soup_to_text(self, soup: Any) -> str:
186
+ # Strip scripts & styles
187
+ for tag in soup(["script", "style"]):
188
+ tag.decompose()
189
+ # Drop blocks whose first heading matches unwanted
190
+ for sec in soup.find_all(["section", "div", "aside"]):
191
+ h = sec.find(["h1", "h2", "h3", "h4", "h5", "h6"])
192
+ if h and any(h.get_text(strip=True).lower().startswith(u) for u in self.unwanted_headings):
193
+ sec.decompose()
194
+ # Drop by CSS selector
195
+ for sel in self.unwanted_css:
196
+ for el in soup.select(sel):
197
+ el.decompose()
198
+ # Isolate the main content container if present
199
+ soup = soup.find("div", class_="mw-parser-output") or soup.find("main") or soup.find("article") or soup
200
+
201
+ # Convert to Markdown text with markdownify
202
+ markdown = md(str(soup), **self.markdown_kwargs)
203
+ markdown = self.normalize_whitespace(markdown)
204
+ return markdown
205
+
206
+ def lazy_load(self) -> Iterator[Document]:
207
+ """Lazy load text from the url(s) in web_path."""
208
+ for path in self.web_paths:
209
+ soup = self._scrape(path, bs_kwargs=self.bs_kwargs)
210
+ text = self._convert_soup_to_text(soup)
211
+ metadata = build_metadata(soup, path)
212
+ yield Document(page_content=text, metadata=metadata)
213
+
214
+ async def alazy_load(self) -> AsyncIterator[Document]:
215
+ """Async lazy load text from the url(s) in web_path."""
216
+ results = await self.ascrape_all(self.web_paths)
217
+ for path, soup in zip(self.web_paths, results):
218
+ text = self._convert_soup_to_text(soup)
219
+ metadata = build_metadata(soup, path)
220
+ yield Document(page_content=text, metadata=metadata)
221
+
222
+
223
+ def fetch_wikipedia_page(page_key: str, lang: str = "en") -> Dict[str, str]:
224
+ """Fetches a Wikipedia page by its key and returns its content in Markdown format.
225
+
226
+ Args:
227
+ page_key (str): The unique key of the Wikipedia page.
228
+ lang (str): The language code for the Wikipedia edition to fetch (default: "en").
229
+ """
230
+ page_key = page_key.replace(" ", "_") # Ensure the page key is URL-safe
231
+ page_url = f"https://api.wikimedia.org/core/v1/wikipedia/{lang}/page/{page_key}/html"
232
+ visit_website_tool = MarkdownWebBaseLoader(page_url)
233
+ markdown = visit_website_tool.load()[0].page_content
234
+ return {
235
+ "page_key": page_key,
236
+ "markdown": markdown,
237
+ }
238
+
239
+
240
+ def get_wikipedia_article(query: str, lang: str = "en") -> Dict[str, str]:
241
+ """Searches and fetches a Wikipedia article for a given query and returns its content in Markdown format.
242
+
243
+ Args:
244
+ query (str): The search query.
245
+ lang (str): The language code for the Wikipedia edition to search (default: "en").
246
+ """
247
+ headers = {
248
+ 'User-Agent': 'MyLLMAgent ([email protected])'
249
+ }
250
+
251
+ search_url = f"https://api.wikimedia.org/core/v1/wikipedia/en/search/page"
252
+ search_params = {'q': query, 'limit': 1}
253
+ search_response = requests.get(search_url, headers=headers, params=search_params, timeout=15)
254
+
255
+ if search_response.status_code != 200:
256
+ raise Exception(f"Search error: {search_response.status_code} - {search_response.text}")
257
+
258
+ results = search_response.json().get("pages", [])
259
+ if not results:
260
+ raise Exception(f"No results found for query: {query}")
261
+
262
+ page = results[0]
263
+ page_key = page["key"]
264
+
265
+ return fetch_wikipedia_page(page_key, lang)
266
+
267
+
268
+ def parse_sections(markdown_text: str) -> Dict[str, Dict]:
269
+ """
270
+ Parses markdown into a nested dict:
271
+ { section_title: {
272
+ "full": full_section_md,
273
+ "subsections": { sub_title: sub_md, ... }
274
+ }, ... }
275
+ """
276
+ # First split top-level sections
277
+ top_pat = re.compile(r"^##\s+(.*)$", re.MULTILINE)
278
+ top_matches = list(top_pat.finditer(markdown_text))
279
+ sections: Dict[str, Dict] = {}
280
+ for i, m in enumerate(top_matches):
281
+ sec_title = m.group(1).strip()
282
+ start = m.start()
283
+ end = top_matches[i+1].start() if i+1 < len(top_matches) else len(markdown_text)
284
+ sec_md = markdown_text[start:end].strip()
285
+
286
+ # Now split subsections within this block
287
+ sub_pat = re.compile(r"^###\s+(.*)$", re.MULTILINE)
288
+ subs: Dict[str, str] = {}
289
+ sub_matches = list(sub_pat.finditer(sec_md))
290
+ for j, sm in enumerate(sub_matches):
291
+ sub_title = sm.group(1).strip()
292
+ sub_start = sm.start()
293
+ sub_end = sub_matches[j+1].start() if j+1 < len(sub_matches) else len(sec_md)
294
+ subs[sub_title] = sec_md[sub_start:sub_end].strip()
295
+
296
+ sections[sec_title] = {"full": sec_md, "subsections": subs}
297
+ return sections