Final_Assignment_Template

Sleeping

App Files Files Community

phucdev commited on 19 days ago

Commit

1284099

1 Parent(s): 88a1595

Move utility code into separate modules and add MarkdownWebBaseLoader implementation

Browse files

Files changed (5) hide show

app.py +1 -1
requirements.txt +10 -5
retrieval.py +80 -0
tools.py +20 -174
web_utilities.py +297 -0

app.py CHANGED Viewed

@@ -148,7 +148,7 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
         )
     else:
         print(f"Running agent on {len(questions_data)} questions...")
-    for item in questions_data:
         result = solve_question(item)
         results_log.append(result)
     with open(results_file_path, "w") as results_file:

         )
     else:
         print(f"Running agent on {len(questions_data)} questions...")
+    for item in filtered_questions_data:
         result = solve_question(item)
         results_log.append(result)
     with open(results_file_path, "w") as results_file:

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 beautifulsoup4==4.13.4
 datasets==3.5.1
 duckduckgo-search==8.0.1
@@ -6,17 +7,21 @@ gradio==5.29.0
 hf_xet==1.1.2
 huggingface-hub==0.30.2
 langchain==0.3.25
-langchain-community==0.3.23
-langchain-core==0.3.58
-langchain_groq==0.3.2
-langchain-huggingface==0.1.2
-langchain-openai==0.3.16
 langfuse==2.60.5
 langgraph==0.4.1
 numpy==2.2.5
 openai-whisper==20240930
 openpyxl==3.1.5
 pandas==2.2.3
 pyrootutils~=1.0.4
 python-dotenv~=1.1.0
 requests==2.32.3

+anthropic==0.52.2
 beautifulsoup4==4.13.4
 datasets==3.5.1
 duckduckgo-search==8.0.1
 hf_xet==1.1.2
 huggingface-hub==0.30.2
 langchain==0.3.25
+langchain-anthropic==0.3.15
+langchain-community==0.3.24
+langchain-core==0.3.64
+langchain-groq==0.3.2
+langchain-huggingface==0.2.0
+langchain-openai==0.3.21
+langchain-tavily==0.1.6
 langfuse==2.60.5
 langgraph==0.4.1
+markdownify==1.1.0
 numpy==2.2.5
 openai-whisper==20240930
 openpyxl==3.1.5
 pandas==2.2.3
+playwright==1.52.0
 pyrootutils~=1.0.4
 python-dotenv~=1.1.0
 requests==2.32.3

retrieval.py ADDED Viewed

	@@ -0,0 +1,80 @@

+from typing import List, Union
+from dotenv import find_dotenv, load_dotenv
+from langchain.chains import RetrievalQA
+from langchain.chat_models import init_chat_model
+from langchain.schema import Document
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.vectorstores import FAISS
+from langchain_huggingface.embeddings import HuggingFaceEmbeddings
+def get_default_splitter() -> RecursiveCharacterTextSplitter:
+    """Returns a pre-configured text splitter."""
+    return RecursiveCharacterTextSplitter(
+        # Using markdown headers as separators is a good strategy
+        separators=["\n### ", "\n## ", "\n# ", "\n\n", "\n", " "],
+        chunk_size=1000,
+        chunk_overlap=200,
+    )
+def get_default_embeddings() -> HuggingFaceEmbeddings:
+    """Returns a pre-configured embedding model."""
+    return HuggingFaceEmbeddings(
+        model_name="sentence-transformers/all-MiniLM-L6-v2",
+        model_kwargs={'device': 'cpu'}
+    )
+def build_retriever(
+        data: Union[str, List[Document]],
+        splitter: RecursiveCharacterTextSplitter = None,
+        embeddings: HuggingFaceEmbeddings = None,
+        top_k: int = 5):
+    """Builds a retriever from either a raw text string or a list of documents.
+    Args:
+        Args:
+        data (Union[str, List[Document]]): The source data to build the retriever from.
+        splitter (RecursiveCharacterTextSplitter, optional): The text splitter to use.
+                                                            Defaults to get_default_splitter().
+        embeddings (HuggingFaceEmbeddings, optional): The embedding model to use.
+                                                     Defaults to get_default_embeddings().
+        top_k (int, optional): The number of top results to return. Defaults to 5.
+    """
+    splitter = splitter or get_default_splitter()
+    embeddings = embeddings or get_default_embeddings()
+    if isinstance(data, str):
+        # If the input is a raw string, split it into chunks first
+        chunks = splitter.split_text(data)
+        # Then convert those chunks into Document objects
+        docs = [Document(page_content=chunk) for chunk in chunks]
+    elif isinstance(data, list):
+        # If the input is already a list of documents, split them directly
+        docs = splitter.split_documents(data)
+    else:
+        raise ValueError(f"Unsupported data type: {type(data)}. Must be str or List[Document].")
+    index = FAISS.from_documents(docs, embeddings)
+    return index.as_retriever(search_kwargs={"k": top_k})
+def create_retrieval_qa(
+        retriever,
+        llm=None
+    ) -> RetrievalQA:
+    """Creates a RetrievalQA instance from a given retriever and LLM.
+    Args:
+        retriever (BaseRetriever): The retriever to be used by the QA chain.
+        llm (LLM, optional): The language model to use. If not provided,
+                                a default model will be initialized.
+    """
+    if llm is None:
+        load_dotenv(find_dotenv())
+        llm = init_chat_model("groq:meta-llama/llama-4-scout-17b-16e-instruct")
+    return RetrievalQA.from_chain_type(
+        llm=llm,
+        chain_type="stuff",
+        retriever=retriever,
+        return_source_documents=True,
+    )

tools.py CHANGED Viewed

@@ -1,42 +1,30 @@
 import base64
 import json
 import os
-import re
-from typing import Optional, Dict
 import pandas as pd
 import requests
 import whisper
-from bs4 import BeautifulSoup
 from datetime import datetime
 from dotenv import find_dotenv, load_dotenv
 from langchain.chains import RetrievalQA
 from langchain.chat_models import init_chat_model
-from langchain.schema import Document
-from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.document_loaders import (
     UnstructuredPDFLoader, UnstructuredPowerPointLoader,
     UnstructuredWordDocumentLoader, WebBaseLoader)
-from langchain_community.tools import DuckDuckGoSearchResults, GoogleSearchResults
 from langchain_community.utilities import GoogleSerperAPIWrapper
-from langchain_community.vectorstores import FAISS
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.tools import tool
-from langchain_huggingface.embeddings import HuggingFaceEmbeddings
 from langchain_tavily import TavilySearch
-from markdownify import markdownify as md
 from youtube_transcript_api import YouTubeTranscriptApi
 from yt_dlp import YoutubeDL
-UNWANTED_SECTIONS = {
-    "references",
-    "external links",
-    "further reading",
-    "see also",
-    "notes",
-}
 @tool
 def get_weather_info(location: str) -> str:
@@ -147,153 +135,6 @@ def reverse_text(text: str) -> str:
     return text[::-1]
-def build_retriever(text: str):
-    """Builds a retriever from the given text.
-    Args:
-        text (str): The text to be used for retrieval.
-    """
-    splitter = RecursiveCharacterTextSplitter(
-        separators=["\n### ", "\n## ", "\n# "],
-        chunk_size=1000,
-        chunk_overlap=200,
-    )
-    chunks = splitter.split_text(text)
-    docs = [
-        Document(page_content=chunk)
-        for chunk in chunks
-    ]
-    hf_embed = HuggingFaceEmbeddings(
-        model_name="sentence-transformers/all-MiniLM-L6-v2"
-    )
-    index = FAISS.from_documents(docs, hf_embed)
-    return index.as_retriever(search_kwargs={"k": 3})
-def get_retrieval_qa(text: str):
-    """Creates a RetrievalQA instance for the given text.
-    Args:
-        text (str): The text to be used for retrieval.
-    """
-    retriever = build_retriever(text)
-    llm = init_chat_model("groq:meta-llama/llama-4-scout-17b-16e-instruct")
-    return RetrievalQA.from_chain_type(
-        llm=llm,
-        chain_type="stuff",
-        retriever=retriever,
-        return_source_documents=True,
-    )
-def clean_html(html: str) -> str:
-    soup = BeautifulSoup(html, "html.parser")
-    # 1. Remove <script> & <style>
-    for tag in soup(["script", "style"]):
-        tag.decompose()
-    # 2. Drop whole <section> blocks whose first heading is unwanted
-    for sec in soup.find_all("section"):
-        h = sec.find(["h1","h2","h3","h4","h5","h6"])
-        if h and any(h.get_text(strip=True).lower().startswith(u) for u in UNWANTED_SECTIONS):
-            sec.decompose()
-    # 3. Additional filtering by CSS selector
-    for selector in [".toc", ".navbox", ".vertical-navbox", ".hatnote", ".reflist", ".mw-references-wrap"]:
-        for el in soup.select(selector):
-            el.decompose()
-    # 4. Isolate the main content container if present
-    main = soup.find("div", class_="mw-parser-output")
-    return str(main or soup)
-def fetch_page_markdown(page_key: str, lang: str="en") -> str:
-    """Fetches the page HTML and returns the <body> as Markdown.
-    Args:
-        page_key (str): The unique key of the Wikipedia page.
-        lang (str): The language code for the Wikipedia edition to fetch (default: "en").
-    """
-    url = f"https://api.wikimedia.org/core/v1/wikipedia/{lang}/page/{page_key}/html"
-    resp = requests.get(url, timeout=15)
-    resp.raise_for_status()
-    html = clean_html(resp.text)    # Optional, but recommended: clean the HTML to remove unwanted sections
-    markdown = md(
-        html,
-        heading_style="ATX",
-        bullets="*+-",
-        table_infer_header=True,
-        strip=['a', 'span']
-    )
-    return markdown
-def get_wikipedia_article(query: str) -> Dict[str, str]:
-    """Fetches a Wikipedia article for a given query and returns its content in Markdown format.
-    Args:
-        query (str): The search query.
-    """
-    headers = {
-        'User-Agent': 'MyLLMAgent ([email protected])'
-    }
-    # Step 1: Search
-    search_url = f"https://api.wikimedia.org/core/v1/wikipedia/en/search/page"
-    search_params = {'q': query, 'limit': 1}
-    search_response = requests.get(search_url, headers=headers, params=search_params, timeout=15)
-    if search_response.status_code != 200:
-        raise Exception(f"Search error: {search_response.status_code} - {search_response.text}")
-    results = search_response.json().get("pages", [])
-    if not results:
-        raise Exception(f"No results found for query: {query}")
-    page = results[0]
-    page_key = page["key"]
-    # Step 2: Get the wiki page, only keep relevant content and convert to Markdown
-    markdown = fetch_page_markdown(page_key)
-    return {
-        "page_key": page_key,
-        "markdown": markdown,
-    }
-def parse_sections(markdown_text: str) -> Dict[str, Dict]:
-    """
-    Parses markdown into a nested dict:
-    { section_title: {
-         "full": full_section_md,
-         "subsections": { sub_title: sub_md, ... }
-      }, ... }
-    """
-    # First split top-level sections
-    top_pat = re.compile(r"^##\s+(.*)$", re.MULTILINE)
-    top_matches = list(top_pat.finditer(markdown_text))
-    sections: Dict[str, Dict] = {}
-    for i, m in enumerate(top_matches):
-        sec_title = m.group(1).strip()
-        start = m.start()
-        end = top_matches[i+1].start() if i+1 < len(top_matches) else len(markdown_text)
-        sec_md = markdown_text[start:end].strip()
-        # Now split subsections within this block
-        sub_pat = re.compile(r"^###\s+(.*)$", re.MULTILINE)
-        subs: Dict[str, str] = {}
-        sub_matches = list(sub_pat.finditer(sec_md))
-        for j, sm in enumerate(sub_matches):
-            sub_title = sm.group(1).strip()
-            sub_start = sm.start()
-            sub_end = sub_matches[j+1].start() if j+1 < len(sub_matches) else len(sec_md)
-            subs[sub_title] = sec_md[sub_start:sub_end].strip()
-        sections[sec_title] = {"full": sec_md, "subsections": subs}
-    return sections
 @tool
 def wiki_search_qa(query: str, question: str) -> str:
     """Searches Wikipedia for a specific article and answers a question based on its content.
@@ -304,10 +145,13 @@ def wiki_search_qa(query: str, question: str) -> str:
     Args:
         query (str): A concise topic name with optional keywords, ideally matching the relevant Wikipedia page title.
         question (str): The question to answer using the article.
     """
     article = get_wikipedia_article(query)
     markdown = article["markdown"]
-    qa = get_retrieval_qa(markdown)
     return qa.invoke(question)
@@ -344,8 +188,8 @@ def wiki_get_section(
     Returns:
         Markdown string of either the entire section or just the named subsection.
     """
-    page_key = page_key.strip().replace(" ", "_")
-    markdown = fetch_page_markdown(page_key)
     sections = parse_sections(markdown)
     sec_info = sections.get(section)
@@ -368,7 +212,7 @@ def web_search(query: str, max_results: int = 5) -> str:
     Args:
         query (str): The search query.
-        max_results (int): The maximum number of results to return. Default is 5.
     """
     if os.getenv("SERPER_API_KEY"):
         # Preferred choice: Use Google Serper API for search
@@ -400,6 +244,8 @@ def web_search(query: str, max_results: int = 5) -> str:
         search_tool = DuckDuckGoSearchResults()
         results = search_tool.invoke(query)
     if results:
         return results
     else:
         return "No results found."
@@ -412,12 +258,12 @@ def visit_website(url: str) -> str:
     Args:
         url (str): The URL of the website to visit.
     """
-    loader = WebBaseLoader(url)
-    documents = loader.load()
-    if documents:
-        return documents[0].page_content
-    else:
-        return "No content found."
 @tool

 import base64
 import json
 import os
 import pandas as pd
+import re
 import requests
 import whisper
 from datetime import datetime
 from dotenv import find_dotenv, load_dotenv
 from langchain.chains import RetrievalQA
 from langchain.chat_models import init_chat_model
 from langchain_community.document_loaders import (
     UnstructuredPDFLoader, UnstructuredPowerPointLoader,
     UnstructuredWordDocumentLoader, WebBaseLoader)
+from langchain_community.tools import DuckDuckGoSearchResults
 from langchain_community.utilities import GoogleSerperAPIWrapper
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.tools import tool
 from langchain_tavily import TavilySearch
+from typing import Optional
 from youtube_transcript_api import YouTubeTranscriptApi
 from yt_dlp import YoutubeDL
+from retrieval import build_retriever, create_retrieval_qa
+from web_utilities import get_wikipedia_article, parse_sections, fetch_wikipedia_page, MarkdownWebBaseLoader
 @tool
 def get_weather_info(location: str) -> str:
     return text[::-1]
 @tool
 def wiki_search_qa(query: str, question: str) -> str:
     """Searches Wikipedia for a specific article and answers a question based on its content.
     Args:
         query (str): A concise topic name with optional keywords, ideally matching the relevant Wikipedia page title.
         question (str): The question to answer using the article.
+    Returns:
+        str: The answer to the question based on the retrieved article.
     """
     article = get_wikipedia_article(query)
     markdown = article["markdown"]
+    retriever = build_retriever(markdown)
+    qa = create_retrieval_qa(retriever=retriever)
     return qa.invoke(question)
     Returns:
         Markdown string of either the entire section or just the named subsection.
     """
+    result_dict = fetch_wikipedia_page(page_key=page_key)
+    markdown = result_dict.get("markdown")
     sections = parse_sections(markdown)
     sec_info = sections.get(section)
     Args:
         query (str): The search query.
+        max_results (int): The maximum number of results to return. Default is 3.
     """
     if os.getenv("SERPER_API_KEY"):
         # Preferred choice: Use Google Serper API for search
         search_tool = DuckDuckGoSearchResults()
         results = search_tool.invoke(query)
     if results:
+        # Clean up the results to remove any unnecessary spaces or newlines, e.g. \n\n\n
+        results = re.sub(r"\n{2,}", "\n", results.strip())
         return results
     else:
         return "No results found."
     Args:
         url (str): The URL of the website to visit.
     """
+    try:
+        page_content = MarkdownWebBaseLoader(url).load()[0].page_content
+        # Use retrieval chain if page_content is large
+        return page_content
+    except Exception as e:
+        return f"Could not retrieve website content. Error: {e}"
 @tool

web_utilities.py ADDED Viewed

	@@ -0,0 +1,297 @@

+import asyncio
+import logging
+import re
+import requests
+from bs4 import BeautifulSoup
+from langchain.chains import RetrievalQA
+from langchain_community.document_loaders import WebBaseLoader
+from langchain_core.documents import Document
+from markdownify import markdownify as md
+from playwright.async_api import async_playwright
+from typing import Any, AsyncIterator, Dict, List, Iterator, Optional, Sequence, Union
+logger = logging.getLogger(__name__)
+UNWANTED_SECTIONS = {
+    "references",
+    "external links",
+    "further reading",
+    "see also",
+    "notes",
+}
+def build_metadata(soup: Any, url: str) -> dict:
+    """Build metadata from BeautifulSoup output."""
+    metadata = {"source": url}
+    if title := soup.find("title"):
+        metadata["title"] = title.get_text()
+    if description := soup.find("meta", attrs={"name": "description"}):
+        metadata["description"] = description.get("content", "No description found.")
+    if html := soup.find("html"):
+        metadata["language"] = html.get("lang", "No language found.")
+    return metadata
+class MarkdownWebBaseLoader(WebBaseLoader):
+    """
+    A WebBaseLoader subclass that uses Playwright to render JS, then
+    strips boilerplate and converts structured pieces to Markdown.
+    """
+    def __init__(
+        self,
+        web_path: Union[str, Sequence[str]] = "",
+        header_template: Optional[dict] = None,
+        verify_ssl: bool = True,
+        proxies: Optional[dict] = None,
+        continue_on_failure: bool = False,
+        autoset_encoding: bool = True,
+        encoding: Optional[str] = None,
+        web_paths: Sequence[str] = (),
+        requests_per_second: int = 2,
+        default_parser: str = "html.parser",
+        requests_kwargs: Optional[Dict[str, Any]] = None,
+        raise_for_status: bool = False,
+        bs_get_text_kwargs: Optional[Dict[str, Any]] = None,
+        bs_kwargs: Optional[Dict[str, Any]] = None,
+        session: Any = None,
+        markdown_kwargs: Optional[Dict[str, Any]] = None,
+        unwanted_css: Optional[List[str]] = None,
+        unwanted_headings: Optional[List[str]] = None,
+        render_wait: float = 1.0,
+        *,
+        show_progress: bool = True,
+        trust_env: bool = False,
+    ) -> None:
+        """Initialize loader.
+        Args:
+            markdown_kwargs: Optional[Dict[str, Any]]: Arguments for markdownify.
+            unwanted_css: Optional[List[str]]: CSS selectors to remove from the page.
+            unwanted_headings: Optional[List[str]]: Headings to remove from the page.
+            render_wait: float: Time to wait for JS rendering (default: 2.0 seconds).
+        """
+        super().__init__(
+            web_path=web_path,
+            header_template=header_template,
+            verify_ssl=verify_ssl,
+            proxies=proxies,
+            continue_on_failure=continue_on_failure,
+            autoset_encoding=autoset_encoding,
+            encoding=encoding,
+            web_paths=web_paths,
+            requests_per_second=requests_per_second,
+            default_parser=default_parser,
+            requests_kwargs=requests_kwargs,
+            raise_for_status=raise_for_status,
+            bs_get_text_kwargs=bs_get_text_kwargs,
+            bs_kwargs=bs_kwargs,
+            session=session,
+            show_progress=show_progress,
+            trust_env=trust_env,
+        )
+        self.markdown_kwargs = markdown_kwargs or {
+            "heading_style": "ATX",
+            "bullets": "*+-",
+            "strip": ["a", "span"],
+            "table_infer_header": True
+        }
+        self.unwanted_css = unwanted_css or [
+            ".toc", ".navbox", ".sidebar", ".advertisement", ".cookie-banner", ".vertical-navbox",
+            ".hatnote", ".reflist", ".mw-references-wrap"
+        ]
+        self.unwanted_headings = [h.lower() for h in (unwanted_headings or UNWANTED_SECTIONS)]
+        self.render_wait = render_wait
+    @staticmethod
+    def _should_render(html: str, soup: Any) -> bool:
+        low_text = len(soup.get_text(strip=True)) < 100
+        has_noscript = bool(soup.find("noscript"))
+        cf_challenge = "just a moment" in html.lower() or "enable javascript" in html.lower()
+        many_scripts = len(soup.find_all("script")) > 20
+        return has_noscript or cf_challenge or low_text or many_scripts
+    async def _fetch_with_playwright(self, url: str) -> str:
+        async with async_playwright() as pw:
+            browser = await pw.chromium.launch(headless=True)
+            page = await browser.new_page()
+            # If you need cookies/auth, you can do:
+            # await page.set_extra_http_headers(self.session.headers)
+            await page.goto(url)
+            await asyncio.sleep(self.render_wait)  # allow JS to finish
+            content = await page.content()
+            await browser.close()
+            return content
+    def _scrape(
+            self,
+            url: str,
+            parser: Union[str, None] = None,
+            bs_kwargs: Optional[dict] = None,
+    ) -> Any:
+        if parser is None:
+            parser = "xml" if url.endswith(".xml") else self.default_parser
+        self._check_parser(parser)
+        resp = self.session.get(url, **self.requests_kwargs)
+        if self.raise_for_status:
+            resp.raise_for_status()
+        if self.encoding is not None:
+            resp.encoding = self.encoding
+        elif self.autoset_encoding:
+            resp.encoding = resp.apparent_encoding
+        html = resp.text
+        soup = BeautifulSoup(html, parser, **(bs_kwargs or {}))
+        # If the html looks JS-heavy, re-render with Playwright
+        if not url.endswith(".xml") and self._should_render(html, soup):
+            try:
+                rendered = asyncio.run(self._fetch_with_playwright(url))
+                soup = BeautifulSoup(rendered, parser, **(bs_kwargs or {}))
+            except Exception as e:
+                logger.warning("Playwright rendering failed for %s: %s. Falling back to requests.", url, e)
+        return soup
+    @staticmethod
+    def normalize_whitespace(text: str) -> str:
+        """
+        Collapse runs of spaces, tabs, etc. down to single spaces—but skip
+        inside fenced code blocks ```…``` or inline code `…`.
+        """
+        # Replace non-breaking and invisible spaces with regular spaces
+        text = text.replace("\u00A0", " ")
+        # Strip zero-width spaces:
+        text = re.sub(r"[\u200B\u200C\u200D\uFEFF]", "", text)
+        # Split out fenced code -> keep code blocks intact while normalizing other text
+        parts = re.split(r'(```.*?```)', text, flags=re.S)
+        for i, part in enumerate(parts):
+            if not part.startswith("```"):
+                # further split out inline code
+                subparts = re.split(r'(`[^`\n]+`)', part)
+                for j, sp in enumerate(subparts):
+                    if not sp.startswith("`"):
+                        # collapse whitespace, strip edges of each segment
+                        subparts[j] = re.sub(r'[ \t\r\f\v]+', ' ', sp).strip()
+                parts[i] = "".join(subparts)
+        # Rejoin and ensure paragraphs are separated by a single blank line
+        normalized = "\n\n".join(p for p in parts if p.strip() != "")
+        return normalized
+    def _convert_soup_to_text(self, soup: Any) -> str:
+        # Strip scripts & styles
+        for tag in soup(["script", "style"]):
+            tag.decompose()
+        # Drop blocks whose first heading matches unwanted
+        for sec in soup.find_all(["section", "div", "aside"]):
+            h = sec.find(["h1", "h2", "h3", "h4", "h5", "h6"])
+            if h and any(h.get_text(strip=True).lower().startswith(u) for u in self.unwanted_headings):
+                sec.decompose()
+        # Drop by CSS selector
+        for sel in self.unwanted_css:
+            for el in soup.select(sel):
+                el.decompose()
+        # Isolate the main content container if present
+        soup = soup.find("div", class_="mw-parser-output") or soup.find("main") or soup.find("article") or soup
+        # Convert to Markdown text with markdownify
+        markdown = md(str(soup), **self.markdown_kwargs)
+        markdown = self.normalize_whitespace(markdown)
+        return markdown
+    def lazy_load(self) -> Iterator[Document]:
+        """Lazy load text from the url(s) in web_path."""
+        for path in self.web_paths:
+            soup = self._scrape(path, bs_kwargs=self.bs_kwargs)
+            text = self._convert_soup_to_text(soup)
+            metadata = build_metadata(soup, path)
+            yield Document(page_content=text, metadata=metadata)
+    async def alazy_load(self) -> AsyncIterator[Document]:
+        """Async lazy load text from the url(s) in web_path."""
+        results = await self.ascrape_all(self.web_paths)
+        for path, soup in zip(self.web_paths, results):
+            text = self._convert_soup_to_text(soup)
+            metadata = build_metadata(soup, path)
+            yield Document(page_content=text, metadata=metadata)
+def fetch_wikipedia_page(page_key: str, lang: str = "en") -> Dict[str, str]:
+    """Fetches a Wikipedia page by its key and returns its content in Markdown format.
+    Args:
+        page_key (str): The unique key of the Wikipedia page.
+        lang (str): The language code for the Wikipedia edition to fetch (default: "en").
+    """
+    page_key = page_key.replace(" ", "_")  # Ensure the page key is URL-safe
+    page_url = f"https://api.wikimedia.org/core/v1/wikipedia/{lang}/page/{page_key}/html"
+    visit_website_tool = MarkdownWebBaseLoader(page_url)
+    markdown = visit_website_tool.load()[0].page_content
+    return {
+        "page_key": page_key,
+        "markdown": markdown,
+    }
+def get_wikipedia_article(query: str, lang: str = "en") -> Dict[str, str]:
+    """Searches and fetches a Wikipedia article for a given query and returns its content in Markdown format.
+    Args:
+        query (str): The search query.
+        lang (str): The language code for the Wikipedia edition to search (default: "en").
+    """
+    headers = {
+        'User-Agent': 'MyLLMAgent ([email protected])'
+    }
+    search_url = f"https://api.wikimedia.org/core/v1/wikipedia/en/search/page"
+    search_params = {'q': query, 'limit': 1}
+    search_response = requests.get(search_url, headers=headers, params=search_params, timeout=15)
+    if search_response.status_code != 200:
+        raise Exception(f"Search error: {search_response.status_code} - {search_response.text}")
+    results = search_response.json().get("pages", [])
+    if not results:
+        raise Exception(f"No results found for query: {query}")
+    page = results[0]
+    page_key = page["key"]
+    return fetch_wikipedia_page(page_key, lang)
+def parse_sections(markdown_text: str) -> Dict[str, Dict]:
+    """
+    Parses markdown into a nested dict:
+    { section_title: {
+         "full": full_section_md,
+         "subsections": { sub_title: sub_md, ... }
+      }, ... }
+    """
+    # First split top-level sections
+    top_pat = re.compile(r"^##\s+(.*)$", re.MULTILINE)
+    top_matches = list(top_pat.finditer(markdown_text))
+    sections: Dict[str, Dict] = {}
+    for i, m in enumerate(top_matches):
+        sec_title = m.group(1).strip()
+        start = m.start()
+        end = top_matches[i+1].start() if i+1 < len(top_matches) else len(markdown_text)
+        sec_md = markdown_text[start:end].strip()
+        # Now split subsections within this block
+        sub_pat = re.compile(r"^###\s+(.*)$", re.MULTILINE)
+        subs: Dict[str, str] = {}
+        sub_matches = list(sub_pat.finditer(sec_md))
+        for j, sm in enumerate(sub_matches):
+            sub_title = sm.group(1).strip()
+            sub_start = sm.start()
+            sub_end = sub_matches[j+1].start() if j+1 < len(sub_matches) else len(sec_md)
+            subs[sub_title] = sec_md[sub_start:sub_end].strip()
+        sections[sec_title] = {"full": sec_md, "subsections": subs}
+    return sections