Spaces:

Nymbo
/

Tools

Running

App Files Files Community

Nymbo commited on 5 days ago

Commit

9e4885c

verified ·

1 Parent(s): b5c0401

adding markdown scraper option to Fetch_Webpage tool

Browse files

Files changed (1) hide show

app.py +61 -6

app.py CHANGED Viewed

@@ -19,6 +19,7 @@ from typing import List, Dict, Tuple, Annotated
 import gradio as gr
 import requests
 from bs4 import BeautifulSoup
 from readability import Document
 from urllib.parse import urljoin, urldefrag, urlparse
 from duckduckgo_search import DDGS
@@ -175,6 +176,50 @@ def _extract_main_text(html: str) -> Tuple[str, BeautifulSoup]:
     return clean_text, s
 def _extract_links(readable_soup: BeautifulSoup, base_url: str, max_links: int) -> List[Tuple[str, str]]:
     """
     Collect clean, unique, absolute links from the readable section only.
@@ -277,6 +322,7 @@ def Fetch_Webpage(  # <-- MCP tool #1 (Fetch)
     include_links: Annotated[bool, "Include outbound links discovered in the readable section."] = True,
     max_chars: Annotated[int, "Hard cap for body characters after the verbosity preset. Use 0 to disable the cap."] = 3000,
     max_links: Annotated[int, "Maximum number of links to include from the readable content. Set 0 to omit links."] = 20,
 ) -> str:
     """
     Fetch a web page and return a compact Markdown summary containing title, key
@@ -297,6 +343,10 @@ def Fetch_Webpage(  # <-- MCP tool #1 (Fetch)
             - Metadata (optional)
             - Text (optional, may be trimmed)
             - Links (optional, deduped and absolute)
     """
     if not url or not url.strip():
         return "Please enter a valid URL."
@@ -316,10 +366,14 @@ def Fetch_Webpage(  # <-- MCP tool #1 (Fetch)
     resp.encoding = resp.encoding or resp.apparent_encoding
     html = resp.text
-    # Full-page soup for metadata
     full_soup = BeautifulSoup(html, "lxml")
     meta = _extract_metadata(full_soup, final_url)
     # Readable content
     body_text, readable_soup = _extract_main_text(html)
     if not body_text:
@@ -679,17 +733,18 @@ fetch_interface = gr.Interface(
         gr.Checkbox(value=True, label="Include Links"),
         gr.Slider(400, 12000, value=3000, step=100, label="Max Characters (body text)"),
         gr.Slider(0, 100, value=20, step=1, label="Max Links"),
     ],
     outputs=gr.Markdown(label="Extracted Summary"),
     title="Fetch Webpage",
     description=(
-        "<div style=\"text-align:center\">Extract title, key metadata, readable text, and links from webpages. No noisy HTML.</div>"
     ),
     api_description=(
-        "Fetch a web page and return a compact Markdown summary with title, key "
-        "metadata, readable body text, and outbound links. Parameters let you "
-        "control verbosity, whether to include metadata/text/links, and limits "
-        "for characters and number of links."
     ),
     allow_flagging="never",
 )

 import gradio as gr
 import requests
 from bs4 import BeautifulSoup
+from markdownify import markdownify as md
 from readability import Document
 from urllib.parse import urljoin, urldefrag, urlparse
 from duckduckgo_search import DDGS
     return clean_text, s
+def _fullpage_markdown_from_soup(full_soup: BeautifulSoup, base_url: str) -> str:
+    """
+    Convert the page's main content (or body fallback) to Markdown, similar to
+    web-scraper's Content Scraper tool, but without any file download side-effects.
+    Steps:
+      - Remove noisy elements (script/style/nav/footer/header/aside)
+      - Prefer <main>, <article>, or common content containers; fallback to <body>
+      - Convert to Markdown with ATX headings
+      - Clean up excessive newlines, empty links, and whitespace
+      - Prepend a title header when available
+    """
+    # Remove unwanted elements globally first
+    for element in full_soup.select("script, style, nav, footer, header, aside"):
+        element.decompose()
+    # Try common main-content containers, then fallback to body
+    main = (
+        full_soup.find("main")
+        or full_soup.find("article")
+        or full_soup.find("div", class_=re.compile(r"content|main|post|article", re.I))
+        or full_soup.find("body")
+    )
+    if not main:
+        return "No main content found on the webpage."
+    # Convert selected HTML to Markdown
+    markdown_text = md(str(main), heading_style="ATX")
+    # Clean up the markdown similar to web-scraper
+    markdown_text = re.sub(r"\n{3,}", "\n\n", markdown_text)
+    markdown_text = re.sub(r"\[\s*\]\([^)]*\)", "", markdown_text)  # empty links
+    markdown_text = re.sub(r"[ \t]+", " ", markdown_text)
+    markdown_text = markdown_text.strip()
+    # Add title if present
+    title = full_soup.find("title")
+    if title and title.get_text(strip=True):
+        markdown_text = f"# {title.get_text(strip=True)}\n\n{markdown_text}"
+    return markdown_text or "No content could be extracted."
 def _extract_links(readable_soup: BeautifulSoup, base_url: str, max_links: int) -> List[Tuple[str, str]]:
     """
     Collect clean, unique, absolute links from the readable section only.
     include_links: Annotated[bool, "Include outbound links discovered in the readable section."] = True,
     max_chars: Annotated[int, "Hard cap for body characters after the verbosity preset. Use 0 to disable the cap."] = 3000,
     max_links: Annotated[int, "Maximum number of links to include from the readable content. Set 0 to omit links."] = 20,
+    full_page_markdown: Annotated[bool, "If true, return the page as full Markdown (Content Scraper mode) instead of a compact summary."] = False,
 ) -> str:
     """
     Fetch a web page and return a compact Markdown summary containing title, key
             - Metadata (optional)
             - Text (optional, may be trimmed)
             - Links (optional, deduped and absolute)
+    Special mode:
+        If full_page_markdown=True, the function returns the page converted to Markdown,
+        similar to the "Content Scraper" tool, ignoring verbosity/include_* limits.
     """
     if not url or not url.strip():
         return "Please enter a valid URL."
     resp.encoding = resp.encoding or resp.apparent_encoding
     html = resp.text
+    # Full-page soup for metadata (and potential Markdown conversion)
     full_soup = BeautifulSoup(html, "lxml")
     meta = _extract_metadata(full_soup, final_url)
+    # Content Scraper mode: return full-page Markdown early
+    if full_page_markdown:
+        return _fullpage_markdown_from_soup(full_soup, final_url)
     # Readable content
     body_text, readable_soup = _extract_main_text(html)
     if not body_text:
         gr.Checkbox(value=True, label="Include Links"),
         gr.Slider(400, 12000, value=3000, step=100, label="Max Characters (body text)"),
         gr.Slider(0, 100, value=20, step=1, label="Max Links"),
+    gr.Checkbox(value=False, label="Full-page Markdown (Content Scraper mode)"),
     ],
     outputs=gr.Markdown(label="Extracted Summary"),
     title="Fetch Webpage",
     description=(
+    "<div style=\"text-align:center\">Extract title, key metadata, readable text, and links from webpages — or toggle full-page Markdown.</div>"
     ),
     api_description=(
+    "Fetch a web page and return a compact Markdown summary with title, key "
+    "metadata, readable body text, and outbound links. Or, enable the "
+    "'Full-page Markdown (Content Scraper mode)' option to return the page "
+    "converted to Markdown."
     ),
     allow_flagging="never",
 )