adding markdown scraper option to Fetch_Webpage tool
Browse files
app.py
CHANGED
@@ -19,6 +19,7 @@ from typing import List, Dict, Tuple, Annotated
|
|
19 |
import gradio as gr
|
20 |
import requests
|
21 |
from bs4 import BeautifulSoup
|
|
|
22 |
from readability import Document
|
23 |
from urllib.parse import urljoin, urldefrag, urlparse
|
24 |
from duckduckgo_search import DDGS
|
@@ -175,6 +176,50 @@ def _extract_main_text(html: str) -> Tuple[str, BeautifulSoup]:
|
|
175 |
return clean_text, s
|
176 |
|
177 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
178 |
def _extract_links(readable_soup: BeautifulSoup, base_url: str, max_links: int) -> List[Tuple[str, str]]:
|
179 |
"""
|
180 |
Collect clean, unique, absolute links from the readable section only.
|
@@ -277,6 +322,7 @@ def Fetch_Webpage( # <-- MCP tool #1 (Fetch)
|
|
277 |
include_links: Annotated[bool, "Include outbound links discovered in the readable section."] = True,
|
278 |
max_chars: Annotated[int, "Hard cap for body characters after the verbosity preset. Use 0 to disable the cap."] = 3000,
|
279 |
max_links: Annotated[int, "Maximum number of links to include from the readable content. Set 0 to omit links."] = 20,
|
|
|
280 |
) -> str:
|
281 |
"""
|
282 |
Fetch a web page and return a compact Markdown summary containing title, key
|
@@ -297,6 +343,10 @@ def Fetch_Webpage( # <-- MCP tool #1 (Fetch)
|
|
297 |
- Metadata (optional)
|
298 |
- Text (optional, may be trimmed)
|
299 |
- Links (optional, deduped and absolute)
|
|
|
|
|
|
|
|
|
300 |
"""
|
301 |
if not url or not url.strip():
|
302 |
return "Please enter a valid URL."
|
@@ -316,10 +366,14 @@ def Fetch_Webpage( # <-- MCP tool #1 (Fetch)
|
|
316 |
resp.encoding = resp.encoding or resp.apparent_encoding
|
317 |
html = resp.text
|
318 |
|
319 |
-
# Full-page soup for metadata
|
320 |
full_soup = BeautifulSoup(html, "lxml")
|
321 |
meta = _extract_metadata(full_soup, final_url)
|
322 |
|
|
|
|
|
|
|
|
|
323 |
# Readable content
|
324 |
body_text, readable_soup = _extract_main_text(html)
|
325 |
if not body_text:
|
@@ -679,17 +733,18 @@ fetch_interface = gr.Interface(
|
|
679 |
gr.Checkbox(value=True, label="Include Links"),
|
680 |
gr.Slider(400, 12000, value=3000, step=100, label="Max Characters (body text)"),
|
681 |
gr.Slider(0, 100, value=20, step=1, label="Max Links"),
|
|
|
682 |
],
|
683 |
outputs=gr.Markdown(label="Extracted Summary"),
|
684 |
title="Fetch Webpage",
|
685 |
description=(
|
686 |
-
|
687 |
),
|
688 |
api_description=(
|
689 |
-
|
690 |
-
|
691 |
-
|
692 |
-
|
693 |
),
|
694 |
allow_flagging="never",
|
695 |
)
|
|
|
19 |
import gradio as gr
|
20 |
import requests
|
21 |
from bs4 import BeautifulSoup
|
22 |
+
from markdownify import markdownify as md
|
23 |
from readability import Document
|
24 |
from urllib.parse import urljoin, urldefrag, urlparse
|
25 |
from duckduckgo_search import DDGS
|
|
|
176 |
return clean_text, s
|
177 |
|
178 |
|
179 |
+
def _fullpage_markdown_from_soup(full_soup: BeautifulSoup, base_url: str) -> str:
|
180 |
+
"""
|
181 |
+
Convert the page's main content (or body fallback) to Markdown, similar to
|
182 |
+
web-scraper's Content Scraper tool, but without any file download side-effects.
|
183 |
+
|
184 |
+
Steps:
|
185 |
+
- Remove noisy elements (script/style/nav/footer/header/aside)
|
186 |
+
- Prefer <main>, <article>, or common content containers; fallback to <body>
|
187 |
+
- Convert to Markdown with ATX headings
|
188 |
+
- Clean up excessive newlines, empty links, and whitespace
|
189 |
+
- Prepend a title header when available
|
190 |
+
"""
|
191 |
+
# Remove unwanted elements globally first
|
192 |
+
for element in full_soup.select("script, style, nav, footer, header, aside"):
|
193 |
+
element.decompose()
|
194 |
+
|
195 |
+
# Try common main-content containers, then fallback to body
|
196 |
+
main = (
|
197 |
+
full_soup.find("main")
|
198 |
+
or full_soup.find("article")
|
199 |
+
or full_soup.find("div", class_=re.compile(r"content|main|post|article", re.I))
|
200 |
+
or full_soup.find("body")
|
201 |
+
)
|
202 |
+
|
203 |
+
if not main:
|
204 |
+
return "No main content found on the webpage."
|
205 |
+
|
206 |
+
# Convert selected HTML to Markdown
|
207 |
+
markdown_text = md(str(main), heading_style="ATX")
|
208 |
+
|
209 |
+
# Clean up the markdown similar to web-scraper
|
210 |
+
markdown_text = re.sub(r"\n{3,}", "\n\n", markdown_text)
|
211 |
+
markdown_text = re.sub(r"\[\s*\]\([^)]*\)", "", markdown_text) # empty links
|
212 |
+
markdown_text = re.sub(r"[ \t]+", " ", markdown_text)
|
213 |
+
markdown_text = markdown_text.strip()
|
214 |
+
|
215 |
+
# Add title if present
|
216 |
+
title = full_soup.find("title")
|
217 |
+
if title and title.get_text(strip=True):
|
218 |
+
markdown_text = f"# {title.get_text(strip=True)}\n\n{markdown_text}"
|
219 |
+
|
220 |
+
return markdown_text or "No content could be extracted."
|
221 |
+
|
222 |
+
|
223 |
def _extract_links(readable_soup: BeautifulSoup, base_url: str, max_links: int) -> List[Tuple[str, str]]:
|
224 |
"""
|
225 |
Collect clean, unique, absolute links from the readable section only.
|
|
|
322 |
include_links: Annotated[bool, "Include outbound links discovered in the readable section."] = True,
|
323 |
max_chars: Annotated[int, "Hard cap for body characters after the verbosity preset. Use 0 to disable the cap."] = 3000,
|
324 |
max_links: Annotated[int, "Maximum number of links to include from the readable content. Set 0 to omit links."] = 20,
|
325 |
+
full_page_markdown: Annotated[bool, "If true, return the page as full Markdown (Content Scraper mode) instead of a compact summary."] = False,
|
326 |
) -> str:
|
327 |
"""
|
328 |
Fetch a web page and return a compact Markdown summary containing title, key
|
|
|
343 |
- Metadata (optional)
|
344 |
- Text (optional, may be trimmed)
|
345 |
- Links (optional, deduped and absolute)
|
346 |
+
|
347 |
+
Special mode:
|
348 |
+
If full_page_markdown=True, the function returns the page converted to Markdown,
|
349 |
+
similar to the "Content Scraper" tool, ignoring verbosity/include_* limits.
|
350 |
"""
|
351 |
if not url or not url.strip():
|
352 |
return "Please enter a valid URL."
|
|
|
366 |
resp.encoding = resp.encoding or resp.apparent_encoding
|
367 |
html = resp.text
|
368 |
|
369 |
+
# Full-page soup for metadata (and potential Markdown conversion)
|
370 |
full_soup = BeautifulSoup(html, "lxml")
|
371 |
meta = _extract_metadata(full_soup, final_url)
|
372 |
|
373 |
+
# Content Scraper mode: return full-page Markdown early
|
374 |
+
if full_page_markdown:
|
375 |
+
return _fullpage_markdown_from_soup(full_soup, final_url)
|
376 |
+
|
377 |
# Readable content
|
378 |
body_text, readable_soup = _extract_main_text(html)
|
379 |
if not body_text:
|
|
|
733 |
gr.Checkbox(value=True, label="Include Links"),
|
734 |
gr.Slider(400, 12000, value=3000, step=100, label="Max Characters (body text)"),
|
735 |
gr.Slider(0, 100, value=20, step=1, label="Max Links"),
|
736 |
+
gr.Checkbox(value=False, label="Full-page Markdown (Content Scraper mode)"),
|
737 |
],
|
738 |
outputs=gr.Markdown(label="Extracted Summary"),
|
739 |
title="Fetch Webpage",
|
740 |
description=(
|
741 |
+
"<div style=\"text-align:center\">Extract title, key metadata, readable text, and links from webpages — or toggle full-page Markdown.</div>"
|
742 |
),
|
743 |
api_description=(
|
744 |
+
"Fetch a web page and return a compact Markdown summary with title, key "
|
745 |
+
"metadata, readable body text, and outbound links. Or, enable the "
|
746 |
+
"'Full-page Markdown (Content Scraper mode)' option to return the page "
|
747 |
+
"converted to Markdown."
|
748 |
),
|
749 |
allow_flagging="never",
|
750 |
)
|