Nymbo commited on
Commit
9e4885c
·
verified ·
1 Parent(s): b5c0401

adding markdown scraper option to Fetch_Webpage tool

Browse files
Files changed (1) hide show
  1. app.py +61 -6
app.py CHANGED
@@ -19,6 +19,7 @@ from typing import List, Dict, Tuple, Annotated
19
  import gradio as gr
20
  import requests
21
  from bs4 import BeautifulSoup
 
22
  from readability import Document
23
  from urllib.parse import urljoin, urldefrag, urlparse
24
  from duckduckgo_search import DDGS
@@ -175,6 +176,50 @@ def _extract_main_text(html: str) -> Tuple[str, BeautifulSoup]:
175
  return clean_text, s
176
 
177
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  def _extract_links(readable_soup: BeautifulSoup, base_url: str, max_links: int) -> List[Tuple[str, str]]:
179
  """
180
  Collect clean, unique, absolute links from the readable section only.
@@ -277,6 +322,7 @@ def Fetch_Webpage( # <-- MCP tool #1 (Fetch)
277
  include_links: Annotated[bool, "Include outbound links discovered in the readable section."] = True,
278
  max_chars: Annotated[int, "Hard cap for body characters after the verbosity preset. Use 0 to disable the cap."] = 3000,
279
  max_links: Annotated[int, "Maximum number of links to include from the readable content. Set 0 to omit links."] = 20,
 
280
  ) -> str:
281
  """
282
  Fetch a web page and return a compact Markdown summary containing title, key
@@ -297,6 +343,10 @@ def Fetch_Webpage( # <-- MCP tool #1 (Fetch)
297
  - Metadata (optional)
298
  - Text (optional, may be trimmed)
299
  - Links (optional, deduped and absolute)
 
 
 
 
300
  """
301
  if not url or not url.strip():
302
  return "Please enter a valid URL."
@@ -316,10 +366,14 @@ def Fetch_Webpage( # <-- MCP tool #1 (Fetch)
316
  resp.encoding = resp.encoding or resp.apparent_encoding
317
  html = resp.text
318
 
319
- # Full-page soup for metadata
320
  full_soup = BeautifulSoup(html, "lxml")
321
  meta = _extract_metadata(full_soup, final_url)
322
 
 
 
 
 
323
  # Readable content
324
  body_text, readable_soup = _extract_main_text(html)
325
  if not body_text:
@@ -679,17 +733,18 @@ fetch_interface = gr.Interface(
679
  gr.Checkbox(value=True, label="Include Links"),
680
  gr.Slider(400, 12000, value=3000, step=100, label="Max Characters (body text)"),
681
  gr.Slider(0, 100, value=20, step=1, label="Max Links"),
 
682
  ],
683
  outputs=gr.Markdown(label="Extracted Summary"),
684
  title="Fetch Webpage",
685
  description=(
686
- "<div style=\"text-align:center\">Extract title, key metadata, readable text, and links from webpages. No noisy HTML.</div>"
687
  ),
688
  api_description=(
689
- "Fetch a web page and return a compact Markdown summary with title, key "
690
- "metadata, readable body text, and outbound links. Parameters let you "
691
- "control verbosity, whether to include metadata/text/links, and limits "
692
- "for characters and number of links."
693
  ),
694
  allow_flagging="never",
695
  )
 
19
  import gradio as gr
20
  import requests
21
  from bs4 import BeautifulSoup
22
+ from markdownify import markdownify as md
23
  from readability import Document
24
  from urllib.parse import urljoin, urldefrag, urlparse
25
  from duckduckgo_search import DDGS
 
176
  return clean_text, s
177
 
178
 
179
+ def _fullpage_markdown_from_soup(full_soup: BeautifulSoup, base_url: str) -> str:
180
+ """
181
+ Convert the page's main content (or body fallback) to Markdown, similar to
182
+ web-scraper's Content Scraper tool, but without any file download side-effects.
183
+
184
+ Steps:
185
+ - Remove noisy elements (script/style/nav/footer/header/aside)
186
+ - Prefer <main>, <article>, or common content containers; fallback to <body>
187
+ - Convert to Markdown with ATX headings
188
+ - Clean up excessive newlines, empty links, and whitespace
189
+ - Prepend a title header when available
190
+ """
191
+ # Remove unwanted elements globally first
192
+ for element in full_soup.select("script, style, nav, footer, header, aside"):
193
+ element.decompose()
194
+
195
+ # Try common main-content containers, then fallback to body
196
+ main = (
197
+ full_soup.find("main")
198
+ or full_soup.find("article")
199
+ or full_soup.find("div", class_=re.compile(r"content|main|post|article", re.I))
200
+ or full_soup.find("body")
201
+ )
202
+
203
+ if not main:
204
+ return "No main content found on the webpage."
205
+
206
+ # Convert selected HTML to Markdown
207
+ markdown_text = md(str(main), heading_style="ATX")
208
+
209
+ # Clean up the markdown similar to web-scraper
210
+ markdown_text = re.sub(r"\n{3,}", "\n\n", markdown_text)
211
+ markdown_text = re.sub(r"\[\s*\]\([^)]*\)", "", markdown_text) # empty links
212
+ markdown_text = re.sub(r"[ \t]+", " ", markdown_text)
213
+ markdown_text = markdown_text.strip()
214
+
215
+ # Add title if present
216
+ title = full_soup.find("title")
217
+ if title and title.get_text(strip=True):
218
+ markdown_text = f"# {title.get_text(strip=True)}\n\n{markdown_text}"
219
+
220
+ return markdown_text or "No content could be extracted."
221
+
222
+
223
  def _extract_links(readable_soup: BeautifulSoup, base_url: str, max_links: int) -> List[Tuple[str, str]]:
224
  """
225
  Collect clean, unique, absolute links from the readable section only.
 
322
  include_links: Annotated[bool, "Include outbound links discovered in the readable section."] = True,
323
  max_chars: Annotated[int, "Hard cap for body characters after the verbosity preset. Use 0 to disable the cap."] = 3000,
324
  max_links: Annotated[int, "Maximum number of links to include from the readable content. Set 0 to omit links."] = 20,
325
+ full_page_markdown: Annotated[bool, "If true, return the page as full Markdown (Content Scraper mode) instead of a compact summary."] = False,
326
  ) -> str:
327
  """
328
  Fetch a web page and return a compact Markdown summary containing title, key
 
343
  - Metadata (optional)
344
  - Text (optional, may be trimmed)
345
  - Links (optional, deduped and absolute)
346
+
347
+ Special mode:
348
+ If full_page_markdown=True, the function returns the page converted to Markdown,
349
+ similar to the "Content Scraper" tool, ignoring verbosity/include_* limits.
350
  """
351
  if not url or not url.strip():
352
  return "Please enter a valid URL."
 
366
  resp.encoding = resp.encoding or resp.apparent_encoding
367
  html = resp.text
368
 
369
+ # Full-page soup for metadata (and potential Markdown conversion)
370
  full_soup = BeautifulSoup(html, "lxml")
371
  meta = _extract_metadata(full_soup, final_url)
372
 
373
+ # Content Scraper mode: return full-page Markdown early
374
+ if full_page_markdown:
375
+ return _fullpage_markdown_from_soup(full_soup, final_url)
376
+
377
  # Readable content
378
  body_text, readable_soup = _extract_main_text(html)
379
  if not body_text:
 
733
  gr.Checkbox(value=True, label="Include Links"),
734
  gr.Slider(400, 12000, value=3000, step=100, label="Max Characters (body text)"),
735
  gr.Slider(0, 100, value=20, step=1, label="Max Links"),
736
+ gr.Checkbox(value=False, label="Full-page Markdown (Content Scraper mode)"),
737
  ],
738
  outputs=gr.Markdown(label="Extracted Summary"),
739
  title="Fetch Webpage",
740
  description=(
741
+ "<div style=\"text-align:center\">Extract title, key metadata, readable text, and links from webpages or toggle full-page Markdown.</div>"
742
  ),
743
  api_description=(
744
+ "Fetch a web page and return a compact Markdown summary with title, key "
745
+ "metadata, readable body text, and outbound links. Or, enable the "
746
+ "'Full-page Markdown (Content Scraper mode)' option to return the page "
747
+ "converted to Markdown."
748
  ),
749
  allow_flagging="never",
750
  )