Update app.py
Browse files
app.py
CHANGED
@@ -1,9 +1,8 @@
|
|
1 |
# File: main/app.py
|
2 |
-
# Purpose: One Space that offers
|
3 |
# 1) Fetch — extract relevant page content (title, metadata, clean text, hyperlinks)
|
4 |
# 2) DuckDuckGo Search — compact JSONL search output (short keys to minimize tokens)
|
5 |
-
# 3)
|
6 |
-
# 4) Python Code Executor — run Python code and capture stdout/errors
|
7 |
|
8 |
from __future__ import annotations
|
9 |
|
@@ -379,129 +378,6 @@ def Search_DuckDuckGo( # <-- MCP tool #2 (DDG Search)
|
|
379 |
return "\n".join(lines)
|
380 |
|
381 |
|
382 |
-
# ============================================
|
383 |
-
# Generate Sitemap (new MCP tool #5)
|
384 |
-
# ============================================
|
385 |
-
|
386 |
-
def Generate_Sitemap(
|
387 |
-
url: str,
|
388 |
-
max_links_per_domain: int = 0,
|
389 |
-
) -> str:
|
390 |
-
"""
|
391 |
-
Generate a grouped sitemap (Markdown) of anchor links on a page, with an optional
|
392 |
-
per-domain cap.
|
393 |
-
|
394 |
-
Args:
|
395 |
-
url (str): The starting page URL (http/https). If the scheme is omitted,
|
396 |
-
https is assumed.
|
397 |
-
max_links_per_domain (int): Limit the number of links shown per domain.
|
398 |
-
Use 0 to show all links.
|
399 |
-
|
400 |
-
Returns:
|
401 |
-
str: Markdown text containing grouped links under "Internal Links" and
|
402 |
-
per-domain "External Links (domain)" sections. If an error occurs or no
|
403 |
-
links are found, a short message is returned.
|
404 |
-
"""
|
405 |
-
# --- Basic validation & normalization ---
|
406 |
-
if not url or not url.strip():
|
407 |
-
return "Please enter a valid URL."
|
408 |
-
|
409 |
-
# If the user forgot the scheme, assume https
|
410 |
-
if not url.lower().startswith(("http://", "https://")):
|
411 |
-
url = "https://" + url.strip()
|
412 |
-
|
413 |
-
# --- Fetch the page safely ---
|
414 |
-
try:
|
415 |
-
resp = _http_get(url)
|
416 |
-
resp.raise_for_status()
|
417 |
-
except requests.exceptions.RequestException as e:
|
418 |
-
return f"Error fetching URL: {str(e)}"
|
419 |
-
|
420 |
-
base_url = str(resp.url) # follow redirects and use the final URL
|
421 |
-
content_type = resp.headers.get("Content-Type", "")
|
422 |
-
if "html" not in content_type.lower():
|
423 |
-
return "The provided URL does not appear to be an HTML page."
|
424 |
-
|
425 |
-
# --- Parse and collect links ---
|
426 |
-
soup = BeautifulSoup(resp.content, "lxml") # fast, lenient HTML parsing
|
427 |
-
anchors = soup.find_all("a", href=True)
|
428 |
-
|
429 |
-
seen_urls: set[str] = set()
|
430 |
-
items: List[Dict[str, str]] = []
|
431 |
-
|
432 |
-
for a in anchors:
|
433 |
-
href = (a.get("href") or "").strip()
|
434 |
-
if not href:
|
435 |
-
continue
|
436 |
-
|
437 |
-
# Skip non-navigational/unsupported schemes
|
438 |
-
if href.startswith(("#", "javascript:", "mailto:", "tel:")):
|
439 |
-
continue
|
440 |
-
|
441 |
-
# Resolve relative links and strip fragments
|
442 |
-
absolute = urljoin(base_url, href)
|
443 |
-
absolute, _ = urldefrag(absolute)
|
444 |
-
|
445 |
-
# Deduplicate and skip self
|
446 |
-
if absolute in seen_urls or absolute == base_url:
|
447 |
-
continue
|
448 |
-
seen_urls.add(absolute)
|
449 |
-
|
450 |
-
# Use link text if available; otherwise the URL itself
|
451 |
-
text = (a.get_text(" ", strip=True) or href).strip()
|
452 |
-
if len(text) > 100:
|
453 |
-
text = text[:100] + "..."
|
454 |
-
|
455 |
-
items.append({"text": text, "url": absolute})
|
456 |
-
|
457 |
-
if not items:
|
458 |
-
return "No links found on this page."
|
459 |
-
|
460 |
-
# --- Group by Internal vs External domains ---
|
461 |
-
base_netloc = urlparse(base_url).netloc
|
462 |
-
domain_groups: Dict[str, List[Dict[str, str]]] = {}
|
463 |
-
|
464 |
-
for it in items:
|
465 |
-
netloc = urlparse(it["url"]).netloc
|
466 |
-
key = "Internal Links" if netloc == base_netloc else f"External Links ({netloc})"
|
467 |
-
domain_groups.setdefault(key, []).append(it)
|
468 |
-
|
469 |
-
# --- Build Markdown with optional per-domain limit ---
|
470 |
-
total_links = len(items)
|
471 |
-
md_lines: List[str] = []
|
472 |
-
md_lines.append("# Sitemap")
|
473 |
-
md_lines.append(f"Base URL: {base_url}")
|
474 |
-
md_lines.append(f"Found {total_links} links:\n")
|
475 |
-
|
476 |
-
# Show Internal first, then external groups sorted by name
|
477 |
-
keys_sorted = ["Internal Links"] + sorted([k for k in domain_groups if k != "Internal Links"])
|
478 |
-
|
479 |
-
for group_key in keys_sorted:
|
480 |
-
if group_key not in domain_groups:
|
481 |
-
continue
|
482 |
-
|
483 |
-
group_links = domain_groups[group_key]
|
484 |
-
md_lines.append(f"## {group_key}\n")
|
485 |
-
|
486 |
-
if max_links_per_domain and max_links_per_domain > 0:
|
487 |
-
links_to_show = group_links[:max_links_per_domain]
|
488 |
-
remaining = max(0, len(group_links) - max_links_per_domain)
|
489 |
-
else:
|
490 |
-
links_to_show = group_links
|
491 |
-
remaining = 0
|
492 |
-
|
493 |
-
for link in links_to_show:
|
494 |
-
md_lines.append(f"- [{link['text']}]({link['url']})")
|
495 |
-
|
496 |
-
if remaining > 0:
|
497 |
-
md_lines.append(f"- ... and {remaining} more links")
|
498 |
-
|
499 |
-
md_lines.append("") # blank line after each group
|
500 |
-
|
501 |
-
sitemap_md = "\n".join(md_lines).strip()
|
502 |
-
return sitemap_md
|
503 |
-
|
504 |
-
|
505 |
# ======================================
|
506 |
# Code Execution: Python (MCP tool #6)
|
507 |
# ======================================
|
@@ -526,7 +402,7 @@ def Execute_Python(code: str) -> str:
|
|
526 |
|
527 |
|
528 |
# ======================
|
529 |
-
# UI:
|
530 |
# ======================
|
531 |
|
532 |
# --- Fetch tab (compact controllable extraction) ---
|
@@ -578,35 +454,7 @@ concise_interface = gr.Interface(
|
|
578 |
submit_btn="Search",
|
579 |
)
|
580 |
|
581 |
-
## Removed Structured and
|
582 |
-
|
583 |
-
# --- Generate Sitemap tab (LIMITED, grouped + optional per-domain cap) ---
|
584 |
-
sitemap_interface = gr.Interface(
|
585 |
-
fn=Generate_Sitemap,
|
586 |
-
inputs=[
|
587 |
-
gr.Textbox(
|
588 |
-
label="Website URL",
|
589 |
-
placeholder="https://example.com or example.com"
|
590 |
-
),
|
591 |
-
gr.Slider(
|
592 |
-
minimum=0,
|
593 |
-
maximum=1000,
|
594 |
-
value=0,
|
595 |
-
step=1,
|
596 |
-
label="Max links per domain (0 = show all)"
|
597 |
-
),
|
598 |
-
],
|
599 |
-
outputs=gr.Markdown(label="Sitemap (Markdown)"),
|
600 |
-
title="Generate Sitemap",
|
601 |
-
description="Group links by Internal/External domains; optionally limit links per domain.",
|
602 |
-
api_description=(
|
603 |
-
"Scan a page and build a grouped sitemap of anchor links. Links are grouped as "
|
604 |
-
"Internal or External (per domain). Set a per-domain cap; 0 shows all."
|
605 |
-
),
|
606 |
-
allow_flagging="never",
|
607 |
-
theme="Nymbo/Nymbo_Theme",
|
608 |
-
submit_btn="Generate",
|
609 |
-
)
|
610 |
|
611 |
# --- Execute Python tab (simple code interpreter) ---
|
612 |
code_interface = gr.Interface(
|
@@ -621,14 +469,13 @@ code_interface = gr.Interface(
|
|
621 |
|
622 |
# --- Combine all into a single app with tabs ---
|
623 |
demo = gr.TabbedInterface(
|
624 |
-
interface_list=[fetch_interface, concise_interface,
|
625 |
tab_names=[
|
626 |
"Fetch Webpage",
|
627 |
"DuckDuckGo Search",
|
628 |
-
"Generate Sitemap",
|
629 |
"Python Code Executor",
|
630 |
],
|
631 |
-
title="Web MCP — Fetch, Search,
|
632 |
theme="Nymbo/Nymbo_Theme",
|
633 |
)
|
634 |
|
|
|
1 |
# File: main/app.py
|
2 |
+
# Purpose: One Space that offers three tools/tabs:
|
3 |
# 1) Fetch — extract relevant page content (title, metadata, clean text, hyperlinks)
|
4 |
# 2) DuckDuckGo Search — compact JSONL search output (short keys to minimize tokens)
|
5 |
+
# 3) Python Code Executor — run Python code and capture stdout/errors
|
|
|
6 |
|
7 |
from __future__ import annotations
|
8 |
|
|
|
378 |
return "\n".join(lines)
|
379 |
|
380 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
381 |
# ======================================
|
382 |
# Code Execution: Python (MCP tool #6)
|
383 |
# ======================================
|
|
|
402 |
|
403 |
|
404 |
# ======================
|
405 |
+
# UI: three-tab interface
|
406 |
# ======================
|
407 |
|
408 |
# --- Fetch tab (compact controllable extraction) ---
|
|
|
454 |
submit_btn="Search",
|
455 |
)
|
456 |
|
457 |
+
## Removed Structured, Raw, and Sitemap tabs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
458 |
|
459 |
# --- Execute Python tab (simple code interpreter) ---
|
460 |
code_interface = gr.Interface(
|
|
|
469 |
|
470 |
# --- Combine all into a single app with tabs ---
|
471 |
demo = gr.TabbedInterface(
|
472 |
+
interface_list=[fetch_interface, concise_interface, code_interface],
|
473 |
tab_names=[
|
474 |
"Fetch Webpage",
|
475 |
"DuckDuckGo Search",
|
|
|
476 |
"Python Code Executor",
|
477 |
],
|
478 |
+
title="Web MCP — Fetch, Search, and Code Execution.",
|
479 |
theme="Nymbo/Nymbo_Theme",
|
480 |
)
|
481 |
|