Spaces:

Nymbo
/

Tools

Running

App Files Files Community

Nymbo commited on 8 days ago

Commit

4add2a4

verified ·

1 Parent(s): 43cc0e4

Update app.py

Browse files

Files changed (1) hide show

app.py +6 -159

app.py CHANGED Viewed

@@ -1,9 +1,8 @@
 # File: main/app.py
-# Purpose: One Space that offers four tools/tabs:
 #   1) Fetch — extract relevant page content (title, metadata, clean text, hyperlinks)
 #   2) DuckDuckGo Search — compact JSONL search output (short keys to minimize tokens)
-#   3) Generate Sitemap — grouped internal/external links with an optional per-domain cap
-#   4) Python Code Executor — run Python code and capture stdout/errors
 from __future__ import annotations
@@ -379,129 +378,6 @@ def Search_DuckDuckGo(  # <-- MCP tool #2 (DDG Search)
     return "\n".join(lines)
-# ============================================
-# Generate Sitemap (new MCP tool #5)
-# ============================================
-def Generate_Sitemap(
-    url: str,
-    max_links_per_domain: int = 0,
-) -> str:
-    """
-    Generate a grouped sitemap (Markdown) of anchor links on a page, with an optional
-    per-domain cap.
-    Args:
-        url (str): The starting page URL (http/https). If the scheme is omitted,
-            https is assumed.
-        max_links_per_domain (int): Limit the number of links shown per domain.
-            Use 0 to show all links.
-    Returns:
-        str: Markdown text containing grouped links under "Internal Links" and
-        per-domain "External Links (domain)" sections. If an error occurs or no
-        links are found, a short message is returned.
-    """
-    # --- Basic validation & normalization ---
-    if not url or not url.strip():
-        return "Please enter a valid URL."
-    # If the user forgot the scheme, assume https
-    if not url.lower().startswith(("http://", "https://")):
-        url = "https://" + url.strip()
-    # --- Fetch the page safely ---
-    try:
-        resp = _http_get(url)
-        resp.raise_for_status()
-    except requests.exceptions.RequestException as e:
-        return f"Error fetching URL: {str(e)}"
-    base_url = str(resp.url)  # follow redirects and use the final URL
-    content_type = resp.headers.get("Content-Type", "")
-    if "html" not in content_type.lower():
-        return "The provided URL does not appear to be an HTML page."
-    # --- Parse and collect links ---
-    soup = BeautifulSoup(resp.content, "lxml")  # fast, lenient HTML parsing
-    anchors = soup.find_all("a", href=True)
-    seen_urls: set[str] = set()
-    items: List[Dict[str, str]] = []
-    for a in anchors:
-        href = (a.get("href") or "").strip()
-        if not href:
-            continue
-        # Skip non-navigational/unsupported schemes
-        if href.startswith(("#", "javascript:", "mailto:", "tel:")):
-            continue
-        # Resolve relative links and strip fragments
-        absolute = urljoin(base_url, href)
-        absolute, _ = urldefrag(absolute)
-        # Deduplicate and skip self
-        if absolute in seen_urls or absolute == base_url:
-            continue
-        seen_urls.add(absolute)
-        # Use link text if available; otherwise the URL itself
-        text = (a.get_text(" ", strip=True) or href).strip()
-        if len(text) > 100:
-            text = text[:100] + "..."
-        items.append({"text": text, "url": absolute})
-    if not items:
-        return "No links found on this page."
-    # --- Group by Internal vs External domains ---
-    base_netloc = urlparse(base_url).netloc
-    domain_groups: Dict[str, List[Dict[str, str]]] = {}
-    for it in items:
-        netloc = urlparse(it["url"]).netloc
-        key = "Internal Links" if netloc == base_netloc else f"External Links ({netloc})"
-        domain_groups.setdefault(key, []).append(it)
-    # --- Build Markdown with optional per-domain limit ---
-    total_links = len(items)
-    md_lines: List[str] = []
-    md_lines.append("# Sitemap")
-    md_lines.append(f"Base URL: {base_url}")
-    md_lines.append(f"Found {total_links} links:\n")
-    # Show Internal first, then external groups sorted by name
-    keys_sorted = ["Internal Links"] + sorted([k for k in domain_groups if k != "Internal Links"])
-    for group_key in keys_sorted:
-        if group_key not in domain_groups:
-            continue
-        group_links = domain_groups[group_key]
-        md_lines.append(f"## {group_key}\n")
-        if max_links_per_domain and max_links_per_domain > 0:
-            links_to_show = group_links[:max_links_per_domain]
-            remaining = max(0, len(group_links) - max_links_per_domain)
-        else:
-            links_to_show = group_links
-            remaining = 0
-        for link in links_to_show:
-            md_lines.append(f"- [{link['text']}]({link['url']})")
-        if remaining > 0:
-            md_lines.append(f"- ... and {remaining} more links")
-        md_lines.append("")  # blank line after each group
-    sitemap_md = "\n".join(md_lines).strip()
-    return sitemap_md
 # ======================================
 # Code Execution: Python (MCP tool #6)
 # ======================================
@@ -526,7 +402,7 @@ def Execute_Python(code: str) -> str:
 # ======================
-# UI: six-tab interface
 # ======================
 # --- Fetch tab (compact controllable extraction) ---
@@ -578,35 +454,7 @@ concise_interface = gr.Interface(
     submit_btn="Search",
 )
-## Removed Structured and Raw tabs
-# --- Generate Sitemap tab (LIMITED, grouped + optional per-domain cap) ---
-sitemap_interface = gr.Interface(
-    fn=Generate_Sitemap,
-    inputs=[
-        gr.Textbox(
-            label="Website URL",
-            placeholder="https://example.com or example.com"
-        ),
-        gr.Slider(
-            minimum=0,
-            maximum=1000,
-            value=0,
-            step=1,
-            label="Max links per domain (0 = show all)"
-        ),
-    ],
-    outputs=gr.Markdown(label="Sitemap (Markdown)"),
-    title="Generate Sitemap",
-    description="Group links by Internal/External domains; optionally limit links per domain.",
-    api_description=(
-        "Scan a page and build a grouped sitemap of anchor links. Links are grouped as "
-        "Internal or External (per domain). Set a per-domain cap; 0 shows all."
-    ),
-    allow_flagging="never",
-    theme="Nymbo/Nymbo_Theme",
-    submit_btn="Generate",
-)
 # --- Execute Python tab (simple code interpreter) ---
 code_interface = gr.Interface(
@@ -621,14 +469,13 @@ code_interface = gr.Interface(
 # --- Combine all into a single app with tabs ---
 demo = gr.TabbedInterface(
-    interface_list=[fetch_interface, concise_interface, sitemap_interface, code_interface],
     tab_names=[
         "Fetch Webpage",
         "DuckDuckGo Search",
-        "Generate Sitemap",
         "Python Code Executor",
     ],
-    title="Web MCP — Fetch, Search, Sitemaps, and Code Execution.",
     theme="Nymbo/Nymbo_Theme",
 )

 # File: main/app.py
+# Purpose: One Space that offers three tools/tabs:
 #   1) Fetch — extract relevant page content (title, metadata, clean text, hyperlinks)
 #   2) DuckDuckGo Search — compact JSONL search output (short keys to minimize tokens)
+#   3) Python Code Executor — run Python code and capture stdout/errors
 from __future__ import annotations
     return "\n".join(lines)
 # ======================================
 # Code Execution: Python (MCP tool #6)
 # ======================================
 # ======================
+# UI: three-tab interface
 # ======================
 # --- Fetch tab (compact controllable extraction) ---
     submit_btn="Search",
 )
+## Removed Structured, Raw, and Sitemap tabs
 # --- Execute Python tab (simple code interpreter) ---
 code_interface = gr.Interface(
 # --- Combine all into a single app with tabs ---
 demo = gr.TabbedInterface(
+    interface_list=[fetch_interface, concise_interface, code_interface],
     tab_names=[
         "Fetch Webpage",
         "DuckDuckGo Search",
         "Python Code Executor",
     ],
+    title="Web MCP — Fetch, Search, and Code Execution.",
     theme="Nymbo/Nymbo_Theme",
 )