Nymbo commited on
Commit
4add2a4
·
verified ·
1 Parent(s): 43cc0e4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -159
app.py CHANGED
@@ -1,9 +1,8 @@
1
  # File: main/app.py
2
- # Purpose: One Space that offers four tools/tabs:
3
  # 1) Fetch — extract relevant page content (title, metadata, clean text, hyperlinks)
4
  # 2) DuckDuckGo Search — compact JSONL search output (short keys to minimize tokens)
5
- # 3) Generate Sitemapgrouped internal/external links with an optional per-domain cap
6
- # 4) Python Code Executor — run Python code and capture stdout/errors
7
 
8
  from __future__ import annotations
9
 
@@ -379,129 +378,6 @@ def Search_DuckDuckGo( # <-- MCP tool #2 (DDG Search)
379
  return "\n".join(lines)
380
 
381
 
382
- # ============================================
383
- # Generate Sitemap (new MCP tool #5)
384
- # ============================================
385
-
386
- def Generate_Sitemap(
387
- url: str,
388
- max_links_per_domain: int = 0,
389
- ) -> str:
390
- """
391
- Generate a grouped sitemap (Markdown) of anchor links on a page, with an optional
392
- per-domain cap.
393
-
394
- Args:
395
- url (str): The starting page URL (http/https). If the scheme is omitted,
396
- https is assumed.
397
- max_links_per_domain (int): Limit the number of links shown per domain.
398
- Use 0 to show all links.
399
-
400
- Returns:
401
- str: Markdown text containing grouped links under "Internal Links" and
402
- per-domain "External Links (domain)" sections. If an error occurs or no
403
- links are found, a short message is returned.
404
- """
405
- # --- Basic validation & normalization ---
406
- if not url or not url.strip():
407
- return "Please enter a valid URL."
408
-
409
- # If the user forgot the scheme, assume https
410
- if not url.lower().startswith(("http://", "https://")):
411
- url = "https://" + url.strip()
412
-
413
- # --- Fetch the page safely ---
414
- try:
415
- resp = _http_get(url)
416
- resp.raise_for_status()
417
- except requests.exceptions.RequestException as e:
418
- return f"Error fetching URL: {str(e)}"
419
-
420
- base_url = str(resp.url) # follow redirects and use the final URL
421
- content_type = resp.headers.get("Content-Type", "")
422
- if "html" not in content_type.lower():
423
- return "The provided URL does not appear to be an HTML page."
424
-
425
- # --- Parse and collect links ---
426
- soup = BeautifulSoup(resp.content, "lxml") # fast, lenient HTML parsing
427
- anchors = soup.find_all("a", href=True)
428
-
429
- seen_urls: set[str] = set()
430
- items: List[Dict[str, str]] = []
431
-
432
- for a in anchors:
433
- href = (a.get("href") or "").strip()
434
- if not href:
435
- continue
436
-
437
- # Skip non-navigational/unsupported schemes
438
- if href.startswith(("#", "javascript:", "mailto:", "tel:")):
439
- continue
440
-
441
- # Resolve relative links and strip fragments
442
- absolute = urljoin(base_url, href)
443
- absolute, _ = urldefrag(absolute)
444
-
445
- # Deduplicate and skip self
446
- if absolute in seen_urls or absolute == base_url:
447
- continue
448
- seen_urls.add(absolute)
449
-
450
- # Use link text if available; otherwise the URL itself
451
- text = (a.get_text(" ", strip=True) or href).strip()
452
- if len(text) > 100:
453
- text = text[:100] + "..."
454
-
455
- items.append({"text": text, "url": absolute})
456
-
457
- if not items:
458
- return "No links found on this page."
459
-
460
- # --- Group by Internal vs External domains ---
461
- base_netloc = urlparse(base_url).netloc
462
- domain_groups: Dict[str, List[Dict[str, str]]] = {}
463
-
464
- for it in items:
465
- netloc = urlparse(it["url"]).netloc
466
- key = "Internal Links" if netloc == base_netloc else f"External Links ({netloc})"
467
- domain_groups.setdefault(key, []).append(it)
468
-
469
- # --- Build Markdown with optional per-domain limit ---
470
- total_links = len(items)
471
- md_lines: List[str] = []
472
- md_lines.append("# Sitemap")
473
- md_lines.append(f"Base URL: {base_url}")
474
- md_lines.append(f"Found {total_links} links:\n")
475
-
476
- # Show Internal first, then external groups sorted by name
477
- keys_sorted = ["Internal Links"] + sorted([k for k in domain_groups if k != "Internal Links"])
478
-
479
- for group_key in keys_sorted:
480
- if group_key not in domain_groups:
481
- continue
482
-
483
- group_links = domain_groups[group_key]
484
- md_lines.append(f"## {group_key}\n")
485
-
486
- if max_links_per_domain and max_links_per_domain > 0:
487
- links_to_show = group_links[:max_links_per_domain]
488
- remaining = max(0, len(group_links) - max_links_per_domain)
489
- else:
490
- links_to_show = group_links
491
- remaining = 0
492
-
493
- for link in links_to_show:
494
- md_lines.append(f"- [{link['text']}]({link['url']})")
495
-
496
- if remaining > 0:
497
- md_lines.append(f"- ... and {remaining} more links")
498
-
499
- md_lines.append("") # blank line after each group
500
-
501
- sitemap_md = "\n".join(md_lines).strip()
502
- return sitemap_md
503
-
504
-
505
  # ======================================
506
  # Code Execution: Python (MCP tool #6)
507
  # ======================================
@@ -526,7 +402,7 @@ def Execute_Python(code: str) -> str:
526
 
527
 
528
  # ======================
529
- # UI: six-tab interface
530
  # ======================
531
 
532
  # --- Fetch tab (compact controllable extraction) ---
@@ -578,35 +454,7 @@ concise_interface = gr.Interface(
578
  submit_btn="Search",
579
  )
580
 
581
- ## Removed Structured and Raw tabs
582
-
583
- # --- Generate Sitemap tab (LIMITED, grouped + optional per-domain cap) ---
584
- sitemap_interface = gr.Interface(
585
- fn=Generate_Sitemap,
586
- inputs=[
587
- gr.Textbox(
588
- label="Website URL",
589
- placeholder="https://example.com or example.com"
590
- ),
591
- gr.Slider(
592
- minimum=0,
593
- maximum=1000,
594
- value=0,
595
- step=1,
596
- label="Max links per domain (0 = show all)"
597
- ),
598
- ],
599
- outputs=gr.Markdown(label="Sitemap (Markdown)"),
600
- title="Generate Sitemap",
601
- description="Group links by Internal/External domains; optionally limit links per domain.",
602
- api_description=(
603
- "Scan a page and build a grouped sitemap of anchor links. Links are grouped as "
604
- "Internal or External (per domain). Set a per-domain cap; 0 shows all."
605
- ),
606
- allow_flagging="never",
607
- theme="Nymbo/Nymbo_Theme",
608
- submit_btn="Generate",
609
- )
610
 
611
  # --- Execute Python tab (simple code interpreter) ---
612
  code_interface = gr.Interface(
@@ -621,14 +469,13 @@ code_interface = gr.Interface(
621
 
622
  # --- Combine all into a single app with tabs ---
623
  demo = gr.TabbedInterface(
624
- interface_list=[fetch_interface, concise_interface, sitemap_interface, code_interface],
625
  tab_names=[
626
  "Fetch Webpage",
627
  "DuckDuckGo Search",
628
- "Generate Sitemap",
629
  "Python Code Executor",
630
  ],
631
- title="Web MCP — Fetch, Search, Sitemaps, and Code Execution.",
632
  theme="Nymbo/Nymbo_Theme",
633
  )
634
 
 
1
  # File: main/app.py
2
+ # Purpose: One Space that offers three tools/tabs:
3
  # 1) Fetch — extract relevant page content (title, metadata, clean text, hyperlinks)
4
  # 2) DuckDuckGo Search — compact JSONL search output (short keys to minimize tokens)
5
+ # 3) Python Code Executor run Python code and capture stdout/errors
 
6
 
7
  from __future__ import annotations
8
 
 
378
  return "\n".join(lines)
379
 
380
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
381
  # ======================================
382
  # Code Execution: Python (MCP tool #6)
383
  # ======================================
 
402
 
403
 
404
  # ======================
405
+ # UI: three-tab interface
406
  # ======================
407
 
408
  # --- Fetch tab (compact controllable extraction) ---
 
454
  submit_btn="Search",
455
  )
456
 
457
+ ## Removed Structured, Raw, and Sitemap tabs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
458
 
459
  # --- Execute Python tab (simple code interpreter) ---
460
  code_interface = gr.Interface(
 
469
 
470
  # --- Combine all into a single app with tabs ---
471
  demo = gr.TabbedInterface(
472
+ interface_list=[fetch_interface, concise_interface, code_interface],
473
  tab_names=[
474
  "Fetch Webpage",
475
  "DuckDuckGo Search",
 
476
  "Python Code Executor",
477
  ],
478
+ title="Web MCP — Fetch, Search, and Code Execution.",
479
  theme="Nymbo/Nymbo_Theme",
480
  )
481