Nymbo commited on
Commit
c730636
·
verified ·
1 Parent(s): f9ecb69

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +710 -0
app.py ADDED
@@ -0,0 +1,710 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # File: main/app.py
2
+ # Purpose: One Space that offers five tools/tabs:
3
+ # 1) Fetch — extract relevant page content (title, metadata, clean text, hyperlinks)
4
+ # 2) DDG (Concise) — ultra-succinct DuckDuckGo search that emits JSONL with short keys to minimize tokens
5
+ # 3) Websearch — structured DuckDuckGo search via LangChain tool (JSON)
6
+ # 4) Unstructured DDG — raw DuckDuckGo list[dict] rendered into a Textbox
7
+ # 5) Generate Sitemap — LIMITED: grouped internal/external links with an optional per-domain cap (and a .md download)
8
+
9
+ from __future__ import annotations
10
+
11
+ import re
12
+ import json
13
+ import sys
14
+ from io import StringIO
15
+ from typing import List, Dict, Literal, Tuple
16
+
17
+ import gradio as gr
18
+ import requests
19
+ from bs4 import BeautifulSoup
20
+ from readability import Document
21
+ from urllib.parse import urljoin, urldefrag, urlparse
22
+ from langchain_community.tools import DuckDuckGoSearchResults
23
+ from duckduckgo_search import DDGS
24
+
25
+
26
+ # ==============================
27
+ # Fetch: HTTP + extraction utils
28
+ # ==============================
29
+
30
+ def _http_get(url: str) -> requests.Response:
31
+ """
32
+ Download the page politely with a short timeout and realistic headers.
33
+ (Layman's terms: grab the web page like a normal browser would, but quickly.)
34
+ """
35
+ headers = {
36
+ "User-Agent": "Mozilla/5.0 (compatible; WebMCP/1.0; +https://example.com)",
37
+ "Accept-Language": "en-US,en;q=0.9",
38
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
39
+ }
40
+ return requests.get(url, headers=headers, timeout=15)
41
+
42
+
43
+ def _normalize_whitespace(text: str) -> str:
44
+ """
45
+ Squeeze extra spaces and blank lines to keep things compact.
46
+ (Layman's terms: tidy up the text so it’s not full of weird spacing.)
47
+ """
48
+ text = re.sub(r"[ \t\u00A0]+", " ", text)
49
+ text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text.strip())
50
+ return text.strip()
51
+
52
+
53
+ def _truncate(text: str, max_chars: int) -> Tuple[str, bool]:
54
+ """
55
+ Cut text if it gets too long; return the text and whether we trimmed.
56
+ (Layman's terms: shorten long text and tell us if we had to cut it.)
57
+ """
58
+ if max_chars is None or max_chars <= 0 or len(text) <= max_chars:
59
+ return text, False
60
+ return text[:max_chars].rstrip() + " …", True
61
+
62
+
63
+ def _shorten(text: str, limit: int) -> str:
64
+ """
65
+ Hard cap a string with an ellipsis to keep tokens small.
66
+ (Layman's terms: force a string to a max length with an ellipsis.)
67
+ """
68
+ if limit <= 0 or len(text) <= limit:
69
+ return text
70
+ return text[: max(0, limit - 1)].rstrip() + "…"
71
+
72
+
73
+ def _domain_of(url: str) -> str:
74
+ """
75
+ Show a friendly site name like "example.com".
76
+ (Layman's terms: pull the website's domain.)
77
+ """
78
+ try:
79
+ return urlparse(url).netloc or ""
80
+ except Exception:
81
+ return ""
82
+
83
+
84
+ def _meta(soup: BeautifulSoup, name: str) -> str | None:
85
+ tag = soup.find("meta", attrs={"name": name})
86
+ return tag.get("content") if tag and tag.has_attr("content") else None
87
+
88
+
89
+ def _og(soup: BeautifulSoup, prop: str) -> str | None:
90
+ tag = soup.find("meta", attrs={"property": prop})
91
+ return tag.get("content") if tag and tag.has_attr("content") else None
92
+
93
+
94
+ def _extract_metadata(soup: BeautifulSoup, final_url: str) -> Dict[str, str]:
95
+ """
96
+ Pull the useful bits: title, description, site name, canonical URL, language, etc.
97
+ (Layman's terms: gather page basics like title/description/address.)
98
+ """
99
+ meta: Dict[str, str] = {}
100
+
101
+ # Title preference: <title> > og:title > twitter:title
102
+ title_candidates = [
103
+ (soup.title.string if soup.title and soup.title.string else None),
104
+ _og(soup, "og:title"),
105
+ _meta(soup, "twitter:title"),
106
+ ]
107
+ meta["title"] = next((t.strip() for t in title_candidates if t and t.strip()), "")
108
+
109
+ # Description preference: description > og:description > twitter:description
110
+ desc_candidates = [
111
+ _meta(soup, "description"),
112
+ _og(soup, "og:description"),
113
+ _meta(soup, "twitter:description"),
114
+ ]
115
+ meta["description"] = next((d.strip() for d in desc_candidates if d and d.strip()), "")
116
+
117
+ # Canonical link (helps dedupe)
118
+ link_canonical = soup.find("link", rel=lambda v: v and "canonical" in v)
119
+ meta["canonical"] = (link_canonical.get("href") or "").strip() if link_canonical else ""
120
+
121
+ # Site name + language info if present
122
+ meta["site_name"] = (_og(soup, "og:site_name") or "").strip()
123
+ html_tag = soup.find("html")
124
+ meta["lang"] = (html_tag.get("lang") or "").strip() if html_tag else ""
125
+
126
+ # Final URL + domain
127
+ meta["fetched_url"] = final_url
128
+ meta["domain"] = _domain_of(final_url)
129
+
130
+ return meta
131
+
132
+
133
+ def _extract_main_text(html: str) -> Tuple[str, BeautifulSoup]:
134
+ """
135
+ Use Readability to isolate the main article and turn it into clean text.
136
+ Returns (clean_text, soup_of_readable_html).
137
+ (Layman's terms: find the real article text and clean it.)
138
+ """
139
+ # Simplified article HTML from Readability
140
+ doc = Document(html)
141
+ readable_html = doc.summary(html_partial=True)
142
+
143
+ # Parse simplified HTML
144
+ s = BeautifulSoup(readable_html, "lxml")
145
+
146
+ # Remove noisy tags
147
+ for sel in ["script", "style", "noscript", "iframe", "svg"]:
148
+ for tag in s.select(sel):
149
+ tag.decompose()
150
+
151
+ # Keep paragraphs, list items, and subheadings for structure without bloat
152
+ text_parts: List[str] = []
153
+ for p in s.find_all(["p", "li", "h2", "h3", "h4", "blockquote"]):
154
+ chunk = p.get_text(" ", strip=True)
155
+ if chunk:
156
+ text_parts.append(chunk)
157
+
158
+ clean_text = _normalize_whitespace("\n\n".join(text_parts))
159
+ return clean_text, s
160
+
161
+
162
+ def _extract_links(readable_soup: BeautifulSoup, base_url: str, max_links: int) -> List[Tuple[str, str]]:
163
+ """
164
+ Collect clean, unique, absolute links from the readable section only.
165
+ (Layman's terms: pull a tidy list of links from the article body.)
166
+ """
167
+ seen = set()
168
+ links: List[Tuple[str, str]] = []
169
+
170
+ for a in readable_soup.find_all("a", href=True):
171
+ href = a.get("href").strip()
172
+ # Skip junk links we can't use
173
+ if not href or href.startswith("#") or href.startswith("mailto:") or href.startswith("javascript:"):
174
+ continue
175
+
176
+ # Resolve relative URLs, strip fragments (#…)
177
+ absolute = urljoin(base_url, href)
178
+ absolute, _ = urldefrag(absolute)
179
+
180
+ if absolute in seen:
181
+ continue
182
+ seen.add(absolute)
183
+
184
+ text = a.get_text(" ", strip=True)
185
+ if len(text) > 120:
186
+ text = text[:117] + "…"
187
+
188
+ links.append((text or absolute, absolute))
189
+
190
+ if len(links) >= max_links > 0:
191
+ break
192
+
193
+ return links
194
+
195
+
196
+ def _format_markdown(
197
+ meta: Dict[str, str],
198
+ body: str,
199
+ body_truncated: bool,
200
+ links: List[Tuple[str, str]],
201
+ include_text: bool,
202
+ include_metadata: bool,
203
+ include_links: bool,
204
+ verbosity: str,
205
+ ) -> str:
206
+ """
207
+ Assemble a compact Markdown summary with optional sections.
208
+ (Layman's terms: build the final markdown output with options.)
209
+ """
210
+ lines: List[str] = []
211
+
212
+ # Title header
213
+ title = meta.get("title") or meta.get("domain") or "Untitled"
214
+ lines.append(f"# {title}")
215
+
216
+ # Metadata section (only show what exists)
217
+ if include_metadata:
218
+ md: List[str] = []
219
+ if meta.get("description"):
220
+ md.append(f"- **Description:** {meta['description']}")
221
+ if meta.get("site_name"):
222
+ md.append(f"- **Site:** {meta['site_name']}")
223
+ if meta.get("canonical"):
224
+ md.append(f"- **Canonical:** {meta['canonical']}")
225
+ if meta.get("lang"):
226
+ md.append(f"- **Language:** {meta['lang']}")
227
+ if meta.get("fetched_url"):
228
+ md.append(f"- **Fetched From:** {meta['fetched_url']}")
229
+ if md:
230
+ lines.append("## Metadata")
231
+ lines.extend(md)
232
+
233
+ # Body text
234
+ if include_text and body:
235
+ if verbosity == "Brief":
236
+ brief, was_more = _truncate(body, 800)
237
+ lines.append("## Text")
238
+ lines.append(brief)
239
+ if was_more or body_truncated:
240
+ lines.append("\n> (Trimmed for brevity)")
241
+ else:
242
+ lines.append("## Text")
243
+ lines.append(body)
244
+ if body_truncated:
245
+ lines.append("\n> (Trimmed for brevity)")
246
+
247
+ # Links section
248
+ if include_links and links:
249
+ lines.append(f"## Links ({len(links)})")
250
+ for text, url in links:
251
+ lines.append(f"- [{text}]({url})")
252
+
253
+ return "\n\n".join(lines).strip()
254
+
255
+
256
+ def Fetch_Webpage( # <-- MCP tool #1 (Fetch)
257
+ url: str,
258
+ verbosity: str = "Standard",
259
+ include_metadata: bool = True,
260
+ include_text: bool = True,
261
+ include_links: bool = True,
262
+ max_chars: int = 3000,
263
+ max_links: int = 20,
264
+ ) -> str:
265
+ """
266
+ Fetch a web page and return a compact Markdown summary that includes title, key
267
+ metadata, readable main text, and outbound links.
268
+
269
+ (Layman's terms: summarize a page with clean text + useful details.)
270
+ """
271
+ if not url or not url.strip():
272
+ return "Please enter a valid URL."
273
+
274
+ try:
275
+ resp = _http_get(url)
276
+ resp.raise_for_status()
277
+ except requests.exceptions.RequestException as e:
278
+ return f"An error occurred: {e}"
279
+
280
+ final_url = str(resp.url)
281
+ ctype = resp.headers.get("Content-Type", "")
282
+ if "html" not in ctype.lower():
283
+ return f"Unsupported content type for extraction: {ctype or 'unknown'}"
284
+
285
+ # Decode to text
286
+ resp.encoding = resp.encoding or resp.apparent_encoding
287
+ html = resp.text
288
+
289
+ # Full-page soup for metadata
290
+ full_soup = BeautifulSoup(html, "lxml")
291
+ meta = _extract_metadata(full_soup, final_url)
292
+
293
+ # Readable content
294
+ body_text, readable_soup = _extract_main_text(html)
295
+ if not body_text:
296
+ # Fallback to "whole-page text" if Readability found nothing
297
+ fallback_text = full_soup.get_text(" ", strip=True)
298
+ body_text = _normalize_whitespace(fallback_text)
299
+
300
+ # Verbosity presets (we keep the smaller of preset vs. user cap)
301
+ preset_caps = {"Brief": 1200, "Standard": 3000, "Full": 999_999}
302
+ target_cap = preset_caps.get(verbosity, 3000)
303
+ cap = min(max_chars if max_chars > 0 else target_cap, target_cap)
304
+ body_text, truncated = _truncate(body_text, cap) if include_text else ("", False)
305
+
306
+ # Extract links from the simplified content only
307
+ links = _extract_links(readable_soup, final_url, max_links=max_links if include_links else 0)
308
+
309
+ # Final compact Markdown
310
+ md = _format_markdown(
311
+ meta=meta,
312
+ body=body_text,
313
+ body_truncated=truncated,
314
+ links=links,
315
+ include_text=include_text,
316
+ include_metadata=include_metadata,
317
+ include_links=include_links,
318
+ verbosity=verbosity,
319
+ )
320
+ return md or "No content could be extracted."
321
+
322
+
323
+ # ==========================
324
+ # Websearch: DuckDuckGo tool
325
+ # ==========================
326
+
327
+ def Search_Structured( # <-- MCP tool #3 (Structured DDG)
328
+ input_query: str,
329
+ max_results: int = 5,
330
+ ) -> List[Dict[Literal["snippet", "title", "link"], str]]:
331
+ """
332
+ Run a DuckDuckGo search and return structured results as a list of dictionaries.
333
+ (Layman's terms: search DDG and get clean JSON objects.)
334
+ """
335
+ if not input_query or not input_query.strip():
336
+ return []
337
+
338
+ # Create the search tool (LangChain community wrapper)
339
+ search = DuckDuckGoSearchResults(output_format="list", num_results=max_results)
340
+
341
+ # Run the search and return results as a list of dicts
342
+ results = search.invoke(input_query)
343
+ return results
344
+
345
+
346
+ # ========================================
347
+ # Unstructured DDG: raw list into Textbox
348
+ # ========================================
349
+
350
+ def Search_Raw( # <-- MCP tool #4 (Unstructured DDG)
351
+ query: str,
352
+ ) -> list[dict]:
353
+ """
354
+ Run a DuckDuckGo search using the native `duckduckgo_search` client and return the
355
+ raw Python list of dictionaries from the library.
356
+ (Layman's terms: search DDG and show exactly what the library returns.)
357
+ """
358
+ if not query or not query.strip():
359
+ return []
360
+ with DDGS() as ddgs:
361
+ results = ddgs.text(query, max_results=5)
362
+ return results
363
+
364
+
365
+ # ============================================
366
+ # Concise DDG: ultra-succinct JSONL for tokens
367
+ # ============================================
368
+
369
+ def Search_Concise( # <-- MCP tool #2 (Concise DDG)
370
+ query: str,
371
+ max_results: int = 5,
372
+ include_snippets: bool = False,
373
+ max_snippet_chars: int = 80,
374
+ dedupe_domains: bool = True,
375
+ title_chars: int = 80,
376
+ ) -> str:
377
+ """
378
+ Run a DuckDuckGo search and return ultra-compact JSONL lines with short keys to
379
+ minimize tokens.
380
+
381
+ (Layman's terms: the tiniest useful search output possible.)
382
+ """
383
+ if not query or not query.strip():
384
+ return ""
385
+
386
+ try:
387
+ with DDGS() as ddgs:
388
+ raw = ddgs.text(query, max_results=max_results)
389
+ except Exception as e:
390
+ return json.dumps({"error": str(e)[:120]}, ensure_ascii=False, separators=(",", ":"))
391
+
392
+ seen_domains = set()
393
+ lines: List[str] = []
394
+
395
+ for r in raw or []:
396
+ title = _shorten((r.get("title") or "").strip(), title_chars)
397
+ url = (r.get("href") or r.get("link") or "").strip()
398
+ body = (r.get("body") or r.get("snippet") or "").strip()
399
+
400
+ if not url:
401
+ continue
402
+
403
+ if dedupe_domains:
404
+ dom = _domain_of(url)
405
+ if dom in seen_domains:
406
+ continue
407
+ seen_domains.add(dom)
408
+
409
+ obj = {"t": title or _domain_of(url), "u": url}
410
+
411
+ if include_snippets and body:
412
+ obj["s"] = _shorten(body, max_snippet_chars)
413
+
414
+ # Emit most compact JSON possible (no spaces)
415
+ lines.append(json.dumps(obj, ensure_ascii=False, separators=(",", ":")))
416
+
417
+ # Join as JSONL (each result on its own line)
418
+ return "\n".join(lines)
419
+
420
+
421
+ # ============================================
422
+ # Generate Sitemap (new MCP tool #5)
423
+ # ============================================
424
+
425
+ def Generate_Sitemap(
426
+ url: str,
427
+ max_links_per_domain: int = 0,
428
+ ) -> str:
429
+ """
430
+ Generate a grouped sitemap (Markdown) of anchor links on a page, with an optional
431
+ per-domain cap.
432
+
433
+ Args:
434
+ url (str): The starting page URL (http/https). If the scheme is omitted,
435
+ https is assumed.
436
+ max_links_per_domain (int): Limit the number of links shown per domain.
437
+ Use 0 to show all links.
438
+
439
+ Returns:
440
+ str: Markdown text containing grouped links under "Internal Links" and
441
+ per-domain "External Links (domain)" sections. If an error occurs or no
442
+ links are found, a short message is returned.
443
+ """
444
+ # --- Basic validation & normalization ---
445
+ if not url or not url.strip():
446
+ return "Please enter a valid URL."
447
+
448
+ # If the user forgot the scheme, assume https
449
+ if not url.lower().startswith(("http://", "https://")):
450
+ url = "https://" + url.strip()
451
+
452
+ # --- Fetch the page safely ---
453
+ try:
454
+ resp = _http_get(url)
455
+ resp.raise_for_status()
456
+ except requests.exceptions.RequestException as e:
457
+ return f"Error fetching URL: {str(e)}"
458
+
459
+ base_url = str(resp.url) # follow redirects and use the final URL
460
+ content_type = resp.headers.get("Content-Type", "")
461
+ if "html" not in content_type.lower():
462
+ return "The provided URL does not appear to be an HTML page."
463
+
464
+ # --- Parse and collect links ---
465
+ soup = BeautifulSoup(resp.content, "lxml") # fast, lenient HTML parsing
466
+ anchors = soup.find_all("a", href=True)
467
+
468
+ seen_urls: set[str] = set()
469
+ items: List[Dict[str, str]] = []
470
+
471
+ for a in anchors:
472
+ href = (a.get("href") or "").strip()
473
+ if not href:
474
+ continue
475
+
476
+ # Skip non-navigational/unsupported schemes
477
+ if href.startswith(("#", "javascript:", "mailto:", "tel:")):
478
+ continue
479
+
480
+ # Resolve relative links and strip fragments
481
+ absolute = urljoin(base_url, href)
482
+ absolute, _ = urldefrag(absolute)
483
+
484
+ # Deduplicate and skip self
485
+ if absolute in seen_urls or absolute == base_url:
486
+ continue
487
+ seen_urls.add(absolute)
488
+
489
+ # Use link text if available; otherwise the URL itself
490
+ text = (a.get_text(" ", strip=True) or href).strip()
491
+ if len(text) > 100:
492
+ text = text[:100] + "..."
493
+
494
+ items.append({"text": text, "url": absolute})
495
+
496
+ if not items:
497
+ return "No links found on this page."
498
+
499
+ # --- Group by Internal vs External domains ---
500
+ base_netloc = urlparse(base_url).netloc
501
+ domain_groups: Dict[str, List[Dict[str, str]]] = {}
502
+
503
+ for it in items:
504
+ netloc = urlparse(it["url"]).netloc
505
+ key = "Internal Links" if netloc == base_netloc else f"External Links ({netloc})"
506
+ domain_groups.setdefault(key, []).append(it)
507
+
508
+ # --- Build Markdown with optional per-domain limit ---
509
+ total_links = len(items)
510
+ md_lines: List[str] = []
511
+ md_lines.append("# Sitemap")
512
+ md_lines.append(f"Base URL: {base_url}")
513
+ md_lines.append(f"Found {total_links} links:\n")
514
+
515
+ # Show Internal first, then external groups sorted by name
516
+ keys_sorted = ["Internal Links"] + sorted([k for k in domain_groups if k != "Internal Links"])
517
+
518
+ for group_key in keys_sorted:
519
+ if group_key not in domain_groups:
520
+ continue
521
+
522
+ group_links = domain_groups[group_key]
523
+ md_lines.append(f"## {group_key}\n")
524
+
525
+ if max_links_per_domain and max_links_per_domain > 0:
526
+ links_to_show = group_links[:max_links_per_domain]
527
+ remaining = max(0, len(group_links) - max_links_per_domain)
528
+ else:
529
+ links_to_show = group_links
530
+ remaining = 0
531
+
532
+ for link in links_to_show:
533
+ md_lines.append(f"- [{link['text']}]({link['url']})")
534
+
535
+ if remaining > 0:
536
+ md_lines.append(f"- ... and {remaining} more links")
537
+
538
+ md_lines.append("") # blank line after each group
539
+
540
+ sitemap_md = "\n".join(md_lines).strip()
541
+ return sitemap_md
542
+
543
+
544
+ # ======================================
545
+ # Code Execution: Python (MCP tool #6)
546
+ # ======================================
547
+
548
+ def Execute_Python(code: str) -> str:
549
+ """
550
+ Execute Python code and return the stdout or error message.
551
+ Mirrors the standalone code interpreter behavior.
552
+ """
553
+ if code is None:
554
+ return "No code provided."
555
+
556
+ old_stdout = sys.stdout
557
+ redirected_output = sys.stdout = StringIO()
558
+ try:
559
+ exec(code)
560
+ return redirected_output.getvalue()
561
+ except Exception as e:
562
+ return str(e)
563
+ finally:
564
+ sys.stdout = old_stdout
565
+
566
+
567
+ # ======================
568
+ # UI: six-tab interface
569
+ # ======================
570
+
571
+ # --- Fetch tab (compact controllable extraction) ---
572
+ fetch_interface = gr.Interface(
573
+ fn=Fetch_Webpage, # connect the function to the UI
574
+ inputs=[
575
+ gr.Textbox(label="URL", placeholder="https://example.com/article"),
576
+ gr.Dropdown(label="Verbosity", choices=["Brief", "Standard", "Full"], value="Standard"),
577
+ gr.Checkbox(value=True, label="Include Metadata"),
578
+ gr.Checkbox(value=True, label="Include Main Text"),
579
+ gr.Checkbox(value=True, label="Include Links"),
580
+ gr.Slider(400, 12000, value=3000, step=100, label="Max Characters (body text)"),
581
+ gr.Slider(0, 100, value=20, step=1, label="Max Links"),
582
+ ],
583
+ outputs=gr.Markdown(label="Extracted Summary"),
584
+ title="Fetch Webpage",
585
+ description="Extract title, key metadata, readable text, and links. No noisy HTML.",
586
+ api_description=(
587
+ "Fetch a web page and return a compact Markdown summary with title, key "
588
+ "metadata, readable body text, and outbound links. Parameters let you "
589
+ "control verbosity, whether to include metadata/text/links, and limits "
590
+ "for characters and number of links."
591
+ ),
592
+ allow_flagging="never",
593
+ theme="Nymbo/Nymbo_Theme",
594
+ )
595
+
596
+ # --- Concise DDG tab (JSONL with short keys, minimal tokens) ---
597
+ concise_interface = gr.Interface(
598
+ fn=Search_Concise,
599
+ inputs=[
600
+ gr.Textbox(label="Query", placeholder="topic OR site:example.com"),
601
+ gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Max results"),
602
+ gr.Checkbox(value=False, label="Include snippets (adds tokens)"),
603
+ gr.Slider(minimum=20, maximum=200, value=80, step=5, label="Max snippet chars"),
604
+ gr.Checkbox(value=True, label="Dedupe by domain"),
605
+ gr.Slider(minimum=20, maximum=120, value=80, step=5, label="Max title chars"),
606
+ ],
607
+ outputs=gr.Textbox(label="Results (JSONL)", interactive=False),
608
+ title="DuckDuckGo Search (Concise)",
609
+ description="Emits JSONL with short keys (t,u[,s]). Defaults avoid snippets and duplicate domains.",
610
+ api_description=(
611
+ "Run a DuckDuckGo search and return newline-delimited JSON with short keys: "
612
+ "t=title, u=url, optional s=snippet. Options control result count, "
613
+ "snippet inclusion and length, domain deduping, and title length."
614
+ ),
615
+ allow_flagging="never",
616
+ theme="Nymbo/Nymbo_Theme",
617
+ submit_btn="Search",
618
+ )
619
+
620
+ # --- Websearch tab (structured DDG via LangChain) ---
621
+ websearch_interface = gr.Interface(
622
+ fn=Search_Structured, # connect the function to the UI
623
+ inputs=[
624
+ gr.Textbox(value="", label="Search query", placeholder="site:example.com interesting topic"),
625
+ gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Max results"),
626
+ ],
627
+ outputs=gr.JSON(label="Search results"),
628
+ title="DuckDuckGo Search (Structured)",
629
+ description="Search the web using DuckDuckGo; returns snippet, title, and link.",
630
+ api_description=(
631
+ "Run a DuckDuckGo web search and return a list of objects with keys: "
632
+ "snippet, title, and link. Configure the number of results."
633
+ ),
634
+ allow_flagging="never",
635
+ theme="Nymbo/Nymbo_Theme",
636
+ )
637
+
638
+ # --- Unstructured DDG tab (matches your separate app’s output) ---
639
+ unstructured_interface = gr.Interface(
640
+ fn=Search_Raw,
641
+ inputs=gr.Textbox(label="Enter Search Query"),
642
+ outputs=gr.Textbox(label="Results", interactive=False),
643
+ title="DuckDuckGo Search (Raw)",
644
+ description="Returns the raw list of results (list[dict]) shown as text.",
645
+ api_description=(
646
+ "Run DuckDuckGo via the native client and return the raw list[dict] as "
647
+ "provided by duckduckgo_search (fields like title, href/link, body/snippet)."
648
+ ),
649
+ allow_flagging="never",
650
+ theme="Nymbo/Nymbo_Theme",
651
+ submit_btn="Search",
652
+ )
653
+
654
+ # --- Generate Sitemap tab (LIMITED, grouped + optional per-domain cap) ---
655
+ sitemap_interface = gr.Interface(
656
+ fn=Generate_Sitemap,
657
+ inputs=[
658
+ gr.Textbox(
659
+ label="Website URL",
660
+ placeholder="https://example.com or example.com"
661
+ ),
662
+ gr.Slider(
663
+ minimum=0,
664
+ maximum=1000,
665
+ value=0,
666
+ step=1,
667
+ label="Max links per domain (0 = show all)"
668
+ ),
669
+ ],
670
+ outputs=gr.Markdown(label="Sitemap (Markdown)"),
671
+ title="Generate Sitemap",
672
+ description="Group links by Internal/External domains; optionally limit links per domain.",
673
+ api_description=(
674
+ "Scan a page and build a grouped sitemap of anchor links. Links are grouped as "
675
+ "Internal or External (per domain). Set a per-domain cap; 0 shows all."
676
+ ),
677
+ allow_flagging="never",
678
+ theme="Nymbo/Nymbo_Theme",
679
+ submit_btn="Generate",
680
+ )
681
+
682
+ # --- Execute Python tab (simple code interpreter) ---
683
+ code_interface = gr.Interface(
684
+ fn=Execute_Python,
685
+ inputs=gr.Code(label="Python Code", language="python"),
686
+ outputs=gr.Textbox(label="Output"),
687
+ title="Python Code Executor",
688
+ description="Execute Python code and see the output. This app is also an MCP server for LLMs.",
689
+ allow_flagging="never",
690
+ theme="Nymbo/Nymbo_Theme",
691
+ )
692
+
693
+ # --- Combine all into a single app with tabs ---
694
+ demo = gr.TabbedInterface(
695
+ interface_list=[fetch_interface, concise_interface, websearch_interface, unstructured_interface, sitemap_interface, code_interface],
696
+ tab_names=[
697
+ "Fetch Webpage",
698
+ "DuckDuckGo Search (Concise)",
699
+ "DuckDuckGo Search (Structured)",
700
+ "DuckDuckGo Search (Raw)",
701
+ "Generate Sitemap",
702
+ "Python Code Executor",
703
+ ],
704
+ title="Web MCP — Fetch, Search, Sitemaps, and Code Execution.",
705
+ theme="Nymbo/Nymbo_Theme",
706
+ )
707
+
708
+ # Launch the UI and expose all functions as MCP tools in one server
709
+ if __name__ == "__main__":
710
+ demo.launch(mcp_server=True)