Spaces:

broadfield-dev
/

browser

Running

App Files Files Community

broadfield-dev commited on Jun 25

Commit

224e219

verified ·

1 Parent(s): e8c80cb

Update app.py

Browse files

Files changed (1) hide show

app.py +93 -65

app.py CHANGED Viewed

@@ -1,36 +1,68 @@
 import os
 os.system("playwright install")
-import os
 import re
 import urllib.parse
 import asyncio
-from typing import Dict
 import gradio as gr
 from bs4 import BeautifulSoup, NavigableString
 from playwright.async_api import async_playwright
-# --- 1. GLOBAL RESOURCES & CONFIGURATION ---
-# This dictionary will hold the long-lived Playwright and Browser objects.
-# It starts empty and browsers are added on-demand.
 PLAYWRIGHT_STATE: Dict = {}
-# A comprehensive list of search engines
 SEARCH_ENGINES = {
-    "DuckDuckGo": "https://duckduckgo.com/html/?q={query}", "Google": "https://www.google.com/search?q={query}",
-    "Bing": "https://www.bing.com/search?q={query}", "Brave": "https://search.brave.com/search?q={query}",
-    "Ecosia": "https://www.ecosia.org/search?q={query}", "Yahoo": "https://search.yahoo.com/search?p={query}",
-    "Startpage": "https://www.startpage.com/sp/search?q={query}", "Qwant": "https://www.qwant.com/?q={query}",
-    "Swisscows": "https://swisscows.com/web?query={query}", "You.com": "https://you.com/search?q={query}",
-    "SearXNG": "https://searx.be/search?q={query}", "MetaGer": "https://metager.org/meta/meta.ger-en?eingabe={query}",
-    "Yandex": "https://yandex.com/search/?text={query}", "Baidu": "https://www.baidu.com/s?wd={query}",
     "Perplexity": "https://www.perplexity.ai/search?q={query}"
 }
-# --- 2. ADVANCED HTML-TO-MARKDOWN CONVERTER (Unchanged) ---
 class HTML_TO_MARKDOWN_CONVERTER:
-    # ... [The class code is identical and correct] ...
     def __init__(self, soup: BeautifulSoup, base_url: str):
         self.soup = soup
         self.base_url = base_url
@@ -51,7 +83,7 @@ class HTML_TO_MARKDOWN_CONVERTER:
         self._cleanup_html()
         content_node = self.soup.find('main') or self.soup.find('article') or self.soup.find('body')
         if not content_node:
-            return "Could not find main content."
         md = self._process_node(content_node)
         return re.sub(r'\n{3,}', '\n\n', md).strip()
@@ -88,91 +120,87 @@ class HTML_TO_MARKDOWN_CONVERTER:
             return f"\n\n![{alt}]({full_src})\n\n"
         return inner_md
-# --- 3. CORE API FUNCTION (WITH LAZY LOADING) ---
 async def perform_web_browse(query: str, browser_name: str, search_engine: str):
-    """
-    A stateless function that takes a query, browser, and search engine,
-    then returns the parsed content of the resulting page.
-    It launches and caches browsers on-demand.
-    """
-    # Step 1: Initialize Playwright process itself if not already running.
     if "playwright" not in PLAYWRIGHT_STATE:
-        print("🚀 First request received, starting Playwright process...")
         PLAYWRIGHT_STATE["playwright"] = await async_playwright().start()
-        print("✅ Playwright process is running.")
-    # Step 2: Check if the *specific browser requested* has been launched.
-    browser_key = browser_name.lower()
     if browser_key not in PLAYWRIGHT_STATE:
-        print(f"🚀 Launching '{browser_key}' for the first time...")
         try:
             p = PLAYWRIGHT_STATE["playwright"]
-            if browser_key == 'firefox':
-                browser_instance = await p.firefox.launch(headless=True)
-            elif browser_key == 'chromium':
-                browser_instance = await p.chromium.launch(headless=True)
-            elif browser_key == 'webkit':
-                browser_instance = await p.webkit.launch(headless=True)
-            else:
-                raise ValueError(f"Invalid browser name: {browser_name}")
             PLAYWRIGHT_STATE[browser_key] = browser_instance
-            print(f"✅ '{browser_key}' is now running and cached.")
         except Exception as e:
-            error_message = str(e).splitlines()[0]
-            print(f"❌ Failed to launch '{browser_key}': {error_message}")
-            return {"status": "error", "query": query, "error_message": f"Failed to launch browser '{browser_key}'. Your system might be missing dependencies. Error: {error_message}"}
     browser_instance = PLAYWRIGHT_STATE[browser_key]
-    # Step 3: Determine URL
-    is_url = urllib.parse.urlparse(query).scheme in ['http', 'https']
-    if is_url:
         url = query
     else:
         search_url_template = SEARCH_ENGINES.get(search_engine)
         if not search_url_template:
-            return {"error": f"Invalid search engine: '{search_engine}'."}
         url = search_url_template.format(query=urllib.parse.quote_plus(query))
-    # Step 4: Create isolated context and browse
-    context = await browser_instance.new_context(user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
     page = await context.new_page()
     try:
-        print(f"Navigating to: {url} using {browser_name}...")
-        await page.goto(url, wait_until='domcontentloaded', timeout=30000)
         final_url, title = page.url, await page.title() or "No Title"
-        print(f"Arrived at: {final_url}")
-        soup = BeautifulSoup(await page.content(), 'lxml')
         converter = HTML_TO_MARKDOWN_CONVERTER(soup, base_url=final_url)
         markdown_text = converter.convert()
-        print("Content parsed successfully.")
-        return {"status": "success", "query": query, "final_url": final_url, "page_title": title, "markdown_content": markdown_text}
     except Exception as e:
-        error_message = str(e).splitlines()[0]
-        print(f"An error occurred: {error_message}")
-        return {"status": "error", "query": query, "error_message": error_message}
     finally:
-        if page: await page.close()
-        if context: await context.close()
-        print("Session context closed.")
-# --- 4. GRADIO INTERFACE & API LAUNCH (Unchanged) ---
 with gr.Blocks(title="Web Browse API", theme=gr.themes.Soft()) as demo:
-    # ... UI definition is identical ...
     gr.Markdown("# Web Browse API")
-    gr.Markdown("This interface exposes a stateless API endpoint (`/api/web_browse`) to fetch and parse web content.")
-    query_input = gr.Textbox(label="URL or Search Query", placeholder="e.g., https://openai.com or 'history of artificial intelligence'")
     with gr.Row():
         browser_input = gr.Dropdown(label="Browser", choices=["firefox", "chromium", "webkit"], value="firefox", scale=1)
         search_engine_input = gr.Dropdown(label="Search Engine (for non-URL queries)", choices=sorted(list(SEARCH_ENGINES.keys())), value="DuckDuckGo", scale=2)
     submit_button = gr.Button("Browse", variant="primary")
     output_json = gr.JSON(label="API Result")
     submit_button.click(fn=perform_web_browse, inputs=[query_input, browser_input, search_engine_input], outputs=output_json, api_name="web_browse")
 if __name__ == "__main__":

 import os
 os.system("playwright install")
 import re
 import urllib.parse
 import asyncio
+from typing import Dict, Optional
+from itertools import cycle
 import gradio as gr
 from bs4 import BeautifulSoup, NavigableString
 from playwright.async_api import async_playwright
+class CredentialRevolver:
+    def __init__(self, proxy_string: str):
+        self.proxies = self._parse_proxies(proxy_string)
+        self.proxy_cycler = cycle(self.proxies) if self.proxies else None
+    def _parse_proxies(self, proxy_string: str):
+        proxies = []
+        if not proxy_string:
+            return proxies
+        for line in proxy_string.strip().splitlines():
+            try:
+                parsed = urllib.parse.urlparse(f"//{line.strip()}")
+                if not parsed.hostname or not parsed.port:
+                    continue
+                server = f"http://{parsed.hostname}:{parsed.port}"
+                proxy_dict = {"server": server}
+                if parsed.username:
+                    proxy_dict["username"] = parsed.username
+                if parsed.password:
+                    proxy_dict["password"] = parsed.password
+                proxies.append(proxy_dict)
+            except Exception:
+                pass
+        return proxies
+    def get_next(self) -> Optional[Dict]:
+        return next(self.proxy_cycler) if self.proxy_cycler else None
+    def count(self) -> int:
+        return len(self.proxies)
 PLAYWRIGHT_STATE: Dict = {}
+REVOLVER = CredentialRevolver(os.getenv("PROXY_LIST", ""))
 SEARCH_ENGINES = {
+    "Google": "https://www.google.com/search?q={query}",
+    "DuckDuckGo": "https://duckduckgo.com/html/?q={query}",
+    "Bing": "https://www.bing.com/search?q={query}",
+    "Brave": "https://search.brave.com/search?q={query}",
+    "Ecosia": "https://www.ecosia.org/search?q={query}",
+    "Yahoo": "https://search.yahoo.com/search?p={query}",
+    "Startpage": "https://www.startpage.com/sp/search?q={query}",
+    "Qwant": "https://www.qwant.com/?q={query}",
+    "Swisscows": "https://swisscows.com/web?query={query}",
+    "You.com": "https://you.com/search?q={query}",
+    "SearXNG": "https://searx.be/search?q={query}",
+    "MetaGer": "https://metager.org/meta/meta.ger-en?eingabe={query}",
+    "Yandex": "https://yandex.com/search/?text={query}",
+    "Baidu": "https://www.baidu.com/s?wd={query}",
     "Perplexity": "https://www.perplexity.ai/search?q={query}"
 }
 class HTML_TO_MARKDOWN_CONVERTER:
     def __init__(self, soup: BeautifulSoup, base_url: str):
         self.soup = soup
         self.base_url = base_url
         self._cleanup_html()
         content_node = self.soup.find('main') or self.soup.find('article') or self.soup.find('body')
         if not content_node:
+            return ""
         md = self._process_node(content_node)
         return re.sub(r'\n{3,}', '\n\n', md).strip()
             return f"\n\n![{alt}]({full_src})\n\n"
         return inner_md
 async def perform_web_browse(query: str, browser_name: str, search_engine: str):
+    browser_key = browser_name.lower()
     if "playwright" not in PLAYWRIGHT_STATE:
         PLAYWRIGHT_STATE["playwright"] = await async_playwright().start()
     if browser_key not in PLAYWRIGHT_STATE:
         try:
             p = PLAYWRIGHT_STATE["playwright"]
+            if browser_key == 'firefox': browser_instance = await p.firefox.launch(headless=True)
+            elif browser_key == 'chromium': browser_instance = await p.chromium.launch(headless=True)
+            elif browser_key == 'webkit': browser_instance = await p.webkit.launch(headless=True)
+            else: raise ValueError(f"Invalid browser name: {browser_name}")
             PLAYWRIGHT_STATE[browser_key] = browser_instance
         except Exception as e:
+            return {"status": "error", "query": query, "error_message": f"Failed to launch '{browser_key}'. Error: {str(e).splitlines()[0]}"}
     browser_instance = PLAYWRIGHT_STATE[browser_key]
+    if urllib.parse.urlparse(query).scheme in ['http', 'https'] and '.' in urllib.parse.urlparse(query).netloc:
         url = query
     else:
         search_url_template = SEARCH_ENGINES.get(search_engine)
         if not search_url_template:
+            return {"status": "error", "query": query, "error_message": f"Invalid search engine: '{search_engine}'."}
         url = search_url_template.format(query=urllib.parse.quote_plus(query))
+    proxy_config = REVOLVER.get_next()
+    proxy_server_used = proxy_config["server"] if proxy_config else "Direct Connection"
+    context_args = {
+        'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
+        'java_script_enabled': True,
+        'ignore_https_errors': True
+    }
+    if proxy_config:
+        context_args['proxy'] = proxy_config
+    context = await browser_instance.new_context(**context_args)
     page = await context.new_page()
     try:
+        response = await page.goto(url, wait_until='domcontentloaded', timeout=45000)
+        current_url = page.url
+        if "google.com" in current_url:
+            await page.wait_for_selector('div#rso, div#search, body[jsmodel]', timeout=15000)
+        elif "perplexity.ai" in current_url or "you.com" in current_url:
+            await page.wait_for_timeout(4000)
         final_url, title = page.url, await page.title() or "No Title"
+        html_content = await page.content()
+        soup = BeautifulSoup(html_content, 'lxml')
         converter = HTML_TO_MARKDOWN_CONVERTER(soup, base_url=final_url)
         markdown_text = converter.convert()
+        status_code = response.status if response else "N/A"
+        return {
+            "status": "success", "query": query, "final_url": final_url, "page_title": title,
+            "http_status": status_code, "proxy_used": proxy_server_used, "markdown_content": markdown_text,
+        }
     except Exception as e:
+        return {"status": "error", "query": query, "proxy_used": proxy_server_used, "error_message": str(e).splitlines()[0]}
     finally:
+        if 'page' in locals() and not page.is_closed(): await page.close()
+        if 'context' in locals(): await context.close()
 with gr.Blocks(title="Web Browse API", theme=gr.themes.Soft()) as demo:
     gr.Markdown("# Web Browse API")
+    gr.Markdown(f"This interface exposes a stateless API endpoint (`/api/web_browse`) to fetch and parse web content. {REVOLVER.count()} proxies loaded.")
+    query_input = gr.Textbox(label="URL or Search Query", placeholder="e.g., https://gradio.app or 'how does gradio work'")
     with gr.Row():
         browser_input = gr.Dropdown(label="Browser", choices=["firefox", "chromium", "webkit"], value="firefox", scale=1)
         search_engine_input = gr.Dropdown(label="Search Engine (for non-URL queries)", choices=sorted(list(SEARCH_ENGINES.keys())), value="DuckDuckGo", scale=2)
     submit_button = gr.Button("Browse", variant="primary")
     output_json = gr.JSON(label="API Result")
     submit_button.click(fn=perform_web_browse, inputs=[query_input, browser_input, search_engine_input], outputs=output_json, api_name="web_browse")
 if __name__ == "__main__":