Spaces:

broadfield-dev
/

browser-gradio

Running

App Files Files Community

broadfield-dev commited on Jun 25

Commit

10dbcf2

verified ·

1 Parent(s): 9b96fc7

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -133

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import os
 os.system("playwright install")
-# app.py (Final Version with Search Engines and Improved Formatting)
 import gradio as gr
 from playwright.async_api import async_playwright
@@ -17,7 +17,6 @@ REVOLVER = None
 LIVE_CONTEXTS = {}
 APP_STARTED = False
-# New: Search Engine Configuration
 SEARCH_ENGINES = {
     "DuckDuckGo": "https://duckduckgo.com/html/?q={query}",
     "Google": "https://www.google.com/search?q={query}",
@@ -41,92 +40,49 @@ class CredentialRevolver:
         self.proxies = self._parse_proxies(proxy_string)
         self.proxy_cycler = cycle(self.proxies) if self.proxies else None
     def _parse_proxies(self, proxy_string: str):
-        proxies = []
         for line in proxy_string.strip().splitlines():
             try:
-                parsed = urllib.parse.urlparse(f"//{line.strip()}")
-                server = f"{parsed.scheme or 'http'}://{parsed.hostname}:{parsed.port}"
                 proxies.append({"server": server, "username": parsed.username, "password": parsed.password})
             except: pass
         return proxies
     def get_next(self): return next(self.proxy_cycler) if self.proxy_cycler else None
     def count(self): return len(self.proxies) if self.proxies else 0
-# --- 3. CORE ASYNC LOGIC with NEW FORMATTER ---
-def _format_html_to_markdown(soup):
-    """
-    NEW: Intelligently converts a BeautifulSoup object to formatted Markdown.
-    """
-    text_parts = []
-    # Use a more specific main content area if available
-    content_node = soup.find('main') or soup.find('body')
-    if not content_node:
-        return "Could not find main body content."
-    for element in content_node.find_all(recursive=False):
-        # Ignore common non-content sections
-        if element.name in ['nav', 'footer', 'header', 'aside', 'form', 'script', 'style']:
-            continue
-        text_parts.append(_process_element_to_markdown(element))
-    return '\n'.join(filter(None, text_parts))
 def _process_element_to_markdown(element):
-    """NEW: Recursive helper to process each element into Markdown."""
-    if isinstance(element, NavigableString):
-        return element.strip()
-    if element.name is None:
-        return ''
-    # Get the inner text by recursively processing children
     inner_text = ''.join(_process_element_to_markdown(child) for child in element.children)
-    # Apply formatting based on tag name
-    if element.name in ['p', 'div', 'article', 'section']:
-        return f"\n{inner_text.strip()}\n"
-    if element.name == 'h1':
-        return f"\n# {inner_text.strip()}\n"
-    if element.name == 'h2':
-        return f"\n## {inner_text.strip()}\n"
-    if element.name == 'h3':
-        return f"\n### {inner_text.strip()}\n"
-    if element.name in ['h4', 'h5', 'h6']:
-        return f"\n#### {inner_text.strip()}\n"
-    if element.name == 'li':
-        return f"* {inner_text.strip()}\n"
-    if element.name in ['ul', 'ol']:
-        return f"\n{inner_text}\n"
-    if element.name in ['strong', 'b']:
-        return f"**{inner_text.strip()}**"
-    if element.name in ['em', 'i']:
-        return f"*{inner_text.strip()}*"
-    if element.name in ['pre', 'code']:
-        return f"\n```\n{inner_text.strip()}\n```\n"
-    if element.name == 'a':
-        href = element.get('href', '')
-        return f"[{inner_text.strip()}]({href})"
-    if element.name == 'hr':
-        return "\n---\n"
-    return inner_text # For other tags like span, etc.
 async def _fetch_and_update_tab_state(tab_state: TabState, url: str):
-    """Uses Playwright to navigate and the new formatter to parse."""
     log = f"▶️ Navigating to {url}..."; live_page = LIVE_CONTEXTS[tab_state.id]["page"]
     try:
         await live_page.goto(url, wait_until='domcontentloaded', timeout=30000)
         tab_state.url = live_page.url; tab_state.title = await live_page.title() or "No Title"
         log += f"\n✅ Arrived at: {tab_state.url}"
-        html_content = await live_page.content()
-        soup = BeautifulSoup(html_content, 'lxml')
-        # Use the new formatter
         tab_state.parsed_text = _format_html_to_markdown(soup)
         tab_state.links = [{'text': link.get_text(strip=True) or "[No Link Text]", 'url': urllib.parse.urljoin(tab_state.url, link['href'])} for link in soup.find_all('a', href=True) if link.get('href', '').startswith('http')]
         log += f"\n🔗 Found {len(tab_state.links)} links."
     except Exception as e:
@@ -136,19 +92,11 @@ async def _fetch_and_update_tab_state(tab_state: TabState, url: str):
     return log
 async def handle_action(browser_state: BrowserState, search_engine: str, action: str, value=None):
-    """Main logic handler, now with search engine selection."""
-    log = ""; active_tab_state = browser_state.get_active_tab()
     if action == "go" and active_tab_state and value:
         is_url = urllib.parse.urlparse(value).scheme and urllib.parse.urlparse(value).netloc
-        if is_url:
-            url = value
-        else: # It's a search query
-            base_url = SEARCH_ENGINES.get(search_engine, SEARCH_ENGINES["DuckDuckGo"])
-            url = base_url.format(query=urllib.parse.quote_plus(value))
         log = await _fetch_and_update_tab_state(active_tab_state, url)
-    # --- Other actions remain the same ---
     elif action == "new_tab":
         tab_id, proxy_config = str(uuid.uuid4()), REVOLVER.get_next()
         context = await BROWSER.new_context(proxy=proxy_config)
@@ -156,7 +104,7 @@ async def handle_action(browser_state: BrowserState, search_engine: str, action:
         LIVE_CONTEXTS[tab_id] = {"context": context, "page": page}
         new_tab = TabState(tab_id, proxy_used=proxy_config['server'].split('@')[-1] if proxy_config else "Direct")
         browser_state.tabs.append(new_tab); browser_state.active_tab_id = tab_id
-        log = await _fetch_and_update_tab_state(new_tab, "https://www.startpage.com/") # New default page
     elif action == "click" and active_tab_state and value is not None:
         try:
             link_index = int(value)
@@ -172,36 +120,39 @@ async def handle_action(browser_state: BrowserState, search_engine: str, action:
         else: log = "Cannot close the last tab."
     elif action == "switch_tab" and value is not None:
         browser_state.active_tab_id = value; log = f"Switched to tab."
     return browser_state, log
 # --- 4. GRADIO UI AND EVENT HANDLING ---
 with gr.Blocks(theme=gr.themes.Soft(), title="Real Browser Demo") as demo:
     browser_state = gr.State(BrowserState())
-    gr.Markdown("# 🛰️ Real Browser Demo v2")
     with gr.Row():
         with gr.Column(scale=4):
-            with gr.Row():
-                url_textbox = gr.Textbox(label="Enter URL or Search Query", interactive=True, scale=4)
-                go_btn = gr.Button("Go", variant="primary", scale=1)
-            with gr.Accordion("Page Content (Formatted)", open=True):
-                page_content = gr.Markdown("Loading...")
         with gr.Column(scale=2):
-            # NEW: Search Engine Selector
-            search_engine_selector = gr.Radio(
-                list(SEARCH_ENGINES.keys()), value="DuckDuckGo", label="Search Engine"
-            )
-            with gr.Row():
-                new_tab_btn = gr.Button("➕ New Tab"); close_tab_btn = gr.Button("❌ Close Tab")
             tab_selector = gr.Radio(choices=[], label="Active Tabs", interactive=True)
             with gr.Accordion("Clickable Links", open=True):
                 links_display = gr.Markdown("...");
-                with gr.Row():
-                    click_num_box = gr.Number(label="Link #", scale=1, minimum=0, step=1)
-                    click_btn = gr.Button("Click Link", scale=2)
     log_display = gr.Textbox(label="Status Log", interactive=False)
     all_outputs = [page_content, url_textbox, links_display, tab_selector, log_display]
     async def master_handler(current_state, search_engine, action, value=None):
@@ -213,44 +164,24 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Real Browser Demo") as demo:
             print(f"✅ Playwright started. {REVOLVER.count()} proxies loaded."); APP_STARTED = True
         new_state, log = await handle_action(current_state, search_engine, action, value)
-        ui_updates = update_ui_components(current_state) # Use current state for UI updates
-        return (
-            new_state,
-            ui_updates[page_content],
-            ui_updates[url_textbox],
-            ui_updates[links_display],
-            ui_updates[tab_selector],
-            log
-        )
-    # Each event listener is its own async function.
-    async def on_load(state, search_engine):
-        return await master_handler(state, search_engine, "new_tab", None)
-    async def on_go_click(state, search_engine, value):
-        return await master_handler(state, search_engine, "go", value)
-    async def on_click_link(state, search_engine, value):
-        return await master_handler(state, search_engine, "click", value)
-    async def on_new_tab(state, search_engine):
-        return await master_handler(state, search_engine, "new_tab", None)
-    async def on_close_tab(state, search_engine):
-        return await master_handler(state, search_engine, "close_tab", None)
-    async def on_switch_tab(state, search_engine, value):
-        return await master_handler(state, search_engine, "switch_tab", value)
-    # Wire up the event handlers
-    go_inputs = [browser_state, search_engine_selector, url_textbox]
-    click_inputs = [browser_state, search_engine_selector, click_num_box]
-    tab_inputs = [browser_state, search_engine_selector]
-    switch_inputs = [browser_state, search_engine_selector, tab_selector]
     outputs = [browser_state, *all_outputs]
-    demo.load(on_load, tab_inputs, outputs)
-    go_btn.click(on_go_click, go_inputs, outputs, show_progress="full")
-    url_textbox.submit(on_go_click, go_inputs, outputs, show_progress="full")
-    click_btn.click(on_click_link, click_inputs, outputs, show_progress="full")
-    new_tab_btn.click(on_new_tab, tab_inputs, outputs, show_progress="full")
-    close_tab_btn.click(on_close_tab, tab_inputs, outputs)
-    tab_selector.input(on_switch_tab, switch_inputs, outputs)
 demo.launch()

 import os
 os.system("playwright install")
+# app.py (Final, Working Async Version with All Bugs Fixed)
 import gradio as gr
 from playwright.async_api import async_playwright
 LIVE_CONTEXTS = {}
 APP_STARTED = False
 SEARCH_ENGINES = {
     "DuckDuckGo": "https://duckduckgo.com/html/?q={query}",
     "Google": "https://www.google.com/search?q={query}",
         self.proxies = self._parse_proxies(proxy_string)
         self.proxy_cycler = cycle(self.proxies) if self.proxies else None
     def _parse_proxies(self, proxy_string: str):
+        proxies = [];
         for line in proxy_string.strip().splitlines():
             try:
+                parsed = urllib.parse.urlparse(f"//{line.strip()}"); server = f"{parsed.scheme or 'http'}://{parsed.hostname}:{parsed.port}"
                 proxies.append({"server": server, "username": parsed.username, "password": parsed.password})
             except: pass
         return proxies
     def get_next(self): return next(self.proxy_cycler) if self.proxy_cycler else None
     def count(self): return len(self.proxies) if self.proxies else 0
+# --- 3. CORE ASYNC LOGIC & FORMATTING ---
 def _process_element_to_markdown(element):
+    if isinstance(element, NavigableString): return element.strip()
+    if element.name is None: return ''
     inner_text = ''.join(_process_element_to_markdown(child) for child in element.children)
+    if element.name in ['p', 'div', 'article', 'section']: return f"\n{inner_text.strip()}\n"
+    if element.name == 'h1': return f"\n# {inner_text.strip()}\n"
+    if element.name == 'h2': return f"\n## {inner_text.strip()}\n"
+    if element.name == 'h3': return f"\n### {inner_text.strip()}\n"
+    if element.name in ['h4', 'h5', 'h6']: return f"\n#### {inner_text.strip()}\n"
+    if element.name == 'li': return f"* {inner_text.strip()}\n"
+    if element.name in ['ul', 'ol']: return f"\n{inner_text}\n"
+    if element.name in ['strong', 'b']: return f"**{inner_text.strip()}**"
+    if element.name in ['em', 'i']: return f"*{inner_text.strip()}*"
+    if element.name in ['pre', 'code']: return f"\n```\n{inner_text.strip()}\n```\n"
+    if element.name == 'a': return f"[{inner_text.strip()}]({element.get('href', '')})"
+    if element.name == 'hr': return "\n---\n"
+    return inner_text
+def _format_html_to_markdown(soup):
+    content_node = soup.find('main') or soup.find('body')
+    if not content_node: return "Could not find main body content."
+    for el in content_node.select('nav, footer, header, aside, form, script, style'): el.decompose()
+    return _process_element_to_markdown(content_node)
 async def _fetch_and_update_tab_state(tab_state: TabState, url: str):
     log = f"▶️ Navigating to {url}..."; live_page = LIVE_CONTEXTS[tab_state.id]["page"]
     try:
         await live_page.goto(url, wait_until='domcontentloaded', timeout=30000)
         tab_state.url = live_page.url; tab_state.title = await live_page.title() or "No Title"
         log += f"\n✅ Arrived at: {tab_state.url}"
+        html_content = await live_page.content(); soup = BeautifulSoup(html_content, 'lxml')
         tab_state.parsed_text = _format_html_to_markdown(soup)
         tab_state.links = [{'text': link.get_text(strip=True) or "[No Link Text]", 'url': urllib.parse.urljoin(tab_state.url, link['href'])} for link in soup.find_all('a', href=True) if link.get('href', '').startswith('http')]
         log += f"\n🔗 Found {len(tab_state.links)} links."
     except Exception as e:
     return log
 async def handle_action(browser_state: BrowserState, search_engine: str, action: str, value=None):
+    active_tab_state = browser_state.get_active_tab()
     if action == "go" and active_tab_state and value:
         is_url = urllib.parse.urlparse(value).scheme and urllib.parse.urlparse(value).netloc
+        url = value if is_url else SEARCH_ENGINES.get(search_engine, SEARCH_ENGINES["DuckDuckGo"]).format(query=urllib.parse.quote_plus(value))
         log = await _fetch_and_update_tab_state(active_tab_state, url)
     elif action == "new_tab":
         tab_id, proxy_config = str(uuid.uuid4()), REVOLVER.get_next()
         context = await BROWSER.new_context(proxy=proxy_config)
         LIVE_CONTEXTS[tab_id] = {"context": context, "page": page}
         new_tab = TabState(tab_id, proxy_used=proxy_config['server'].split('@')[-1] if proxy_config else "Direct")
         browser_state.tabs.append(new_tab); browser_state.active_tab_id = tab_id
+        log = await _fetch_and_update_tab_state(new_tab, "https://www.startpage.com/")
     elif action == "click" and active_tab_state and value is not None:
         try:
             link_index = int(value)
         else: log = "Cannot close the last tab."
     elif action == "switch_tab" and value is not None:
         browser_state.active_tab_id = value; log = f"Switched to tab."
+    else: log = "No action taken."
     return browser_state, log
+# ** CRITICAL BUG FIX: `NameError` is fixed by defining this function before it is called **
+def update_ui_components(browser_state: BrowserState):
+    active_tab = browser_state.get_active_tab()
+    if not active_tab: return {page_content: gr.Markdown("No active tabs."), url_textbox: "", links_display: "", tab_selector: gr.Radio(choices=[])}
+    tab_choices = [(f"Tab {i}: {t.title[:25]}... ({t.proxy_used})", t.id) for i, t in enumerate(browser_state.tabs)]
+    links_md = "### 🔗 Links on Page\n" + ('\n'.join(f"{i}. [{link['text'][:80]}]({link['url']})" for i, link in enumerate(active_tab.links[:25])) if active_tab.links else "_No links found._")
+    page_md = f"# {active_tab.title}\n**URL:** {active_tab.url}\n\n---\n\n{active_tab.parsed_text}"
+    return {
+        page_content: gr.Markdown(page_md),
+        url_textbox: gr.Textbox(value=active_tab.url), links_display: gr.Markdown(links_md),
+        tab_selector: gr.Radio(choices=tab_choices, value=active_tab.id, label="Active Tabs"),
+    }
 # --- 4. GRADIO UI AND EVENT HANDLING ---
 with gr.Blocks(theme=gr.themes.Soft(), title="Real Browser Demo") as demo:
     browser_state = gr.State(BrowserState())
+    gr.Markdown("# 🛰️ Real Browser Demo v2.1")
     with gr.Row():
         with gr.Column(scale=4):
+            with gr.Row(): url_textbox = gr.Textbox(label="Enter URL or Search Query", interactive=True, scale=4); go_btn = gr.Button("Go", variant="primary", scale=1)
+            with gr.Accordion("Page Content (Formatted)", open=True): page_content = gr.Markdown("Loading...")
         with gr.Column(scale=2):
+            search_engine_selector = gr.Radio(list(SEARCH_ENGINES.keys()), value="DuckDuckGo", label="Search Engine")
+            with gr.Row(): new_tab_btn = gr.Button("➕ New Tab"); close_tab_btn = gr.Button("❌ Close Tab")
             tab_selector = gr.Radio(choices=[], label="Active Tabs", interactive=True)
             with gr.Accordion("Clickable Links", open=True):
                 links_display = gr.Markdown("...");
+                with gr.Row(): click_num_box = gr.Number(label="Link #", scale=1, minimum=0, step=1); click_btn = gr.Button("Click Link", scale=2)
     log_display = gr.Textbox(label="Status Log", interactive=False)
     all_outputs = [page_content, url_textbox, links_display, tab_selector, log_display]
     async def master_handler(current_state, search_engine, action, value=None):
             print(f"✅ Playwright started. {REVOLVER.count()} proxies loaded."); APP_STARTED = True
         new_state, log = await handle_action(current_state, search_engine, action, value)
+        ui_updates = update_ui_components(new_state)
+        return (new_state, ui_updates[page_content], ui_updates[url_textbox], ui_updates[links_display], ui_updates[tab_selector], log)
+    async def on_load(state, search_engine): return await master_handler(state, search_engine, "new_tab", None)
+    async def on_go_click(state, search_engine, value): return await master_handler(state, search_engine, "go", value)
+    async def on_click_link(state, search_engine, value): return await master_handler(state, search_engine, "click", value)
+    async def on_new_tab(state, search_engine): return await master_handler(state, search_engine, "new_tab", None)
+    async def on_close_tab(state, search_engine): return await master_handler(state, search_engine, "close_tab", None)
+    async def on_switch_tab(state, search_engine, value): return await master_handler(state, search_engine, "switch_tab", value)
     outputs = [browser_state, *all_outputs]
+    demo.load(on_load, [browser_state, search_engine_selector], outputs)
+    go_btn.click(on_go_click, [browser_state, search_engine_selector, url_textbox], outputs, show_progress="full")
+    url_textbox.submit(on_go_click, [browser_state, search_engine_selector, url_textbox], outputs, show_progress="full")
+    click_btn.click(on_click_link, [browser_state, search_engine_selector, click_num_box], outputs, show_progress="full")
+    new_tab_btn.click(on_new_tab, [browser_state, search_engine_selector], outputs, show_progress="full")
+    close_tab_btn.click(on_close_tab, [browser_state, search_engine_selector], outputs)
+    tab_selector.input(on_switch_tab, [browser_state, search_engine_selector, tab_selector], outputs)
 demo.launch()