Spaces:

broadfield-dev
/

browser-gradio

Running

File size: 12,048 Bytes

4c3fe29
 
1bb1ac8
35ae779
3819331
1bb1ac8
 
31d5b37
35ae779
15fdb32
35ae779
31d5b37
d74e8cc
80fef4e
 
 
d164b37
80fef4e
006e72f
1bb1ac8
 
 
 
 
 
 
 
 
d164b37
7f2bf6a
 
d164b37
 
31d5b37
7f2bf6a
d164b37
 
3819331
006e72f
7f2bf6a
 
d164b37
7f2bf6a
f2b00a1
7f2bf6a
f2b00a1
 
 
 
7f2bf6a
 
d164b37
 
31d5b37
1bb1ac8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80fef4e
1bb1ac8
006e72f
7f2bf6a
80fef4e
 
7f2bf6a
1bb1ac8
 
 
 
 
 
 
80fef4e
7f2bf6a
d164b37
 
7f2bf6a
 
 
3819331
1bb1ac8
 
35ae779
1bb1ac8
 
 
 
 
 
 
 
 
 
 
 
80fef4e
 
 
7f2bf6a
d164b37
80fef4e
1bb1ac8
35ae779
15fdb32
7f2bf6a
80fef4e
7f2bf6a
 
 
35ae779
80fef4e
 
7f2bf6a
80fef4e
35ae779
 
 
7f2bf6a
 
f2b00a1
31d5b37
7f2bf6a
1bb1ac8
d164b37
3819331
1bb1ac8
 
 
 
 
 
 
 
 
 
 
 
 
3819331
31d5b37
006e72f
1bb1ac8
 
 
80fef4e
1bb1ac8
 
31d5b37
d74e8cc
1bb1ac8
80fef4e
 
 
 
 
 
 
1bb1ac8
 
d74e8cc
 
d164b37
 
 
 
 
 
d74e8cc
 
1bb1ac8
 
 
 
 
 
 
 
 
 
 
 
 
d74e8cc
1bb1ac8
 
 
 
 
 
 
 
 
 
 
 
 
 
3819331

import os
os.system("playwright install")
# app.py (Final Version with Search Engines and Improved Formatting)

import gradio as gr
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup, NavigableString
import urllib.parse
import os
from itertools import cycle
import uuid

# --- 1. GLOBAL RESOURCES & STATE ---
P = None
BROWSER = None
REVOLVER = None
LIVE_CONTEXTS = {}
APP_STARTED = False

# New: Search Engine Configuration
SEARCH_ENGINES = {
    "DuckDuckGo": "https://duckduckgo.com/html/?q={query}",
    "Google": "https://www.google.com/search?q={query}",
    "Bing": "https://www.bing.com/search?q={query}",
    "Brave": "https://search.brave.com/search?q={query}",
    "Ecosia": "https://www.ecosia.org/search?q={query}"
}

# --- 2. PLAIN DATA STATE CLASSES ---
class TabState:
    def __init__(self, tab_id, proxy_used="Direct Connection"):
        self.id, self.url, self.title = tab_id, "about:blank", "New Tab"
        self.parsed_text, self.links, self.proxy_used = "Welcome!", [], proxy_used

class BrowserState:
    def __init__(self): self.tabs, self.active_tab_id = [], None
    def get_active_tab(self): return next((t for t in self.tabs if t.id == self.active_tab_id), None)

class CredentialRevolver:
    def __init__(self, proxy_string: str):
        self.proxies = self._parse_proxies(proxy_string)
        self.proxy_cycler = cycle(self.proxies) if self.proxies else None
    def _parse_proxies(self, proxy_string: str):
        proxies = []
        for line in proxy_string.strip().splitlines():
            try:
                parsed = urllib.parse.urlparse(f"//{line.strip()}")
                server = f"{parsed.scheme or 'http'}://{parsed.hostname}:{parsed.port}"
                proxies.append({"server": server, "username": parsed.username, "password": parsed.password})
            except: pass
        return proxies
    def get_next(self): return next(self.proxy_cycler) if self.proxy_cycler else None
    def count(self): return len(self.proxies) if self.proxies else 0

# --- 3. CORE ASYNC LOGIC with NEW FORMATTER ---

def _format_html_to_markdown(soup):
    """
    NEW: Intelligently converts a BeautifulSoup object to formatted Markdown.
    """
    text_parts = []
    
    # Use a more specific main content area if available
    content_node = soup.find('main') or soup.find('body')
    if not content_node:
        return "Could not find main body content."

    for element in content_node.find_all(recursive=False):
        # Ignore common non-content sections
        if element.name in ['nav', 'footer', 'header', 'aside', 'form', 'script', 'style']:
            continue
        text_parts.append(_process_element_to_markdown(element))
        
    return '\n'.join(filter(None, text_parts))

def _process_element_to_markdown(element):
    """NEW: Recursive helper to process each element into Markdown."""
    if isinstance(element, NavigableString):
        return element.strip()
    
    if element.name is None:
        return ''

    # Get the inner text by recursively processing children
    inner_text = ''.join(_process_element_to_markdown(child) for child in element.children)

    # Apply formatting based on tag name
    if element.name in ['p', 'div', 'article', 'section']:
        return f"\n{inner_text.strip()}\n"
    if element.name == 'h1':
        return f"\n# {inner_text.strip()}\n"
    if element.name == 'h2':
        return f"\n## {inner_text.strip()}\n"
    if element.name == 'h3':
        return f"\n### {inner_text.strip()}\n"
    if element.name in ['h4', 'h5', 'h6']:
        return f"\n#### {inner_text.strip()}\n"
    if element.name == 'li':
        return f"* {inner_text.strip()}\n"
    if element.name in ['ul', 'ol']:
        return f"\n{inner_text}\n"
    if element.name in ['strong', 'b']:
        return f"**{inner_text.strip()}**"
    if element.name in ['em', 'i']:
        return f"*{inner_text.strip()}*"
    if element.name in ['pre', 'code']:
        return f"\n```\n{inner_text.strip()}\n```\n"
    if element.name == 'a':
        href = element.get('href', '')
        return f"[{inner_text.strip()}]({href})"
    if element.name == 'hr':
        return "\n---\n"
    
    return inner_text # For other tags like span, etc.

async def _fetch_and_update_tab_state(tab_state: TabState, url: str):
    """Uses Playwright to navigate and the new formatter to parse."""
    log = f"▶️ Navigating to {url}..."; live_page = LIVE_CONTEXTS[tab_state.id]["page"]
    try:
        await live_page.goto(url, wait_until='domcontentloaded', timeout=30000)
        tab_state.url = live_page.url; tab_state.title = await live_page.title() or "No Title"
        log += f"\n✅ Arrived at: {tab_state.url}"
        
        html_content = await live_page.content()
        soup = BeautifulSoup(html_content, 'lxml')
        
        # Use the new formatter
        tab_state.parsed_text = _format_html_to_markdown(soup)
        
        tab_state.links = [{'text': link.get_text(strip=True) or "[No Link Text]", 'url': urllib.parse.urljoin(tab_state.url, link['href'])} for link in soup.find_all('a', href=True) if link.get('href', '').startswith('http')]
        log += f"\n🔗 Found {len(tab_state.links)} links."
    except Exception as e:
        error_message = str(e).splitlines()[0]; tab_state.title = "Error"; tab_state.url = url
        tab_state.parsed_text = f"❌ Failed to load page.\n\nError: {error_message}"
        tab_state.links = []; log += f"\n❌ {error_message}"
    return log

async def handle_action(browser_state: BrowserState, search_engine: str, action: str, value=None):
    """Main logic handler, now with search engine selection."""
    log = ""; active_tab_state = browser_state.get_active_tab()
    
    if action == "go" and active_tab_state and value:
        is_url = urllib.parse.urlparse(value).scheme and urllib.parse.urlparse(value).netloc
        if is_url:
            url = value
        else: # It's a search query
            base_url = SEARCH_ENGINES.get(search_engine, SEARCH_ENGINES["DuckDuckGo"])
            url = base_url.format(query=urllib.parse.quote_plus(value))
        log = await _fetch_and_update_tab_state(active_tab_state, url)
    
    # --- Other actions remain the same ---
    elif action == "new_tab":
        tab_id, proxy_config = str(uuid.uuid4()), REVOLVER.get_next()
        context = await BROWSER.new_context(proxy=proxy_config)
        page = await context.new_page()
        LIVE_CONTEXTS[tab_id] = {"context": context, "page": page}
        new_tab = TabState(tab_id, proxy_used=proxy_config['server'].split('@')[-1] if proxy_config else "Direct")
        browser_state.tabs.append(new_tab); browser_state.active_tab_id = tab_id
        log = await _fetch_and_update_tab_state(new_tab, "https://www.startpage.com/") # New default page
    elif action == "click" and active_tab_state and value is not None:
        try:
            link_index = int(value)
            if 0 <= link_index < len(active_tab_state.links): log = await _fetch_and_update_tab_state(active_tab_state, active_tab_state.links[link_index]['url'])
            else: log = "Invalid link number."
        except: log = "Please enter a valid number to click."
    elif action == "close_tab" and active_tab_state:
        if len(browser_state.tabs) > 1:
            tab_to_close_id = browser_state.active_tab_id; tab_index = browser_state.tabs.index(active_tab_state)
            browser_state.tabs.pop(tab_index); new_index = tab_index - 1 if tab_index > 0 else 0
            browser_state.active_tab_id = browser_state.tabs[new_index].id
            resources = LIVE_CONTEXTS.pop(tab_to_close_id); await resources['context'].close(); log = f"💣 Tab closed."
        else: log = "Cannot close the last tab."
    elif action == "switch_tab" and value is not None:
        browser_state.active_tab_id = value; log = f"Switched to tab."
    return browser_state, log

# --- 4. GRADIO UI AND EVENT HANDLING ---
with gr.Blocks(theme=gr.themes.Soft(), title="Real Browser Demo") as demo:
    browser_state = gr.State(BrowserState())
    gr.Markdown("# 🛰️ Real Browser Demo v2")
    
    with gr.Row():
        with gr.Column(scale=4):
            with gr.Row():
                url_textbox = gr.Textbox(label="Enter URL or Search Query", interactive=True, scale=4)
                go_btn = gr.Button("Go", variant="primary", scale=1)
            with gr.Accordion("Page Content (Formatted)", open=True):
                page_content = gr.Markdown("Loading...")
        with gr.Column(scale=2):
            # NEW: Search Engine Selector
            search_engine_selector = gr.Radio(
                list(SEARCH_ENGINES.keys()), value="DuckDuckGo", label="Search Engine"
            )
            with gr.Row():
                new_tab_btn = gr.Button("➕ New Tab"); close_tab_btn = gr.Button("❌ Close Tab")
            tab_selector = gr.Radio(choices=[], label="Active Tabs", interactive=True)
            with gr.Accordion("Clickable Links", open=True):
                links_display = gr.Markdown("...");
                with gr.Row():
                    click_num_box = gr.Number(label="Link #", scale=1, minimum=0, step=1)
                    click_btn = gr.Button("Click Link", scale=2)
    
    log_display = gr.Textbox(label="Status Log", interactive=False)

    all_outputs = [page_content, url_textbox, links_display, tab_selector, log_display]
    
    async def master_handler(current_state, search_engine, action, value=None):
        global APP_STARTED, P, BROWSER, REVOLVER
        if not APP_STARTED:
            print("🚀 First request received, starting up Playwright...");
            P = await async_playwright().start(); BROWSER = await P.firefox.launch(headless=True)
            proxy_list_str = os.getenv("PROXY_LIST", ""); REVOLVER = CredentialRevolver(proxy_list_str)
            print(f"✅ Playwright started. {REVOLVER.count()} proxies loaded."); APP_STARTED = True
        
        new_state, log = await handle_action(current_state, search_engine, action, value)
        ui_updates = update_ui_components(current_state) # Use current state for UI updates
        
        return (
            new_state,
            ui_updates[page_content],
            ui_updates[url_textbox],
            ui_updates[links_display],
            ui_updates[tab_selector],
            log
        )
    
    # Each event listener is its own async function.
    async def on_load(state, search_engine):
        return await master_handler(state, search_engine, "new_tab", None)
    async def on_go_click(state, search_engine, value):
        return await master_handler(state, search_engine, "go", value)
    async def on_click_link(state, search_engine, value):
        return await master_handler(state, search_engine, "click", value)
    async def on_new_tab(state, search_engine):
        return await master_handler(state, search_engine, "new_tab", None)
    async def on_close_tab(state, search_engine):
        return await master_handler(state, search_engine, "close_tab", None)
    async def on_switch_tab(state, search_engine, value):
        return await master_handler(state, search_engine, "switch_tab", value)
        
    # Wire up the event handlers
    go_inputs = [browser_state, search_engine_selector, url_textbox]
    click_inputs = [browser_state, search_engine_selector, click_num_box]
    tab_inputs = [browser_state, search_engine_selector]
    switch_inputs = [browser_state, search_engine_selector, tab_selector]
    outputs = [browser_state, *all_outputs]

    demo.load(on_load, tab_inputs, outputs)
    go_btn.click(on_go_click, go_inputs, outputs, show_progress="full")
    url_textbox.submit(on_go_click, go_inputs, outputs, show_progress="full")
    click_btn.click(on_click_link, click_inputs, outputs, show_progress="full")
    new_tab_btn.click(on_new_tab, tab_inputs, outputs, show_progress="full")
    close_tab_btn.click(on_close_tab, tab_inputs, outputs)
    tab_selector.input(on_switch_tab, switch_inputs, outputs)

demo.launch()