File size: 14,158 Bytes
4c3fe29
 
d2ba6e0
35ae779
3819331
1bb1ac8
 
31d5b37
35ae779
15fdb32
35ae779
d2ba6e0
31d5b37
d74e8cc
d2ba6e0
1bb1ac8
d2ba6e0
 
1bb1ac8
 
 
d2ba6e0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7f2bf6a
 
d164b37
 
31d5b37
7f2bf6a
d164b37
 
3819331
006e72f
7f2bf6a
 
d164b37
7f2bf6a
10dbcf2
7f2bf6a
f2b00a1
10dbcf2
f2b00a1
7f2bf6a
 
d164b37
 
31d5b37
1bb1ac8
d2ba6e0
80fef4e
006e72f
7f2bf6a
80fef4e
 
7f2bf6a
d2ba6e0
10dbcf2
d2ba6e0
 
 
 
 
80fef4e
7f2bf6a
d164b37
 
7f2bf6a
 
 
3819331
1bb1ac8
10dbcf2
1bb1ac8
 
10dbcf2
1bb1ac8
 
80fef4e
 
 
7f2bf6a
d164b37
80fef4e
10dbcf2
35ae779
15fdb32
7f2bf6a
80fef4e
7f2bf6a
 
 
35ae779
80fef4e
 
7f2bf6a
80fef4e
35ae779
 
 
10dbcf2
7f2bf6a
 
10dbcf2
 
 
 
 
d2ba6e0
 
10dbcf2
 
 
 
 
 
d2ba6e0
31d5b37
7f2bf6a
d2ba6e0
3819331
1bb1ac8
10dbcf2
 
1bb1ac8
10dbcf2
 
3819331
31d5b37
006e72f
10dbcf2
1bb1ac8
10dbcf2
31d5b37
d74e8cc
1bb1ac8
80fef4e
 
 
 
 
 
 
1bb1ac8
10dbcf2
d74e8cc
10dbcf2
d74e8cc
10dbcf2
 
 
 
 
 
d74e8cc
1bb1ac8
d2ba6e0
 
 
 
 
 
 
 
 
 
 
 
3819331
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
import os
os.system("playwright install")
# app.py (Final Version with Advanced Formatting)

import gradio as gr
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup, NavigableString
import urllib.parse
import os
from itertools import cycle
import uuid
import re

# --- 1. GLOBAL RESOURCES & STATE ---
P, BROWSER, REVOLVER, LIVE_CONTEXTS, APP_STARTED = None, None, None, {}, False
SEARCH_ENGINES = {
    "DuckDuckGo": "https://duckduckgo.com/html/?q={query}", "Google": "https://www.google.com/search?q={query}",
    "Bing": "https://www.bing.com/search?q={query}", "Brave": "https://search.brave.com/search?q={query}",
    "Ecosia": "https://www.ecosia.org/search?q={query}"
}

# --- 2. NEW: ADVANCED HTML-TO-MARKDOWN CONVERTER ---
class HTML_TO_MARKDOWN_CONVERTER:
    """A sophisticated converter to turn cleaned HTML into readable Markdown."""
    def __init__(self, soup: BeautifulSoup, base_url: str):
        self.soup = soup
        self.base_url = base_url

    def _cleanup_html(self):
        """Aggressively remove non-content tags and sections from the HTML."""
        selectors_to_remove = [
            'nav', 'footer', 'header', 'aside', 'form', 'script', 'style', 'svg', 'button', 'input', 'textarea',
            '[role="navigation"]', '[role="search"]', '[id*="comment"]', '[class*="comment-"]',
            '[id*="sidebar"]', '[class*="sidebar"]', '[id*="related"]', '[class*="related"]',
            '[id*="share"]', '[class*="share"]', '[id*="social"]', '[class*="social"]',
            '[id*="cookie"]', '[class*="cookie"]'
        ]
        for selector in selectors_to_remove:
            for element in self.soup.select(selector):
                element.decompose()

    def convert(self):
        """Main conversion method."""
        self._cleanup_html()
        content_node = self.soup.find('main') or self.soup.find('article') or self.soup.find('body')
        if not content_node:
            return "Could not find main content."
        return self._process_node(content_node)

    def _process_node(self, element):
        """Recursively process each HTML node into Markdown text."""
        if isinstance(element, NavigableString):
            # Use regex to replace multiple spaces/newlines with a single space
            return re.sub(r'\s+', ' ', element.strip())

        if element.name is None or not element.name:
            return ''

        # Process children first to build up inner content
        inner_md = " ".join(self._process_node(child) for child in element.children).strip()

        # Block-level tags add newlines
        if element.name in ['p', 'div', 'section']:
            return f"\n\n{inner_md}\n\n"
        if element.name == 'h1': return f"\n\n# {inner_md}\n\n"
        if element.name == 'h2': return f"\n\n## {inner_md}\n\n"
        if element.name == 'h3': return f"\n\n### {inner_md}\n\n"
        if element.name in ['h4', 'h5', 'h6']: return f"\n\n#### {inner_md}\n\n"
        if element.name == 'li': return f"* {inner_md}\n"
        if element.name in ['ul', 'ol']: return f"\n{inner_md}\n"
        if element.name == 'blockquote': return f"> {inner_md.replace(chr(10), chr(10) + '> ')}\n\n"
        if element.name == 'hr': return "\n\n---\n\n"
        
        # Table conversion
        if element.name == 'table':
            header = " | ".join(f"**{th.get_text(strip=True)}**" for th in element.select('thead th, tr th'))
            separator = " | ".join(['---'] * len(header.split('|')))
            rows = [" | ".join(td.get_text(strip=True) for td in tr.find_all('td')) for tr in element.select('tbody tr')]
            return f"\n\n{header}\n{separator}\n" + "\n".join(rows) + "\n\n"

        # Pre-formatted and code
        if element.name == 'pre': return f"\n```\n{element.get_text(strip=True)}\n```\n\n"
        if element.name == 'code': return f"`{inner_md}`"
        
        # Inline tags
        if element.name in ['strong', 'b']: return f"**{inner_md}**"
        if element.name in ['em', 'i']: return f"*{inner_md}*"
        
        # Links and Images
        if element.name == 'a':
            href = element.get('href', '')
            # Resolve relative URLs
            full_href = urllib.parse.urljoin(self.base_url, href)
            return f"[{inner_md}]({full_href})"
        if element.name == 'img':
            src = element.get('src', '')
            alt = element.get('alt', 'Image').strip()
            # Resolve relative URLs
            full_src = urllib.parse.urljoin(self.base_url, src)
            return f"\n\n![{alt}]({full_src})\n\n"

        # Return inner markdown for unrecognized tags (like span, etc.)
        return inner_md


# --- 3. PLAIN DATA STATE CLASSES ---
class TabState:
    def __init__(self, tab_id, proxy_used="Direct Connection"):
        self.id, self.url, self.title = tab_id, "about:blank", "New Tab"
        self.parsed_text, self.links, self.proxy_used = "Welcome!", [], proxy_used

class BrowserState:
    def __init__(self): self.tabs, self.active_tab_id = [], None
    def get_active_tab(self): return next((t for t in self.tabs if t.id == self.active_tab_id), None)

class CredentialRevolver:
    def __init__(self, proxy_string: str):
        self.proxies = self._parse_proxies(proxy_string)
        self.proxy_cycler = cycle(self.proxies) if self.proxies else None
    def _parse_proxies(self, proxy_string: str):
        proxies = [];
        for line in proxy_string.strip().splitlines():
            try:
                parsed = urllib.parse.urlparse(f"//{line.strip()}"); server = f"{parsed.scheme or 'http'}://{parsed.hostname}:{parsed.port}"
                proxies.append({"server": server, "username": parsed.username, "password": parsed.password})
            except: pass
        return proxies
    def get_next(self): return next(self.proxy_cycler) if self.proxy_cycler else None
    def count(self): return len(self.proxies) if self.proxies else 0


# --- 4. CORE ASYNC LOGIC ---
async def _fetch_and_update_tab_state(tab_state: TabState, url: str):
    log = f"▶️ Navigating to {url}..."; live_page = LIVE_CONTEXTS[tab_state.id]["page"]
    try:
        await live_page.goto(url, wait_until='domcontentloaded', timeout=30000)
        tab_state.url = live_page.url; tab_state.title = await live_page.title() or "No Title"
        log += f"\n✅ Arrived at: {tab_state.url}"
        
        html_content = await live_page.content(); soup = BeautifulSoup(html_content, 'lxml')
        
        # Use the new advanced converter
        converter = HTML_TO_MARKDOWN_CONVERTER(soup, base_url=tab_state.url)
        tab_state.parsed_text = converter.convert()
        
        tab_state.links = [{'text': link.get_text(strip=True) or "[No Link Text]", 'url': urllib.parse.urljoin(tab_state.url, link['href'])} for link in soup.find_all('a', href=True) if link.get('href', '').startswith('http')]
        log += f"\n🔗 Found {len(tab_state.links)} links."
    except Exception as e:
        error_message = str(e).splitlines()[0]; tab_state.title = "Error"; tab_state.url = url
        tab_state.parsed_text = f"❌ Failed to load page.\n\nError: {error_message}"
        tab_state.links = []; log += f"\n❌ {error_message}"
    return log

async def handle_action(browser_state: BrowserState, search_engine: str, action: str, value=None):
    active_tab_state = browser_state.get_active_tab()
    if action == "go" and active_tab_state and value:
        is_url = urllib.parse.urlparse(value).scheme and urllib.parse.urlparse(value).netloc
        url = value if is_url else SEARCH_ENGINES.get(search_engine, SEARCH_ENGINES["DuckDuckGo"]).format(query=urllib.parse.quote_plus(value))
        log = await _fetch_and_update_tab_state(active_tab_state, url)
    elif action == "new_tab":
        tab_id, proxy_config = str(uuid.uuid4()), REVOLVER.get_next()
        context = await BROWSER.new_context(proxy=proxy_config)
        page = await context.new_page()
        LIVE_CONTEXTS[tab_id] = {"context": context, "page": page}
        new_tab = TabState(tab_id, proxy_used=proxy_config['server'].split('@')[-1] if proxy_config else "Direct")
        browser_state.tabs.append(new_tab); browser_state.active_tab_id = tab_id
        log = await _fetch_and_update_tab_state(new_tab, "https://www.startpage.com/")
    elif action == "click" and active_tab_state and value is not None:
        try:
            link_index = int(value)
            if 0 <= link_index < len(active_tab_state.links): log = await _fetch_and_update_tab_state(active_tab_state, active_tab_state.links[link_index]['url'])
            else: log = "Invalid link number."
        except: log = "Please enter a valid number to click."
    elif action == "close_tab" and active_tab_state:
        if len(browser_state.tabs) > 1:
            tab_to_close_id = browser_state.active_tab_id; tab_index = browser_state.tabs.index(active_tab_state)
            browser_state.tabs.pop(tab_index); new_index = tab_index - 1 if tab_index > 0 else 0
            browser_state.active_tab_id = browser_state.tabs[new_index].id
            resources = LIVE_CONTEXTS.pop(tab_to_close_id); await resources['context'].close(); log = f"💣 Tab closed."
        else: log = "Cannot close the last tab."
    elif action == "switch_tab" and value is not None:
        browser_state.active_tab_id = value; log = f"Switched to tab."
    else: log = "No action taken."
    return browser_state, log

def update_ui_components(browser_state: BrowserState):
    active_tab = browser_state.get_active_tab()
    if not active_tab: return {page_content: gr.Markdown("No active tabs."), url_textbox: "", links_display: "", tab_selector: gr.Radio(choices=[])}
    tab_choices = [(f"Tab {i}: {t.title[:25]}... ({t.proxy_used})", t.id) for i, t in enumerate(browser_state.tabs)]
    links_md = "### 🔗 Links on Page\n" + ('\n'.join(f"{i}. [{link['text'][:80]}]({link['url']})" for i, link in enumerate(active_tab.links[:25])) if active_tab.links else "_No links found._")
    # Clean up excessive newlines for final display
    page_md = re.sub(r'\n{3,}', '\n\n', f"# {active_tab.title}\n**URL:** {active_tab.url}\n\n---\n\n{active_tab.parsed_text}").strip()
    return {
        page_content: gr.Markdown(page_md),
        url_textbox: gr.Textbox(value=active_tab.url), links_display: gr.Markdown(links_md),
        tab_selector: gr.Radio(choices=tab_choices, value=active_tab.id, label="Active Tabs"),
    }

# --- 5. GRADIO UI AND EVENT HANDLING ---
with gr.Blocks(theme=gr.themes.Soft(), title="Real Browser Demo") as demo:
    browser_state = gr.State(BrowserState())
    gr.Markdown("# 🛰️ Real Browser Demo v2.2 (Advanced Formatting)")
    with gr.Row():
        with gr.Column(scale=4):
            with gr.Row(): url_textbox = gr.Textbox(label="Enter URL or Search Query", interactive=True, scale=4); go_btn = gr.Button("Go", variant="primary", scale=1)
            with gr.Accordion("Page Content (Formatted)", open=True): page_content = gr.Markdown("Loading...")
        with gr.Column(scale=2):
            search_engine_selector = gr.Radio(list(SEARCH_ENGINES.keys()), value="DuckDuckGo", label="Search Engine")
            with gr.Row(): new_tab_btn = gr.Button("➕ New Tab"); close_tab_btn = gr.Button("❌ Close Tab")
            tab_selector = gr.Radio(choices=[], label="Active Tabs", interactive=True)
            with gr.Accordion("Clickable Links", open=True):
                links_display = gr.Markdown("...");
                with gr.Row(): click_num_box = gr.Number(label="Link #", scale=1, minimum=0, step=1); click_btn = gr.Button("Click Link", scale=2)
    log_display = gr.Textbox(label="Status Log", interactive=False)
    
    all_outputs = [page_content, url_textbox, links_display, tab_selector, log_display]
    
    async def master_handler(current_state, search_engine, action, value=None):
        global APP_STARTED, P, BROWSER, REVOLVER
        if not APP_STARTED:
            print("🚀 First request received, starting up Playwright...");
            P = await async_playwright().start(); BROWSER = await P.firefox.launch(headless=True)
            proxy_list_str = os.getenv("PROXY_LIST", ""); REVOLVER = CredentialRevolver(proxy_list_str)
            print(f"✅ Playwright started. {REVOLVER.count()} proxies loaded."); APP_STARTED = True
        
        new_state, log = await handle_action(current_state, search_engine, action, value)
        ui_updates = update_ui_components(new_state)
        
        return (new_state, ui_updates[page_content], ui_updates[url_textbox], ui_updates[links_display], ui_updates[tab_selector], log)
    
    async def on_load(state, search_engine): return await master_handler(state, search_engine, "new_tab", None)
    async def on_go_click(state, search_engine, value): return await master_handler(state, search_engine, "go", value)
    async def on_click_link(state, search_engine, value): return await master_handler(state, search_engine, "click", value)
    async def on_new_tab(state, search_engine): return await master_handler(state, search_engine, "new_tab", None)
    async def on_close_tab(state, search_engine): return await master_handler(state, search_engine, "close_tab", None)
    async def on_switch_tab(state, search_engine, value): return await master_handler(state, search_engine, "switch_tab", value)
        
    outputs = [browser_state, *all_outputs]
    go_inputs = [browser_state, search_engine_selector, url_textbox]
    click_inputs = [browser_state, search_engine_selector, click_num_box]
    tab_inputs = [browser_state, search_engine_selector]
    switch_inputs = [browser_state, search_engine_selector, tab_selector]

    demo.load(on_load, tab_inputs, outputs)
    go_btn.click(on_go_click, go_inputs, outputs, show_progress="full")
    url_textbox.submit(on_go_click, go_inputs, outputs, show_progress="full")
    click_btn.click(on_click_link, click_inputs, outputs, show_progress="full")
    new_tab_btn.click(on_new_tab, tab_inputs, outputs, show_progress="full")
    close_tab_btn.click(on_close_tab, tab_inputs, outputs)
    tab_selector.input(on_switch_tab, switch_inputs, outputs)

demo.launch()