File size: 12,650 Bytes
3819331
 
 
31d5b37
 
 
3819331
31d5b37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3819331
 
31d5b37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3819331
31d5b37
 
3819331
 
31d5b37
3819331
 
31d5b37
 
3819331
31d5b37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3819331
31d5b37
 
3819331
31d5b37
 
 
 
 
 
 
 
 
3819331
31d5b37
3819331
 
 
31d5b37
 
 
 
 
3819331
 
 
31d5b37
 
 
 
 
3819331
 
31d5b37
 
 
 
3819331
 
31d5b37
 
 
 
3819331
31d5b37
3819331
 
31d5b37
 
3819331
31d5b37
 
 
3819331
 
 
31d5b37
 
3819331
31d5b37
 
 
 
 
 
 
 
 
 
3819331
 
31d5b37
3819331
31d5b37
 
 
 
 
 
 
 
3819331
31d5b37
 
 
 
 
 
 
 
 
3819331
31d5b37
 
3819331
31d5b37
 
 
3819331
 
 
 
31d5b37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3819331
31d5b37
3819331
 
 
 
 
 
 
31d5b37
 
 
3819331
31d5b37
 
3819331
 
 
 
31d5b37
 
 
 
 
 
3819331
31d5b37
 
3819331
31d5b37
3819331
 
 
 
31d5b37
3819331
 
31d5b37
 
 
 
 
3819331
 
31d5b37
3819331
 
 
31d5b37
3819331
31d5b37
3819331
 
31d5b37
 
 
 
3819331
31d5b37
 
 
3819331
31d5b37
 
 
3819331
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
# app.py

import gradio as gr
from playwright.sync_api import sync_playwright, Error as PlaywrightError
from bs4 import BeautifulSoup
import urllib.parse
import datetime
import atexit
import re

# --- GLOBAL PLAYWRIGHT SETUP ---
# Launch Playwright and a browser instance once when the app starts.
# This is crucial for performance and state management.
try:
    p = sync_playwright().start()
    # Using Firefox can sometimes be less prone to bot detection than Chromium.
    # headless=True is essential for running on a server like Hugging Face Spaces.
    browser = p.firefox.launch(headless=True, timeout=60000) 
    print("✅ Playwright browser launched successfully.")
except Exception as e:
    print(f"❌ Could not launch Playwright browser: {e}")
    # You might want to handle this more gracefully, but for a demo, exiting is fine.
    exit()

# Ensure the browser is closed gracefully when the app exits.
def cleanup():
    print("🧹 Cleaning up: Closing Playwright browser...")
    browser.close()
    p.stop()
atexit.register(cleanup)


# --- Core Browser Logic (Powered by Playwright) ---

class Tab:
    """Represents a single browser tab, now backed by a Playwright Page."""
    def __init__(self, playwright_page):
        self.page = playwright_page # The actual Playwright page object
        self.title = "New Tab"
        self.url = "about:blank"
        self.parsed_text = "Welcome! Navigate to a URL or search to get started."
        self.links = [] # A list of {'text': str, 'url': str}

    def close(self):
        """Closes the underlying Playwright page."""
        if not self.page.is_closed():
            self.page.close()

class RealBrowser:
    """Manages multiple tabs and browser-level state."""
    def __init__(self):
        self.tabs = []
        self.active_tab_index = -1
        self.bookmarks = set()
        self.global_history = []
        self.new_tab() # Start with one tab

    def _get_active_tab(self):
        if self.active_tab_index == -1 or self.active_tab_index >= len(self.tabs):
            return None
        return self.tabs[self.active_tab_index]
        
    def _fetch_and_parse(self, tab, url):
        """Uses Playwright to navigate and BeautifulSoup to parse."""
        log = f"▶️ Navigating to {url}..."
        try:
            # Navigate the page, waiting until the page is fully loaded.
            # wait_until='domcontentloaded' is a good balance of speed and completeness.
            tab.page.goto(url, wait_until='domcontentloaded', timeout=30000)
            
            # Update tab state with the final URL after any redirects
            tab.url = tab.page.url
            tab.title = tab.page.title() or "No Title"
            log += f"\n✅ Arrived at: {tab.url}"
            log += f"\n📄 Title: {tab.title}"

            # Get the fully-rendered HTML and parse it
            html_content = tab.page.content()
            soup = BeautifulSoup(html_content, 'lxml')
            
            # Extract and clean text
            for script in soup(["script", "style", "nav", "footer"]):
                script.extract()
            text = soup.get_text(separator='\n', strip=True)
            tab.parsed_text = text

            # Extract links
            tab.links = []
            for link in soup.find_all('a', href=True):
                href = link['href']
                absolute_url = urllib.parse.urljoin(tab.url, href)
                # Filter out useless links
                if absolute_url.startswith('http') and not re.match(r'javascript:|mailto:', absolute_url):
                    link_text = link.get_text(strip=True) or "[No Link Text]"
                    tab.links.append({'text': link_text, 'url': absolute_url})
            log += f"\n🔗 Found {len(tab.links)} links."

        except PlaywrightError as e:
            error_message = str(e)
            if "net::ERR" in error_message:
                error_message = "Network error: Could not resolve host or connect."
            elif "Timeout" in error_message:
                error_message = f"Timeout: The page took too long to load."
            
            tab.title = "Error"
            tab.url = url
            tab.parsed_text = f"❌ Failed to load page.\n\nError: {error_message}"
            tab.links = []
            log += f"\n❌ {error_message}"
        
        return log

    def go(self, term_or_url):
        """Opens a URL or performs a search in the active tab."""
        tab = self._get_active_tab()
        if not tab: return "No active tab."

        # Check if it's a URL or a search term
        parsed_url = urllib.parse.urlparse(term_or_url)
        if parsed_url.scheme and parsed_url.netloc:
            url = term_or_url
        else:
            url = f"https://duckduckgo.com/html/?q={urllib.parse.quote_plus(term_or_url)}"
        
        self.global_history.append((datetime.datetime.now(), url))
        return self._fetch_and_parse(tab, url)

    def back(self):
        tab = self._get_active_tab()
        if tab and tab.page.can_go_back():
            # Playwright's go_back is async-like, we need to re-parse
            tab.page.go_back(wait_until='domcontentloaded')
            return self._fetch_and_parse(tab, tab.page.url)
        return "Cannot go back."

    def forward(self):
        tab = self._get_active_tab()
        if tab and tab.page.can_go_forward():
            tab.page.go_forward(wait_until='domcontentloaded')
            return self._fetch_and_parse(tab, tab.page.url)
        return "Cannot go forward."
        
    def refresh(self):
        tab = self._get_active_tab()
        if tab:
            tab.page.reload(wait_until='domcontentloaded')
            return self._fetch_and_parse(tab, tab.page.url)
        return "No active tab."

    def new_tab(self):
        # Create a new page in the persistent browser context
        page = browser.new_page()
        tab = Tab(page)
        self.tabs.append(tab)
        self.active_tab_index = len(self.tabs) - 1
        return self.go("https://duckduckgo.com/html/?q=news") # Navigate new tab to a default search

    def close_tab(self):
        if len(self.tabs) <= 1:
            return "Cannot close the last tab."
        
        tab_to_close = self.tabs.pop(self.active_tab_index)
        tab_to_close.close()

        if self.active_tab_index >= len(self.tabs):
            self.active_tab_index = len(self.tabs) - 1
        
        # No need to re-fetch, just update the UI state
        return f"Tab closed. Switched to Tab {self.active_tab_index}."
    
    def switch_tab(self, tab_label):
        try:
            index = int(tab_label.split(":")[0].replace("Tab", "").strip())
            if 0 <= index < len(self.tabs):
                self.active_tab_index = index
                return f"Switched to Tab {index}."
            return "Invalid tab index."
        except (ValueError, IndexError):
            return "Invalid tab format."

# --- Gradio UI and Event Handlers ---

def update_ui_components(browser_state: RealBrowser):
    """Generates all UI component values from the browser state."""
    active_tab = browser_state._get_active_tab()
    if not active_tab:
        return {
            page_content: gr.Markdown("No active tabs. Please create a new one."),
            url_textbox: "",
            links_display: "",
            tab_selector: gr.Radio(choices=[], label="Active Tabs"),
        }

    # Tab Selector
    tab_choices = [f"Tab {i}: {tab.title[:40]}..." for i, tab in enumerate(browser_state.tabs)]
    active_tab_label = f"Tab {browser_state.active_tab_index}: {active_tab.title[:40]}..."

    # Links Display
    links_md = "### 🔗 Links on Page\n"
    if active_tab.links:
        for i, link in enumerate(active_tab.links[:25]): # Show first 25 links
            links_md += f"{i}. [{link['text'][:80]}]({link['url']})\n"
    else:
        links_md += "_No links found or page failed to load._"
        
    return {
        page_content: gr.Markdown(f"# {active_tab.title}\n**URL:** {active_tab.url}\n\n---\n\n{active_tab.parsed_text[:2000]}..."),
        url_textbox: gr.Textbox(value=active_tab.url),
        links_display: gr.Markdown(links_md),
        tab_selector: gr.Radio(choices=tab_choices, value=active_tab_label, label="Active Tabs"),
    }

# --- Event Handlers ---
def handle_action(browser_state, action, value=None):
    if action == "go":
        log = browser_state.go(value)
    elif action == "click":
        tab = browser_state._get_active_tab()
        try:
            link_index = int(value)
            if tab and 0 <= link_index < len(tab.links):
                link_url = tab.links[link_index]['url']
                log = browser_state.go(link_url)
            else:
                log = "Invalid link number."
        except (ValueError, TypeError):
            log = "Please enter a valid number to click."
    elif action == "back":
        log = browser_state.back()
    elif action == "forward":
        log = browser_state.forward()
    elif action == "refresh":
        log = browser_state.refresh()
    elif action == "new_tab":
        log = browser_state.new_tab()
    elif action == "close_tab":
        log = browser_state.close_tab()
    elif action == "switch_tab":
        log = browser_state.switch_tab(value)
    else:
        log = "Unknown action."
    
    # After any action, update the entire UI based on the new state
    return {
        **update_ui_components(browser_state),
        log_display: gr.Textbox(log)
    }

# --- Gradio Interface Layout ---

with gr.Blocks(theme=gr.themes.Soft(), title="Real Browser Demo") as demo:
    # The gr.State holds our Python class instance, persisting it across calls.
    browser_state = gr.State(RealBrowser())

    gr.Markdown("# 🌐 Real Browser Demo (Powered by Playwright)")
    gr.Markdown("Type a URL or search term. This demo runs a real headless browser on the server to fetch and parse live websites.")

    with gr.Row():
        with gr.Column(scale=3):
            with gr.Row():
                back_btn = gr.Button("◀ Back")
                forward_btn = gr.Button("▶ Forward")
                refresh_btn = gr.Button("🔄 Refresh")
            
            url_textbox = gr.Textbox(label="URL or Search Term", placeholder="https://news.ycombinator.com or 'best python libraries'", interactive=True)
            go_btn = gr.Button("Go", variant="primary")
            
            with gr.Accordion("Page Content (Text Only)", open=True):
                page_content = gr.Markdown("Loading...")
            
            log_display = gr.Textbox(label="Status Log", interactive=False)

        with gr.Column(scale=1):
            with gr.Row():
                new_tab_btn = gr.Button("➕ New Tab")
                close_tab_btn = gr.Button("❌ Close Tab")
            tab_selector = gr.Radio(choices=[], label="Active Tabs", interactive=True)
            
            with gr.Accordion("Clickable Links", open=True):
                links_display = gr.Markdown("...")
                with gr.Row():
                    click_num_box = gr.Number(label="Link #", scale=1, minimum=0, step=1)
                    click_btn = gr.Button("Click Link", scale=2)

    # --- Component Wiring ---
    all_outputs = [page_content, url_textbox, links_display, tab_selector, log_display]

    # Initial load
    demo.load(
        lambda state: {**update_ui_components(state), log_display: "🚀 Browser Initialized! Ready to navigate."},
        inputs=[browser_state],
        outputs=all_outputs
    )

    # Event listeners
    go_btn.click(lambda s, v: handle_action(s, "go", v), [browser_state, url_textbox], all_outputs, show_progress="full")
    url_textbox.submit(lambda s, v: handle_action(s, "go", v), [browser_state, url_textbox], all_outputs, show_progress="full")
    click_btn.click(lambda s, v: handle_action(s, "click", v), [browser_state, click_num_box], all_outputs, show_progress="full")
    
    back_btn.click(lambda s: handle_action(s, "back"), [browser_state], all_outputs, show_progress="full")
    forward_btn.click(lambda s: handle_action(s, "forward"), [browser_state], all_outputs, show_progress="full")
    refresh_btn.click(lambda s: handle_action(s, "refresh"), [browser_state], all_outputs, show_progress="full")
    
    new_tab_btn.click(lambda s: handle_action(s, "new_tab"), [browser_state], all_outputs, show_progress="full")
    close_tab_btn.click(lambda s: handle_action(s, "close_tab"), [browser_state], all_outputs)
    tab_selector.input(lambda s, v: handle_action(s, "switch_tab", v), [browser_state, tab_selector], all_outputs)


demo.launch()