Spaces:
Running
Running
# app.py | |
import gradio as gr | |
from playwright.sync_api import sync_playwright, Error as PlaywrightError | |
from bs4 import BeautifulSoup | |
import urllib.parse | |
import datetime | |
import atexit | |
import re | |
# --- GLOBAL PLAYWRIGHT SETUP --- | |
# Launch Playwright and a browser instance once when the app starts. | |
# This is crucial for performance and state management. | |
try: | |
p = sync_playwright().start() | |
# Using Firefox can sometimes be less prone to bot detection than Chromium. | |
# headless=True is essential for running on a server like Hugging Face Spaces. | |
browser = p.firefox.launch(headless=True, timeout=60000) | |
print("✅ Playwright browser launched successfully.") | |
except Exception as e: | |
print(f"❌ Could not launch Playwright browser: {e}") | |
# You might want to handle this more gracefully, but for a demo, exiting is fine. | |
exit() | |
# Ensure the browser is closed gracefully when the app exits. | |
def cleanup(): | |
print("🧹 Cleaning up: Closing Playwright browser...") | |
browser.close() | |
p.stop() | |
atexit.register(cleanup) | |
# --- Core Browser Logic (Powered by Playwright) --- | |
class Tab: | |
"""Represents a single browser tab, now backed by a Playwright Page.""" | |
def __init__(self, playwright_page): | |
self.page = playwright_page # The actual Playwright page object | |
self.title = "New Tab" | |
self.url = "about:blank" | |
self.parsed_text = "Welcome! Navigate to a URL or search to get started." | |
self.links = [] # A list of {'text': str, 'url': str} | |
def close(self): | |
"""Closes the underlying Playwright page.""" | |
if not self.page.is_closed(): | |
self.page.close() | |
class RealBrowser: | |
"""Manages multiple tabs and browser-level state.""" | |
def __init__(self): | |
self.tabs = [] | |
self.active_tab_index = -1 | |
self.bookmarks = set() | |
self.global_history = [] | |
self.new_tab() # Start with one tab | |
def _get_active_tab(self): | |
if self.active_tab_index == -1 or self.active_tab_index >= len(self.tabs): | |
return None | |
return self.tabs[self.active_tab_index] | |
def _fetch_and_parse(self, tab, url): | |
"""Uses Playwright to navigate and BeautifulSoup to parse.""" | |
log = f"▶️ Navigating to {url}..." | |
try: | |
# Navigate the page, waiting until the page is fully loaded. | |
# wait_until='domcontentloaded' is a good balance of speed and completeness. | |
tab.page.goto(url, wait_until='domcontentloaded', timeout=30000) | |
# Update tab state with the final URL after any redirects | |
tab.url = tab.page.url | |
tab.title = tab.page.title() or "No Title" | |
log += f"\n✅ Arrived at: {tab.url}" | |
log += f"\n📄 Title: {tab.title}" | |
# Get the fully-rendered HTML and parse it | |
html_content = tab.page.content() | |
soup = BeautifulSoup(html_content, 'lxml') | |
# Extract and clean text | |
for script in soup(["script", "style", "nav", "footer"]): | |
script.extract() | |
text = soup.get_text(separator='\n', strip=True) | |
tab.parsed_text = text | |
# Extract links | |
tab.links = [] | |
for link in soup.find_all('a', href=True): | |
href = link['href'] | |
absolute_url = urllib.parse.urljoin(tab.url, href) | |
# Filter out useless links | |
if absolute_url.startswith('http') and not re.match(r'javascript:|mailto:', absolute_url): | |
link_text = link.get_text(strip=True) or "[No Link Text]" | |
tab.links.append({'text': link_text, 'url': absolute_url}) | |
log += f"\n🔗 Found {len(tab.links)} links." | |
except PlaywrightError as e: | |
error_message = str(e) | |
if "net::ERR" in error_message: | |
error_message = "Network error: Could not resolve host or connect." | |
elif "Timeout" in error_message: | |
error_message = f"Timeout: The page took too long to load." | |
tab.title = "Error" | |
tab.url = url | |
tab.parsed_text = f"❌ Failed to load page.\n\nError: {error_message}" | |
tab.links = [] | |
log += f"\n❌ {error_message}" | |
return log | |
def go(self, term_or_url): | |
"""Opens a URL or performs a search in the active tab.""" | |
tab = self._get_active_tab() | |
if not tab: return "No active tab." | |
# Check if it's a URL or a search term | |
parsed_url = urllib.parse.urlparse(term_or_url) | |
if parsed_url.scheme and parsed_url.netloc: | |
url = term_or_url | |
else: | |
url = f"https://duckduckgo.com/html/?q={urllib.parse.quote_plus(term_or_url)}" | |
self.global_history.append((datetime.datetime.now(), url)) | |
return self._fetch_and_parse(tab, url) | |
def back(self): | |
tab = self._get_active_tab() | |
if tab and tab.page.can_go_back(): | |
# Playwright's go_back is async-like, we need to re-parse | |
tab.page.go_back(wait_until='domcontentloaded') | |
return self._fetch_and_parse(tab, tab.page.url) | |
return "Cannot go back." | |
def forward(self): | |
tab = self._get_active_tab() | |
if tab and tab.page.can_go_forward(): | |
tab.page.go_forward(wait_until='domcontentloaded') | |
return self._fetch_and_parse(tab, tab.page.url) | |
return "Cannot go forward." | |
def refresh(self): | |
tab = self._get_active_tab() | |
if tab: | |
tab.page.reload(wait_until='domcontentloaded') | |
return self._fetch_and_parse(tab, tab.page.url) | |
return "No active tab." | |
def new_tab(self): | |
# Create a new page in the persistent browser context | |
page = browser.new_page() | |
tab = Tab(page) | |
self.tabs.append(tab) | |
self.active_tab_index = len(self.tabs) - 1 | |
return self.go("https://duckduckgo.com/html/?q=news") # Navigate new tab to a default search | |
def close_tab(self): | |
if len(self.tabs) <= 1: | |
return "Cannot close the last tab." | |
tab_to_close = self.tabs.pop(self.active_tab_index) | |
tab_to_close.close() | |
if self.active_tab_index >= len(self.tabs): | |
self.active_tab_index = len(self.tabs) - 1 | |
# No need to re-fetch, just update the UI state | |
return f"Tab closed. Switched to Tab {self.active_tab_index}." | |
def switch_tab(self, tab_label): | |
try: | |
index = int(tab_label.split(":")[0].replace("Tab", "").strip()) | |
if 0 <= index < len(self.tabs): | |
self.active_tab_index = index | |
return f"Switched to Tab {index}." | |
return "Invalid tab index." | |
except (ValueError, IndexError): | |
return "Invalid tab format." | |
# --- Gradio UI and Event Handlers --- | |
def update_ui_components(browser_state: RealBrowser): | |
"""Generates all UI component values from the browser state.""" | |
active_tab = browser_state._get_active_tab() | |
if not active_tab: | |
return { | |
page_content: gr.Markdown("No active tabs. Please create a new one."), | |
url_textbox: "", | |
links_display: "", | |
tab_selector: gr.Radio(choices=[], label="Active Tabs"), | |
} | |
# Tab Selector | |
tab_choices = [f"Tab {i}: {tab.title[:40]}..." for i, tab in enumerate(browser_state.tabs)] | |
active_tab_label = f"Tab {browser_state.active_tab_index}: {active_tab.title[:40]}..." | |
# Links Display | |
links_md = "### 🔗 Links on Page\n" | |
if active_tab.links: | |
for i, link in enumerate(active_tab.links[:25]): # Show first 25 links | |
links_md += f"{i}. [{link['text'][:80]}]({link['url']})\n" | |
else: | |
links_md += "_No links found or page failed to load._" | |
return { | |
page_content: gr.Markdown(f"# {active_tab.title}\n**URL:** {active_tab.url}\n\n---\n\n{active_tab.parsed_text[:2000]}..."), | |
url_textbox: gr.Textbox(value=active_tab.url), | |
links_display: gr.Markdown(links_md), | |
tab_selector: gr.Radio(choices=tab_choices, value=active_tab_label, label="Active Tabs"), | |
} | |
# --- Event Handlers --- | |
def handle_action(browser_state, action, value=None): | |
if action == "go": | |
log = browser_state.go(value) | |
elif action == "click": | |
tab = browser_state._get_active_tab() | |
try: | |
link_index = int(value) | |
if tab and 0 <= link_index < len(tab.links): | |
link_url = tab.links[link_index]['url'] | |
log = browser_state.go(link_url) | |
else: | |
log = "Invalid link number." | |
except (ValueError, TypeError): | |
log = "Please enter a valid number to click." | |
elif action == "back": | |
log = browser_state.back() | |
elif action == "forward": | |
log = browser_state.forward() | |
elif action == "refresh": | |
log = browser_state.refresh() | |
elif action == "new_tab": | |
log = browser_state.new_tab() | |
elif action == "close_tab": | |
log = browser_state.close_tab() | |
elif action == "switch_tab": | |
log = browser_state.switch_tab(value) | |
else: | |
log = "Unknown action." | |
# After any action, update the entire UI based on the new state | |
return { | |
**update_ui_components(browser_state), | |
log_display: gr.Textbox(log) | |
} | |
# --- Gradio Interface Layout --- | |
with gr.Blocks(theme=gr.themes.Soft(), title="Real Browser Demo") as demo: | |
# The gr.State holds our Python class instance, persisting it across calls. | |
browser_state = gr.State(RealBrowser()) | |
gr.Markdown("# 🌐 Real Browser Demo (Powered by Playwright)") | |
gr.Markdown("Type a URL or search term. This demo runs a real headless browser on the server to fetch and parse live websites.") | |
with gr.Row(): | |
with gr.Column(scale=3): | |
with gr.Row(): | |
back_btn = gr.Button("◀ Back") | |
forward_btn = gr.Button("▶ Forward") | |
refresh_btn = gr.Button("🔄 Refresh") | |
url_textbox = gr.Textbox(label="URL or Search Term", placeholder="https://news.ycombinator.com or 'best python libraries'", interactive=True) | |
go_btn = gr.Button("Go", variant="primary") | |
with gr.Accordion("Page Content (Text Only)", open=True): | |
page_content = gr.Markdown("Loading...") | |
log_display = gr.Textbox(label="Status Log", interactive=False) | |
with gr.Column(scale=1): | |
with gr.Row(): | |
new_tab_btn = gr.Button("➕ New Tab") | |
close_tab_btn = gr.Button("❌ Close Tab") | |
tab_selector = gr.Radio(choices=[], label="Active Tabs", interactive=True) | |
with gr.Accordion("Clickable Links", open=True): | |
links_display = gr.Markdown("...") | |
with gr.Row(): | |
click_num_box = gr.Number(label="Link #", scale=1, minimum=0, step=1) | |
click_btn = gr.Button("Click Link", scale=2) | |
# --- Component Wiring --- | |
all_outputs = [page_content, url_textbox, links_display, tab_selector, log_display] | |
# Initial load | |
demo.load( | |
lambda state: {**update_ui_components(state), log_display: "🚀 Browser Initialized! Ready to navigate."}, | |
inputs=[browser_state], | |
outputs=all_outputs | |
) | |
# Event listeners | |
go_btn.click(lambda s, v: handle_action(s, "go", v), [browser_state, url_textbox], all_outputs, show_progress="full") | |
url_textbox.submit(lambda s, v: handle_action(s, "go", v), [browser_state, url_textbox], all_outputs, show_progress="full") | |
click_btn.click(lambda s, v: handle_action(s, "click", v), [browser_state, click_num_box], all_outputs, show_progress="full") | |
back_btn.click(lambda s: handle_action(s, "back"), [browser_state], all_outputs, show_progress="full") | |
forward_btn.click(lambda s: handle_action(s, "forward"), [browser_state], all_outputs, show_progress="full") | |
refresh_btn.click(lambda s: handle_action(s, "refresh"), [browser_state], all_outputs, show_progress="full") | |
new_tab_btn.click(lambda s: handle_action(s, "new_tab"), [browser_state], all_outputs, show_progress="full") | |
close_tab_btn.click(lambda s: handle_action(s, "close_tab"), [browser_state], all_outputs) | |
tab_selector.input(lambda s, v: handle_action(s, "switch_tab", v), [browser_state, tab_selector], all_outputs) | |
demo.launch() |