broadfield-dev commited on
Commit
1bb1ac8
·
verified ·
1 Parent(s): d164b37

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +145 -59
app.py CHANGED
@@ -1,23 +1,31 @@
1
  import os
2
  os.system("playwright install")
3
- # app.py (Final, Working Async Version with All Bugs Fixed)
4
 
5
  import gradio as gr
6
- from playwright.async_api import async_playwright, Error as PlaywrightError
7
- from bs4 import BeautifulSoup
8
  import urllib.parse
9
  import os
10
  from itertools import cycle
11
  import uuid
12
 
13
  # --- 1. GLOBAL RESOURCES & STATE ---
14
- # Initialized on the first request to be compatible with Spaces.
15
  P = None
16
  BROWSER = None
17
  REVOLVER = None
18
  LIVE_CONTEXTS = {}
19
  APP_STARTED = False
20
 
 
 
 
 
 
 
 
 
 
21
  # --- 2. PLAIN DATA STATE CLASSES ---
22
  class TabState:
23
  def __init__(self, tab_id, proxy_used="Direct Connection"):
@@ -44,16 +52,81 @@ class CredentialRevolver:
44
  def get_next(self): return next(self.proxy_cycler) if self.proxy_cycler else None
45
  def count(self): return len(self.proxies) if self.proxies else 0
46
 
47
- # --- 3. CORE ASYNC LOGIC ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  async def _fetch_and_update_tab_state(tab_state: TabState, url: str):
 
49
  log = f"▶️ Navigating to {url}..."; live_page = LIVE_CONTEXTS[tab_state.id]["page"]
50
  try:
51
  await live_page.goto(url, wait_until='domcontentloaded', timeout=30000)
52
  tab_state.url = live_page.url; tab_state.title = await live_page.title() or "No Title"
53
  log += f"\n✅ Arrived at: {tab_state.url}"
54
- html_content = await live_page.content(); soup = BeautifulSoup(html_content, 'lxml')
55
- for el in soup(["script", "style", "nav", "footer", "aside"]): el.extract()
56
- tab_state.parsed_text = soup.get_text(separator='\n', strip=True)
 
 
 
 
57
  tab_state.links = [{'text': link.get_text(strip=True) or "[No Link Text]", 'url': urllib.parse.urljoin(tab_state.url, link['href'])} for link in soup.find_all('a', href=True) if link.get('href', '').startswith('http')]
58
  log += f"\n🔗 Found {len(tab_state.links)} links."
59
  except Exception as e:
@@ -62,19 +135,28 @@ async def _fetch_and_update_tab_state(tab_state: TabState, url: str):
62
  tab_state.links = []; log += f"\n❌ {error_message}"
63
  return log
64
 
65
- async def handle_action(browser_state: BrowserState, action: str, value=None):
 
66
  log = ""; active_tab_state = browser_state.get_active_tab()
67
- if action == "new_tab":
 
 
 
 
 
 
 
 
 
 
 
68
  tab_id, proxy_config = str(uuid.uuid4()), REVOLVER.get_next()
69
  context = await BROWSER.new_context(proxy=proxy_config)
70
  page = await context.new_page()
71
  LIVE_CONTEXTS[tab_id] = {"context": context, "page": page}
72
  new_tab = TabState(tab_id, proxy_used=proxy_config['server'].split('@')[-1] if proxy_config else "Direct")
73
  browser_state.tabs.append(new_tab); browser_state.active_tab_id = tab_id
74
- log = await _fetch_and_update_tab_state(new_tab, "https://www.whatsmyip.org/")
75
- elif action == "go" and active_tab_state and value:
76
- url = value if (urllib.parse.urlparse(value).scheme and urllib.parse.urlparse(value).netloc) else f"https://duckduckgo.com/html/?q={urllib.parse.quote_plus(value)}"
77
- log = await _fetch_and_update_tab_state(active_tab_state, url)
78
  elif action == "click" and active_tab_state and value is not None:
79
  try:
80
  link_index = int(value)
@@ -92,37 +174,37 @@ async def handle_action(browser_state: BrowserState, action: str, value=None):
92
  browser_state.active_tab_id = value; log = f"Switched to tab."
93
  return browser_state, log
94
 
95
- def update_ui_components(browser_state: BrowserState):
96
- active_tab = browser_state.get_active_tab()
97
- if not active_tab: return {page_content: gr.Markdown("No active tabs."), url_textbox: "", links_display: "", tab_selector: gr.Radio(choices=[])}
98
- tab_choices = [(f"Tab {i}: {t.title[:25]}... ({t.proxy_used})", t.id) for i, t in enumerate(browser_state.tabs)]
99
- links_md = "### 🔗 Links on Page\n" + ('\n'.join(f"{i}. [{link['text'][:80]}]({link['url']})" for i, link in enumerate(active_tab.links[:25])) if active_tab.links else "_No links found._")
100
- return {
101
- page_content: gr.Markdown(f"# {active_tab.title}\n**URL:** {active_tab.url}\n\n---\n\n{active_tab.parsed_text[:2000]}..."),
102
- url_textbox: gr.Textbox(value=active_tab.url), links_display: gr.Markdown(links_md),
103
- tab_selector: gr.Radio(choices=tab_choices, value=active_tab.id, label="Active Tabs"),
104
- }
105
-
106
  # --- 4. GRADIO UI AND EVENT HANDLING ---
107
  with gr.Blocks(theme=gr.themes.Soft(), title="Real Browser Demo") as demo:
108
  browser_state = gr.State(BrowserState())
109
- gr.Markdown("# 🛰️ Real Browser Demo (Final Working Version)")
110
 
111
  with gr.Row():
112
- with gr.Column(scale=3):
113
- url_textbox = gr.Textbox(label="URL or Search Term", interactive=True); go_btn = gr.Button("Go", variant="primary")
114
- with gr.Accordion("Page Content (Text Only)", open=True): page_content = gr.Markdown("Loading...")
115
- log_display = gr.Textbox(label="Status Log", interactive=False)
116
- with gr.Column(scale=1):
117
- with gr.Row(): new_tab_btn = gr.Button("➕ New Tab"); close_tab_btn = gr.Button("❌ Close Tab")
 
 
 
 
 
 
 
118
  tab_selector = gr.Radio(choices=[], label="Active Tabs", interactive=True)
119
  with gr.Accordion("Clickable Links", open=True):
120
  links_display = gr.Markdown("...");
121
- with gr.Row(): click_num_box = gr.Number(label="Link #", scale=1, minimum=0, step=1); click_btn = gr.Button("Click Link", scale=2)
 
 
122
 
 
 
123
  all_outputs = [page_content, url_textbox, links_display, tab_selector, log_display]
124
 
125
- async def master_handler(current_state, action, value=None):
126
  global APP_STARTED, P, BROWSER, REVOLVER
127
  if not APP_STARTED:
128
  print("🚀 First request received, starting up Playwright...");
@@ -130,8 +212,8 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Real Browser Demo") as demo:
130
  proxy_list_str = os.getenv("PROXY_LIST", ""); REVOLVER = CredentialRevolver(proxy_list_str)
131
  print(f"✅ Playwright started. {REVOLVER.count()} proxies loaded."); APP_STARTED = True
132
 
133
- new_state, log = await handle_action(current_state, action, value)
134
- ui_updates = update_ui_components(new_state)
135
 
136
  return (
137
  new_state,
@@ -142,29 +224,33 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Real Browser Demo") as demo:
142
  log
143
  )
144
 
145
- # ** THE DEFINITIVE FIX FOR GRADIO'S ASYNC RETURN VALUE ERROR **
146
- # Each event listener is its own async function that awaits the handler.
147
- # This ensures Gradio correctly receives the unpacked tuple of return values.
148
-
149
- async def on_load(state):
150
- return await master_handler(state, "new_tab", None)
151
- async def on_go_click(state, value):
152
- return await master_handler(state, "go", value)
153
- async def on_click_link(state, value):
154
- return await master_handler(state, "click", value)
155
- async def on_new_tab(state):
156
- return await master_handler(state, "new_tab", None)
157
- async def on_close_tab(state):
158
- return await master_handler(state, "close_tab", None)
159
- async def on_switch_tab(state, value):
160
- return await master_handler(state, "switch_tab", value)
161
 
162
- demo.load(on_load, inputs=[browser_state], outputs=[browser_state, *all_outputs])
163
- go_btn.click(on_go_click, [browser_state, url_textbox], [browser_state, *all_outputs], show_progress="full")
164
- url_textbox.submit(on_go_click, [browser_state, url_textbox], [browser_state, *all_outputs], show_progress="full")
165
- click_btn.click(on_click_link, [browser_state, click_num_box], [browser_state, *all_outputs], show_progress="full")
166
- new_tab_btn.click(on_new_tab, [browser_state], [browser_state, *all_outputs], show_progress="full")
167
- close_tab_btn.click(on_close_tab, [browser_state], [browser_state, *all_outputs])
168
- tab_selector.input(on_switch_tab, [browser_state, tab_selector], [browser_state, *all_outputs])
 
 
 
 
 
 
 
169
 
170
  demo.launch()
 
1
  import os
2
  os.system("playwright install")
3
+ # app.py (Final Version with Search Engines and Improved Formatting)
4
 
5
  import gradio as gr
6
+ from playwright.async_api import async_playwright
7
+ from bs4 import BeautifulSoup, NavigableString
8
  import urllib.parse
9
  import os
10
  from itertools import cycle
11
  import uuid
12
 
13
  # --- 1. GLOBAL RESOURCES & STATE ---
 
14
  P = None
15
  BROWSER = None
16
  REVOLVER = None
17
  LIVE_CONTEXTS = {}
18
  APP_STARTED = False
19
 
20
+ # New: Search Engine Configuration
21
+ SEARCH_ENGINES = {
22
+ "DuckDuckGo": "https://duckduckgo.com/html/?q={query}",
23
+ "Google": "https://www.google.com/search?q={query}",
24
+ "Bing": "https://www.bing.com/search?q={query}",
25
+ "Brave": "https://search.brave.com/search?q={query}",
26
+ "Ecosia": "https://www.ecosia.org/search?q={query}"
27
+ }
28
+
29
  # --- 2. PLAIN DATA STATE CLASSES ---
30
  class TabState:
31
  def __init__(self, tab_id, proxy_used="Direct Connection"):
 
52
  def get_next(self): return next(self.proxy_cycler) if self.proxy_cycler else None
53
  def count(self): return len(self.proxies) if self.proxies else 0
54
 
55
+ # --- 3. CORE ASYNC LOGIC with NEW FORMATTER ---
56
+
57
+ def _format_html_to_markdown(soup):
58
+ """
59
+ NEW: Intelligently converts a BeautifulSoup object to formatted Markdown.
60
+ """
61
+ text_parts = []
62
+
63
+ # Use a more specific main content area if available
64
+ content_node = soup.find('main') or soup.find('body')
65
+ if not content_node:
66
+ return "Could not find main body content."
67
+
68
+ for element in content_node.find_all(recursive=False):
69
+ # Ignore common non-content sections
70
+ if element.name in ['nav', 'footer', 'header', 'aside', 'form', 'script', 'style']:
71
+ continue
72
+ text_parts.append(_process_element_to_markdown(element))
73
+
74
+ return '\n'.join(filter(None, text_parts))
75
+
76
+ def _process_element_to_markdown(element):
77
+ """NEW: Recursive helper to process each element into Markdown."""
78
+ if isinstance(element, NavigableString):
79
+ return element.strip()
80
+
81
+ if element.name is None:
82
+ return ''
83
+
84
+ # Get the inner text by recursively processing children
85
+ inner_text = ''.join(_process_element_to_markdown(child) for child in element.children)
86
+
87
+ # Apply formatting based on tag name
88
+ if element.name in ['p', 'div', 'article', 'section']:
89
+ return f"\n{inner_text.strip()}\n"
90
+ if element.name == 'h1':
91
+ return f"\n# {inner_text.strip()}\n"
92
+ if element.name == 'h2':
93
+ return f"\n## {inner_text.strip()}\n"
94
+ if element.name == 'h3':
95
+ return f"\n### {inner_text.strip()}\n"
96
+ if element.name in ['h4', 'h5', 'h6']:
97
+ return f"\n#### {inner_text.strip()}\n"
98
+ if element.name == 'li':
99
+ return f"* {inner_text.strip()}\n"
100
+ if element.name in ['ul', 'ol']:
101
+ return f"\n{inner_text}\n"
102
+ if element.name in ['strong', 'b']:
103
+ return f"**{inner_text.strip()}**"
104
+ if element.name in ['em', 'i']:
105
+ return f"*{inner_text.strip()}*"
106
+ if element.name in ['pre', 'code']:
107
+ return f"\n```\n{inner_text.strip()}\n```\n"
108
+ if element.name == 'a':
109
+ href = element.get('href', '')
110
+ return f"[{inner_text.strip()}]({href})"
111
+ if element.name == 'hr':
112
+ return "\n---\n"
113
+
114
+ return inner_text # For other tags like span, etc.
115
+
116
  async def _fetch_and_update_tab_state(tab_state: TabState, url: str):
117
+ """Uses Playwright to navigate and the new formatter to parse."""
118
  log = f"▶️ Navigating to {url}..."; live_page = LIVE_CONTEXTS[tab_state.id]["page"]
119
  try:
120
  await live_page.goto(url, wait_until='domcontentloaded', timeout=30000)
121
  tab_state.url = live_page.url; tab_state.title = await live_page.title() or "No Title"
122
  log += f"\n✅ Arrived at: {tab_state.url}"
123
+
124
+ html_content = await live_page.content()
125
+ soup = BeautifulSoup(html_content, 'lxml')
126
+
127
+ # Use the new formatter
128
+ tab_state.parsed_text = _format_html_to_markdown(soup)
129
+
130
  tab_state.links = [{'text': link.get_text(strip=True) or "[No Link Text]", 'url': urllib.parse.urljoin(tab_state.url, link['href'])} for link in soup.find_all('a', href=True) if link.get('href', '').startswith('http')]
131
  log += f"\n🔗 Found {len(tab_state.links)} links."
132
  except Exception as e:
 
135
  tab_state.links = []; log += f"\n❌ {error_message}"
136
  return log
137
 
138
+ async def handle_action(browser_state: BrowserState, search_engine: str, action: str, value=None):
139
+ """Main logic handler, now with search engine selection."""
140
  log = ""; active_tab_state = browser_state.get_active_tab()
141
+
142
+ if action == "go" and active_tab_state and value:
143
+ is_url = urllib.parse.urlparse(value).scheme and urllib.parse.urlparse(value).netloc
144
+ if is_url:
145
+ url = value
146
+ else: # It's a search query
147
+ base_url = SEARCH_ENGINES.get(search_engine, SEARCH_ENGINES["DuckDuckGo"])
148
+ url = base_url.format(query=urllib.parse.quote_plus(value))
149
+ log = await _fetch_and_update_tab_state(active_tab_state, url)
150
+
151
+ # --- Other actions remain the same ---
152
+ elif action == "new_tab":
153
  tab_id, proxy_config = str(uuid.uuid4()), REVOLVER.get_next()
154
  context = await BROWSER.new_context(proxy=proxy_config)
155
  page = await context.new_page()
156
  LIVE_CONTEXTS[tab_id] = {"context": context, "page": page}
157
  new_tab = TabState(tab_id, proxy_used=proxy_config['server'].split('@')[-1] if proxy_config else "Direct")
158
  browser_state.tabs.append(new_tab); browser_state.active_tab_id = tab_id
159
+ log = await _fetch_and_update_tab_state(new_tab, "https://www.startpage.com/") # New default page
 
 
 
160
  elif action == "click" and active_tab_state and value is not None:
161
  try:
162
  link_index = int(value)
 
174
  browser_state.active_tab_id = value; log = f"Switched to tab."
175
  return browser_state, log
176
 
 
 
 
 
 
 
 
 
 
 
 
177
  # --- 4. GRADIO UI AND EVENT HANDLING ---
178
  with gr.Blocks(theme=gr.themes.Soft(), title="Real Browser Demo") as demo:
179
  browser_state = gr.State(BrowserState())
180
+ gr.Markdown("# 🛰️ Real Browser Demo v2")
181
 
182
  with gr.Row():
183
+ with gr.Column(scale=4):
184
+ with gr.Row():
185
+ url_textbox = gr.Textbox(label="Enter URL or Search Query", interactive=True, scale=4)
186
+ go_btn = gr.Button("Go", variant="primary", scale=1)
187
+ with gr.Accordion("Page Content (Formatted)", open=True):
188
+ page_content = gr.Markdown("Loading...")
189
+ with gr.Column(scale=2):
190
+ # NEW: Search Engine Selector
191
+ search_engine_selector = gr.Radio(
192
+ list(SEARCH_ENGINES.keys()), value="DuckDuckGo", label="Search Engine"
193
+ )
194
+ with gr.Row():
195
+ new_tab_btn = gr.Button("➕ New Tab"); close_tab_btn = gr.Button("❌ Close Tab")
196
  tab_selector = gr.Radio(choices=[], label="Active Tabs", interactive=True)
197
  with gr.Accordion("Clickable Links", open=True):
198
  links_display = gr.Markdown("...");
199
+ with gr.Row():
200
+ click_num_box = gr.Number(label="Link #", scale=1, minimum=0, step=1)
201
+ click_btn = gr.Button("Click Link", scale=2)
202
 
203
+ log_display = gr.Textbox(label="Status Log", interactive=False)
204
+
205
  all_outputs = [page_content, url_textbox, links_display, tab_selector, log_display]
206
 
207
+ async def master_handler(current_state, search_engine, action, value=None):
208
  global APP_STARTED, P, BROWSER, REVOLVER
209
  if not APP_STARTED:
210
  print("🚀 First request received, starting up Playwright...");
 
212
  proxy_list_str = os.getenv("PROXY_LIST", ""); REVOLVER = CredentialRevolver(proxy_list_str)
213
  print(f"✅ Playwright started. {REVOLVER.count()} proxies loaded."); APP_STARTED = True
214
 
215
+ new_state, log = await handle_action(current_state, search_engine, action, value)
216
+ ui_updates = update_ui_components(current_state) # Use current state for UI updates
217
 
218
  return (
219
  new_state,
 
224
  log
225
  )
226
 
227
+ # Each event listener is its own async function.
228
+ async def on_load(state, search_engine):
229
+ return await master_handler(state, search_engine, "new_tab", None)
230
+ async def on_go_click(state, search_engine, value):
231
+ return await master_handler(state, search_engine, "go", value)
232
+ async def on_click_link(state, search_engine, value):
233
+ return await master_handler(state, search_engine, "click", value)
234
+ async def on_new_tab(state, search_engine):
235
+ return await master_handler(state, search_engine, "new_tab", None)
236
+ async def on_close_tab(state, search_engine):
237
+ return await master_handler(state, search_engine, "close_tab", None)
238
+ async def on_switch_tab(state, search_engine, value):
239
+ return await master_handler(state, search_engine, "switch_tab", value)
 
 
 
240
 
241
+ # Wire up the event handlers
242
+ go_inputs = [browser_state, search_engine_selector, url_textbox]
243
+ click_inputs = [browser_state, search_engine_selector, click_num_box]
244
+ tab_inputs = [browser_state, search_engine_selector]
245
+ switch_inputs = [browser_state, search_engine_selector, tab_selector]
246
+ outputs = [browser_state, *all_outputs]
247
+
248
+ demo.load(on_load, tab_inputs, outputs)
249
+ go_btn.click(on_go_click, go_inputs, outputs, show_progress="full")
250
+ url_textbox.submit(on_go_click, go_inputs, outputs, show_progress="full")
251
+ click_btn.click(on_click_link, click_inputs, outputs, show_progress="full")
252
+ new_tab_btn.click(on_new_tab, tab_inputs, outputs, show_progress="full")
253
+ close_tab_btn.click(on_close_tab, tab_inputs, outputs)
254
+ tab_selector.input(on_switch_tab, switch_inputs, outputs)
255
 
256
  demo.launch()