broadfield-dev commited on
Commit
10dbcf2
·
verified ·
1 Parent(s): 9b96fc7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -133
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import os
2
  os.system("playwright install")
3
- # app.py (Final Version with Search Engines and Improved Formatting)
4
 
5
  import gradio as gr
6
  from playwright.async_api import async_playwright
@@ -17,7 +17,6 @@ REVOLVER = None
17
  LIVE_CONTEXTS = {}
18
  APP_STARTED = False
19
 
20
- # New: Search Engine Configuration
21
  SEARCH_ENGINES = {
22
  "DuckDuckGo": "https://duckduckgo.com/html/?q={query}",
23
  "Google": "https://www.google.com/search?q={query}",
@@ -41,92 +40,49 @@ class CredentialRevolver:
41
  self.proxies = self._parse_proxies(proxy_string)
42
  self.proxy_cycler = cycle(self.proxies) if self.proxies else None
43
  def _parse_proxies(self, proxy_string: str):
44
- proxies = []
45
  for line in proxy_string.strip().splitlines():
46
  try:
47
- parsed = urllib.parse.urlparse(f"//{line.strip()}")
48
- server = f"{parsed.scheme or 'http'}://{parsed.hostname}:{parsed.port}"
49
  proxies.append({"server": server, "username": parsed.username, "password": parsed.password})
50
  except: pass
51
  return proxies
52
  def get_next(self): return next(self.proxy_cycler) if self.proxy_cycler else None
53
  def count(self): return len(self.proxies) if self.proxies else 0
54
 
55
- # --- 3. CORE ASYNC LOGIC with NEW FORMATTER ---
56
-
57
- def _format_html_to_markdown(soup):
58
- """
59
- NEW: Intelligently converts a BeautifulSoup object to formatted Markdown.
60
- """
61
- text_parts = []
62
-
63
- # Use a more specific main content area if available
64
- content_node = soup.find('main') or soup.find('body')
65
- if not content_node:
66
- return "Could not find main body content."
67
-
68
- for element in content_node.find_all(recursive=False):
69
- # Ignore common non-content sections
70
- if element.name in ['nav', 'footer', 'header', 'aside', 'form', 'script', 'style']:
71
- continue
72
- text_parts.append(_process_element_to_markdown(element))
73
-
74
- return '\n'.join(filter(None, text_parts))
75
-
76
  def _process_element_to_markdown(element):
77
- """NEW: Recursive helper to process each element into Markdown."""
78
- if isinstance(element, NavigableString):
79
- return element.strip()
80
-
81
- if element.name is None:
82
- return ''
83
-
84
- # Get the inner text by recursively processing children
85
  inner_text = ''.join(_process_element_to_markdown(child) for child in element.children)
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
- # Apply formatting based on tag name
88
- if element.name in ['p', 'div', 'article', 'section']:
89
- return f"\n{inner_text.strip()}\n"
90
- if element.name == 'h1':
91
- return f"\n# {inner_text.strip()}\n"
92
- if element.name == 'h2':
93
- return f"\n## {inner_text.strip()}\n"
94
- if element.name == 'h3':
95
- return f"\n### {inner_text.strip()}\n"
96
- if element.name in ['h4', 'h5', 'h6']:
97
- return f"\n#### {inner_text.strip()}\n"
98
- if element.name == 'li':
99
- return f"* {inner_text.strip()}\n"
100
- if element.name in ['ul', 'ol']:
101
- return f"\n{inner_text}\n"
102
- if element.name in ['strong', 'b']:
103
- return f"**{inner_text.strip()}**"
104
- if element.name in ['em', 'i']:
105
- return f"*{inner_text.strip()}*"
106
- if element.name in ['pre', 'code']:
107
- return f"\n```\n{inner_text.strip()}\n```\n"
108
- if element.name == 'a':
109
- href = element.get('href', '')
110
- return f"[{inner_text.strip()}]({href})"
111
- if element.name == 'hr':
112
- return "\n---\n"
113
-
114
- return inner_text # For other tags like span, etc.
115
 
116
  async def _fetch_and_update_tab_state(tab_state: TabState, url: str):
117
- """Uses Playwright to navigate and the new formatter to parse."""
118
  log = f"▶️ Navigating to {url}..."; live_page = LIVE_CONTEXTS[tab_state.id]["page"]
119
  try:
120
  await live_page.goto(url, wait_until='domcontentloaded', timeout=30000)
121
  tab_state.url = live_page.url; tab_state.title = await live_page.title() or "No Title"
122
  log += f"\n✅ Arrived at: {tab_state.url}"
123
-
124
- html_content = await live_page.content()
125
- soup = BeautifulSoup(html_content, 'lxml')
126
-
127
- # Use the new formatter
128
  tab_state.parsed_text = _format_html_to_markdown(soup)
129
-
130
  tab_state.links = [{'text': link.get_text(strip=True) or "[No Link Text]", 'url': urllib.parse.urljoin(tab_state.url, link['href'])} for link in soup.find_all('a', href=True) if link.get('href', '').startswith('http')]
131
  log += f"\n🔗 Found {len(tab_state.links)} links."
132
  except Exception as e:
@@ -136,19 +92,11 @@ async def _fetch_and_update_tab_state(tab_state: TabState, url: str):
136
  return log
137
 
138
  async def handle_action(browser_state: BrowserState, search_engine: str, action: str, value=None):
139
- """Main logic handler, now with search engine selection."""
140
- log = ""; active_tab_state = browser_state.get_active_tab()
141
-
142
  if action == "go" and active_tab_state and value:
143
  is_url = urllib.parse.urlparse(value).scheme and urllib.parse.urlparse(value).netloc
144
- if is_url:
145
- url = value
146
- else: # It's a search query
147
- base_url = SEARCH_ENGINES.get(search_engine, SEARCH_ENGINES["DuckDuckGo"])
148
- url = base_url.format(query=urllib.parse.quote_plus(value))
149
  log = await _fetch_and_update_tab_state(active_tab_state, url)
150
-
151
- # --- Other actions remain the same ---
152
  elif action == "new_tab":
153
  tab_id, proxy_config = str(uuid.uuid4()), REVOLVER.get_next()
154
  context = await BROWSER.new_context(proxy=proxy_config)
@@ -156,7 +104,7 @@ async def handle_action(browser_state: BrowserState, search_engine: str, action:
156
  LIVE_CONTEXTS[tab_id] = {"context": context, "page": page}
157
  new_tab = TabState(tab_id, proxy_used=proxy_config['server'].split('@')[-1] if proxy_config else "Direct")
158
  browser_state.tabs.append(new_tab); browser_state.active_tab_id = tab_id
159
- log = await _fetch_and_update_tab_state(new_tab, "https://www.startpage.com/") # New default page
160
  elif action == "click" and active_tab_state and value is not None:
161
  try:
162
  link_index = int(value)
@@ -172,36 +120,39 @@ async def handle_action(browser_state: BrowserState, search_engine: str, action:
172
  else: log = "Cannot close the last tab."
173
  elif action == "switch_tab" and value is not None:
174
  browser_state.active_tab_id = value; log = f"Switched to tab."
 
175
  return browser_state, log
176
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  # --- 4. GRADIO UI AND EVENT HANDLING ---
178
  with gr.Blocks(theme=gr.themes.Soft(), title="Real Browser Demo") as demo:
179
  browser_state = gr.State(BrowserState())
180
- gr.Markdown("# 🛰️ Real Browser Demo v2")
181
-
182
  with gr.Row():
183
  with gr.Column(scale=4):
184
- with gr.Row():
185
- url_textbox = gr.Textbox(label="Enter URL or Search Query", interactive=True, scale=4)
186
- go_btn = gr.Button("Go", variant="primary", scale=1)
187
- with gr.Accordion("Page Content (Formatted)", open=True):
188
- page_content = gr.Markdown("Loading...")
189
  with gr.Column(scale=2):
190
- # NEW: Search Engine Selector
191
- search_engine_selector = gr.Radio(
192
- list(SEARCH_ENGINES.keys()), value="DuckDuckGo", label="Search Engine"
193
- )
194
- with gr.Row():
195
- new_tab_btn = gr.Button("➕ New Tab"); close_tab_btn = gr.Button("❌ Close Tab")
196
  tab_selector = gr.Radio(choices=[], label="Active Tabs", interactive=True)
197
  with gr.Accordion("Clickable Links", open=True):
198
  links_display = gr.Markdown("...");
199
- with gr.Row():
200
- click_num_box = gr.Number(label="Link #", scale=1, minimum=0, step=1)
201
- click_btn = gr.Button("Click Link", scale=2)
202
-
203
  log_display = gr.Textbox(label="Status Log", interactive=False)
204
-
205
  all_outputs = [page_content, url_textbox, links_display, tab_selector, log_display]
206
 
207
  async def master_handler(current_state, search_engine, action, value=None):
@@ -213,44 +164,24 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Real Browser Demo") as demo:
213
  print(f"✅ Playwright started. {REVOLVER.count()} proxies loaded."); APP_STARTED = True
214
 
215
  new_state, log = await handle_action(current_state, search_engine, action, value)
216
- ui_updates = update_ui_components(current_state) # Use current state for UI updates
217
 
218
- return (
219
- new_state,
220
- ui_updates[page_content],
221
- ui_updates[url_textbox],
222
- ui_updates[links_display],
223
- ui_updates[tab_selector],
224
- log
225
- )
226
 
227
- # Each event listener is its own async function.
228
- async def on_load(state, search_engine):
229
- return await master_handler(state, search_engine, "new_tab", None)
230
- async def on_go_click(state, search_engine, value):
231
- return await master_handler(state, search_engine, "go", value)
232
- async def on_click_link(state, search_engine, value):
233
- return await master_handler(state, search_engine, "click", value)
234
- async def on_new_tab(state, search_engine):
235
- return await master_handler(state, search_engine, "new_tab", None)
236
- async def on_close_tab(state, search_engine):
237
- return await master_handler(state, search_engine, "close_tab", None)
238
- async def on_switch_tab(state, search_engine, value):
239
- return await master_handler(state, search_engine, "switch_tab", value)
240
 
241
- # Wire up the event handlers
242
- go_inputs = [browser_state, search_engine_selector, url_textbox]
243
- click_inputs = [browser_state, search_engine_selector, click_num_box]
244
- tab_inputs = [browser_state, search_engine_selector]
245
- switch_inputs = [browser_state, search_engine_selector, tab_selector]
246
  outputs = [browser_state, *all_outputs]
247
-
248
- demo.load(on_load, tab_inputs, outputs)
249
- go_btn.click(on_go_click, go_inputs, outputs, show_progress="full")
250
- url_textbox.submit(on_go_click, go_inputs, outputs, show_progress="full")
251
- click_btn.click(on_click_link, click_inputs, outputs, show_progress="full")
252
- new_tab_btn.click(on_new_tab, tab_inputs, outputs, show_progress="full")
253
- close_tab_btn.click(on_close_tab, tab_inputs, outputs)
254
- tab_selector.input(on_switch_tab, switch_inputs, outputs)
255
 
256
  demo.launch()
 
1
  import os
2
  os.system("playwright install")
3
+ # app.py (Final, Working Async Version with All Bugs Fixed)
4
 
5
  import gradio as gr
6
  from playwright.async_api import async_playwright
 
17
  LIVE_CONTEXTS = {}
18
  APP_STARTED = False
19
 
 
20
  SEARCH_ENGINES = {
21
  "DuckDuckGo": "https://duckduckgo.com/html/?q={query}",
22
  "Google": "https://www.google.com/search?q={query}",
 
40
  self.proxies = self._parse_proxies(proxy_string)
41
  self.proxy_cycler = cycle(self.proxies) if self.proxies else None
42
  def _parse_proxies(self, proxy_string: str):
43
+ proxies = [];
44
  for line in proxy_string.strip().splitlines():
45
  try:
46
+ parsed = urllib.parse.urlparse(f"//{line.strip()}"); server = f"{parsed.scheme or 'http'}://{parsed.hostname}:{parsed.port}"
 
47
  proxies.append({"server": server, "username": parsed.username, "password": parsed.password})
48
  except: pass
49
  return proxies
50
  def get_next(self): return next(self.proxy_cycler) if self.proxy_cycler else None
51
  def count(self): return len(self.proxies) if self.proxies else 0
52
 
53
+ # --- 3. CORE ASYNC LOGIC & FORMATTING ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  def _process_element_to_markdown(element):
55
+ if isinstance(element, NavigableString): return element.strip()
56
+ if element.name is None: return ''
 
 
 
 
 
 
57
  inner_text = ''.join(_process_element_to_markdown(child) for child in element.children)
58
+ if element.name in ['p', 'div', 'article', 'section']: return f"\n{inner_text.strip()}\n"
59
+ if element.name == 'h1': return f"\n# {inner_text.strip()}\n"
60
+ if element.name == 'h2': return f"\n## {inner_text.strip()}\n"
61
+ if element.name == 'h3': return f"\n### {inner_text.strip()}\n"
62
+ if element.name in ['h4', 'h5', 'h6']: return f"\n#### {inner_text.strip()}\n"
63
+ if element.name == 'li': return f"* {inner_text.strip()}\n"
64
+ if element.name in ['ul', 'ol']: return f"\n{inner_text}\n"
65
+ if element.name in ['strong', 'b']: return f"**{inner_text.strip()}**"
66
+ if element.name in ['em', 'i']: return f"*{inner_text.strip()}*"
67
+ if element.name in ['pre', 'code']: return f"\n```\n{inner_text.strip()}\n```\n"
68
+ if element.name == 'a': return f"[{inner_text.strip()}]({element.get('href', '')})"
69
+ if element.name == 'hr': return "\n---\n"
70
+ return inner_text
71
 
72
+ def _format_html_to_markdown(soup):
73
+ content_node = soup.find('main') or soup.find('body')
74
+ if not content_node: return "Could not find main body content."
75
+ for el in content_node.select('nav, footer, header, aside, form, script, style'): el.decompose()
76
+ return _process_element_to_markdown(content_node)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
  async def _fetch_and_update_tab_state(tab_state: TabState, url: str):
 
79
  log = f"▶️ Navigating to {url}..."; live_page = LIVE_CONTEXTS[tab_state.id]["page"]
80
  try:
81
  await live_page.goto(url, wait_until='domcontentloaded', timeout=30000)
82
  tab_state.url = live_page.url; tab_state.title = await live_page.title() or "No Title"
83
  log += f"\n✅ Arrived at: {tab_state.url}"
84
+ html_content = await live_page.content(); soup = BeautifulSoup(html_content, 'lxml')
 
 
 
 
85
  tab_state.parsed_text = _format_html_to_markdown(soup)
 
86
  tab_state.links = [{'text': link.get_text(strip=True) or "[No Link Text]", 'url': urllib.parse.urljoin(tab_state.url, link['href'])} for link in soup.find_all('a', href=True) if link.get('href', '').startswith('http')]
87
  log += f"\n🔗 Found {len(tab_state.links)} links."
88
  except Exception as e:
 
92
  return log
93
 
94
  async def handle_action(browser_state: BrowserState, search_engine: str, action: str, value=None):
95
+ active_tab_state = browser_state.get_active_tab()
 
 
96
  if action == "go" and active_tab_state and value:
97
  is_url = urllib.parse.urlparse(value).scheme and urllib.parse.urlparse(value).netloc
98
+ url = value if is_url else SEARCH_ENGINES.get(search_engine, SEARCH_ENGINES["DuckDuckGo"]).format(query=urllib.parse.quote_plus(value))
 
 
 
 
99
  log = await _fetch_and_update_tab_state(active_tab_state, url)
 
 
100
  elif action == "new_tab":
101
  tab_id, proxy_config = str(uuid.uuid4()), REVOLVER.get_next()
102
  context = await BROWSER.new_context(proxy=proxy_config)
 
104
  LIVE_CONTEXTS[tab_id] = {"context": context, "page": page}
105
  new_tab = TabState(tab_id, proxy_used=proxy_config['server'].split('@')[-1] if proxy_config else "Direct")
106
  browser_state.tabs.append(new_tab); browser_state.active_tab_id = tab_id
107
+ log = await _fetch_and_update_tab_state(new_tab, "https://www.startpage.com/")
108
  elif action == "click" and active_tab_state and value is not None:
109
  try:
110
  link_index = int(value)
 
120
  else: log = "Cannot close the last tab."
121
  elif action == "switch_tab" and value is not None:
122
  browser_state.active_tab_id = value; log = f"Switched to tab."
123
+ else: log = "No action taken."
124
  return browser_state, log
125
 
126
+ # ** CRITICAL BUG FIX: `NameError` is fixed by defining this function before it is called **
127
+ def update_ui_components(browser_state: BrowserState):
128
+ active_tab = browser_state.get_active_tab()
129
+ if not active_tab: return {page_content: gr.Markdown("No active tabs."), url_textbox: "", links_display: "", tab_selector: gr.Radio(choices=[])}
130
+ tab_choices = [(f"Tab {i}: {t.title[:25]}... ({t.proxy_used})", t.id) for i, t in enumerate(browser_state.tabs)]
131
+ links_md = "### 🔗 Links on Page\n" + ('\n'.join(f"{i}. [{link['text'][:80]}]({link['url']})" for i, link in enumerate(active_tab.links[:25])) if active_tab.links else "_No links found._")
132
+ page_md = f"# {active_tab.title}\n**URL:** {active_tab.url}\n\n---\n\n{active_tab.parsed_text}"
133
+ return {
134
+ page_content: gr.Markdown(page_md),
135
+ url_textbox: gr.Textbox(value=active_tab.url), links_display: gr.Markdown(links_md),
136
+ tab_selector: gr.Radio(choices=tab_choices, value=active_tab.id, label="Active Tabs"),
137
+ }
138
+
139
  # --- 4. GRADIO UI AND EVENT HANDLING ---
140
  with gr.Blocks(theme=gr.themes.Soft(), title="Real Browser Demo") as demo:
141
  browser_state = gr.State(BrowserState())
142
+ gr.Markdown("# 🛰️ Real Browser Demo v2.1")
 
143
  with gr.Row():
144
  with gr.Column(scale=4):
145
+ with gr.Row(): url_textbox = gr.Textbox(label="Enter URL or Search Query", interactive=True, scale=4); go_btn = gr.Button("Go", variant="primary", scale=1)
146
+ with gr.Accordion("Page Content (Formatted)", open=True): page_content = gr.Markdown("Loading...")
 
 
 
147
  with gr.Column(scale=2):
148
+ search_engine_selector = gr.Radio(list(SEARCH_ENGINES.keys()), value="DuckDuckGo", label="Search Engine")
149
+ with gr.Row(): new_tab_btn = gr.Button("➕ New Tab"); close_tab_btn = gr.Button("❌ Close Tab")
 
 
 
 
150
  tab_selector = gr.Radio(choices=[], label="Active Tabs", interactive=True)
151
  with gr.Accordion("Clickable Links", open=True):
152
  links_display = gr.Markdown("...");
153
+ with gr.Row(): click_num_box = gr.Number(label="Link #", scale=1, minimum=0, step=1); click_btn = gr.Button("Click Link", scale=2)
 
 
 
154
  log_display = gr.Textbox(label="Status Log", interactive=False)
155
+
156
  all_outputs = [page_content, url_textbox, links_display, tab_selector, log_display]
157
 
158
  async def master_handler(current_state, search_engine, action, value=None):
 
164
  print(f"✅ Playwright started. {REVOLVER.count()} proxies loaded."); APP_STARTED = True
165
 
166
  new_state, log = await handle_action(current_state, search_engine, action, value)
167
+ ui_updates = update_ui_components(new_state)
168
 
169
+ return (new_state, ui_updates[page_content], ui_updates[url_textbox], ui_updates[links_display], ui_updates[tab_selector], log)
 
 
 
 
 
 
 
170
 
171
+ async def on_load(state, search_engine): return await master_handler(state, search_engine, "new_tab", None)
172
+ async def on_go_click(state, search_engine, value): return await master_handler(state, search_engine, "go", value)
173
+ async def on_click_link(state, search_engine, value): return await master_handler(state, search_engine, "click", value)
174
+ async def on_new_tab(state, search_engine): return await master_handler(state, search_engine, "new_tab", None)
175
+ async def on_close_tab(state, search_engine): return await master_handler(state, search_engine, "close_tab", None)
176
+ async def on_switch_tab(state, search_engine, value): return await master_handler(state, search_engine, "switch_tab", value)
 
 
 
 
 
 
 
177
 
 
 
 
 
 
178
  outputs = [browser_state, *all_outputs]
179
+ demo.load(on_load, [browser_state, search_engine_selector], outputs)
180
+ go_btn.click(on_go_click, [browser_state, search_engine_selector, url_textbox], outputs, show_progress="full")
181
+ url_textbox.submit(on_go_click, [browser_state, search_engine_selector, url_textbox], outputs, show_progress="full")
182
+ click_btn.click(on_click_link, [browser_state, search_engine_selector, click_num_box], outputs, show_progress="full")
183
+ new_tab_btn.click(on_new_tab, [browser_state, search_engine_selector], outputs, show_progress="full")
184
+ close_tab_btn.click(on_close_tab, [browser_state, search_engine_selector], outputs)
185
+ tab_selector.input(on_switch_tab, [browser_state, search_engine_selector, tab_selector], outputs)
 
186
 
187
  demo.launch()