broadfield-dev commited on
Commit
35ae779
·
verified ·
1 Parent(s): 8932042

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +91 -148
app.py CHANGED
@@ -1,61 +1,37 @@
1
  import os
2
  os.system("playwright install")
 
 
3
  import gradio as gr
4
- from playwright.sync_api import sync_playwright, Error as PlaywrightError
5
  from bs4 import BeautifulSoup
6
  import urllib.parse
7
  import datetime
8
  import atexit
9
  import re
 
10
  from itertools import cycle
11
- import uuid # For generating unique tab IDs
12
-
13
- try:
14
- p = sync_playwright().start()
15
- browser = p.firefox.launch(headless=True, timeout=60000)
16
- print("✅ Playwright browser launched successfully.")
17
- except Exception as e:
18
- print(f"❌ Could not launch Playwright browser. Original error: {e}")
19
- exit()
20
-
21
- # This dictionary is the key to the solution. It maps a tab's unique ID
22
- # to its live, non-copyable Playwright Page and Context object.
23
- LIVE_CONTEXTS = {} # { tab_id: { "context": PlaywrightContext, "page": PlaywrightPage } }
24
-
25
- def cleanup():
26
- """Ensures all browser resources are closed when the app shuts down."""
27
- print(f"🧹 Cleaning up: Closing {len(LIVE_CONTEXTS)} browser contexts...")
28
- for tab_id, resources in LIVE_CONTEXTS.items():
29
- if not resources["context"].is_closed():
30
- resources["context"].close()
31
- browser.close()
32
- p.stop()
33
- atexit.register(cleanup)
34
 
 
 
 
 
 
 
35
 
 
36
  class TabState:
37
- """A plain data class representing a tab's state. Fully copyable."""
38
  def __init__(self, tab_id, proxy_used="Direct Connection"):
39
- self.id = tab_id
40
- self.url = "about:blank"
41
- self.title = "New Tab"
42
  self.parsed_text = "Welcome! Navigate to a URL or search to get started."
43
- self.links = []
44
- self.proxy_used = proxy_used
45
 
46
  class BrowserState:
47
- """A plain data class representing the browser's overall state."""
48
- def __init__(self):
49
- self.tabs = [] # A list of TabState objects
50
- self.active_tab_id = None
51
- # Add bookmarks, history etc. here if needed
52
-
53
- def get_active_tab(self) -> TabState | None:
54
- if not self.active_tab_id: return None
55
- return next((t for t in self.tabs if t.id == self.active_tab_id), None)
56
 
57
-
58
- class CredentialRevolver: # (This class is unchanged)
59
  def __init__(self, proxy_string: str):
60
  self.proxies = self._parse_proxies(proxy_string)
61
  if self.proxies: self.proxy_cycler = cycle(self.proxies)
@@ -69,142 +45,97 @@ class CredentialRevolver: # (This class is unchanged)
69
  def get_next(self): return next(self.proxy_cycler) if self.proxy_cycler else None
70
  def count(self): return len(self.proxies)
71
 
72
- proxy_list_str = os.getenv("PROXY_LIST", "")
73
- revolver = CredentialRevolver(proxy_list_str)
74
 
75
- def _fetch_and_update_tab_state(tab_state: TabState, url: str):
76
- """
77
- The core function. It uses the tab_state's ID to find the LIVE page,
78
- navigates it, and then updates the copyable tab_state object.
79
- """
80
  log = f"▶️ Navigating to {url}..."
81
  live_page = LIVE_CONTEXTS[tab_state.id]["page"]
82
-
83
  try:
84
- live_page.goto(url, wait_until='domcontentloaded', timeout=30000)
85
  tab_state.url = live_page.url
86
- tab_state.title = live_page.title() or "No Title"
87
  log += f"\n✅ Arrived at: {tab_state.url}"
88
-
89
- html_content = live_page.content()
90
  soup = BeautifulSoup(html_content, 'lxml')
91
  for script in soup(["script", "style", "nav", "footer"]): script.extract()
92
  tab_state.parsed_text = soup.get_text(separator='\n', strip=True)
93
-
94
- tab_state.links = []
95
- for link in soup.find_all('a', href=True):
96
- href = link['href']
97
- absolute_url = urllib.parse.urljoin(tab_state.url, href)
98
- if absolute_url.startswith('http') and not re.match(r'javascript:|mailto:', absolute_url):
99
- link_text = link.get_text(strip=True) or "[No Link Text]"
100
- tab_state.links.append({'text': link_text, 'url': absolute_url})
101
  log += f"\n🔗 Found {len(tab_state.links)} links."
102
  except PlaywrightError as e:
103
  error_message = str(e); tab_state.title = "Error"; tab_state.url = url
104
  tab_state.parsed_text = f"❌ Failed to load page.\n\nError: {error_message}"
105
  tab_state.links = []; log += f"\n❌ {error_message}"
106
-
107
  return log
108
 
109
- def handle_action(browser_state: BrowserState, action: str, value=None):
110
- """Main event handler. It modifies the browser_state and interacts with LIVE_CONTEXTS."""
111
- log = ""
112
- active_tab_state = browser_state.get_active_tab()
113
-
114
  if action == "new_tab":
115
  tab_id = str(uuid.uuid4())
116
- proxy_config = revolver.get_next()
117
-
118
- context = browser.new_context(proxy=proxy_config)
119
- page = context.new_page()
120
  LIVE_CONTEXTS[tab_id] = {"context": context, "page": page}
121
-
122
  new_tab = TabState(tab_id, proxy_used=proxy_config['server'] if proxy_config else "Direct")
123
  browser_state.tabs.append(new_tab)
124
  browser_state.active_tab_id = tab_id
125
-
126
- # Now navigate the new tab
127
- log = _fetch_and_update_tab_state(new_tab, "https://www.whatsmyip.org/")
128
-
129
  elif action == "go" and active_tab_state:
130
  url = value if (urllib.parse.urlparse(value).scheme and urllib.parse.urlparse(value).netloc) else f"https://duckduckgo.com/html/?q={urllib.parse.quote_plus(value)}"
131
- log = _fetch_and_update_tab_state(active_tab_state, url)
132
-
133
- elif action == "click" and active_tab_state:
134
  try:
135
  link_index = int(value)
136
  if 0 <= link_index < len(active_tab_state.links):
137
- link_url = active_tab_state.links[link_index]['url']
138
- log = _fetch_and_update_tab_state(active_tab_state, link_url)
139
  else: log = "Invalid link number."
140
  except: log = "Please enter a valid number to click."
141
-
142
  elif action == "close_tab" and active_tab_state:
143
- if len(browser_state.tabs) <= 1:
144
- log = "Cannot close the last tab."
145
- else:
146
  tab_to_close_id = browser_state.active_tab_id
147
-
148
- # Find and remove tab from state
149
  tab_index = browser_state.tabs.index(active_tab_state)
150
  browser_state.tabs.pop(tab_index)
151
-
152
- # Set new active tab
153
  new_index = tab_index - 1 if tab_index > 0 else 0
154
  browser_state.active_tab_id = browser_state.tabs[new_index].id
155
-
156
- # Close and remove live resources
157
  resources = LIVE_CONTEXTS.pop(tab_to_close_id)
158
- if not resources['context'].is_closed():
159
- resources['context'].close()
160
  log = f"💣 Tab closed."
161
-
162
- elif action == "switch_tab":
163
- try:
164
- # The value from the radio button is the tab_id itself
165
- browser_state.active_tab_id = value
166
- log = f"Switched to tab."
167
- except: log = "Invalid tab format."
168
-
169
- # Return the modified state object. Gradio will handle copying it for the UI update.
170
  return browser_state, log
171
 
172
-
173
- def update_ui_components(browser_state: BrowserState):
174
- """Generates all UI component values from the plain browser_state object."""
175
  active_tab = browser_state.get_active_tab()
176
- if not active_tab:
177
- return {
178
- page_content: gr.Markdown("No active tabs. Please create a new one."),
179
- url_textbox: "", links_display: "",
180
- tab_selector: gr.Radio(choices=[], label="Active Tabs"),
181
- }
182
-
183
- # Use the tab ID as the value for the radio button
184
  tab_choices = [(f"Tab {i}: {t.title[:25]}... (via {t.proxy_used})", t.id) for i, t in enumerate(browser_state.tabs)]
185
-
186
- links_md = "### 🔗 Links on Page\n"
187
- if active_tab.links:
188
- for i, link in enumerate(active_tab.links[:25]): links_md += f"{i}. [{link['text'][:80]}]({link['url']})\n"
189
- else: links_md += "_No links found._"
190
-
191
  return {
192
  page_content: gr.Markdown(f"# {active_tab.title}\n**URL:** {active_tab.url}\n\n---\n\n{active_tab.parsed_text[:2000]}..."),
193
- url_textbox: gr.Textbox(value=active_tab.url),
194
- links_display: gr.Markdown(links_md),
195
  tab_selector: gr.Radio(choices=tab_choices, value=active_tab.id, label="Active Tabs"),
196
  }
197
 
198
- # --- Gradio UI Layout ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  with gr.Blocks(theme=gr.themes.Soft(), title="Real Browser Demo") as demo:
200
- # Initialize the state with our new, copyable BrowserState class
201
  browser_state = gr.State(BrowserState())
202
-
203
- gr.Markdown("# 🛰️ Real Browser Demo (with Proxies & State Fix)")
204
- gr.Markdown(f"This demo runs a real headless browser. **{revolver.count()} proxies loaded**.")
205
- # (The rest of the UI layout is the same)
206
- # ...
207
- # --- Gradio Interface Layout ---
208
  with gr.Row():
209
  with gr.Column(scale=3):
210
  url_textbox = gr.Textbox(label="URL or Search Term", interactive=True)
@@ -217,30 +148,42 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Real Browser Demo") as demo:
217
  with gr.Accordion("Clickable Links", open=True):
218
  links_display = gr.Markdown("...")
219
  with gr.Row(): click_num_box = gr.Number(label="Link #", scale=1, minimum=0, step=1); click_btn = gr.Button("Click Link", scale=2)
220
-
221
  all_outputs = [page_content, url_textbox, links_display, tab_selector, log_display]
222
 
223
- def master_handler(current_state, action, value):
224
- new_state, log = handle_action(current_state, action, value)
225
- # The update_ui_components function now only needs the state
226
  ui_updates = update_ui_components(new_state)
227
  ui_updates[log_display] = log
228
- # IMPORTANT: Return the new_state object to update gr.State
229
  return new_state, ui_updates
230
 
231
- # Initial load: create the first tab
232
- demo.load(
233
- lambda s: master_handler(s, "new_tab", None)[1], # Just return the UI updates
234
- inputs=[browser_state],
235
- outputs=list(all_outputs)
236
- )
237
-
238
- # Event listeners now call the master_handler
239
- go_btn.click(lambda s, v: master_handler(s, "go", v), [browser_state, url_textbox], [browser_state, *all_outputs], show_progress="full")
240
- url_textbox.submit(lambda s, v: master_handler(s, "go", v), [browser_state, url_textbox], [browser_state, *all_outputs], show_progress="full")
241
- click_btn.click(lambda s, v: master_handler(s, "click", v), [browser_state, click_num_box], [browser_state, *all_outputs], show_progress="full")
242
- new_tab_btn.click(lambda s: master_handler(s, "new_tab", None), [browser_state], [browser_state, *all_outputs], show_progress="full")
243
- close_tab_btn.click(lambda s: master_handler(s, "close_tab", None), [browser_state], [browser_state, *all_outputs])
244
- tab_selector.input(lambda s, v: master_handler(s, "switch_tab", v), [browser_state, tab_selector], [browser_state, *all_outputs])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
 
246
  demo.launch()
 
1
  import os
2
  os.system("playwright install")
3
+ # app.py (Final version with async API)
4
+
5
  import gradio as gr
6
+ from playwright.async_api import async_playwright, Error as PlaywrightError
7
  from bs4 import BeautifulSoup
8
  import urllib.parse
9
  import datetime
10
  import atexit
11
  import re
12
+ import os
13
  from itertools import cycle
14
+ import uuid
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
+ # --- 1. ASYNC GLOBAL RESOURCES ---
17
+ # We will initialize these in a Gradio startup event.
18
+ P = None
19
+ BROWSER = None
20
+ REVOLVER = None
21
+ LIVE_CONTEXTS = {} # { tab_id: { "context": PlaywrightContext, "page": PlaywrightPage } }
22
 
23
+ # --- 2. PLAIN DATA STATE CLASSES (Unchanged) ---
24
  class TabState:
 
25
  def __init__(self, tab_id, proxy_used="Direct Connection"):
26
+ self.id, self.url, self.title = tab_id, "about:blank", "New Tab"
 
 
27
  self.parsed_text = "Welcome! Navigate to a URL or search to get started."
28
+ self.links, self.proxy_used = [], proxy_used
 
29
 
30
  class BrowserState:
31
+ def __init__(self): self.tabs, self.active_tab_id = [], None
32
+ def get_active_tab(self): return next((t for t in self.tabs if t.id == self.active_tab_id), None)
 
 
 
 
 
 
 
33
 
34
+ class CredentialRevolver: # Unchanged
 
35
  def __init__(self, proxy_string: str):
36
  self.proxies = self._parse_proxies(proxy_string)
37
  if self.proxies: self.proxy_cycler = cycle(self.proxies)
 
45
  def get_next(self): return next(self.proxy_cycler) if self.proxy_cycler else None
46
  def count(self): return len(self.proxies)
47
 
48
+ # --- 3. ASYNC LOGIC ---
49
+ # All functions interacting with Playwright are now `async def`
50
 
51
+ async def _fetch_and_update_tab_state(tab_state: TabState, url: str):
 
 
 
 
52
  log = f"▶️ Navigating to {url}..."
53
  live_page = LIVE_CONTEXTS[tab_state.id]["page"]
 
54
  try:
55
+ await live_page.goto(url, wait_until='domcontentloaded', timeout=30000)
56
  tab_state.url = live_page.url
57
+ tab_state.title = await live_page.title() or "No Title"
58
  log += f"\n✅ Arrived at: {tab_state.url}"
59
+ html_content = await live_page.content()
 
60
  soup = BeautifulSoup(html_content, 'lxml')
61
  for script in soup(["script", "style", "nav", "footer"]): script.extract()
62
  tab_state.parsed_text = soup.get_text(separator='\n', strip=True)
63
+ tab_state.links = [{'text': link.get_text(strip=True) or "[No Link Text]", 'url': urllib.parse.urljoin(tab_state.url, link['href'])} for link in soup.find_all('a', href=True) if link['href'] and link['href'].startswith('http')]
 
 
 
 
 
 
 
64
  log += f"\n🔗 Found {len(tab_state.links)} links."
65
  except PlaywrightError as e:
66
  error_message = str(e); tab_state.title = "Error"; tab_state.url = url
67
  tab_state.parsed_text = f"❌ Failed to load page.\n\nError: {error_message}"
68
  tab_state.links = []; log += f"\n❌ {error_message}"
 
69
  return log
70
 
71
+ async def handle_action(browser_state: BrowserState, action: str, value=None):
72
+ log = ""; active_tab_state = browser_state.get_active_tab()
 
 
 
73
  if action == "new_tab":
74
  tab_id = str(uuid.uuid4())
75
+ proxy_config = REVOLVER.get_next()
76
+ context = await BROWSER.new_context(proxy=proxy_config)
77
+ page = await context.new_page()
 
78
  LIVE_CONTEXTS[tab_id] = {"context": context, "page": page}
 
79
  new_tab = TabState(tab_id, proxy_used=proxy_config['server'] if proxy_config else "Direct")
80
  browser_state.tabs.append(new_tab)
81
  browser_state.active_tab_id = tab_id
82
+ log = await _fetch_and_update_tab_state(new_tab, "https://www.whatsmyip.org/")
 
 
 
83
  elif action == "go" and active_tab_state:
84
  url = value if (urllib.parse.urlparse(value).scheme and urllib.parse.urlparse(value).netloc) else f"https://duckduckgo.com/html/?q={urllib.parse.quote_plus(value)}"
85
+ log = await _fetch_and_update_tab_state(active_tab_state, url)
86
+ elif action == "click" and active_tab_state and value is not None:
 
87
  try:
88
  link_index = int(value)
89
  if 0 <= link_index < len(active_tab_state.links):
90
+ log = await _fetch_and_update_tab_state(active_tab_state, active_tab_state.links[link_index]['url'])
 
91
  else: log = "Invalid link number."
92
  except: log = "Please enter a valid number to click."
 
93
  elif action == "close_tab" and active_tab_state:
94
+ if len(browser_state.tabs) > 1:
 
 
95
  tab_to_close_id = browser_state.active_tab_id
 
 
96
  tab_index = browser_state.tabs.index(active_tab_state)
97
  browser_state.tabs.pop(tab_index)
 
 
98
  new_index = tab_index - 1 if tab_index > 0 else 0
99
  browser_state.active_tab_id = browser_state.tabs[new_index].id
 
 
100
  resources = LIVE_CONTEXTS.pop(tab_to_close_id)
101
+ await resources['context'].close()
 
102
  log = f"💣 Tab closed."
103
+ else: log = "Cannot close the last tab."
104
+ elif action == "switch_tab" and value is not None:
105
+ browser_state.active_tab_id = value; log = f"Switched to tab."
 
 
 
 
 
 
106
  return browser_state, log
107
 
108
+ def update_ui_components(browser_state: BrowserState): # This function is not async
 
 
109
  active_tab = browser_state.get_active_tab()
110
+ if not active_tab: return {page_content: gr.Markdown("No active tabs."), url_textbox: "", links_display: "", tab_selector: gr.Radio(choices=[])}
 
 
 
 
 
 
 
111
  tab_choices = [(f"Tab {i}: {t.title[:25]}... (via {t.proxy_used})", t.id) for i, t in enumerate(browser_state.tabs)]
112
+ links_md = "### 🔗 Links on Page\n" + ('\n'.join(f"{i}. [{link['text'][:80]}]({link['url']})" for i, link in enumerate(active_tab.links[:25])) if active_tab.links else "_No links found._")
 
 
 
 
 
113
  return {
114
  page_content: gr.Markdown(f"# {active_tab.title}\n**URL:** {active_tab.url}\n\n---\n\n{active_tab.parsed_text[:2000]}..."),
115
+ url_textbox: gr.Textbox(value=active_tab.url), links_display: gr.Markdown(links_md),
 
116
  tab_selector: gr.Radio(choices=tab_choices, value=active_tab.id, label="Active Tabs"),
117
  }
118
 
119
+ # --- 4. GRADIO UI AND STARTUP/SHUTDOWN EVENTS ---
120
+ async def startup():
121
+ global P, BROWSER, REVOLVER
122
+ print("🚀 App starting up...")
123
+ P = await async_playwright().start()
124
+ BROWSER = await P.firefox.launch(headless=True)
125
+ proxy_list_str = os.getenv("PROXY_LIST", "")
126
+ REVOLVER = CredentialRevolver(proxy_list_str)
127
+ print(f"✅ Playwright started. {REVOLVER.count()} proxies loaded.")
128
+
129
+ async def shutdown():
130
+ print("🧹 App shutting down...")
131
+ if BROWSER: await BROWSER.close()
132
+ if P: await P.stop()
133
+ print("✅ Playwright stopped.")
134
+
135
  with gr.Blocks(theme=gr.themes.Soft(), title="Real Browser Demo") as demo:
 
136
  browser_state = gr.State(BrowserState())
137
+ gr.Markdown("# 🛰️ Real Browser Demo (Async API)")
138
+ # UI Layout is the same...
 
 
 
 
139
  with gr.Row():
140
  with gr.Column(scale=3):
141
  url_textbox = gr.Textbox(label="URL or Search Term", interactive=True)
 
148
  with gr.Accordion("Clickable Links", open=True):
149
  links_display = gr.Markdown("...")
150
  with gr.Row(): click_num_box = gr.Number(label="Link #", scale=1, minimum=0, step=1); click_btn = gr.Button("Click Link", scale=2)
151
+
152
  all_outputs = [page_content, url_textbox, links_display, tab_selector, log_display]
153
 
154
+ async def master_handler(current_state, action, value):
155
+ new_state, log = await handle_action(current_state, action, value)
 
156
  ui_updates = update_ui_components(new_state)
157
  ui_updates[log_display] = log
 
158
  return new_state, ui_updates
159
 
160
+ # Initial Load - create first tab
161
+ async def initial_load(s):
162
+ _, ui_updates = await master_handler(s, "new_tab", None)
163
+ return ui_updates
164
+
165
+ demo.load(initial_load, inputs=[browser_state], outputs=list(all_outputs))
166
+
167
+ # Event listeners - Gradio automatically awaits async functions
168
+ go_btn.click(master_handler, [browser_state, url_textbox], [browser_state, *all_outputs], fn_name="go", show_progress="full")
169
+ url_textbox.submit(master_handler, [browser_state, url_textbox], [browser_state, *all_outputs], fn_name="go", show_progress="full")
170
+ click_btn.click(master_handler, [browser_state, click_num_box], [browser_state, *all_outputs], fn_name="click", show_progress="full")
171
+ new_tab_btn.click(master_handler, [browser_state], [browser_state, *all_outputs], fn_name="new_tab", show_progress="full")
172
+ close_tab_btn.click(master_handler, [browser_state], [browser_state, *all_outputs], fn_name="close_tab")
173
+ tab_selector.input(master_handler, [browser_state, tab_selector], [browser_state, *all_outputs], fn_name="switch_tab")
174
+
175
+ # The `startup` and `shutdown` events are not available in standard `Blocks`.
176
+ # We manage this by doing the startup inside the first `load` event.
177
+ # The global setup is the best way for Spaces.
178
+ # A small tweak to make it work without official startup events:
179
+ # The startup logic is moved to the global scope but needs an async context.
180
+ # We will use the existing startup/shutdown logic and assume the Gradio version supports it or handles it gracefully.
181
+ # The best practice would be to use a framework that has explicit startup/shutdown events like FastAPI.
182
+ # For Gradio Spaces, the singleton pattern with async calls in handlers is the way to go.
183
+
184
+ # Let's adjust for standard Gradio deployment.
185
+ # We'll run startup manually before launching.
186
+ import asyncio
187
+ asyncio.run(startup())
188
 
189
  demo.launch()