broadfield-dev commited on
Commit
224e219
·
verified ·
1 Parent(s): e8c80cb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +93 -65
app.py CHANGED
@@ -1,36 +1,68 @@
1
  import os
2
  os.system("playwright install")
3
- import os
4
  import re
5
  import urllib.parse
6
  import asyncio
7
- from typing import Dict
 
8
 
9
  import gradio as gr
10
  from bs4 import BeautifulSoup, NavigableString
11
  from playwright.async_api import async_playwright
12
 
13
- # --- 1. GLOBAL RESOURCES & CONFIGURATION ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
- # This dictionary will hold the long-lived Playwright and Browser objects.
16
- # It starts empty and browsers are added on-demand.
17
  PLAYWRIGHT_STATE: Dict = {}
 
18
 
19
- # A comprehensive list of search engines
20
  SEARCH_ENGINES = {
21
- "DuckDuckGo": "https://duckduckgo.com/html/?q={query}", "Google": "https://www.google.com/search?q={query}",
22
- "Bing": "https://www.bing.com/search?q={query}", "Brave": "https://search.brave.com/search?q={query}",
23
- "Ecosia": "https://www.ecosia.org/search?q={query}", "Yahoo": "https://search.yahoo.com/search?p={query}",
24
- "Startpage": "https://www.startpage.com/sp/search?q={query}", "Qwant": "https://www.qwant.com/?q={query}",
25
- "Swisscows": "https://swisscows.com/web?query={query}", "You.com": "https://you.com/search?q={query}",
26
- "SearXNG": "https://searx.be/search?q={query}", "MetaGer": "https://metager.org/meta/meta.ger-en?eingabe={query}",
27
- "Yandex": "https://yandex.com/search/?text={query}", "Baidu": "https://www.baidu.com/s?wd={query}",
 
 
 
 
 
 
 
28
  "Perplexity": "https://www.perplexity.ai/search?q={query}"
29
  }
30
 
31
- # --- 2. ADVANCED HTML-TO-MARKDOWN CONVERTER (Unchanged) ---
32
  class HTML_TO_MARKDOWN_CONVERTER:
33
- # ... [The class code is identical and correct] ...
34
  def __init__(self, soup: BeautifulSoup, base_url: str):
35
  self.soup = soup
36
  self.base_url = base_url
@@ -51,7 +83,7 @@ class HTML_TO_MARKDOWN_CONVERTER:
51
  self._cleanup_html()
52
  content_node = self.soup.find('main') or self.soup.find('article') or self.soup.find('body')
53
  if not content_node:
54
- return "Could not find main content."
55
  md = self._process_node(content_node)
56
  return re.sub(r'\n{3,}', '\n\n', md).strip()
57
 
@@ -88,91 +120,87 @@ class HTML_TO_MARKDOWN_CONVERTER:
88
  return f"\n\n![{alt}]({full_src})\n\n"
89
  return inner_md
90
 
91
-
92
- # --- 3. CORE API FUNCTION (WITH LAZY LOADING) ---
93
-
94
  async def perform_web_browse(query: str, browser_name: str, search_engine: str):
95
- """
96
- A stateless function that takes a query, browser, and search engine,
97
- then returns the parsed content of the resulting page.
98
- It launches and caches browsers on-demand.
99
- """
100
- # Step 1: Initialize Playwright process itself if not already running.
101
  if "playwright" not in PLAYWRIGHT_STATE:
102
- print("🚀 First request received, starting Playwright process...")
103
  PLAYWRIGHT_STATE["playwright"] = await async_playwright().start()
104
- print("✅ Playwright process is running.")
105
 
106
- # Step 2: Check if the *specific browser requested* has been launched.
107
- browser_key = browser_name.lower()
108
  if browser_key not in PLAYWRIGHT_STATE:
109
- print(f"🚀 Launching '{browser_key}' for the first time...")
110
  try:
111
  p = PLAYWRIGHT_STATE["playwright"]
112
- if browser_key == 'firefox':
113
- browser_instance = await p.firefox.launch(headless=True)
114
- elif browser_key == 'chromium':
115
- browser_instance = await p.chromium.launch(headless=True)
116
- elif browser_key == 'webkit':
117
- browser_instance = await p.webkit.launch(headless=True)
118
- else:
119
- raise ValueError(f"Invalid browser name: {browser_name}")
120
  PLAYWRIGHT_STATE[browser_key] = browser_instance
121
- print(f"✅ '{browser_key}' is now running and cached.")
122
  except Exception as e:
123
- error_message = str(e).splitlines()[0]
124
- print(f"❌ Failed to launch '{browser_key}': {error_message}")
125
- return {"status": "error", "query": query, "error_message": f"Failed to launch browser '{browser_key}'. Your system might be missing dependencies. Error: {error_message}"}
126
 
127
  browser_instance = PLAYWRIGHT_STATE[browser_key]
128
 
129
- # Step 3: Determine URL
130
- is_url = urllib.parse.urlparse(query).scheme in ['http', 'https']
131
- if is_url:
132
  url = query
133
  else:
134
  search_url_template = SEARCH_ENGINES.get(search_engine)
135
  if not search_url_template:
136
- return {"error": f"Invalid search engine: '{search_engine}'."}
137
  url = search_url_template.format(query=urllib.parse.quote_plus(query))
138
 
139
- # Step 4: Create isolated context and browse
140
- context = await browser_instance.new_context(user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
 
 
 
 
 
 
 
 
 
 
141
  page = await context.new_page()
142
 
143
  try:
144
- print(f"Navigating to: {url} using {browser_name}...")
145
- await page.goto(url, wait_until='domcontentloaded', timeout=30000)
 
 
 
 
 
 
146
  final_url, title = page.url, await page.title() or "No Title"
147
- print(f"Arrived at: {final_url}")
148
 
149
- soup = BeautifulSoup(await page.content(), 'lxml')
 
 
150
  converter = HTML_TO_MARKDOWN_CONVERTER(soup, base_url=final_url)
151
  markdown_text = converter.convert()
 
152
 
153
- print("Content parsed successfully.")
154
- return {"status": "success", "query": query, "final_url": final_url, "page_title": title, "markdown_content": markdown_text}
 
 
155
  except Exception as e:
156
- error_message = str(e).splitlines()[0]
157
- print(f"An error occurred: {error_message}")
158
- return {"status": "error", "query": query, "error_message": error_message}
159
  finally:
160
- if page: await page.close()
161
- if context: await context.close()
162
- print("Session context closed.")
163
 
164
-
165
- # --- 4. GRADIO INTERFACE & API LAUNCH (Unchanged) ---
166
  with gr.Blocks(title="Web Browse API", theme=gr.themes.Soft()) as demo:
167
- # ... UI definition is identical ...
168
  gr.Markdown("# Web Browse API")
169
- gr.Markdown("This interface exposes a stateless API endpoint (`/api/web_browse`) to fetch and parse web content.")
170
- query_input = gr.Textbox(label="URL or Search Query", placeholder="e.g., https://openai.com or 'history of artificial intelligence'")
 
 
171
  with gr.Row():
172
  browser_input = gr.Dropdown(label="Browser", choices=["firefox", "chromium", "webkit"], value="firefox", scale=1)
173
  search_engine_input = gr.Dropdown(label="Search Engine (for non-URL queries)", choices=sorted(list(SEARCH_ENGINES.keys())), value="DuckDuckGo", scale=2)
 
174
  submit_button = gr.Button("Browse", variant="primary")
175
  output_json = gr.JSON(label="API Result")
 
176
  submit_button.click(fn=perform_web_browse, inputs=[query_input, browser_input, search_engine_input], outputs=output_json, api_name="web_browse")
177
 
178
  if __name__ == "__main__":
 
1
  import os
2
  os.system("playwright install")
 
3
  import re
4
  import urllib.parse
5
  import asyncio
6
+ from typing import Dict, Optional
7
+ from itertools import cycle
8
 
9
  import gradio as gr
10
  from bs4 import BeautifulSoup, NavigableString
11
  from playwright.async_api import async_playwright
12
 
13
+ class CredentialRevolver:
14
+ def __init__(self, proxy_string: str):
15
+ self.proxies = self._parse_proxies(proxy_string)
16
+ self.proxy_cycler = cycle(self.proxies) if self.proxies else None
17
+
18
+ def _parse_proxies(self, proxy_string: str):
19
+ proxies = []
20
+ if not proxy_string:
21
+ return proxies
22
+ for line in proxy_string.strip().splitlines():
23
+ try:
24
+ parsed = urllib.parse.urlparse(f"//{line.strip()}")
25
+ if not parsed.hostname or not parsed.port:
26
+ continue
27
+ server = f"http://{parsed.hostname}:{parsed.port}"
28
+ proxy_dict = {"server": server}
29
+ if parsed.username:
30
+ proxy_dict["username"] = parsed.username
31
+ if parsed.password:
32
+ proxy_dict["password"] = parsed.password
33
+ proxies.append(proxy_dict)
34
+ except Exception:
35
+ pass
36
+ return proxies
37
+
38
+ def get_next(self) -> Optional[Dict]:
39
+ return next(self.proxy_cycler) if self.proxy_cycler else None
40
+
41
+ def count(self) -> int:
42
+ return len(self.proxies)
43
 
 
 
44
  PLAYWRIGHT_STATE: Dict = {}
45
+ REVOLVER = CredentialRevolver(os.getenv("PROXY_LIST", ""))
46
 
 
47
  SEARCH_ENGINES = {
48
+ "Google": "https://www.google.com/search?q={query}",
49
+ "DuckDuckGo": "https://duckduckgo.com/html/?q={query}",
50
+ "Bing": "https://www.bing.com/search?q={query}",
51
+ "Brave": "https://search.brave.com/search?q={query}",
52
+ "Ecosia": "https://www.ecosia.org/search?q={query}",
53
+ "Yahoo": "https://search.yahoo.com/search?p={query}",
54
+ "Startpage": "https://www.startpage.com/sp/search?q={query}",
55
+ "Qwant": "https://www.qwant.com/?q={query}",
56
+ "Swisscows": "https://swisscows.com/web?query={query}",
57
+ "You.com": "https://you.com/search?q={query}",
58
+ "SearXNG": "https://searx.be/search?q={query}",
59
+ "MetaGer": "https://metager.org/meta/meta.ger-en?eingabe={query}",
60
+ "Yandex": "https://yandex.com/search/?text={query}",
61
+ "Baidu": "https://www.baidu.com/s?wd={query}",
62
  "Perplexity": "https://www.perplexity.ai/search?q={query}"
63
  }
64
 
 
65
  class HTML_TO_MARKDOWN_CONVERTER:
 
66
  def __init__(self, soup: BeautifulSoup, base_url: str):
67
  self.soup = soup
68
  self.base_url = base_url
 
83
  self._cleanup_html()
84
  content_node = self.soup.find('main') or self.soup.find('article') or self.soup.find('body')
85
  if not content_node:
86
+ return ""
87
  md = self._process_node(content_node)
88
  return re.sub(r'\n{3,}', '\n\n', md).strip()
89
 
 
120
  return f"\n\n![{alt}]({full_src})\n\n"
121
  return inner_md
122
 
 
 
 
123
  async def perform_web_browse(query: str, browser_name: str, search_engine: str):
124
+ browser_key = browser_name.lower()
 
 
 
 
 
125
  if "playwright" not in PLAYWRIGHT_STATE:
 
126
  PLAYWRIGHT_STATE["playwright"] = await async_playwright().start()
 
127
 
 
 
128
  if browser_key not in PLAYWRIGHT_STATE:
 
129
  try:
130
  p = PLAYWRIGHT_STATE["playwright"]
131
+ if browser_key == 'firefox': browser_instance = await p.firefox.launch(headless=True)
132
+ elif browser_key == 'chromium': browser_instance = await p.chromium.launch(headless=True)
133
+ elif browser_key == 'webkit': browser_instance = await p.webkit.launch(headless=True)
134
+ else: raise ValueError(f"Invalid browser name: {browser_name}")
 
 
 
 
135
  PLAYWRIGHT_STATE[browser_key] = browser_instance
 
136
  except Exception as e:
137
+ return {"status": "error", "query": query, "error_message": f"Failed to launch '{browser_key}'. Error: {str(e).splitlines()[0]}"}
 
 
138
 
139
  browser_instance = PLAYWRIGHT_STATE[browser_key]
140
 
141
+ if urllib.parse.urlparse(query).scheme in ['http', 'https'] and '.' in urllib.parse.urlparse(query).netloc:
 
 
142
  url = query
143
  else:
144
  search_url_template = SEARCH_ENGINES.get(search_engine)
145
  if not search_url_template:
146
+ return {"status": "error", "query": query, "error_message": f"Invalid search engine: '{search_engine}'."}
147
  url = search_url_template.format(query=urllib.parse.quote_plus(query))
148
 
149
+ proxy_config = REVOLVER.get_next()
150
+ proxy_server_used = proxy_config["server"] if proxy_config else "Direct Connection"
151
+
152
+ context_args = {
153
+ 'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
154
+ 'java_script_enabled': True,
155
+ 'ignore_https_errors': True
156
+ }
157
+ if proxy_config:
158
+ context_args['proxy'] = proxy_config
159
+
160
+ context = await browser_instance.new_context(**context_args)
161
  page = await context.new_page()
162
 
163
  try:
164
+ response = await page.goto(url, wait_until='domcontentloaded', timeout=45000)
165
+
166
+ current_url = page.url
167
+ if "google.com" in current_url:
168
+ await page.wait_for_selector('div#rso, div#search, body[jsmodel]', timeout=15000)
169
+ elif "perplexity.ai" in current_url or "you.com" in current_url:
170
+ await page.wait_for_timeout(4000)
171
+
172
  final_url, title = page.url, await page.title() or "No Title"
 
173
 
174
+ html_content = await page.content()
175
+ soup = BeautifulSoup(html_content, 'lxml')
176
+
177
  converter = HTML_TO_MARKDOWN_CONVERTER(soup, base_url=final_url)
178
  markdown_text = converter.convert()
179
+ status_code = response.status if response else "N/A"
180
 
181
+ return {
182
+ "status": "success", "query": query, "final_url": final_url, "page_title": title,
183
+ "http_status": status_code, "proxy_used": proxy_server_used, "markdown_content": markdown_text,
184
+ }
185
  except Exception as e:
186
+ return {"status": "error", "query": query, "proxy_used": proxy_server_used, "error_message": str(e).splitlines()[0]}
 
 
187
  finally:
188
+ if 'page' in locals() and not page.is_closed(): await page.close()
189
+ if 'context' in locals(): await context.close()
 
190
 
 
 
191
  with gr.Blocks(title="Web Browse API", theme=gr.themes.Soft()) as demo:
 
192
  gr.Markdown("# Web Browse API")
193
+ gr.Markdown(f"This interface exposes a stateless API endpoint (`/api/web_browse`) to fetch and parse web content. {REVOLVER.count()} proxies loaded.")
194
+
195
+ query_input = gr.Textbox(label="URL or Search Query", placeholder="e.g., https://gradio.app or 'how does gradio work'")
196
+
197
  with gr.Row():
198
  browser_input = gr.Dropdown(label="Browser", choices=["firefox", "chromium", "webkit"], value="firefox", scale=1)
199
  search_engine_input = gr.Dropdown(label="Search Engine (for non-URL queries)", choices=sorted(list(SEARCH_ENGINES.keys())), value="DuckDuckGo", scale=2)
200
+
201
  submit_button = gr.Button("Browse", variant="primary")
202
  output_json = gr.JSON(label="API Result")
203
+
204
  submit_button.click(fn=perform_web_browse, inputs=[query_input, browser_input, search_engine_input], outputs=output_json, api_name="web_browse")
205
 
206
  if __name__ == "__main__":