broadfield-dev commited on
Commit
56976c0
·
verified ·
1 Parent(s): 3880e24

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -29
app.py CHANGED
@@ -40,21 +40,21 @@ PLAYWRIGHT_STATE: Dict = {}
40
  REVOLVER = CredentialRevolver(os.getenv("PROXY_LIST", ""))
41
 
42
  SEARCH_ENGINES = {
43
- "Google": ("https://www.google.com/search?q={query}&hl=en", '#rso, #search, #botstuff'),
44
- "DuckDuckGo": ("https://duckduckgo.com/html/?q={query}", '#links'),
45
- "Bing": ("https://www.bing.com/search?q={query}", '#b_results'),
46
- "Brave": ("https://search.brave.com/search?q={query}", '#results'),
47
- "Ecosia": ("https://www.ecosia.org/search?q={query}", 'main[role="main"]'),
48
- "Yahoo": ("https://search.yahoo.com/search?p={query}", '#web'),
49
- "Startpage": ("https://www.startpage.com/sp/search?q={query}", '#main'),
50
- "Qwant": ("https://www.qwant.com/?q={query}", '[data-testid="web-results"]'),
51
- "Swisscows": ("https://swisscows.com/web?query={query}", '.web-results'),
52
- "You.com": ("https://you.com/search?q={query}", '#search-results'),
53
- "SearXNG": ("https://searx.be/search?q={query}", '#results'),
54
- "MetaGer": ("https://metager.org/meta/meta.ger-en?eingabe={query}", '#results'),
55
- "Yandex": ("https://yandex.com/search/?text={query}", '#search-result'),
56
- "Baidu": ("https://www.baidu.com/s?wd={query}", '#content_left'),
57
- "Perplexity": ("https://www.perplexity.ai/search?q={query}", 'div[class*="prose"]'),
58
  }
59
 
60
  class HTML_TO_MARKDOWN_CONVERTER:
@@ -130,13 +130,11 @@ async def perform_web_browse(query: str, browser_name: str, search_engine_name:
130
 
131
  if is_direct_url:
132
  url = query
133
- content_selector = 'body'
134
  else:
135
- engine_data = SEARCH_ENGINES.get(search_engine_name)
136
- if not engine_data:
137
  return {"status": "error", "query": query, "error_message": f"Invalid search engine: '{search_engine_name}'."}
138
- url, content_selector = engine_data
139
- url = url.format(query=urllib.parse.quote_plus(query))
140
 
141
  proxy_config = REVOLVER.get_next()
142
  proxy_server_used = proxy_config["server"] if proxy_config else "Direct Connection"
@@ -148,17 +146,12 @@ async def perform_web_browse(query: str, browser_name: str, search_engine_name:
148
  page = await context.new_page()
149
 
150
  try:
151
- response = await page.goto(url, wait_until='commit', timeout=20000)
152
-
153
- await asyncio.wait([
154
- page.wait_for_load_state('domcontentloaded', timeout=15000),
155
- page.wait_for_selector(content_selector, timeout=15000),
156
- ], return_when=asyncio.FIRST_COMPLETED)
157
 
158
  html_content = await page.content()
159
 
160
- if any(phrase in html_content for phrase in ["unusual traffic", "CAPTCHA", "prove you are human", "before you continue"]):
161
- raise Exception("Anti-bot measure detected. Try another search engine or proxy.")
162
 
163
  final_url, title = page.url, await page.title() or "No Title"
164
  soup = BeautifulSoup(html_content, 'lxml')
@@ -170,7 +163,7 @@ async def perform_web_browse(query: str, browser_name: str, search_engine_name:
170
  except Exception as e:
171
  error_message = str(e).splitlines()[0]
172
  if "Timeout" in error_message:
173
- return {"status": "error", "query": query, "proxy_used": proxy_server_used, "error_message": f"Page failed to load or find content for '{query}'. The site may be slow, blocking automation, or the content selector '{content_selector}' was not found."}
174
  return {"status": "error", "query": query, "proxy_used": proxy_server_used, "error_message": error_message}
175
  finally:
176
  if 'page' in locals() and not page.is_closed(): await page.close()
 
40
  REVOLVER = CredentialRevolver(os.getenv("PROXY_LIST", ""))
41
 
42
  SEARCH_ENGINES = {
43
+ "Google": "https://www.google.com/search?q={query}&hl=en",
44
+ "DuckDuckGo": "https://duckduckgo.com/html/?q={query}",
45
+ "Bing": "https://www.bing.com/search?q={query}",
46
+ "Brave": "https://search.brave.com/search?q={query}",
47
+ "Ecosia": "https://www.ecosia.org/search?q={query}",
48
+ "Yahoo": "https://search.yahoo.com/search?p={query}",
49
+ "Startpage": "https://www.startpage.com/sp/search?q={query}",
50
+ "Qwant": "https://www.qwant.com/?q={query}",
51
+ "Swisscows": "https://swisscows.com/web?query={query}",
52
+ "You.com": "https://you.com/search?q={query}",
53
+ "SearXNG": "https://searx.be/search?q={query}",
54
+ "MetaGer": "https://metager.org/meta/meta.ger-en?eingabe={query}",
55
+ "Yandex": "https://yandex.com/search/?text={query}",
56
+ "Baidu": "https://www.baidu.com/s?wd={query}",
57
+ "Perplexity": "https://www.perplexity.ai/search?q={query}",
58
  }
59
 
60
  class HTML_TO_MARKDOWN_CONVERTER:
 
130
 
131
  if is_direct_url:
132
  url = query
 
133
  else:
134
+ url_template = SEARCH_ENGINES.get(search_engine_name)
135
+ if not url_template:
136
  return {"status": "error", "query": query, "error_message": f"Invalid search engine: '{search_engine_name}'."}
137
+ url = url_template.format(query=urllib.parse.quote_plus(query))
 
138
 
139
  proxy_config = REVOLVER.get_next()
140
  proxy_server_used = proxy_config["server"] if proxy_config else "Direct Connection"
 
146
  page = await context.new_page()
147
 
148
  try:
149
+ response = await page.goto(url, wait_until='domcontentloaded', timeout=25000)
 
 
 
 
 
150
 
151
  html_content = await page.content()
152
 
153
+ if any(phrase in html_content for phrase in ["unusual traffic", "CAPTCHA", "are you human", "not a robot"]):
154
+ raise Exception(f"Anti-bot measure detected on {page.url}. Try another search engine or proxy.")
155
 
156
  final_url, title = page.url, await page.title() or "No Title"
157
  soup = BeautifulSoup(html_content, 'lxml')
 
163
  except Exception as e:
164
  error_message = str(e).splitlines()[0]
165
  if "Timeout" in error_message:
166
+ return {"status": "error", "query": query, "proxy_used": proxy_server_used, "error_message": f"Navigation Timeout: The page for '{query}' took too long to load."}
167
  return {"status": "error", "query": query, "proxy_used": proxy_server_used, "error_message": error_message}
168
  finally:
169
  if 'page' in locals() and not page.is_closed(): await page.close()