Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -40,21 +40,21 @@ PLAYWRIGHT_STATE: Dict = {}
|
|
40 |
REVOLVER = CredentialRevolver(os.getenv("PROXY_LIST", ""))
|
41 |
|
42 |
SEARCH_ENGINES = {
|
43 |
-
"Google":
|
44 |
-
"DuckDuckGo":
|
45 |
-
"Bing":
|
46 |
-
"Brave":
|
47 |
-
"Ecosia":
|
48 |
-
"Yahoo":
|
49 |
-
"Startpage":
|
50 |
-
"Qwant":
|
51 |
-
"Swisscows":
|
52 |
-
"You.com":
|
53 |
-
"SearXNG":
|
54 |
-
"MetaGer":
|
55 |
-
"Yandex":
|
56 |
-
"Baidu":
|
57 |
-
"Perplexity":
|
58 |
}
|
59 |
|
60 |
class HTML_TO_MARKDOWN_CONVERTER:
|
@@ -130,13 +130,11 @@ async def perform_web_browse(query: str, browser_name: str, search_engine_name:
|
|
130 |
|
131 |
if is_direct_url:
|
132 |
url = query
|
133 |
-
content_selector = 'body'
|
134 |
else:
|
135 |
-
|
136 |
-
if not
|
137 |
return {"status": "error", "query": query, "error_message": f"Invalid search engine: '{search_engine_name}'."}
|
138 |
-
url
|
139 |
-
url = url.format(query=urllib.parse.quote_plus(query))
|
140 |
|
141 |
proxy_config = REVOLVER.get_next()
|
142 |
proxy_server_used = proxy_config["server"] if proxy_config else "Direct Connection"
|
@@ -148,17 +146,12 @@ async def perform_web_browse(query: str, browser_name: str, search_engine_name:
|
|
148 |
page = await context.new_page()
|
149 |
|
150 |
try:
|
151 |
-
response = await page.goto(url, wait_until='
|
152 |
-
|
153 |
-
await asyncio.wait([
|
154 |
-
page.wait_for_load_state('domcontentloaded', timeout=15000),
|
155 |
-
page.wait_for_selector(content_selector, timeout=15000),
|
156 |
-
], return_when=asyncio.FIRST_COMPLETED)
|
157 |
|
158 |
html_content = await page.content()
|
159 |
|
160 |
-
if any(phrase in html_content for phrase in ["unusual traffic", "CAPTCHA", "
|
161 |
-
raise Exception("Anti-bot measure detected. Try another search engine or proxy.")
|
162 |
|
163 |
final_url, title = page.url, await page.title() or "No Title"
|
164 |
soup = BeautifulSoup(html_content, 'lxml')
|
@@ -170,7 +163,7 @@ async def perform_web_browse(query: str, browser_name: str, search_engine_name:
|
|
170 |
except Exception as e:
|
171 |
error_message = str(e).splitlines()[0]
|
172 |
if "Timeout" in error_message:
|
173 |
-
return {"status": "error", "query": query, "proxy_used": proxy_server_used, "error_message": f"
|
174 |
return {"status": "error", "query": query, "proxy_used": proxy_server_used, "error_message": error_message}
|
175 |
finally:
|
176 |
if 'page' in locals() and not page.is_closed(): await page.close()
|
|
|
40 |
REVOLVER = CredentialRevolver(os.getenv("PROXY_LIST", ""))
|
41 |
|
42 |
SEARCH_ENGINES = {
|
43 |
+
"Google": "https://www.google.com/search?q={query}&hl=en",
|
44 |
+
"DuckDuckGo": "https://duckduckgo.com/html/?q={query}",
|
45 |
+
"Bing": "https://www.bing.com/search?q={query}",
|
46 |
+
"Brave": "https://search.brave.com/search?q={query}",
|
47 |
+
"Ecosia": "https://www.ecosia.org/search?q={query}",
|
48 |
+
"Yahoo": "https://search.yahoo.com/search?p={query}",
|
49 |
+
"Startpage": "https://www.startpage.com/sp/search?q={query}",
|
50 |
+
"Qwant": "https://www.qwant.com/?q={query}",
|
51 |
+
"Swisscows": "https://swisscows.com/web?query={query}",
|
52 |
+
"You.com": "https://you.com/search?q={query}",
|
53 |
+
"SearXNG": "https://searx.be/search?q={query}",
|
54 |
+
"MetaGer": "https://metager.org/meta/meta.ger-en?eingabe={query}",
|
55 |
+
"Yandex": "https://yandex.com/search/?text={query}",
|
56 |
+
"Baidu": "https://www.baidu.com/s?wd={query}",
|
57 |
+
"Perplexity": "https://www.perplexity.ai/search?q={query}",
|
58 |
}
|
59 |
|
60 |
class HTML_TO_MARKDOWN_CONVERTER:
|
|
|
130 |
|
131 |
if is_direct_url:
|
132 |
url = query
|
|
|
133 |
else:
|
134 |
+
url_template = SEARCH_ENGINES.get(search_engine_name)
|
135 |
+
if not url_template:
|
136 |
return {"status": "error", "query": query, "error_message": f"Invalid search engine: '{search_engine_name}'."}
|
137 |
+
url = url_template.format(query=urllib.parse.quote_plus(query))
|
|
|
138 |
|
139 |
proxy_config = REVOLVER.get_next()
|
140 |
proxy_server_used = proxy_config["server"] if proxy_config else "Direct Connection"
|
|
|
146 |
page = await context.new_page()
|
147 |
|
148 |
try:
|
149 |
+
response = await page.goto(url, wait_until='domcontentloaded', timeout=25000)
|
|
|
|
|
|
|
|
|
|
|
150 |
|
151 |
html_content = await page.content()
|
152 |
|
153 |
+
if any(phrase in html_content for phrase in ["unusual traffic", "CAPTCHA", "are you human", "not a robot"]):
|
154 |
+
raise Exception(f"Anti-bot measure detected on {page.url}. Try another search engine or proxy.")
|
155 |
|
156 |
final_url, title = page.url, await page.title() or "No Title"
|
157 |
soup = BeautifulSoup(html_content, 'lxml')
|
|
|
163 |
except Exception as e:
|
164 |
error_message = str(e).splitlines()[0]
|
165 |
if "Timeout" in error_message:
|
166 |
+
return {"status": "error", "query": query, "proxy_used": proxy_server_used, "error_message": f"Navigation Timeout: The page for '{query}' took too long to load."}
|
167 |
return {"status": "error", "query": query, "proxy_used": proxy_server_used, "error_message": error_message}
|
168 |
finally:
|
169 |
if 'page' in locals() and not page.is_closed(): await page.close()
|