Spaces:
Running
Running
Update routers/searchterm.py
Browse files- routers/searchterm.py +71 -63
routers/searchterm.py
CHANGED
@@ -6,7 +6,7 @@ import httpx
|
|
6 |
import aiohttp
|
7 |
import trafilatura
|
8 |
from urllib.parse import urlparse
|
9 |
-
from typing import List, Dict, Any, Optional
|
10 |
from fastapi import APIRouter, HTTPException, Body
|
11 |
from newspaper import Article
|
12 |
|
@@ -33,7 +33,6 @@ BLOCKED_DOMAINS = {"reddit.com", "www.reddit.com", "old.reddit.com",
|
|
33 |
"quora.com", "www.quora.com"}
|
34 |
|
35 |
MAX_TEXT_LENGTH = 4000
|
36 |
-
EXTRACTION_CONCURRENCY = int(os.getenv("EXTRACTION_CONCURRENCY", "6"))
|
37 |
|
38 |
|
39 |
def is_blocked_domain(url: str) -> bool:
|
@@ -44,16 +43,6 @@ def is_blocked_domain(url: str) -> bool:
|
|
44 |
return False
|
45 |
|
46 |
|
47 |
-
def get_site_name(url: str) -> str:
|
48 |
-
try:
|
49 |
-
host = urlparse(url).netloc
|
50 |
-
if host.startswith('www.'):
|
51 |
-
host = host[4:]
|
52 |
-
return host
|
53 |
-
except Exception:
|
54 |
-
return url
|
55 |
-
|
56 |
-
|
57 |
def clamp_text(text: str) -> str:
|
58 |
if not text:
|
59 |
return ""
|
@@ -71,29 +60,35 @@ def get_realistic_headers() -> Dict[str, str]:
|
|
71 |
}
|
72 |
|
73 |
|
74 |
-
async def search_brave_term(client: httpx.AsyncClient, term: str) -> List[str]:
|
75 |
params = {"q": term, "count": 10, "safesearch": "off", "summary": "false"}
|
76 |
|
77 |
-
|
78 |
-
|
79 |
-
|
|
|
80 |
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
|
|
|
|
|
|
88 |
|
89 |
-
|
|
|
|
|
90 |
|
91 |
|
92 |
async def extract_article_text(url: str, session: aiohttp.ClientSession) -> str:
|
93 |
try:
|
94 |
art = Article(url)
|
95 |
art.config.browser_user_agent = random.choice(USER_AGENTS)
|
96 |
-
art.config.request_timeout =
|
97 |
art.config.number_threads = 1
|
98 |
|
99 |
art.download()
|
@@ -105,10 +100,10 @@ async def extract_article_text(url: str, session: aiohttp.ClientSession) -> str:
|
|
105 |
pass
|
106 |
|
107 |
try:
|
108 |
-
await asyncio.sleep(random.uniform(0.
|
109 |
|
110 |
headers = get_realistic_headers()
|
111 |
-
async with session.get(url, headers=headers, timeout=
|
112 |
if resp.status != 200:
|
113 |
return ""
|
114 |
|
@@ -135,41 +130,54 @@ async def search_terms(payload: Dict[str, List[str]] = Body(...)) -> Dict[str, A
|
|
135 |
raise HTTPException(status_code=400, detail="Campo 'terms' 茅 obrigat贸rio e deve ser uma lista.")
|
136 |
|
137 |
used_urls = set()
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
continue
|
151 |
-
|
152 |
-
found_valid = False
|
153 |
|
154 |
-
|
155 |
-
|
156 |
-
continue
|
157 |
-
|
158 |
-
async with semaphore:
|
159 |
-
text = await extract_article_text(url, session)
|
160 |
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
import aiohttp
|
7 |
import trafilatura
|
8 |
from urllib.parse import urlparse
|
9 |
+
from typing import List, Dict, Any, Optional
|
10 |
from fastapi import APIRouter, HTTPException, Body
|
11 |
from newspaper import Article
|
12 |
|
|
|
33 |
"quora.com", "www.quora.com"}
|
34 |
|
35 |
MAX_TEXT_LENGTH = 4000
|
|
|
36 |
|
37 |
|
38 |
def is_blocked_domain(url: str) -> bool:
|
|
|
43 |
return False
|
44 |
|
45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
def clamp_text(text: str) -> str:
|
47 |
if not text:
|
48 |
return ""
|
|
|
60 |
}
|
61 |
|
62 |
|
63 |
+
async def search_brave_term(client: httpx.AsyncClient, term: str) -> List[Dict[str, str]]:
|
64 |
params = {"q": term, "count": 10, "safesearch": "off", "summary": "false"}
|
65 |
|
66 |
+
try:
|
67 |
+
resp = await client.get(BRAVE_SEARCH_URL, headers=BRAVE_HEADERS, params=params)
|
68 |
+
if resp.status_code != 200:
|
69 |
+
return []
|
70 |
|
71 |
+
data = resp.json()
|
72 |
+
results: List[Dict[str, str]] = []
|
73 |
+
|
74 |
+
if "web" in data and "results" in data["web"]:
|
75 |
+
for item in data["web"]["results"]:
|
76 |
+
url = item.get("url")
|
77 |
+
age = item.get("age", "Unknown")
|
78 |
+
|
79 |
+
if url and not is_blocked_domain(url):
|
80 |
+
results.append({"url": url, "age": age})
|
81 |
|
82 |
+
return results
|
83 |
+
except Exception:
|
84 |
+
return []
|
85 |
|
86 |
|
87 |
async def extract_article_text(url: str, session: aiohttp.ClientSession) -> str:
|
88 |
try:
|
89 |
art = Article(url)
|
90 |
art.config.browser_user_agent = random.choice(USER_AGENTS)
|
91 |
+
art.config.request_timeout = 8
|
92 |
art.config.number_threads = 1
|
93 |
|
94 |
art.download()
|
|
|
100 |
pass
|
101 |
|
102 |
try:
|
103 |
+
await asyncio.sleep(random.uniform(0.1, 0.3))
|
104 |
|
105 |
headers = get_realistic_headers()
|
106 |
+
async with session.get(url, headers=headers, timeout=12) as resp:
|
107 |
if resp.status != 200:
|
108 |
return ""
|
109 |
|
|
|
130 |
raise HTTPException(status_code=400, detail="Campo 'terms' 茅 obrigat贸rio e deve ser uma lista.")
|
131 |
|
132 |
used_urls = set()
|
133 |
+
search_semaphore = asyncio.Semaphore(20)
|
134 |
+
extract_semaphore = asyncio.Semaphore(50)
|
135 |
+
|
136 |
+
async def search_with_limit(client, term):
|
137 |
+
async with search_semaphore:
|
138 |
+
return await search_brave_term(client, term)
|
139 |
+
|
140 |
+
async def process_term(session, term, search_results):
|
141 |
+
async with extract_semaphore:
|
142 |
+
for result in search_results:
|
143 |
+
url = result["url"]
|
144 |
+
age = result["age"]
|
|
|
|
|
|
|
145 |
|
146 |
+
if url in used_urls:
|
147 |
+
continue
|
|
|
|
|
|
|
|
|
148 |
|
149 |
+
text = await extract_article_text(url, session)
|
150 |
+
if text:
|
151 |
+
used_urls.add(url)
|
152 |
+
return {
|
153 |
+
"term": term,
|
154 |
+
"age": age,
|
155 |
+
"url": url,
|
156 |
+
"text": text
|
157 |
+
}
|
158 |
+
return None
|
159 |
+
|
160 |
+
connector = aiohttp.TCPConnector(limit=100, limit_per_host=15)
|
161 |
+
timeout = aiohttp.ClientTimeout(total=15)
|
162 |
+
|
163 |
+
async with httpx.AsyncClient(
|
164 |
+
timeout=15.0,
|
165 |
+
limits=httpx.Limits(max_connections=100, max_keepalive_connections=25)
|
166 |
+
) as http_client:
|
167 |
+
async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
|
168 |
+
|
169 |
+
search_tasks = [search_with_limit(http_client, term) for term in terms]
|
170 |
+
search_results = await asyncio.gather(*search_tasks, return_exceptions=True)
|
171 |
+
|
172 |
+
process_tasks = []
|
173 |
+
for term, results in zip(terms, search_results):
|
174 |
+
if isinstance(results, list) and results:
|
175 |
+
process_tasks.append(process_term(session, term, results))
|
176 |
+
|
177 |
+
if process_tasks:
|
178 |
+
processed_results = await asyncio.gather(*process_tasks, return_exceptions=True)
|
179 |
+
final_results = [r for r in processed_results if r is not None and not isinstance(r, Exception)]
|
180 |
+
else:
|
181 |
+
final_results = []
|
182 |
+
|
183 |
+
return {"results": final_results}
|