habulaj commited on
Commit
05d2c32
verified
1 Parent(s): 4ffe0a9

Update routers/searchterm.py

Browse files
Files changed (1) hide show
  1. routers/searchterm.py +71 -63
routers/searchterm.py CHANGED
@@ -6,7 +6,7 @@ import httpx
6
  import aiohttp
7
  import trafilatura
8
  from urllib.parse import urlparse
9
- from typing import List, Dict, Any, Optional, Tuple
10
  from fastapi import APIRouter, HTTPException, Body
11
  from newspaper import Article
12
 
@@ -33,7 +33,6 @@ BLOCKED_DOMAINS = {"reddit.com", "www.reddit.com", "old.reddit.com",
33
  "quora.com", "www.quora.com"}
34
 
35
  MAX_TEXT_LENGTH = 4000
36
- EXTRACTION_CONCURRENCY = int(os.getenv("EXTRACTION_CONCURRENCY", "6"))
37
 
38
 
39
  def is_blocked_domain(url: str) -> bool:
@@ -44,16 +43,6 @@ def is_blocked_domain(url: str) -> bool:
44
  return False
45
 
46
 
47
- def get_site_name(url: str) -> str:
48
- try:
49
- host = urlparse(url).netloc
50
- if host.startswith('www.'):
51
- host = host[4:]
52
- return host
53
- except Exception:
54
- return url
55
-
56
-
57
  def clamp_text(text: str) -> str:
58
  if not text:
59
  return ""
@@ -71,29 +60,35 @@ def get_realistic_headers() -> Dict[str, str]:
71
  }
72
 
73
 
74
- async def search_brave_term(client: httpx.AsyncClient, term: str) -> List[str]:
75
  params = {"q": term, "count": 10, "safesearch": "off", "summary": "false"}
76
 
77
- resp = await client.get(BRAVE_SEARCH_URL, headers=BRAVE_HEADERS, params=params)
78
- if resp.status_code != 200:
79
- return []
 
80
 
81
- data = resp.json()
82
- urls: List[str] = []
83
- if "web" in data and "results" in data["web"]:
84
- for item in data["web"]["results"]:
85
- u = item.get("url")
86
- if u and not is_blocked_domain(u):
87
- urls.append(u)
 
 
 
88
 
89
- return urls
 
 
90
 
91
 
92
  async def extract_article_text(url: str, session: aiohttp.ClientSession) -> str:
93
  try:
94
  art = Article(url)
95
  art.config.browser_user_agent = random.choice(USER_AGENTS)
96
- art.config.request_timeout = 10
97
  art.config.number_threads = 1
98
 
99
  art.download()
@@ -105,10 +100,10 @@ async def extract_article_text(url: str, session: aiohttp.ClientSession) -> str:
105
  pass
106
 
107
  try:
108
- await asyncio.sleep(random.uniform(0.5, 1.5))
109
 
110
  headers = get_realistic_headers()
111
- async with session.get(url, headers=headers, timeout=15) as resp:
112
  if resp.status != 200:
113
  return ""
114
 
@@ -135,41 +130,54 @@ async def search_terms(payload: Dict[str, List[str]] = Body(...)) -> Dict[str, A
135
  raise HTTPException(status_code=400, detail="Campo 'terms' 茅 obrigat贸rio e deve ser uma lista.")
136
 
137
  used_urls = set()
138
- results: List[Dict[str, str]] = []
139
-
140
- async with httpx.AsyncClient(timeout=15.0) as http_client:
141
- async with aiohttp.ClientSession() as session:
142
- semaphore = asyncio.Semaphore(EXTRACTION_CONCURRENCY)
143
-
144
- for idx, term in enumerate(terms):
145
- try:
146
- urls = await search_brave_term(http_client, term)
147
- except Exception:
148
- if idx < len(terms) - 1:
149
- await asyncio.sleep(2)
150
- continue
151
-
152
- found_valid = False
153
 
154
- for url in urls:
155
- if url in used_urls:
156
- continue
157
-
158
- async with semaphore:
159
- text = await extract_article_text(url, session)
160
 
161
- if text:
162
- used_urls.add(url)
163
- results.append({
164
- "term": term,
165
- "site": get_site_name(url),
166
- "url": url,
167
- "text": text
168
- })
169
- found_valid = True
170
- break
171
-
172
- if idx < len(terms) - 1:
173
- await asyncio.sleep(1)
174
-
175
- return {"results": results}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  import aiohttp
7
  import trafilatura
8
  from urllib.parse import urlparse
9
+ from typing import List, Dict, Any, Optional
10
  from fastapi import APIRouter, HTTPException, Body
11
  from newspaper import Article
12
 
 
33
  "quora.com", "www.quora.com"}
34
 
35
  MAX_TEXT_LENGTH = 4000
 
36
 
37
 
38
  def is_blocked_domain(url: str) -> bool:
 
43
  return False
44
 
45
 
 
 
 
 
 
 
 
 
 
 
46
  def clamp_text(text: str) -> str:
47
  if not text:
48
  return ""
 
60
  }
61
 
62
 
63
+ async def search_brave_term(client: httpx.AsyncClient, term: str) -> List[Dict[str, str]]:
64
  params = {"q": term, "count": 10, "safesearch": "off", "summary": "false"}
65
 
66
+ try:
67
+ resp = await client.get(BRAVE_SEARCH_URL, headers=BRAVE_HEADERS, params=params)
68
+ if resp.status_code != 200:
69
+ return []
70
 
71
+ data = resp.json()
72
+ results: List[Dict[str, str]] = []
73
+
74
+ if "web" in data and "results" in data["web"]:
75
+ for item in data["web"]["results"]:
76
+ url = item.get("url")
77
+ age = item.get("age", "Unknown")
78
+
79
+ if url and not is_blocked_domain(url):
80
+ results.append({"url": url, "age": age})
81
 
82
+ return results
83
+ except Exception:
84
+ return []
85
 
86
 
87
  async def extract_article_text(url: str, session: aiohttp.ClientSession) -> str:
88
  try:
89
  art = Article(url)
90
  art.config.browser_user_agent = random.choice(USER_AGENTS)
91
+ art.config.request_timeout = 8
92
  art.config.number_threads = 1
93
 
94
  art.download()
 
100
  pass
101
 
102
  try:
103
+ await asyncio.sleep(random.uniform(0.1, 0.3))
104
 
105
  headers = get_realistic_headers()
106
+ async with session.get(url, headers=headers, timeout=12) as resp:
107
  if resp.status != 200:
108
  return ""
109
 
 
130
  raise HTTPException(status_code=400, detail="Campo 'terms' 茅 obrigat贸rio e deve ser uma lista.")
131
 
132
  used_urls = set()
133
+ search_semaphore = asyncio.Semaphore(20)
134
+ extract_semaphore = asyncio.Semaphore(50)
135
+
136
+ async def search_with_limit(client, term):
137
+ async with search_semaphore:
138
+ return await search_brave_term(client, term)
139
+
140
+ async def process_term(session, term, search_results):
141
+ async with extract_semaphore:
142
+ for result in search_results:
143
+ url = result["url"]
144
+ age = result["age"]
 
 
 
145
 
146
+ if url in used_urls:
147
+ continue
 
 
 
 
148
 
149
+ text = await extract_article_text(url, session)
150
+ if text:
151
+ used_urls.add(url)
152
+ return {
153
+ "term": term,
154
+ "age": age,
155
+ "url": url,
156
+ "text": text
157
+ }
158
+ return None
159
+
160
+ connector = aiohttp.TCPConnector(limit=100, limit_per_host=15)
161
+ timeout = aiohttp.ClientTimeout(total=15)
162
+
163
+ async with httpx.AsyncClient(
164
+ timeout=15.0,
165
+ limits=httpx.Limits(max_connections=100, max_keepalive_connections=25)
166
+ ) as http_client:
167
+ async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
168
+
169
+ search_tasks = [search_with_limit(http_client, term) for term in terms]
170
+ search_results = await asyncio.gather(*search_tasks, return_exceptions=True)
171
+
172
+ process_tasks = []
173
+ for term, results in zip(terms, search_results):
174
+ if isinstance(results, list) and results:
175
+ process_tasks.append(process_term(session, term, results))
176
+
177
+ if process_tasks:
178
+ processed_results = await asyncio.gather(*process_tasks, return_exceptions=True)
179
+ final_results = [r for r in processed_results if r is not None and not isinstance(r, Exception)]
180
+ else:
181
+ final_results = []
182
+
183
+ return {"results": final_results}