Spaces:

habulaj
/

newapi-clone

Running

App Files Files Community

habulaj commited on 2 days ago

Commit

05d2c32

verified ·

1 Parent(s): 4ffe0a9

Update routers/searchterm.py

Browse files

Files changed (1) hide show

routers/searchterm.py +71 -63

routers/searchterm.py CHANGED Viewed

@@ -6,7 +6,7 @@ import httpx
 import aiohttp
 import trafilatura
 from urllib.parse import urlparse
-from typing import List, Dict, Any, Optional, Tuple
 from fastapi import APIRouter, HTTPException, Body
 from newspaper import Article
@@ -33,7 +33,6 @@ BLOCKED_DOMAINS = {"reddit.com", "www.reddit.com", "old.reddit.com",
                    "quora.com", "www.quora.com"}
 MAX_TEXT_LENGTH = 4000
-EXTRACTION_CONCURRENCY = int(os.getenv("EXTRACTION_CONCURRENCY", "6"))
 def is_blocked_domain(url: str) -> bool:
@@ -44,16 +43,6 @@ def is_blocked_domain(url: str) -> bool:
         return False
-def get_site_name(url: str) -> str:
-    try:
-        host = urlparse(url).netloc
-        if host.startswith('www.'):
-            host = host[4:]
-        return host
-    except Exception:
-        return url
 def clamp_text(text: str) -> str:
     if not text:
         return ""
@@ -71,29 +60,35 @@ def get_realistic_headers() -> Dict[str, str]:
     }
-async def search_brave_term(client: httpx.AsyncClient, term: str) -> List[str]:
     params = {"q": term, "count": 10, "safesearch": "off", "summary": "false"}
-    resp = await client.get(BRAVE_SEARCH_URL, headers=BRAVE_HEADERS, params=params)
-    if resp.status_code != 200:
-        return []
-    data = resp.json()
-    urls: List[str] = []
-    if "web" in data and "results" in data["web"]:
-        for item in data["web"]["results"]:
-            u = item.get("url")
-            if u and not is_blocked_domain(u):
-                urls.append(u)
-    return urls
 async def extract_article_text(url: str, session: aiohttp.ClientSession) -> str:
     try:
         art = Article(url)
         art.config.browser_user_agent = random.choice(USER_AGENTS)
-        art.config.request_timeout = 10
         art.config.number_threads = 1
         art.download()
@@ -105,10 +100,10 @@ async def extract_article_text(url: str, session: aiohttp.ClientSession) -> str:
         pass
     try:
-        await asyncio.sleep(random.uniform(0.5, 1.5))
         headers = get_realistic_headers()
-        async with session.get(url, headers=headers, timeout=15) as resp:
             if resp.status != 200:
                 return ""
@@ -135,41 +130,54 @@ async def search_terms(payload: Dict[str, List[str]] = Body(...)) -> Dict[str, A
         raise HTTPException(status_code=400, detail="Campo 'terms' é obrigatório e deve ser uma lista.")
     used_urls = set()
-    results: List[Dict[str, str]] = []
-    async with httpx.AsyncClient(timeout=15.0) as http_client:
-        async with aiohttp.ClientSession() as session:
-            semaphore = asyncio.Semaphore(EXTRACTION_CONCURRENCY)
-            for idx, term in enumerate(terms):
-                try:
-                    urls = await search_brave_term(http_client, term)
-                except Exception:
-                    if idx < len(terms) - 1:
-                        await asyncio.sleep(2)
-                    continue
-                found_valid = False
-                for url in urls:
-                    if url in used_urls:
-                        continue
-                    async with semaphore:
-                        text = await extract_article_text(url, session)
-                    if text:
-                        used_urls.add(url)
-                        results.append({
-                            "term": term,
-                            "site": get_site_name(url),
-                            "url": url,
-                            "text": text
-                        })
-                        found_valid = True
-                        break
-                if idx < len(terms) - 1:
-                    await asyncio.sleep(1)
-    return {"results": results}

 import aiohttp
 import trafilatura
 from urllib.parse import urlparse
+from typing import List, Dict, Any, Optional
 from fastapi import APIRouter, HTTPException, Body
 from newspaper import Article
                    "quora.com", "www.quora.com"}
 MAX_TEXT_LENGTH = 4000
 def is_blocked_domain(url: str) -> bool:
         return False
 def clamp_text(text: str) -> str:
     if not text:
         return ""
     }
+async def search_brave_term(client: httpx.AsyncClient, term: str) -> List[Dict[str, str]]:
     params = {"q": term, "count": 10, "safesearch": "off", "summary": "false"}
+    try:
+        resp = await client.get(BRAVE_SEARCH_URL, headers=BRAVE_HEADERS, params=params)
+        if resp.status_code != 200:
+            return []
+        data = resp.json()
+        results: List[Dict[str, str]] = []
+        if "web" in data and "results" in data["web"]:
+            for item in data["web"]["results"]:
+                url = item.get("url")
+                age = item.get("age", "Unknown")
+                if url and not is_blocked_domain(url):
+                    results.append({"url": url, "age": age})
+        return results
+    except Exception:
+        return []
 async def extract_article_text(url: str, session: aiohttp.ClientSession) -> str:
     try:
         art = Article(url)
         art.config.browser_user_agent = random.choice(USER_AGENTS)
+        art.config.request_timeout = 8
         art.config.number_threads = 1
         art.download()
         pass
     try:
+        await asyncio.sleep(random.uniform(0.1, 0.3))
         headers = get_realistic_headers()
+        async with session.get(url, headers=headers, timeout=12) as resp:
             if resp.status != 200:
                 return ""
         raise HTTPException(status_code=400, detail="Campo 'terms' é obrigatório e deve ser uma lista.")
     used_urls = set()
+    search_semaphore = asyncio.Semaphore(20)
+    extract_semaphore = asyncio.Semaphore(50)
+    async def search_with_limit(client, term):
+        async with search_semaphore:
+            return await search_brave_term(client, term)
+    async def process_term(session, term, search_results):
+        async with extract_semaphore:
+            for result in search_results:
+                url = result["url"]
+                age = result["age"]
+                if url in used_urls:
+                    continue
+                text = await extract_article_text(url, session)
+                if text:
+                    used_urls.add(url)
+                    return {
+                        "term": term,
+                        "age": age,
+                        "url": url,
+                        "text": text
+                    }
+            return None
+    connector = aiohttp.TCPConnector(limit=100, limit_per_host=15)
+    timeout = aiohttp.ClientTimeout(total=15)
+    async with httpx.AsyncClient(
+        timeout=15.0,
+        limits=httpx.Limits(max_connections=100, max_keepalive_connections=25)
+    ) as http_client:
+        async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
+            search_tasks = [search_with_limit(http_client, term) for term in terms]
+            search_results = await asyncio.gather(*search_tasks, return_exceptions=True)
+            process_tasks = []
+            for term, results in zip(terms, search_results):
+                if isinstance(results, list) and results:
+                    process_tasks.append(process_term(session, term, results))
+            if process_tasks:
+                processed_results = await asyncio.gather(*process_tasks, return_exceptions=True)
+                final_results = [r for r in processed_results if r is not None and not isinstance(r, Exception)]
+            else:
+                final_results = []
+    return {"results": final_results}