Spaces:
Running
Running
Update routers/searchterm.py
Browse files- routers/searchterm.py +102 -1
routers/searchterm.py
CHANGED
@@ -5,10 +5,16 @@ import asyncio
|
|
5 |
import httpx
|
6 |
import aiohttp
|
7 |
import trafilatura
|
|
|
|
|
|
|
|
|
8 |
from urllib.parse import urlparse
|
9 |
from typing import List, Dict, Any, Optional
|
10 |
from fastapi import APIRouter, HTTPException, Body
|
|
|
11 |
from newspaper import Article
|
|
|
12 |
|
13 |
router = APIRouter()
|
14 |
|
@@ -34,6 +40,13 @@ BLOCKED_DOMAINS = {"reddit.com", "www.reddit.com", "old.reddit.com",
|
|
34 |
|
35 |
MAX_TEXT_LENGTH = 4000
|
36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
def is_blocked_domain(url: str) -> bool:
|
39 |
try:
|
@@ -60,6 +73,44 @@ def get_realistic_headers() -> Dict[str, str]:
|
|
60 |
}
|
61 |
|
62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
async def search_brave_term(client: httpx.AsyncClient, term: str) -> List[Dict[str, str]]:
|
64 |
params = {"q": term, "count": 10, "safesearch": "off", "summary": "false"}
|
65 |
|
@@ -180,4 +231,54 @@ async def search_terms(payload: Dict[str, List[str]] = Body(...)) -> Dict[str, A
|
|
180 |
else:
|
181 |
final_results = []
|
182 |
|
183 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
import httpx
|
6 |
import aiohttp
|
7 |
import trafilatura
|
8 |
+
import json
|
9 |
+
import uuid
|
10 |
+
import time
|
11 |
+
from pathlib import Path
|
12 |
from urllib.parse import urlparse
|
13 |
from typing import List, Dict, Any, Optional
|
14 |
from fastapi import APIRouter, HTTPException, Body
|
15 |
+
from fastapi.responses import FileResponse
|
16 |
from newspaper import Article
|
17 |
+
from threading import Timer
|
18 |
|
19 |
router = APIRouter()
|
20 |
|
|
|
40 |
|
41 |
MAX_TEXT_LENGTH = 4000
|
42 |
|
43 |
+
# Diretório para arquivos temporários
|
44 |
+
TEMP_DIR = Path("/tmp")
|
45 |
+
TEMP_DIR.mkdir(exist_ok=True)
|
46 |
+
|
47 |
+
# Dicionário para controlar arquivos temporários
|
48 |
+
temp_files = {}
|
49 |
+
|
50 |
|
51 |
def is_blocked_domain(url: str) -> bool:
|
52 |
try:
|
|
|
73 |
}
|
74 |
|
75 |
|
76 |
+
def delete_temp_file(file_id: str, file_path: Path):
|
77 |
+
"""Remove arquivo temporário após expiração"""
|
78 |
+
try:
|
79 |
+
if file_path.exists():
|
80 |
+
file_path.unlink()
|
81 |
+
temp_files.pop(file_id, None)
|
82 |
+
print(f"Arquivo temporário removido: {file_path}")
|
83 |
+
except Exception as e:
|
84 |
+
print(f"Erro ao remover arquivo temporário: {e}")
|
85 |
+
|
86 |
+
|
87 |
+
def create_temp_file(data: Dict[str, Any]) -> Dict[str, str]:
|
88 |
+
"""Cria arquivo temporário e agenda sua remoção"""
|
89 |
+
file_id = str(uuid.uuid4())
|
90 |
+
file_path = TEMP_DIR / f"fontes_{file_id}.txt"
|
91 |
+
|
92 |
+
# Salva o JSON no arquivo
|
93 |
+
with open(file_path, 'w', encoding='utf-8') as f:
|
94 |
+
json.dump(data, f, ensure_ascii=False, indent=2)
|
95 |
+
|
96 |
+
# Agenda remoção em 24 horas (86400 segundos)
|
97 |
+
timer = Timer(86400, delete_temp_file, args=[file_id, file_path])
|
98 |
+
timer.start()
|
99 |
+
|
100 |
+
# Registra o arquivo temporário
|
101 |
+
temp_files[file_id] = {
|
102 |
+
"path": file_path,
|
103 |
+
"created_at": time.time(),
|
104 |
+
"timer": timer
|
105 |
+
}
|
106 |
+
|
107 |
+
return {
|
108 |
+
"file_id": file_id,
|
109 |
+
"download_url": f"/download-temp/{file_id}",
|
110 |
+
"expires_in_hours": 24
|
111 |
+
}
|
112 |
+
|
113 |
+
|
114 |
async def search_brave_term(client: httpx.AsyncClient, term: str) -> List[Dict[str, str]]:
|
115 |
params = {"q": term, "count": 10, "safesearch": "off", "summary": "false"}
|
116 |
|
|
|
231 |
else:
|
232 |
final_results = []
|
233 |
|
234 |
+
# Cria o JSON final
|
235 |
+
result_data = {"results": final_results}
|
236 |
+
|
237 |
+
# Cria arquivo temporário
|
238 |
+
temp_file_info = create_temp_file(result_data)
|
239 |
+
|
240 |
+
return {
|
241 |
+
"message": "Dados salvos em arquivo temporário",
|
242 |
+
"total_results": len(final_results),
|
243 |
+
"file_info": temp_file_info
|
244 |
+
}
|
245 |
+
|
246 |
+
|
247 |
+
@router.get("/download-temp/{file_id}")
|
248 |
+
async def download_temp_file(file_id: str):
|
249 |
+
"""Endpoint para download do arquivo temporário"""
|
250 |
+
if file_id not in temp_files:
|
251 |
+
raise HTTPException(status_code=404, detail="Arquivo não encontrado ou expirado")
|
252 |
+
|
253 |
+
file_info = temp_files[file_id]
|
254 |
+
file_path = file_info["path"]
|
255 |
+
|
256 |
+
if not file_path.exists():
|
257 |
+
temp_files.pop(file_id, None)
|
258 |
+
raise HTTPException(status_code=404, detail="Arquivo não encontrado")
|
259 |
+
|
260 |
+
return FileResponse(
|
261 |
+
path=str(file_path),
|
262 |
+
filename="fontes.txt",
|
263 |
+
media_type="text/plain",
|
264 |
+
headers={"Content-Disposition": "attachment; filename=fontes.txt"}
|
265 |
+
)
|
266 |
+
|
267 |
+
|
268 |
+
@router.get("/temp-files/status")
|
269 |
+
async def get_temp_files_status():
|
270 |
+
"""Endpoint para verificar status dos arquivos temporários (debug)"""
|
271 |
+
status = {}
|
272 |
+
current_time = time.time()
|
273 |
+
|
274 |
+
for file_id, info in temp_files.items():
|
275 |
+
age_hours = (current_time - info["created_at"]) / 3600
|
276 |
+
remaining_hours = max(0, 24 - age_hours)
|
277 |
+
|
278 |
+
status[file_id] = {
|
279 |
+
"age_hours": round(age_hours, 2),
|
280 |
+
"remaining_hours": round(remaining_hours, 2),
|
281 |
+
"exists": info["path"].exists()
|
282 |
+
}
|
283 |
+
|
284 |
+
return {"temp_files": status}
|