Spaces:
Running
Running
import os | |
import sys | |
import importlib.util | |
from pathlib import Path | |
import re | |
import json | |
import time | |
import logging | |
import gc | |
import asyncio | |
import aiohttp | |
from typing import Optional, Dict, Any | |
from fastapi import FastAPI, APIRouter, HTTPException | |
from pydantic import BaseModel | |
from urllib.parse import quote | |
# IMPORTANTE: Configurar variáveis de ambiente e PyTorch ANTES de qualquer importação que use PyTorch | |
os.environ["TOKENIZERS_PARALLELISM"] = "false" | |
os.environ["OMP_NUM_THREADS"] = "2" | |
os.environ["MKL_NUM_THREADS"] = "2" | |
# Configurar PyTorch ANTES de importar qualquer módulo que o use | |
import torch | |
torch.set_num_threads(2) | |
# Verificar se já foi configurado antes de tentar definir interop threads | |
if not hasattr(torch, '_interop_threads_set'): | |
try: | |
torch.set_num_interop_threads(1) | |
torch._interop_threads_set = True | |
except RuntimeError as e: | |
if "cannot set number of interop threads" in str(e): | |
print(f"Warning: Could not set interop threads: {e}") | |
else: | |
raise e | |
# Supabase Config | |
SUPABASE_URL = "https://iiwbixdrrhejkthxygak.supabase.co" | |
SUPABASE_KEY = os.getenv("SUPA_KEY") | |
SUPABASE_ROLE_KEY = os.getenv("SUPA_SERVICE_KEY") | |
if not SUPABASE_KEY or not SUPABASE_ROLE_KEY: | |
raise ValueError("❌ SUPA_KEY or SUPA_SERVICE_KEY not set in environment!") | |
SUPABASE_HEADERS = { | |
"apikey": SUPABASE_KEY, | |
"Authorization": f"Bearer {SUPABASE_KEY}", | |
"Content-Type": "application/json" | |
} | |
SUPABASE_ROLE_HEADERS = { | |
"apikey": SUPABASE_ROLE_KEY, | |
"Authorization": f"Bearer {SUPABASE_ROLE_KEY}", | |
"Content-Type": "application/json" | |
} | |
# Rewrite API URL | |
REWRITE_API_URL = "https://habulaj-newapi-clone.hf.space/rewrite-news" | |
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") | |
log = logging.getLogger("news-analyze-api") | |
http_session = None | |
async def get_http_session(): | |
global http_session | |
if http_session is None: | |
connector = aiohttp.TCPConnector( | |
limit=20, | |
limit_per_host=10, | |
ttl_dns_cache=300, | |
use_dns_cache=True, | |
keepalive_timeout=30, | |
enable_cleanup_closed=True | |
) | |
timeout = aiohttp.ClientTimeout(total=30, connect=5) | |
http_session = aiohttp.ClientSession( | |
connector=connector, | |
timeout=timeout, | |
headers={'User-Agent': 'NewsAnalyzeAPI/1.0 (https://example.com/contact)'} | |
) | |
return http_session | |
def load_inference_module(): | |
"""Carrega o módulo inference.py dinamicamente""" | |
try: | |
# Assumindo que inference.py está no mesmo diretório ou em um caminho conhecido | |
inference_path = Path(__file__).parent / "inference.py" # Ajuste o caminho conforme necessário | |
if not inference_path.exists(): | |
# Tenta outros caminhos possíveis | |
possible_paths = [ | |
Path(__file__).parent.parent / "inference.py", | |
Path("./inference.py"), | |
Path("../inference.py") | |
] | |
for path in possible_paths: | |
if path.exists(): | |
inference_path = path | |
break | |
else: | |
raise FileNotFoundError("inference.py não encontrado") | |
spec = importlib.util.spec_from_file_location("inference", inference_path) | |
inference_module = importlib.util.module_from_spec(spec) | |
spec.loader.exec_module(inference_module) | |
return inference_module | |
except Exception as e: | |
log.error(f"Erro ao carregar inference.py: {str(e)}") | |
return None | |
# Carrega o módulo na inicialização | |
inference_module = load_inference_module() | |
async def rewrite_article_direct(content: str) -> Optional[Dict[str, Any]]: | |
"""Reescreve o artigo chamando diretamente a função do inference.py""" | |
try: | |
if not inference_module: | |
log.error("Módulo inference não carregado, fallback para API HTTP") | |
return await rewrite_article_http(content) | |
log.info(f"Reescrevendo artigo diretamente: {len(content)} caracteres") | |
# Cria um objeto similar ao NewsRequest | |
class NewsRequest: | |
def __init__(self, content: str): | |
self.content = content | |
news_request = NewsRequest(content) | |
# Chama a função rewrite_news diretamente | |
result = await inference_module.rewrite_news(news_request) | |
# Converte o resultado para dicionário | |
rewritten_data = { | |
"title": result.title, | |
"subhead": result.subhead, | |
"content": result.content, | |
"title_instagram": getattr(result, "title_instagram", ""), | |
"content_instagram": getattr(result, "content_instagram", "") | |
} | |
# Validação básica da resposta | |
required_keys = ["title", "subhead", "content", "title_instagram", "content_instagram"] | |
if all(key in rewritten_data and rewritten_data[key].strip() for key in required_keys): | |
log.info("Artigo reescrito com sucesso (chamada direta)") | |
return { | |
"success": True, | |
"data": rewritten_data, | |
"raw_response": str(rewritten_data), | |
"status_code": 200, | |
"method": "direct_call" | |
} | |
else: | |
log.error("Resposta da reescrita direta incompleta") | |
return { | |
"success": False, | |
"error": "Resposta incompleta", | |
"data": rewritten_data, | |
"raw_response": str(rewritten_data), | |
"status_code": 200, | |
"method": "direct_call", | |
"missing_keys": [key for key in required_keys if not rewritten_data.get(key, "").strip()] | |
} | |
except Exception as e: | |
log.error(f"Erro na reescrita direta: {str(e)}") | |
log.info("Tentando fallback para API HTTP") | |
return await rewrite_article_http(content) | |
async def rewrite_article_http(content: str) -> Optional[Dict[str, Any]]: | |
"""Reescreve o artigo usando a API HTTP (função original)""" | |
try: | |
session = await get_http_session() | |
payload = {"content": content} | |
log.info(f"Enviando artigo para reescrita (HTTP): {len(content)} caracteres") | |
# Timeout maior para a API HTTP | |
timeout = aiohttp.ClientTimeout(total=120, connect=10) # 2 minutos | |
async with session.post( | |
REWRITE_API_URL, | |
json=payload, | |
headers={"Content-Type": "application/json"}, | |
timeout=timeout | |
) as response: | |
# Log detalhado do status e headers | |
log.info(f"Status da resposta HTTP: {response.status}") | |
# Captura o body completo da resposta | |
response_text = await response.text() | |
log.info(f"Body completo da resposta HTTP: {response_text}") | |
if response.status == 200: | |
try: | |
# Tenta fazer parse do JSON | |
rewritten_data = json.loads(response_text) | |
# Validação básica da resposta | |
required_keys = ["title", "subhead", "content", "title_instagram", "content_instagram"] | |
if all(key in rewritten_data for key in required_keys): | |
log.info("Artigo reescrito com sucesso (HTTP)") | |
return { | |
"success": True, | |
"data": rewritten_data, | |
"raw_response": response_text, | |
"status_code": response.status, | |
"method": "http_call" | |
} | |
else: | |
log.error(f"Resposta HTTP incompleta. Chaves encontradas: {list(rewritten_data.keys())}") | |
return { | |
"success": False, | |
"error": "Resposta incompleta", | |
"data": rewritten_data, | |
"raw_response": response_text, | |
"status_code": response.status, | |
"method": "http_call", | |
"missing_keys": [key for key in required_keys if key not in rewritten_data] | |
} | |
except json.JSONDecodeError as e: | |
log.error(f"Erro ao fazer parse do JSON: {str(e)}") | |
return { | |
"success": False, | |
"error": f"JSON inválido: {str(e)}", | |
"raw_response": response_text, | |
"status_code": response.status, | |
"method": "http_call" | |
} | |
else: | |
log.error(f"Erro na API HTTP: {response.status}") | |
return { | |
"success": False, | |
"error": f"HTTP {response.status}", | |
"raw_response": response_text, | |
"status_code": response.status, | |
"method": "http_call" | |
} | |
except asyncio.TimeoutError: | |
log.error("Timeout na API HTTP") | |
return { | |
"success": False, | |
"error": "Timeout", | |
"raw_response": "Timeout occurred", | |
"status_code": 0, | |
"method": "http_call" | |
} | |
except Exception as e: | |
log.error(f"Erro na API HTTP: {str(e)}") | |
return { | |
"success": False, | |
"error": str(e), | |
"raw_response": "Exception occurred", | |
"status_code": 0, | |
"method": "http_call" | |
} | |
async def rewrite_article(content: str) -> Optional[Dict[str, Any]]: | |
"""Reescreve o artigo - tenta chamada direta primeiro, depois HTTP""" | |
# Tenta chamada direta primeiro | |
result = await rewrite_article_direct(content) | |
# Se a chamada direta falhou e não foi um fallback, tenta HTTP | |
if not result or (not result.get("success") and result.get("method") == "direct_call"): | |
log.info("Chamada direta falhou, tentando API HTTP") | |
result = await rewrite_article_http(content) | |
return result | |
async def fetch_brazil_interest_news(): | |
"""Busca uma notícia com brazil_interest=true e title_pt vazio""" | |
try: | |
session = await get_http_session() | |
url = f"{SUPABASE_URL}/rest/v1/news" | |
params = { | |
"brazil_interest": "eq.true", | |
"title_pt": "is.null", | |
"limit": "1", | |
"order": "created_at.asc" | |
} | |
async with session.get(url, headers=SUPABASE_HEADERS, params=params) as response: | |
if response.status != 200: | |
raise HTTPException(status_code=500, detail="Erro ao buscar notícia") | |
data = await response.json() | |
if not data: | |
raise HTTPException(status_code=404, detail="Nenhuma notícia com brazil_interest=true e title_pt vazio disponível") | |
return data[0] | |
except Exception as e: | |
raise HTTPException(status_code=500, detail=f"Erro Supabase: {str(e)}") | |
async def update_news_rewrite(news_id: int, rewritten_data: Dict[str, str]): | |
"""Atualiza a notícia com os dados reescritos incluindo campos do Instagram""" | |
try: | |
session = await get_http_session() | |
url = f"{SUPABASE_URL}/rest/v1/news" | |
params = {"id": f"eq.{news_id}"} | |
payload = { | |
"title_pt": rewritten_data.get("title", ""), | |
"text_pt": rewritten_data.get("content", ""), | |
"subhead_pt": rewritten_data.get("subhead", ""), | |
"title_instagram": rewritten_data.get("title_instagram", ""), | |
"content_instagram": rewritten_data.get("content_instagram", "") | |
} | |
async with session.patch(url, headers=SUPABASE_ROLE_HEADERS, json=payload, params=params) as response: | |
if response.status not in [200, 201, 204]: | |
response_text = await response.text() | |
log.error(f"Erro ao atualizar notícia - Status: {response.status}, Response: {response_text}") | |
raise HTTPException(status_code=500, detail=f"Erro ao atualizar notícia - Status: {response.status}") | |
log.info(f"Notícia {news_id} atualizada com sucesso - Status: {response.status}") | |
except Exception as e: | |
log.error(f"Erro ao atualizar notícia {news_id}: {str(e)}") | |
raise HTTPException(status_code=500, detail=f"Erro ao atualizar: {str(e)}") | |
def fix_wikipedia_image_url(url: str) -> str: | |
if not url or not url.startswith('//upload.wikimedia.org'): | |
return url | |
if url.startswith('//'): | |
url = 'https:' + url | |
url = url.replace('/thumb/', '/') | |
parts = url.split('/') | |
if len(parts) >= 2: | |
filename = parts[-1] | |
if 'px-' in filename: | |
filename = filename.split('px-', 1)[1] | |
base_parts = parts[:-2] | |
url = '/'.join(base_parts) + '/' + filename | |
return url | |
def extract_birth_death_years(description: str) -> tuple[Optional[int], Optional[int]]: | |
if not description: | |
return None, None | |
pattern = r'\((?:born\s+)?(\d{4})(?:[–-](\d{4}))?\)' | |
match = re.search(pattern, description) | |
if match: | |
birth_year = int(match.group(1)) | |
death_year = int(match.group(2)) if match.group(2) else None | |
if death_year is None: | |
death_year = 2025 | |
return birth_year, death_year | |
return None, None | |
async def fetch_wikipedia_info(entity_name: str) -> Optional[Dict[str, Any]]: | |
try: | |
session = await get_http_session() | |
url = f"https://en.wikipedia.org/w/rest.php/v1/search/title" | |
params = {'q': entity_name, 'limit': 1} | |
async with session.get(url, params=params) as response: | |
if response.status != 200: | |
return None | |
data = await response.json() | |
if not data.get('pages'): | |
return None | |
page = data['pages'][0] | |
title = page.get('title', '') | |
description = page.get('description', '') | |
thumbnail = page.get('thumbnail', {}) | |
birth_year, death_year = extract_birth_death_years(description) | |
image_url = thumbnail.get('url', '') if thumbnail else '' | |
if image_url: | |
image_url = fix_wikipedia_image_url(image_url) | |
return { | |
'title': title, | |
'birth_year': birth_year, | |
'death_year': death_year, | |
'image_url': image_url | |
} | |
except Exception as e: | |
log.error(f"Erro ao buscar Wikipedia: {str(e)}") | |
return None | |
def generate_poster_url(name: str, birth: int, death: int, image_url: str) -> str: | |
base_url = "https://habulaj-newapi-clone.hf.space/cover/memoriam" | |
params = f"?image_url={quote(image_url)}&name={quote(name)}&birth={birth}&death={death}" | |
return base_url + params | |
def generate_news_poster_url(image_url: str, headline: str) -> str: | |
"""Gera URL do poster para notícias normais (não morte)""" | |
base_url = "https://habulaj-newapi-clone.hf.space/cover/news" | |
params = f"?image_url={quote(image_url)}&headline={quote(headline)}" | |
return base_url + params | |
async def generate_poster_analysis(news_data: Dict[str, Any], rewritten_result: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: | |
"""Gera análise de poster baseado nos dados da notícia""" | |
try: | |
result = {} | |
image_url = news_data.get("image", "") | |
# Verifica se é morte e gera poster apropriado | |
if news_data.get("death_related") is True and news_data.get("entity_name"): | |
wikipedia_info = await fetch_wikipedia_info(news_data["entity_name"]) | |
if wikipedia_info: | |
result["wikipedia_info"] = wikipedia_info | |
# Gera poster de morte apenas se tiver morte confirmada | |
if (wikipedia_info.get("death_year") and | |
wikipedia_info.get("birth_year")): | |
poster_url = generate_poster_url( | |
wikipedia_info["title"], | |
wikipedia_info["birth_year"], | |
wikipedia_info["death_year"], | |
wikipedia_info.get("image_url", image_url) | |
) | |
result["poster"] = poster_url | |
# Se não for morte, gera poster de notícia normal | |
if "poster" not in result and image_url: | |
# Usa headline reescrito se disponível, senão usa título original | |
headline_to_use = news_data.get("title_en", "") # fallback para título original | |
if (rewritten_result and | |
rewritten_result.get("success") and | |
rewritten_result.get("data") and | |
rewritten_result["data"].get("title")): | |
headline_to_use = rewritten_result["data"]["title"] | |
news_poster_url = generate_news_poster_url(image_url, headline_to_use) | |
result["poster"] = news_poster_url | |
return result | |
except Exception as e: | |
log.error(f"Erro ao gerar poster: {str(e)}") | |
return {} | |
app = FastAPI(title="News Analyze API") | |
router = APIRouter() | |
async def analyze_endpoint(): | |
# Busca notícia com brazil_interest=true e title_pt vazio | |
news_data = await fetch_brazil_interest_news() | |
title_en = news_data.get("title_en", "") | |
text_en = news_data.get("text_en", "") | |
news_id = news_data.get("id") | |
if not title_en.strip() or not text_en.strip(): | |
raise HTTPException(status_code=400, detail="Title_en and text_en must not be empty.") | |
# Executa reescrita (tenta direta primeiro, depois HTTP) | |
rewritten_result = await rewrite_article(text_en) | |
# Log do resultado completo da reescrita | |
log.info(f"Resultado completo da reescrita: {json.dumps(rewritten_result, indent=2)}") | |
# Atualiza no banco de dados se reescrita foi bem-sucedida | |
if rewritten_result and rewritten_result.get("success") and rewritten_result.get("data"): | |
await update_news_rewrite(news_id, rewritten_result["data"]) | |
# Gera análise de poster | |
poster_analysis = await generate_poster_analysis(news_data, rewritten_result) | |
# Prepara resultado final | |
result = { | |
"news_id": news_id, | |
"title_en": title_en, | |
"text_en": text_en, | |
"rewrite_result": rewritten_result, | |
"death_related": news_data.get("death_related", False), | |
"entity_name": news_data.get("entity_name", ""), | |
"entity_type": news_data.get("entity_type", ""), | |
"image": news_data.get("image", "") | |
} | |
# Adiciona informações do poster se disponíveis | |
if poster_analysis: | |
result.update(poster_analysis) | |
return result | |
app.include_router(router) | |
async def shutdown_event(): | |
global http_session | |
if http_session: | |
await http_session.close() |