newapi-clone / routers /getnews.py
habulaj's picture
Upload 13 files
4ffe0a9 verified
raw
history blame
4.34 kB
import os
import re
import httpx
from typing import List, Dict
from bs4 import BeautifulSoup
from fastapi import APIRouter, HTTPException
router = APIRouter()
# 🎯 IMDb GraphQL
GRAPHQL_URL = "https://api.graphql.imdb.com"
HEADERS = {"Content-Type": "application/json"}
QUERY = """
query GetNews($first: Int!) {
movieNews: news(first: $first, category: MOVIE) {
edges {
node {
id
articleTitle { plainText }
externalUrl
date
text { plaidHtml }
image { url }
}
}
}
tvNews: news(first: $first, category: TV) {
edges {
node {
id
articleTitle { plainText }
externalUrl
date
text { plaidHtml }
image { url }
}
}
}
}
"""
# 🔧 Supabase Config
SUPABASE_URL = "https://iiwbixdrrhejkthxygak.supabase.co"
SUPABASE_KEY = os.getenv("SUPA_KEY")
SUPABASE_ROLE_KEY = os.getenv("SUPA_SERVICE_KEY")
if not SUPABASE_KEY or not SUPABASE_ROLE_KEY:
raise ValueError("❌ SUPA_KEY or SUPA_SERVICE_KEY not set in environment!")
SUPABASE_HEADERS = {
"apikey": SUPABASE_KEY,
"Authorization": f"Bearer {SUPABASE_KEY}",
"Content-Type": "application/json"
}
SUPABASE_ROLE_HEADERS = {
"apikey": SUPABASE_ROLE_KEY,
"Authorization": f"Bearer {SUPABASE_ROLE_KEY}",
"Content-Type": "application/json"
}
# 🧼 HTML Cleanup
def clean_html(raw_html: str) -> str:
text = BeautifulSoup(raw_html or "", "html.parser").get_text(separator=" ", strip=True)
text = re.sub(r"\s+", " ", text)
text = re.sub(r"\s+([.,;:!?])", r"\1", text)
text = re.sub(r"\(\s+", "(", text)
text = re.sub(r"\s+\)", ")", text)
text = re.sub(r"\[\s+", "[", text)
text = re.sub(r"\s+\]", "]", text)
text = re.sub(r"\{\s+", "{", text)
text = re.sub(r"\s+\}", "}", text)
return text.strip()
# 🚀 Endpoint principal
@router.get("/news")
async def get_news(first: int = 20) -> List[Dict]:
payload = {
"query": QUERY,
"variables": {"first": first}
}
async with httpx.AsyncClient(timeout=10.0) as client:
# Pega notícias do IMDb
response = await client.post(GRAPHQL_URL, headers=HEADERS, json=payload)
if response.status_code != 200:
raise HTTPException(status_code=502, detail="Erro ao acessar a API do IMDb")
data = response.json().get("data")
if not data:
raise HTTPException(status_code=500, detail="Resposta inválida da API")
combined = []
for category_key in ["movieNews", "tvNews"]:
for edge in data.get(category_key, {}).get("edges", []):
node = edge.get("node", {})
image_data = node.get("image")
combined.append({
"news_id": node.get("id"),
"title": node.get("articleTitle", {}).get("plainText"),
"url": node.get("externalUrl"),
"date": node.get("date"),
"text": clean_html(node.get("text", {}).get("plaidHtml")),
"image": image_data.get("url") if image_data else None,
"category": category_key.replace("News", "").upper()
})
# 📌 Verifica quais IDs já existem no Supabase
all_ids = [item["news_id"] for item in combined]
existing_ids = []
ids_chunks = [all_ids[i:i + 1000] for i in range(0, len(all_ids), 1000)] # evita URL muito grande
for chunk in ids_chunks:
query_ids = ",".join([f"\"{nid}\"" for nid in chunk])
url = f"{SUPABASE_URL}/rest/v1/news_extraction?select=news_id&news_id=in.({query_ids})"
r = await client.get(url, headers=SUPABASE_HEADERS)
if r.status_code == 200:
existing_ids.extend([item["news_id"] for item in r.json()])
# 🔎 Filtra apenas as novas notícias
new_entries = [item for item in combined if item["news_id"] not in existing_ids]
# 🧾 Insere novas notícias (em lote)
if new_entries:
insert_url = f"{SUPABASE_URL}/rest/v1/news_extraction"
await client.post(insert_url, headers=SUPABASE_ROLE_HEADERS, json=new_entries)
# 🔃 Ordena por data
combined.sort(key=lambda x: x.get("date"), reverse=True)
return combined