File size: 27,677 Bytes
4ffe0a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
import os
import sys
import re
import json
import time
import logging
import gc
import asyncio
import aiohttp
import random
from typing import Optional, Dict, Any
from fastapi import FastAPI, APIRouter, HTTPException
from pydantic import BaseModel
from google import genai
from google.genai import types
from newspaper import Article
import trafilatura

# Supabase Config
SUPABASE_URL = "https://iiwbixdrrhejkthxygak.supabase.co"
SUPABASE_KEY = os.getenv("SUPA_KEY")
SUPABASE_ROLE_KEY = os.getenv("SUPA_SERVICE_KEY")
if not SUPABASE_KEY or not SUPABASE_ROLE_KEY:
    raise ValueError("❌ SUPA_KEY or SUPA_SERVICE_KEY not set in environment!")
SUPABASE_HEADERS = {
    "apikey": SUPABASE_KEY,
    "Authorization": f"Bearer {SUPABASE_KEY}",
    "Content-Type": "application/json"
}
SUPABASE_ROLE_HEADERS = {
    "apikey": SUPABASE_ROLE_KEY,
    "Authorization": f"Bearer {SUPABASE_ROLE_KEY}",
    "Content-Type": "application/json"
}

logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
log = logging.getLogger("news-filter-api")

http_session = None

# Lista de User-Agents realistas para rotacionar
USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59'
]

async def get_http_session():
    global http_session
    if http_session is None:
        connector = aiohttp.TCPConnector(
            limit=20,
            limit_per_host=10,
            ttl_dns_cache=300,
            use_dns_cache=True,
            keepalive_timeout=30,
            enable_cleanup_closed=True
        )
        timeout = aiohttp.ClientTimeout(total=30, connect=5)
        http_session = aiohttp.ClientSession(
            connector=connector,
            timeout=timeout
        )
    return http_session

def get_realistic_headers():
    """Retorna headers realistas para evitar bloqueios"""
    return {
        'User-Agent': random.choice(USER_AGENTS),
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate, br',
        'DNT': '1',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'Cache-Control': 'max-age=0'
    }

async def extract_article_text(url: str) -> str:
    """Extrai o texto completo de uma notícia usando newspaper3k com fallback para trafilatura"""
    try:
        # Método 1: newspaper3k com headers realistas
        try:
            article = Article(url)
            article.config.browser_user_agent = random.choice(USER_AGENTS)
            article.config.request_timeout = 10
            article.config.number_threads = 1
            
            article.download()
            article.parse()
            
            if article.text and len(article.text.strip()) > 100:
                return article.text.strip()
            
        except Exception:
            pass
        
        # Método 2: trafilatura como fallback
        session = await get_http_session()
        headers = get_realistic_headers()
        
        # Adiciona um pequeno delay para parecer mais humano
        await asyncio.sleep(random.uniform(1, 3))
        
        async with session.get(url, headers=headers) as response:
            if response.status == 200:
                html = await response.text()
                extracted_text = trafilatura.extract(html)
                
                if extracted_text and len(extracted_text.strip()) > 100:
                    return extracted_text.strip()
        
        return ""
        
    except Exception as e:
        log.error(f"Erro ao extrair texto da URL {url}: {str(e)}")
        return ""

async def fetch_unused_news():
    """Busca uma notícia não usada do Supabase"""
    try:
        session = await get_http_session()
        url = f"{SUPABASE_URL}/rest/v1/news_extraction"
        params = {
            "used": "eq.false",
            "limit": "1",
            "order": "created_at.asc"
        }
        
        async with session.get(url, headers=SUPABASE_HEADERS, params=params) as response:
            if response.status != 200:
                raise HTTPException(status_code=500, detail="Erro ao buscar notícia")
            
            data = await response.json()
            if not data:
                raise HTTPException(status_code=404, detail="Nenhuma notícia disponível")
            
            return data[0]
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Erro Supabase: {str(e)}")

async def fetch_last_50_titles():
    """Busca os últimos 50 títulos da tabela news ordenados por created_at"""
    try:
        session = await get_http_session()
        url = f"{SUPABASE_URL}/rest/v1/news"
        params = {
            "select": "title_pt",
            "limit": "50",
            "order": "created_at.desc"
        }
        
        async with session.get(url, headers=SUPABASE_HEADERS, params=params) as response:
            if response.status != 200:
                log.warning("Erro ao buscar títulos anteriores")
                return []
            
            data = await response.json()
            titles = [item.get("title_pt", "") for item in data if item.get("title_pt")]
            return titles
    except Exception as e:
        log.warning(f"Erro ao buscar últimos títulos: {str(e)}")
        return []

async def insert_news_to_db(title: str, text: str, news_id: str, url: str, image_url: str, filters: dict):
    """Insere notícia na tabela news com dados originais e filtros"""
    try:
        session = await get_http_session()
        supabase_url = f"{SUPABASE_URL}/rest/v1/news"
        
        payload = {
            "title_en": title,
            "text_en": text,
            "news_id": news_id,
            "url": url,
            "image": image_url,
            "death_related": filters.get("death_related", False),
            "political_related": filters.get("political_related", False),
            "woke_related": filters.get("woke_related", False),
            "spoilers": filters.get("spoilers", False),
            "sensitive_theme": filters.get("sensitive_theme", False),
            "contains_video": filters.get("contains_video", False),
            "is_news_content": filters.get("is_news_content", True),
            "relevance": filters.get("relevance", ""),
            "brazil_interest": filters.get("brazil_interest", False),
            "breaking_news": filters.get("breaking_news", False),
            "audience_age_rating": filters.get("audience_age_rating", ""),
            "regional_focus": filters.get("regional_focus", ""),
            "country_focus": filters.get("country_focus", ""),
            "ideological_alignment": filters.get("ideological_alignment", ""),
            "entity_type": filters.get("entity_type", ""),
            "entity_name": filters.get("entity_name", ""),
            "duplication": filters.get("duplication", False)
        }
        
        async with session.post(supabase_url, headers=SUPABASE_ROLE_HEADERS, json=payload) as response:
            if response.status not in [200, 201]:
                response_text = await response.text()
                raise HTTPException(status_code=500, detail=f"Erro ao inserir notícia: {response.status} - {response_text}")
            
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Erro ao inserir: {str(e)}")

async def mark_news_as_used(news_id: str):
    """Marca notícia como usada - SEMPRE deve funcionar para evitar loops infinitos"""
    try:
        session = await get_http_session()
        url = f"{SUPABASE_URL}/rest/v1/news_extraction"
        params = {"news_id": f"eq.{news_id}"}
        
        payload = {"used": True}
        
        async with session.patch(url, headers=SUPABASE_ROLE_HEADERS, json=payload, params=params) as response:
            if response.status not in [200, 201, 204]:
                log.warning(f"Erro ao marcar {news_id} como usada, mas continuando...")
            
    except Exception as e:
        log.warning(f"Erro ao atualizar notícia {news_id}: {str(e)}")

def extract_json(text):
    match = re.search(r'\{.*\}', text, flags=re.DOTALL)
    return match.group(0) if match else text

def ensure_filter_order(filter_dict: Dict[str, Any]) -> Dict[str, Any]:
    ordered_keys = [
        "death_related", "political_related", "woke_related", "spoilers", 
        "sensitive_theme", "contains_video", "is_news_content", "relevance",
        "brazil_interest", "breaking_news", "audience_age_rating", "regional_focus",
        "country_focus", "ideological_alignment", "entity_type", "entity_name", "duplication"
    ]
    
    return {key: filter_dict[key] for key in ordered_keys if key in filter_dict}

async def filter_news(title: str, content: str, last_titles: list) -> dict:
    try:
        client = genai.Client(
            api_key=os.environ.get("GEMINI_API_KEY"),
        )

        model = "gemini-2.5-flash-lite"

        # Instruções do sistema
        SYSTEM_INSTRUCTIONS = """
        Analyze the news title and content, and return the filters in JSON format with the defined fields.
        Please respond ONLY with the JSON filter, do NOT add any explanations, system messages, or extra text.

        - death_related (true | false): Whether the news involves the real-life death of a person. Does not include fictional character deaths or deaths within stories.
        - political_related (true | false): Related to real-world politics (governments, elections, politicians, or official decisions). Not about political storylines in fiction.
        - woke_related (true | false): Involves social issues like inclusion, diversity, racism, gender, LGBTQIA+, etc.
        - spoilers (true | false): Reveals important plot points (e.g., character deaths, endings, major twists).
        - sensitive_theme (true | false): Covers sensitive or disturbing topics like suicide, abuse, violence, or tragedy.
        - contains_video (true | false): The news includes an embedded video (e.g., trailer, teaser, interview, video report).
        - is_news_content (true | false): Whether the content is actual news reporting. True for breaking news, announcements, factual reports. False for reviews, opinion pieces, lists, rankings, recommendations, critiques, analysis, or editorial content.
        - relevance ("low" | "medium" | "high" | "viral"): The expected public interest or impact of the news.
        - brazil_interest (true | false): Whether the topic is directly related to Brazil or relevant to Brazilian audiences.
        - breaking_news (true | false): The content is urgent or part of a recent and unfolding event.
        - audience_age_rating ("L" | 10 | 12 | 14 | 16 | 18): Content rating based on Brazilian standards.
        - regional_focus ("global" | "americas" | "europe" | "asia" | "africa" | "middle_east" | "oceania"): The main geographic region the news relates to.
        - country_focus (ISO 3166-1 alpha-2 code like "br", "us", "fr", "jp" or null): The specific country the news is about, if applicable.
        - ideological_alignment ("left" | "center-left" | "center" | "center-right" | "right" | "apolitical"): The perceived political bias of the article.
        - entity_type ("movie" | "series" | "event" | "person" | "place" | "other"): The type of main subject mentioned in the news.
        - entity_name (string): The name of the person, title, event, or topic the article is primarily about.
        - duplication (true | false): Whether the current news is a duplicate or highly similar to any of the previously published news titles.
        """

        # Formata os últimos títulos para incluir no prompt - aumentado para 25 títulos
        last_titles_formatted = "\n- ".join(last_titles[:25]) if last_titles else "No previous titles available"

        # Primeiro exemplo - NOTÍCIA REAL
        EXAMPLE_INPUT_1 = f"""Title: 'The Gilded Age' Renewed for Season 4 at HBO — Everything We Know So Far
Content: The Gilded Age will return. HBO announced on Monday, July 28, that the series has been renewed for Season 4. This comes after the release of Season 3 Episode 6 on Sunday, July 27. There are two episodes left to go in the third season. The Season 3 finale will air on Sunday, August 10, on HBO. According to HBO, total premiere-night viewing for the third season has grown for five consecutive weeks, culminating in a 20 percent growth compared to last season. Fan engagement has also climbed, with social chatter rising nearly 60 percent week over week. The show has also received its most critical acclaim to date with Season 3, its highest-stakes season so far. In the July 27 episode, the series that's known for its low stakes but high-camp drama, a character was seemingly killed off in violent (for The Gilded Age) fashion. The show is already Emmy-winning. Production designer Bob Shaw took home an Emmy for
Last titles:
- 'Quarteto Fantástico: Primeiros Passos' dispara para arrecadar US$ 118 milhões nas bilheterias dos EUA e US$ 218 milhões globalmente
- Bilheteria: 'Quarteto Fantástico: Primeiros Passos' sobe para US$ 218 milhões globalmente, 'Superman' e 'F1' ultrapassam US$ 500 milhões
- Reboot de 'Quarteto Fantástico' da Marvel ultrapassa US$ 200 milhões globalmente"""

        EXAMPLE_OUTPUT_1 = """{"death_related": false,"political_related": false,"woke_related": false,"spoilers": false,"sensitive_theme": false,"contains_video": false,"is_news_content": true,"relevance": "medium","brazil_interest": true,"breaking_news": true,"audience_age_rating": 14,"regional_focus": "americas","country_focus": "us","ideological_alignment": "apolitical","entity_type": "series","entity_name": "The Gilded Age","duplication": false}"""

        # Segundo exemplo - REVIEW/CRÍTICA (NÃO É NOTÍCIA)
        EXAMPLE_INPUT_2 = f"""Title: ‘My Best Friend’s Wedding’ Sequel in the Works: ‘Materialists,’ ‘Past Lives’ Director Celine Song to Write Screenplay
Content: A sequel to the Julia Roberts romantic comedy “My Best Friend’s Wedding” is in early development at Sony Pictures. The studio has tapped “Materialists” and “Past Lives” writer-director Celine Song to pen a screenplay for the project, though she is not in talks to helm the feature. 
Last titles:
- Sequência de "The Batman" ganha data de lançamento oficial da Warner Bros
- Sequência de "The Batman" de Robert Pattinson tem data oficial de lançamento para 2026
- Warner Bros. define data de lançamento da sequência de "The Batman" para 2026
- Sequência de 'O Casamento do Meu Melhor Amigo' terá roteiro da diretora de 'Vidas Passadas'"""

        EXAMPLE_OUTPUT_2 = """{"death_related": false,"political_related": false,"woke_related": false,"spoilers": false,"sensitive_theme": false,"contains_video": false,"is_news_content": true,"relevance": "medium","brazil_interest": true,"breaking_news": false,"audience_age_rating": 10,"regional_focus": "americas","country_focus": "us","ideological_alignment": "apolitical","entity_type": "movie","entity_name": "My Best Friend’s Wedding","duplication": true}"""

        # Terceiro exemplo - LISTA/RANKING (NÃO É NOTÍCIA)
        EXAMPLE_INPUT_3 = f"""Title: 9-1-1: Death of main character shakes series, which gets new date for the 9th season
Content: The 9-1-1 universe was permanently redefined after one of the most shocking events in its history. The show's eighth season bid farewell to one of its pillars with the death of Captain Bobby Nash, played by Peter Krause, in episode 15. Now, with the renewal for a ninth season confirmed, ABC has announced a schedule change: the premiere has been moved up to Thursday, October 9, 2025. Bobby Nash's death, the first of a main cast member, leaves a leadership vacuum in Battalion 118 and sets the main narrative arc for the new episodes. Peter Krause's departure had already been signaled, but the impact of his absence will be the driving force behind the next season, which will have 18 episodes. Showrunner Tim Minear had previously stated that, despite the death, the character would still appear in specific moments in the eighth season finale, fulfilling his promise.
Last titles:
- The Batman 2 ganha data oficial de lançamento para 2026 na Warner Bros
- Datas de estreia da ABC no outono de 2025: '9-1-1', 'Nashville' e 'Grey's Anatomy' antecipadas
- Warner Bros. anuncia sequência de 'The Batman' para 2026"""

        EXAMPLE_OUTPUT_3 = """{"death_related": false,"political_related": false,"woke_related": false,"spoilers": true,"sensitive_theme": false,"contains_video": false,"is_news_content": true,"relevance": "high","brazil_interest": true,"breaking_news": true,"audience_age_rating": 14,"regional_focus": "global","country_focus": null,"ideological_alignment": "apolitical","entity_type": "series","entity_name": "9-1-1","duplication": true}"""

        # Quarto exemplo - NOTÍCIA REAL DE MORTE
        EXAMPLE_INPUT_4 = f"""Title: Julian McMahon, 'Fantastic Four,' 'Nip/Tuck' and 'FBI: Most Wanted' Star, Dies at 56
Content: Julian McMahon, the suave Australian actor best known for his performances on "FBI: Most Wanted," "Charmed," "Nip/Tuck" and the early aughts "Fantastic Four" films, died Wednesday in Florida. He was 56 and died after a battle with cancer. McMahon's death was confirmed through his reps, who shared a statement from his wife, Kelly McMahon, in remembrance of her husband. "With an open heart, I wish to share with the world that my beloved husband, Julian McMahon, died peacefully this week after a valiant effort to overcome cancer," she said. "Julian loved life. He loved his family. He loved his friends. He loved his work, and he loved his fans. His deepest wish was to bring joy into as many lives as possible. We ask for support during this time to allow our family to grieve in privacy. And we wish for all of those to whom Julian brought joy, to continue to find joy in life. We are grateful for the memories."
Last titles:
- Mortes de Celebridades em 2025: Estrelas que Perdemos Este Ano
- Programas de TV Cancelados em 2025: Quais Séries Foram Canceladas
- Atores Australianos que Estão Fazendo Sucesso em Hollywood"""

        EXAMPLE_OUTPUT_4 = """{"death_related": true,"political_related": false,"woke_related": false,"spoilers": false,"sensitive_theme": true,"contains_video": false,"is_news_content": true,"relevance": "medium","brazil_interest": true,"breaking_news": true,"audience_age_rating": 14,"regional_focus": "americas","country_focus": "au","ideological_alignment": "apolitical","entity_type": "person","entity_name": "Julian McMahon","duplication": false}"""

        # Estrutura de conversação correta com múltiplos exemplos
        contents = [
            # Primeiro exemplo: usuário envia uma notícia
            types.Content(
                role="user",
                parts=[
                    types.Part.from_text(text=EXAMPLE_INPUT_1)
                ]
            ),
            # Primeiro exemplo: modelo responde com o formato correto
            types.Content(
                role="model",
                parts=[
                    types.Part.from_text(text=EXAMPLE_OUTPUT_1)
                ]
            ),
            # Segundo exemplo: usuário envia um review (não é notícia)
            types.Content(
                role="user",
                parts=[
                    types.Part.from_text(text=EXAMPLE_INPUT_2)
                ]
            ),
            # Segundo exemplo: modelo responde marcando como não-notícia
            types.Content(
                role="model",
                parts=[
                    types.Part.from_text(text=EXAMPLE_OUTPUT_2)
                ]
            ),
            # Terceiro exemplo: usuário envia uma lista (não é notícia)
            types.Content(
                role="user",
                parts=[
                    types.Part.from_text(text=EXAMPLE_INPUT_3)
                ]
            ),
            # Terceiro exemplo: modelo responde marcando como não-notícia
            types.Content(
                role="model",
                parts=[
                    types.Part.from_text(text=EXAMPLE_OUTPUT_3)
                ]
            ),
            # Quarto exemplo: usuário envia notícia real
            types.Content(
                role="user",
                parts=[
                    types.Part.from_text(text=EXAMPLE_INPUT_4)
                ]
            ),
            # Quarto exemplo: modelo responde marcando como notícia
            types.Content(
                role="model",
                parts=[
                    types.Part.from_text(text=EXAMPLE_OUTPUT_4)
                ]
            ),
            # Agora o usuário envia a notícia real para ser analisada
            types.Content(
                role="user",
                parts=[
                    types.Part.from_text(text=f"""Title: {title}
Content: {content}
Last titles:
- {last_titles_formatted}""")
                ]
            )
        ]

        # Ferramentas para pesquisa e pensamento
        tools = [
            types.Tool(googleSearch=types.GoogleSearch())
        ]

        config = types.GenerateContentConfig(
            system_instruction=SYSTEM_INSTRUCTIONS,
            tools=tools,
            response_mime_type="text/plain",
            max_output_tokens=4096,
            temperature=0.8,
        )

        response_text = ""
        for chunk in client.models.generate_content_stream(
            model=model,
            contents=contents,
            config=config
        ):
            if chunk.text:
                response_text += chunk.text
        
        json_result = extract_json(response_text)

        try:
            parsed = json.loads(json_result)
        except json.JSONDecodeError as e:
            raise ValueError("Modelo retornou JSON inválido")

        ALLOWED_KEYS = {
            "death_related", "political_related", "woke_related", "spoilers", 
            "sensitive_theme", "contains_video", "is_news_content", "relevance",
            "brazil_interest", "breaking_news", "audience_age_rating", "regional_focus",
            "country_focus", "ideological_alignment", "entity_type", "entity_name", "duplication"
        }

        clean_filter = {key: parsed[key] for key in ALLOWED_KEYS if key in parsed}
        clean_filter = ensure_filter_order(clean_filter)

        return {"filter": clean_filter}

    except Exception as e:
        raise ValueError(f"Erro na filtragem: {str(e)}")

def should_skip_insertion(filters: dict) -> tuple[bool, str]:
    """
    Verifica se a notícia deve ser pulada (não inserida na tabela news).
    Retorna (should_skip, reason)
    """

    # Condição 1: Se duplication for true → sempre pular
    if filters.get("duplication", False):
        return True, "duplicação detectada"

    # Condição 2: Se is_news_content for false → pular
    if not filters.get("is_news_content", True):
        return True, "conteúdo não é notícia (review, lista, crítica, etc.)"

    # Condição 3: Se brazil_interest for false → pular
    if not filters.get("brazil_interest", True):
        return True, "baixo interesse para o Brasil (brazil_interest=false)"

    # Condição 4: Se relevance for low ou ausente → pular
    if filters.get("relevance", "") not in {"medium", "high", "viral"}:
        return True, f"relevância insuficiente (relevance={filters.get('relevance')})"

    # Se passou por todas, pode inserir
    return False, ""
    
app = FastAPI(title="News Filter API")
router = APIRouter()

@router.post("/filter")
async def filter_endpoint():
    news_data = None
    news_id = None
    
    try:
        # Busca notícia não usada do Supabase
        news_data = await fetch_unused_news()
        
        title = news_data.get("title", "")
        url = news_data.get("url", "")
        news_id = news_data.get("news_id", "")
        image_url = news_data.get("image", "")
        
        if not title.strip() or not url.strip():
            raise ValueError("Title e URL não podem estar vazios")
        
        log.info(f"Processando notícia {news_id}: {title}")
        
        # Busca os últimos 50 títulos
        last_titles = await fetch_last_50_titles()
        
        # Extrai texto completo da URL
        full_text = await extract_article_text(url)
        
        if not full_text.strip():
            raise ValueError("Não foi possível extrair texto da URL")
        
        # Executa análise de filtros com os últimos títulos
        filter_result = await filter_news(title, full_text, last_titles)
        
        # Verifica se deve pular a inserção
        should_skip, skip_reason = should_skip_insertion(filter_result["filter"])
        
        if should_skip:
            # Apenas marca como usada, não insere na tabela news
            await mark_news_as_used(news_id)
            log.info(f"Notícia {news_id} pulada devido a: {skip_reason}")
            
            return {
                "filter": filter_result["filter"],
                "title_en": title,
                "text_en": full_text,
                "news_id": news_id,
                "url": url,
                "image": image_url,
                "last_titles": last_titles,
                "skipped": True,
                "skip_reason": skip_reason
            }
        else:
            # Insere na tabela news com filtros
            await insert_news_to_db(title, full_text, news_id, url, image_url, filter_result["filter"])
            
            # Marca como usada (sucesso)
            await mark_news_as_used(news_id)
            
            log.info(f"Notícia {news_id} processada e inserida com sucesso")
            
            return {
                "filter": filter_result["filter"],
                "title_en": title,
                "text_en": full_text,
                "news_id": news_id,
                "url": url,
                "image": image_url,
                "last_titles": last_titles,
                "skipped": False
            }
        
    except Exception as e:
        error_msg = str(e)
        log.error(f"Erro no processamento da notícia {news_id}: {error_msg}")
        
        # SEMPRE marca como usada em caso de erro para evitar loops infinitos
        if news_id:
            await mark_news_as_used(news_id)
        
        # Determina o tipo de erro para o HTTP response
        if "Nenhuma notícia disponível" in error_msg:
            raise HTTPException(status_code=404, detail=error_msg)
        elif "Title e URL não podem estar vazios" in error_msg:
            raise HTTPException(status_code=400, detail=error_msg)
        elif "Não foi possível extrair texto" in error_msg:
            raise HTTPException(status_code=400, detail=error_msg)
        else:
            raise HTTPException(status_code=500, detail=f"Erro interno: {error_msg}")

app.include_router(router)

@app.on_event("shutdown")
async def shutdown_event():
    global http_session
    if http_session:
        await http_session.close()