Spaces:

habulaj
/

filter

Sleeping

App Files Files Community

habulaj commited on Jul 8

Commit

6658bef

verified ·

1 Parent(s): bcf557b

Update app.py

Browse files

Files changed (1) hide show

app.py +69 -78

app.py CHANGED Viewed

@@ -4,12 +4,14 @@ import re
 import time
 import logging
 import os
-from transformers import AutoTokenizer, GenerationConfig
-from peft import AutoPeftModelForCausalLM
 import gc
 import json
-# -------- CONFIGS DE OTIMIZAÇÃO --------
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 os.environ["OMP_NUM_THREADS"] = "2"
 os.environ["MKL_NUM_THREADS"] = "2"
@@ -20,43 +22,37 @@ torch.set_num_interop_threads(1)
 logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
 log = logging.getLogger("news-filter")
-# -------- MODELO E TOKENIZER --------
-model_name = "habulaj/filterinstruct"
 log.info("🚀 Carregando modelo e tokenizer...")
-tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, padding_side="left")
 if tokenizer.pad_token is None:
     tokenizer.pad_token = tokenizer.eos_token
-# Aplica chat template
-def get_chat_template(tokenizer, chat_template="llama-3.1"):
-    tokenizer.chat_template = chat_template
-    return tokenizer
-tokenizer = get_chat_template(tokenizer, chat_template="llama-3.1")
 model = AutoPeftModelForCausalLM.from_pretrained(
     model_name,
     device_map="cpu",
     torch_dtype=torch.bfloat16,
     low_cpu_mem_usage=True,
-    use_cache=True,
     trust_remote_code=True
 )
 model.eval()
-log.info("✅ Modelo carregado em modo eval.")
-# -------- CONFIG DE GERAÇÃO --------
 generation_config = GenerationConfig(
     max_new_tokens=128,
-    temperature=1.2,
-    top_p=0.95,
-    do_sample=True,
     use_cache=True,
     eos_token_id=tokenizer.eos_token_id,
     pad_token_id=tokenizer.eos_token_id,
     repetition_penalty=1.1,
-    no_repeat_ngram_size=2,
 )
 # -------- FASTAPI --------
@@ -66,12 +62,34 @@ app = FastAPI(title="News Filter JSON API")
 def read_root():
     return {"message": "News Filter JSON API is running!", "docs": "/docs"}
-# -------- INFERÊNCIA COM TEMPLATE --------
-def infer_filter(title, content):
-    log.info(f"🧠 Iniciando inferência para: {title}")
-    start_time = time.time()
-    # Histórico com exemplos
     messages = [
         {
             "role": "user",
@@ -85,7 +103,21 @@ Content: "Lucasfilm confirmed a new Star Wars movie set to release in 2026, dire
         },
         {
             "role": "assistant",
-            "content": '{ "death_related": false, "relevance": "high", "global_interest": true, "entity_type": "movie", "entity_name": "Star Wars", "is_promotional": false, "potential_for_viral": true, "urgency_level": "medium", "has_video_content": false }'
         },
         {
             "role": "user",
@@ -99,74 +131,33 @@ Content: "{content}"
         }
     ]
-    # Tokenização com chat template
     inputs = tokenizer.apply_chat_template(
         messages,
         tokenize=True,
         add_generation_prompt=True,
-        return_tensors="pt"
     ).to("cpu")
     with torch.no_grad(), torch.inference_mode():
         outputs = model.generate(
             input_ids=inputs,
             generation_config=generation_config,
-            return_dict_in_generate=False
         )
-    # Remove o prompt da saída
     prompt_text = tokenizer.decode(inputs[0], skip_special_tokens=True)
-    decoded_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    generated_only = decoded_text[len(prompt_text):].strip()
-    json_result = extract_json(generated_only)
     duration = time.time() - start_time
     log.info(f"✅ JSON extraído em {duration:.2f}s")
-    log.info(json_result)
-    # Limpeza
-    del outputs, inputs
-    gc.collect()
-    return json_result
 def extract_json(text):
-    match = re.search(r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}', text, re.DOTALL)
     if match:
-        json_str = match.group(0)
-        # Correções mínimas
-        json_str = re.sub(r"'\s*:\s*'([^']*)'", r'": "\1"', json_str)
-        json_str = re.sub(r"'", '"', json_str)
-        json_str = re.sub(r"\bTrue\b", "true", json_str)
-        json_str = re.sub(r"\bFalse\b", "false", json_str)
-        return json_str.strip()
-    return text.strip()
-# -------- ENDPOINT --------
-@app.get("/filter")
-def get_filter(
-    title: str = Query(..., description="News title"),
-    content: str = Query(..., description="News content")
-):
-    try:
-        json_output = infer_filter(title, content)
-        try:
-            parsed = json.loads(json_output)
-            return {"result": parsed}
-        except json.JSONDecodeError:
-            log.error("❌ JSON inválido ao fazer parse.")
-            return {"result": json_output, "warning": "Raw JSON string returned due to parse error"}
-    except Exception as e:
-        log.exception("❌ Erro inesperado:")
-        raise HTTPException(status_code=500, detail="Erro interno durante a inferência.")
-# -------- WARMUP --------
-@app.on_event("startup")
-async def warmup():
-    log.info("🔥 Warmup iniciado...")
-    try:
-        infer_filter("Test title", "Test content")
-        log.info("✅ Warmup concluído.")
-    except Exception as e:
-        log.warning(f"⚠️ Warmup falhou: {e}")

 import time
 import logging
 import os
 import gc
 import json
+from transformers import AutoTokenizer, GenerationConfig
+from peft import AutoPeftModelForCausalLM
+from unsloth.chat_templates import get_chat_template
+from unsloth import FastLanguageModel
+# -------- CONFIGURAÇÕES DE OTIMIZAÇÃO --------
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 os.environ["OMP_NUM_THREADS"] = "2"
 os.environ["MKL_NUM_THREADS"] = "2"
 logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
 log = logging.getLogger("news-filter")
+# -------- MODELO --------
+model_name = "habulaj/filterinstruct3b"
 log.info("🚀 Carregando modelo e tokenizer...")
+tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
+tokenizer = get_chat_template(tokenizer, chat_template="llama-3.1")
 if tokenizer.pad_token is None:
     tokenizer.pad_token = tokenizer.eos_token
 model = AutoPeftModelForCausalLM.from_pretrained(
     model_name,
     device_map="cpu",
     torch_dtype=torch.bfloat16,
     low_cpu_mem_usage=True,
     trust_remote_code=True
 )
+FastLanguageModel.for_inference(model)
 model.eval()
+log.info("✅ Modelo carregado (modo eval).")
 generation_config = GenerationConfig(
     max_new_tokens=128,
+    temperature=1.0,
+    do_sample=False,
+    num_beams=1,
     use_cache=True,
     eos_token_id=tokenizer.eos_token_id,
     pad_token_id=tokenizer.eos_token_id,
     repetition_penalty=1.1,
+    length_penalty=1.0
 )
 # -------- FASTAPI --------
 def read_root():
     return {"message": "News Filter JSON API is running!", "docs": "/docs"}
+@app.get("/filter")
+def get_filter(
+    title: str = Query(..., description="News title"),
+    content: str = Query(..., description="News content")
+):
+    try:
+        result = infer_filter(title, content)
+        try:
+            return {"result": json.loads(result)}
+        except json.JSONDecodeError:
+            return {"result": result, "warning": "Returned as string due to JSON parsing error"}
+    except HTTPException as he:
+        raise he
+    except Exception as e:
+        log.exception("❌ Erro inesperado:")
+        raise HTTPException(status_code=500, detail="Internal server error during inference.")
+@app.on_event("startup")
+async def warmup():
+    log.info("🔥 Executando warmup...")
+    try:
+        infer_filter("Test title", "Test content")
+        log.info("✅ Warmup concluído.")
+    except Exception as e:
+        log.warning(f"⚠️ Warmup falhou: {e}")
+# -------- INFERÊNCIA --------
+def infer_filter(title, content):
     messages = [
         {
             "role": "user",
         },
         {
             "role": "assistant",
+            "content": '{ "death_related": false, "relevance": "high", "global_interest": true, "entity_type": "movie", "entity_name": "Star Wars", "breaking_news": true, "has_video_content": false }'
+        },
+        {
+            "role": "user",
+            "content": """Analyze the news title and content, and return the filters in JSON format with the defined fields.
+Please respond ONLY with the JSON filter, do NOT add any explanations, system messages, or extra text.
+Title: "Legendary Musician Carlos Mendes Dies at 78"
+Content: "Carlos Mendes, the internationally acclaimed Brazilian guitarist and composer known for blending traditional bossa nova with modern jazz, has died at the age of 78."
+"""
+        },
+        {
+            "role": "assistant",
+            "content": '{ "death_related": true, "relevance": "high", "global_interest": true, "entity_type": "person", "entity_name": "Carlos Mendes", "breaking_news": true, "has_video_content": false }'
         },
         {
             "role": "user",
         }
     ]
+    log.info(f"🧠 Inferência iniciada para: {title}")
+    start_time = time.time()
     inputs = tokenizer.apply_chat_template(
         messages,
         tokenize=True,
         add_generation_prompt=True,
+        return_tensors="pt",
     ).to("cpu")
     with torch.no_grad(), torch.inference_mode():
         outputs = model.generate(
             input_ids=inputs,
             generation_config=generation_config,
         )
     prompt_text = tokenizer.decode(inputs[0], skip_special_tokens=True)
+    full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    generated = full_output[len(prompt_text):].strip()
+    json_str = extract_json(generated)
     duration = time.time() - start_time
     log.info(f"✅ JSON extraído em {duration:.2f}s")
+    return json_str
 def extract_json(text):
+    match = re.search(r'\{.*?\}', text, flags=re.DOTALL)
     if match:
+        return match.group(0)
+    return text