Spaces:

habulaj
/

filter

Sleeping

App Files Files Community

habulaj commited on Jul 8

Commit

2c0cc11

verified ·

1 Parent(s): caa0753

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -9

app.py CHANGED Viewed

@@ -8,12 +8,25 @@ from peft import AutoPeftModelForCausalLM
 model_name = "habulaj/filter"
 print("Carregando tokenizer e modelo (CPU)...")
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoPeftModelForCausalLM.from_pretrained(
     model_name,
-    device_map="cpu",  # Força CPU
-    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,  # usa float16 se possível, senão float32
 )
-model.eval()  # modo avaliação
 # -------- FASTAPI --------
 app = FastAPI(title="News Filter JSON API")
@@ -23,8 +36,13 @@ app = FastAPI(title="News Filter JSON API")
 def read_root():
     return {"message": "News Filter JSON API is running!", "docs": "/docs"}
-# Função para inferência
 def infer_filter(title, content):
     prompt = f"""Analyze the news title and content, and return the filters in JSON format with the defined fields.
 Please respond ONLY with the JSON filter, do NOT add any explanations, system messages, or extra text.
@@ -33,17 +51,28 @@ Title: "{title}"
 Content: "{content}"
 """
-    inputs = tokenizer(prompt, return_tensors="pt")
     input_ids = inputs.input_ids.to("cpu")
     with torch.no_grad():
         outputs = model.generate(
             input_ids=input_ids,
-            max_new_tokens=128,
-            temperature=1.2,
             do_sample=True,
             top_p=0.9,
             eos_token_id=tokenizer.eos_token_id,
         )
     decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
@@ -54,7 +83,11 @@ Content: "{content}"
     # Extrai JSON
     match = re.search(r"\{.*\}", generated, re.DOTALL)
     if match:
-        return match.group(0)
     else:
         return "⚠️ Failed to extract JSON. Output:\n" + generated
@@ -66,6 +99,11 @@ def get_filter(
 ):
     try:
         json_output = infer_filter(title, content)
-        return {"filter": json_output}
     except Exception as e:
         raise HTTPException(status_code=422, detail=str(e))

 model_name = "habulaj/filter"
 print("Carregando tokenizer e modelo (CPU)...")
 tokenizer = AutoTokenizer.from_pretrained(model_name)
+# Otimizações de performance
 model = AutoPeftModelForCausalLM.from_pretrained(
     model_name,
+    device_map="cpu",
+    torch_dtype=torch.float32,  # float32 é mais rápido em CPU
+    low_cpu_mem_usage=True,     # Reduz uso de memória
 )
+model.eval()
+# Compilação do modelo para otimizar (PyTorch 2.0+)
+try:
+    model = torch.compile(model, mode="reduce-overhead")
+    print("✅ Modelo compilado com torch.compile")
+except Exception as e:
+    print(f"⚠️ torch.compile não disponível: {e}")
+# Cache para prompts similares
+prompt_cache = {}
 # -------- FASTAPI --------
 app = FastAPI(title="News Filter JSON API")
 def read_root():
     return {"message": "News Filter JSON API is running!", "docs": "/docs"}
+# Função para inferência otimizada
 def infer_filter(title, content):
+    # Cache key simples
+    cache_key = hash((title[:50], content[:100]))
+    if cache_key in prompt_cache:
+        return prompt_cache[cache_key]
     prompt = f"""Analyze the news title and content, and return the filters in JSON format with the defined fields.
 Please respond ONLY with the JSON filter, do NOT add any explanations, system messages, or extra text.
 Content: "{content}"
 """
+    # Otimizações de tokenização
+    inputs = tokenizer(
+        prompt,
+        return_tensors="pt",
+        truncation=True,
+        max_length=512,  # Limita tamanho do input
+        padding=False    # Não faz padding desnecessário
+    )
     input_ids = inputs.input_ids.to("cpu")
     with torch.no_grad():
+        # Configurações otimizadas para velocidade
         outputs = model.generate(
             input_ids=input_ids,
+            max_new_tokens=100,      # Reduzido de 128 para 100
+            temperature=1.0,         # Reduzido para ser mais determinístico
             do_sample=True,
             top_p=0.9,
+            num_beams=1,            # Beam search = 1 (greedy) é mais rápido
+            early_stopping=True,    # Para quando encontrar EOS
             eos_token_id=tokenizer.eos_token_id,
+            pad_token_id=tokenizer.eos_token_id,
         )
     decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
     # Extrai JSON
     match = re.search(r"\{.*\}", generated, re.DOTALL)
     if match:
+        result = match.group(0)
+        # Cache o resultado (limitado a 100 entradas)
+        if len(prompt_cache) < 100:
+            prompt_cache[cache_key] = result
+        return result
     else:
         return "⚠️ Failed to extract JSON. Output:\n" + generated
 ):
     try:
         json_output = infer_filter(title, content)
+        import json
+        # Retorna como dados brutos (parse do JSON)
+        return json.loads(json_output)
+    except json.JSONDecodeError:
+        # Se não conseguir fazer parse, retorna como string
+        return {"raw_output": json_output}
     except Exception as e:
         raise HTTPException(status_code=422, detail=str(e))