Spaces:

habulaj
/

filter

Sleeping

App Files Files Community

habulaj commited on Jul 8

Commit

cda1138

verified ·

1 Parent(s): 8efc71c

Update app.py

Browse files

Files changed (1) hide show

app.py +110 -53

app.py CHANGED Viewed

@@ -3,49 +3,72 @@ import torch
 import re
 import time
 import logging
-from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 # -------- LOGGING CONFIG --------
-logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", handlers=[logging.FileHandler("inference.log"), logging.StreamHandler()])
 log = logging.getLogger("news-filter")
 # -------- LOAD MODEL --------
 model_name = "habulaj/filterinstruct"
 log.info("🚀 Carregando modelo e tokenizer...")
-# Configuração para quantização em 8-bit para CPU
-# BitsAndBytesConfig é primariamente para GPU, mas pode ser usado para indicar a intenção de quantização.
-# Para CPU, a quantização real pode depender do suporte do modelo e da biblioteca transformers.
-quantization_config = BitsAndBytesConfig(
-    load_in_8bit=True,
-    # bnb_4bit_compute_dtype=torch.float32, # Não é necessário para 8-bit e pode causar problemas em CPU
-    # bnb_4bit_quant_type="nf4", # Não é necessário para 8-bit
-    # bnb_4bit_use_double_quant=True, # Não é necessário para 8-bit
 )
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-# Carregar o modelo com a configuração de quantização
-# Se o modelo não suportar 8-bit em CPU, ele fará fallback para float32.
-model = AutoModelForCausalLM.from_pretrained(
     model_name,
     device_map="cpu",
-    torch_dtype=torch.float32, # Manter float32 para garantir compatibilidade com CPU
     low_cpu_mem_usage=True,
-    quantization_config=quantization_config, # Aplicar configuração de quantização
 )
 model.eval()
 log.info("✅ Modelo carregado (eval mode).")
-# Otimização para CPU: Habilitar formato de memória contígua
-try:
-    torch.backends.cpu.enable_contiguous_memory_format()
-    log.info("✅ Formato de memória contígua para CPU habilitado.")
-except Exception as e:
-    log.warning(f"⚠️ Não foi possível habilitar formato de memória contígua para CPU: {e}")
 try:
-    model = torch.compile(model, mode="reduce-overhead")
     log.info("✅ Modelo compilado com torch.compile.")
 except Exception as e:
     log.warning(f"⚠️ torch.compile não disponível: {e}")
@@ -57,50 +80,72 @@ app = FastAPI(title="News Filter JSON API")
 def read_root():
     return {"message": "News Filter JSON API is running!", "docs": "/docs"}
-# -------- INFERENCE --------
 def infer_filter(title, content):
-    prompt = f"""Analyze the news title and content, and return the filters in strict JSON format.\n\nUse only double quotes for all property names and string values. Use lowercase `true` and `false` for booleans. Do not include any explanations, labels, or comments.\n\nTitle: "{title}"\nContent: "{content}"\n"""
     log.info(f"🧠 Inferência iniciada para: {title}")
     start_time = time.time()
     inputs = tokenizer(
         prompt,
         return_tensors="pt",
         truncation=True,
-        max_length=512,
-        padding=True,
     )
-    input_ids = inputs.input_ids.to("cpu")
-    attention_mask = inputs.attention_mask.to("cpu")
     with torch.no_grad():
-        outputs = model.generate(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            max_new_tokens=100,
-            temperature=1.0,
-            do_sample=False,
-            top_k=50,
-            no_repeat_ngram_size=2,
-            num_beams=1,
-            eos_token_id=tokenizer.eos_token_id,
-            pad_token_id=tokenizer.eos_token_id,
-        )
-    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    generated = decoded[len(prompt):].strip()
     log.info("📤 Resultado gerado:")
     log.info(generated)
-    match = re.search(r"\\{.*\\}", generated, re.DOTALL)
     if match:
         duration = time.time() - start_time
         log.info(f"✅ JSON extraído em {duration:.2f}s")
-        return match.group(0)
     else:
         log.warning("⚠️ Falha ao extrair JSON.")
         raise HTTPException(status_code=404, detail="Unable to extract JSON from model output.")
 # -------- API --------
@@ -114,7 +159,19 @@ def get_filter(
         import json
         return json.loads(json_output)
     except HTTPException as he:
-        raise he  # já tratado
     except Exception as e:
         log.exception("❌ Erro inesperado:")
-        raise HTTPException(status_code=404, detail="Invalid or malformed JSON output from model.")

 import re
 import time
 import logging
+import os
+from transformers import AutoTokenizer, LlamaForCausalLM, GenerationConfig
+from peft import AutoPeftModelForCausalLM
+import gc
+# -------- CONFIGURAÇÕES DE OTIMIZAÇÃO --------
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+os.environ["OMP_NUM_THREADS"] = "2"  # Ajuste para seus 2 vcpus
+os.environ["MKL_NUM_THREADS"] = "2"
+torch.set_num_threads(2)
+torch.set_num_interop_threads(1)
 # -------- LOGGING CONFIG --------
+logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
 log = logging.getLogger("news-filter")
 # -------- LOAD MODEL --------
 model_name = "habulaj/filterinstruct"
 log.info("🚀 Carregando modelo e tokenizer...")
+# Tokenizer otimizado
+tokenizer = AutoTokenizer.from_pretrained(
+    model_name,
+    use_fast=True,  # Usa tokenizer fast se disponível
+    padding_side="left"  # Padding à esquerda para melhor performance
 )
+# Configurar pad_token se não existir
+if tokenizer.pad_token is None:
+    tokenizer.pad_token = tokenizer.eos_token
+# Modelo otimizado
+model = AutoPeftModelForCausalLM.from_pretrained(
     model_name,
     device_map="cpu",
+    torch_dtype=torch.bfloat16,  # bfloat16 é mais rápido que float32 em CPU moderna
     low_cpu_mem_usage=True,
+    use_cache=True,  # Cache interno do modelo
+    trust_remote_code=True
 )
 model.eval()
 log.info("✅ Modelo carregado (eval mode).")
+# Configuração de geração otimizada
+generation_config = GenerationConfig(
+    max_new_tokens=100,
+    temperature=1.0,
+    do_sample=False,
+    num_beams=1,
+    use_cache=True,
+    eos_token_id=tokenizer.eos_token_id,
+    pad_token_id=tokenizer.eos_token_id,
+    no_repeat_ngram_size=2,
+    repetition_penalty=1.1,
+    length_penalty=1.0
+)
+# Torch compile com configurações otimizadas
 try:
+    model = torch.compile(
+        model,
+        mode="reduce-overhead",
+        fullgraph=True,
+        dynamic=False
+    )
     log.info("✅ Modelo compilado com torch.compile.")
 except Exception as e:
     log.warning(f"⚠️ torch.compile não disponível: {e}")
 def read_root():
     return {"message": "News Filter JSON API is running!", "docs": "/docs"}
+# -------- INFERENCE OTIMIZADA --------
 def infer_filter(title, content):
+    # Prompt mais conciso para reduzir tokens
+    prompt = f"""Analyze and return JSON filters:
+Title: "{title}"
+Content: "{content}"
+"""
     log.info(f"🧠 Inferência iniciada para: {title}")
     start_time = time.time()
+    # Tokenização otimizada
     inputs = tokenizer(
         prompt,
         return_tensors="pt",
         truncation=True,
+        max_length=384,  # Reduzido de 512 para acelerar
+        padding=False,   # Sem padding desnecessário
+        add_special_tokens=True,
     )
+    input_ids = inputs.input_ids
+    attention_mask = inputs.attention_mask
+    # Geração otimizada
     with torch.no_grad():
+        with torch.inference_mode():  # Modo de inferência mais rápido
+            outputs = model.generate(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                generation_config=generation_config,
+                # Parâmetros adicionais de otimização
+                early_stopping=True,
+                num_return_sequences=1,
+                output_scores=False,
+                return_dict_in_generate=False,
+            )
+    # Decodificação otimizada
+    generated_tokens = outputs[0][len(input_ids[0]):]
+    generated = tokenizer.decode(
+        generated_tokens,
+        skip_special_tokens=True,
+        clean_up_tokenization_spaces=True
+    )
     log.info("📤 Resultado gerado:")
     log.info(generated)
+    # Regex otimizada
+    match = re.search(r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}', generated, re.DOTALL)
     if match:
         duration = time.time() - start_time
+        json_result = match.group(0)
         log.info(f"✅ JSON extraído em {duration:.2f}s")
+        # Limpeza de memória
+        del outputs, generated_tokens, inputs
+        gc.collect()
+        return json_result
     else:
         log.warning("⚠️ Falha ao extrair JSON.")
+        # Limpeza de memória mesmo em caso de erro
+        del outputs, generated_tokens, inputs
+        gc.collect()
         raise HTTPException(status_code=404, detail="Unable to extract JSON from model output.")
 # -------- API --------
         import json
         return json.loads(json_output)
     except HTTPException as he:
+        raise he
     except Exception as e:
         log.exception("❌ Erro inesperado:")
+        raise HTTPException(status_code=404, detail="Invalid or malformed JSON output from model.")
+# -------- WARMUP (OPCIONAL) --------
+@app.on_event("startup")
+async def warmup():
+    """Faz um warmup do modelo para otimizar as primeiras execuções"""
+    log.info("🔥 Executando warmup...")
+    try:
+        # Exemplo simples para warmup
+        infer_filter("Test title", "Test content")
+        log.info("✅ Warmup concluído.")
+    except Exception as e:
+        log.warning(f"⚠️ Warmup falhou: {e}")