Spaces:

habulaj
/

filtergradio

Sleeping

App Files Files Community

habulaj commited on Jul 9

Commit

b6db17e

verified ·

1 Parent(s): c1b201d

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -139

app.py CHANGED Viewed

@@ -10,15 +10,10 @@ import os
 import gc
 from typing import Dict, Any, Optional, List, Tuple
 import psutil
-import threading
-import concurrent.futures
 from contextlib import contextmanager
-import numpy as np
-# -------- CONFIGURAÇÕES AVANÇADAS DE OTIMIZAÇÃO --------
-# Configuração de CPU baseada no hardware disponível
 num_cores = psutil.cpu_count(logical=False)
-num_threads = min(4, num_cores)  # Limite para evitar oversubscription
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 os.environ["OMP_NUM_THREADS"] = str(num_threads)
@@ -29,53 +24,36 @@ os.environ["NUMEXPR_NUM_THREADS"] = str(num_threads)
 torch.set_num_threads(num_threads)
 torch.set_num_interop_threads(1)
-# Configurações avançadas para otimização
 torch.backends.mkl.enabled = True
 torch.backends.mkldnn.enabled = True
 torch.backends.quantized.engine = 'qnnpack'
-# Configuração de flushing para memória
-torch.cuda.empty_cache = lambda: None  # Evita chamadas desnecessárias
-# -------- LOGGING CONFIG --------
 logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
 log = logging.getLogger("news-filter-optimized")
-# Configuração global para usar CPU
 device = "cpu"
 torch.set_default_device(device)
-# -------- OTIMIZAÇÕES DE MEMÓRIA --------
 @contextmanager
 def memory_efficient_context():
-    """Context manager para otimizar uso de memória durante inferência"""
     try:
-        # Força garbage collection antes da operação
         gc.collect()
         yield
     finally:
-        # Limpa memória após a operação
         gc.collect()
 class OptimizedTokenizerWrapper:
-    """Wrapper otimizado para tokenizer com cache de operações comuns"""
     def __init__(self, tokenizer):
         self.tokenizer = tokenizer
-        self._encode_cache = {}
-        self._decode_cache = {}
         self._template_cache = {}
     def apply_chat_template(self, messages, **kwargs):
-        """Versão otimizada do chat template com cache"""
-        # Cria key baseada no conteúdo da mensagem
         content = messages[0]['content'] if messages else ""
-        key = hash(content[:100])  # Usa apenas primeiros 100 chars para key
         if key not in self._template_cache:
             result = self.tokenizer.apply_chat_template(messages, **kwargs)
-            # Limita cache a 100 entradas
             if len(self._template_cache) > 100:
                 self._template_cache.clear()
             self._template_cache[key] = result
@@ -83,91 +61,80 @@ class OptimizedTokenizerWrapper:
         return self._template_cache[key]
     def decode(self, *args, **kwargs):
-        """Versão otimizada do decode"""
         return self.tokenizer.decode(*args, **kwargs)
     def __getattr__(self, name):
-        """Proxy para outros métodos do tokenizer"""
         return getattr(self.tokenizer, name)
-# -------- CONFIGURAÇÃO DE MODELO COM OTIMIZAÇÕES AVANÇADAS --------
-print("🚀 Carregando modelo e tokenizer com otimizações avançadas...")
-log.info("🚀 Carregando modelo e tokenizer com otimizações avançadas...")
-# Configurações de otimização para carregamento do modelo
 model_config = {
     "device_map": device,
-    "torch_dtype": torch.float16,  # Mudança para float16 (mais rápido em algumas CPUs)
     "low_cpu_mem_usage": True,
     "use_cache": True,
     "trust_remote_code": True,
-    "attn_implementation": "eager",  # Implementação mais rápida para CPU
 }
-# Carrega modelo com configurações otimizadas
 model = AutoPeftModelForCausalLM.from_pretrained(
     "habulaj/filterinstruct180",
     **model_config
 )
-# Configuração otimizada do tokenizer
 tokenizer = AutoTokenizer.from_pretrained(
     "habulaj/filterinstruct180",
     use_fast=True,
     padding_side="left",
-    model_max_length=1024,  # Limite explícito para evitar sequências muito longas
-    clean_up_tokenization_spaces=False,  # Mais rápido
 )
-# Otimizações de tokenizer
 if tokenizer.pad_token is None:
     tokenizer.pad_token = tokenizer.eos_token
-# Wrapper otimizado para tokenizer
 tokenizer = OptimizedTokenizerWrapper(tokenizer)
-# -------- OTIMIZAÇÕES DE MODELO --------
-# Modo de avaliação com otimizações
 model.eval()
-# Otimizações específicas para inferência
 for param in model.parameters():
     param.requires_grad = False
-# Compila o modelo para otimização (se disponível)
 try:
     model = torch.compile(model, mode="reduce-overhead")
-    log.info("✅ Modelo compilado com torch.compile")
 except Exception as e:
     log.warning(f"⚠️ Torch compile não disponível: {e}")
-# Otimização de fusão de operações
 if hasattr(model, 'fuse_linear_layers'):
     model.fuse_linear_layers()
-log.info("✅ Modelo carregado com otimizações avançadas.")
-# -------- CONFIGURAÇÃO DE TEMPLATE E GERAÇÃO --------
-# Chat template otimizado (sem formatação desnecessária)
-tokenizer.tokenizer.chat_template = """{% for message in messages %}{% if message['role'] == 'user' %}{% if loop.first %}<|begin_of_text|><|start_header_id|>user<|end_header_id|>{{ message['content'] }}<|eot_id|>{% else %}<|start_header_id|>user<|end_header_id|>{{ message['content'] }}<|eot_id|>{% endif %}{% elif message['role'] == 'assistant' %}<|start_header_id|>assistant<|end_header_id|>{{ message['content'] }}<|eot_id|>{% endif %}{% endfor %}{% if add_generation_prompt %}<|start_header_id|>assistant<|end_header_id|>{% endif %}"""
-# Configuração otimizada de geração
 generation_config = GenerationConfig(
-    max_new_tokens=150,  # Reduzido para acelerar
-    temperature=0.8,     # Reduzido para mais determinismo
-    do_sample=False,     # Desativado para maximum speed
     use_cache=True,
     eos_token_id=tokenizer.eos_token_id,
     pad_token_id=tokenizer.eos_token_id,
     repetition_penalty=1.1,
     length_penalty=1.0,
-    num_beams=1,  # Força greedy decoding
     early_stopping=True,
 )
-# -------- FUNÇÕES OTIMIZADAS --------
 def extract_json_optimized(text: str) -> str:
-    """Extração otimizada de JSON com regex compilado"""
     if not hasattr(extract_json_optimized, 'pattern'):
         extract_json_optimized.pattern = re.compile(r'\{.*?\}', re.DOTALL)
@@ -175,8 +142,6 @@ def extract_json_optimized(text: str) -> str:
     return match.group(0) if match else text
 def preprocess_input_optimized(title: str, content: str) -> List[Dict[str, str]]:
-    """Preprocessamento otimizado de entrada"""
-    # Trunca entradas muito longas para acelerar processamento
     max_title_length = 100
     max_content_length = 500
@@ -195,26 +160,22 @@ Content: "{content}"
     }]
 def analyze_news_optimized(title: str, content: str) -> str:
-    """Versão ultra-otimizada da análise de notícias"""
     try:
         with memory_efficient_context():
             start_time = time.time()
-            # Prepara entrada otimizada
             messages = preprocess_input_optimized(title, content)
-            # Tokenização otimizada
             inputs = tokenizer.apply_chat_template(
                 messages,
                 tokenize=True,
                 add_generation_prompt=True,
                 return_tensors="pt",
-                padding=False,  # Sem padding desnecessário
                 truncation=True,
                 max_length=1024,
             )
-            # Inferência otimizada com múltiplas optimizações
             with torch.no_grad(), torch.inference_mode():
                 with torch.autocast(device_type='cpu', dtype=torch.float16):
                     outputs = model.generate(
@@ -226,10 +187,9 @@ def analyze_news_optimized(title: str, content: str) -> str:
                         output_attentions=False,
                         return_dict_in_generate=False,
                         use_cache=True,
-                        do_sample=False,  # Greedy para máxima velocidade
                     )
-            # Decodificação otimizada
             generated_tokens = outputs[0][inputs.shape[1]:]
             generated_text = tokenizer.decode(
                 generated_tokens,
@@ -237,17 +197,13 @@ def analyze_news_optimized(title: str, content: str) -> str:
                 clean_up_tokenization_spaces=False
             )
-            # Extração otimizada de JSON
             json_result = extract_json_optimized(generated_text)
-            # Logging de performance
             duration = time.time() - start_time
             log.info(f"✅ Análise concluída em {duration:.2f}s")
-            # Limpeza de memória otimizada
             del outputs, inputs, generated_tokens
-            # Validação de JSON otimizada
             try:
                 parsed_json = json.loads(json_result)
                 return json.dumps(parsed_json, indent=2, ensure_ascii=False)
@@ -258,26 +214,19 @@ def analyze_news_optimized(title: str, content: str) -> str:
         log.exception("❌ Erro durante análise:")
         return f"Erro durante a análise: {str(e)}"
-# -------- WARMUP OTIMIZADO --------
 def warmup_optimized():
-    """Warmup otimizado com múltiplas execuções"""
-    log.info("🔥 Executando warmup otimizado...")
     try:
-        # Múltiplas execuções de warmup para otimizar cache
         for i in range(3):
             result = analyze_news_optimized(f"Test title {i}", f"Test content {i}")
             log.info(f"Warmup {i+1}/3 concluído")
-        # Força garbage collection após warmup
         gc.collect()
-        log.info("✅ Warmup otimizado concluído")
     except Exception as e:
         log.warning(f"⚠️ Warmup falhou: {e}")
-# -------- INTERFACE OTIMIZADA --------
 def create_optimized_interface():
-    """Interface otimizada para melhor performance"""
     with gr.Blocks(
         title="Analisador de Notícias - Ultra Otimizado",
         theme=gr.themes.Monochrome(),
@@ -291,19 +240,10 @@ def create_optimized_interface():
             padding: 15px;
             margin: 10px 0;
         }
-        .status-success {
-            color: #28a745;
-            font-weight: bold;
-        }
-        .status-error {
-            color: #dc3545;
-            font-weight: bold;
-        }
         """
     ) as demo:
         gr.Markdown("# 🚀 Analisador de Notícias - Ultra Otimizado")
-        gr.Markdown("🔥 Versão otimizada para máxima performance em CPU")
         with gr.Row():
             with gr.Column(scale=1):
@@ -319,9 +259,8 @@ def create_optimized_interface():
                     max_lines=6
                 )
-                analyze_btn = gr.Button("⚡ Analisar Notícia (Otimizado)", variant="primary")
-                # Exemplos
                 with gr.Row():
                     example_btn1 = gr.Button("📻 Exemplo 1", size="sm")
                     example_btn2 = gr.Button("⚽ Exemplo 2", size="sm")
@@ -335,15 +274,12 @@ def create_optimized_interface():
                     show_copy_button=True
                 )
-                # Status com informações de performance
-                with gr.Row():
-                    status = gr.Textbox(
-                        label="Status",
-                        value="⚡ Pronto para análise ultra-rápida",
-                        interactive=False
-                    )
-        # Função otimizada para análise
         def analyze_with_status(title: str, content: str) -> Tuple[str, str]:
             if not title.strip() or not content.strip():
                 return "❌ Preencha todos os campos", "Erro: Campos obrigatórios não preenchidos"
@@ -357,65 +293,39 @@ def create_optimized_interface():
             except Exception as e:
                 return f"❌ Erro: {str(e)}", f"Erro: {str(e)}"
-        # Exemplos otimizados
         examples = [
             ("Legendary Musician Carlos Mendes Dies at 78", "Carlos Mendes, the internationally acclaimed Brazilian guitarist and composer known for blending traditional bossa nova with modern jazz, has died at the age of 78."),
             ("Brazil Defeats Argentina 2-1 in Copa America Final", "In a thrilling match at the Maracana Stadium, Brazil secured victory over Argentina with goals from Neymar and Vinicius Jr. The match was watched by over 200 million viewers worldwide."),
             ("Tech Giant Announces Major Layoffs Affecting 10,000 Employees", "The technology company announced significant workforce reductions citing economic uncertainty and changing market conditions. The layoffs will affect multiple departments across different regions.")
         ]
-        # Event handlers
         analyze_btn.click(
             fn=analyze_with_status,
             inputs=[title_input, content_input],
             outputs=[status, output]
         )
-        for i, (title, content) in enumerate(examples):
-            locals()[f'example_btn{i+1}'].click(
-                fn=lambda t=title, c=content: (t, c),
-                outputs=[title_input, content_input]
-            )
-        # Informações de otimização
-        with gr.Accordion("⚡ Otimizações Aplicadas", open=False):
-            gr.Markdown(f"""
-            **Otimizações de Hardware:**
-            - Threads otimizadas: {num_threads} threads para {num_cores} cores
-            - MKL/BLAS otimizado para operações matemáticas
-            - Floating point otimizado (float16 com autocast)
-            - Torch.compile ativado (se disponível)
-            **Otimizações de Modelo:**
-            - Modo de inferência com torch.inference_mode()
-            - Cache de tokenização inteligente
-            - Processamento sem gradientes
-            - Fusão de camadas lineares
-            - Greedy decoding para máxima velocidade
-            **Otimizações de Memória:**
-            - Garbage collection otimizado
-            - Context manager para gestão de memória
-            - Limpeza automática de tensores
-            - Limite de tamanho de entrada
-            **Otimizações de I/O:**
-            - Regex compilado para extração JSON
-            - Preprocessamento otimizado
-            - Cache inteligente de operações
-            - Múltiplas execuções de warmup
-            ⚡ **Resultado esperado:** 30-50% mais rápido que a versão anterior
-            """)
     return demo
-# -------- EXECUÇÃO PRINCIPAL --------
 if __name__ == "__main__":
-    # Executa warmup otimizado
     warmup_optimized()
-    print("🚀 Iniciando interface ultra-otimizada...")
     demo = create_optimized_interface()
     demo.launch(
         share=False,
@@ -423,5 +333,5 @@ if __name__ == "__main__":
         server_port=7860,
         show_error=True,
         max_threads=num_threads,
-        show_api=False,  # Desativa API para economizar recursos
     )

 import gc
 from typing import Dict, Any, Optional, List, Tuple
 import psutil
 from contextlib import contextmanager
 num_cores = psutil.cpu_count(logical=False)
+num_threads = min(4, num_cores)
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 os.environ["OMP_NUM_THREADS"] = str(num_threads)
 torch.set_num_threads(num_threads)
 torch.set_num_interop_threads(1)
 torch.backends.mkl.enabled = True
 torch.backends.mkldnn.enabled = True
 torch.backends.quantized.engine = 'qnnpack'
+torch.cuda.empty_cache = lambda: None
 logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
 log = logging.getLogger("news-filter-optimized")
 device = "cpu"
 torch.set_default_device(device)
 @contextmanager
 def memory_efficient_context():
     try:
         gc.collect()
         yield
     finally:
         gc.collect()
 class OptimizedTokenizerWrapper:
     def __init__(self, tokenizer):
         self.tokenizer = tokenizer
         self._template_cache = {}
     def apply_chat_template(self, messages, **kwargs):
         content = messages[0]['content'] if messages else ""
+        key = hash(content[:100])
         if key not in self._template_cache:
             result = self.tokenizer.apply_chat_template(messages, **kwargs)
             if len(self._template_cache) > 100:
                 self._template_cache.clear()
             self._template_cache[key] = result
         return self._template_cache[key]
     def decode(self, *args, **kwargs):
         return self.tokenizer.decode(*args, **kwargs)
     def __getattr__(self, name):
         return getattr(self.tokenizer, name)
+print("🚀 Carregando modelo...")
+log.info("🚀 Carregando modelo...")
 model_config = {
     "device_map": device,
+    "torch_dtype": torch.float16,
     "low_cpu_mem_usage": True,
     "use_cache": True,
     "trust_remote_code": True,
+    "attn_implementation": "eager",
 }
 model = AutoPeftModelForCausalLM.from_pretrained(
     "habulaj/filterinstruct180",
     **model_config
 )
 tokenizer = AutoTokenizer.from_pretrained(
     "habulaj/filterinstruct180",
     use_fast=True,
     padding_side="left",
+    model_max_length=1024,
+    clean_up_tokenization_spaces=False,
 )
 if tokenizer.pad_token is None:
     tokenizer.pad_token = tokenizer.eos_token
 tokenizer = OptimizedTokenizerWrapper(tokenizer)
 model.eval()
 for param in model.parameters():
     param.requires_grad = False
 try:
     model = torch.compile(model, mode="reduce-overhead")
+    log.info("✅ Modelo compilado")
 except Exception as e:
     log.warning(f"⚠️ Torch compile não disponível: {e}")
 if hasattr(model, 'fuse_linear_layers'):
     model.fuse_linear_layers()
+log.info("✅ Modelo carregado")
+tokenizer.tokenizer.chat_template = """{% for message in messages %}{% if message['role'] == 'user' %}{% if loop.first %}<|begin_of_text|><|start_header_id|>user<|end_header_id|>
+{{ message['content'] }}<|eot_id|>{% else %}<|start_header_id|>user<|end_header_id|>
+{{ message['content'] }}<|eot_id|>{% endif %}{% elif message['role'] == 'assistant' %}<|start_header_id|>assistant<|end_header_id|>
+{{ message['content'] }}<|eot_id|>{% endif %}{% endfor %}{% if add_generation_prompt %}<|start_header_id|>assistant<|end_header_id|>
+{% endif %}"""
 generation_config = GenerationConfig(
+    max_new_tokens=150,
+    temperature=0.8,
+    do_sample=False,
     use_cache=True,
     eos_token_id=tokenizer.eos_token_id,
     pad_token_id=tokenizer.eos_token_id,
     repetition_penalty=1.1,
     length_penalty=1.0,
+    num_beams=1,
     early_stopping=True,
 )
 def extract_json_optimized(text: str) -> str:
     if not hasattr(extract_json_optimized, 'pattern'):
         extract_json_optimized.pattern = re.compile(r'\{.*?\}', re.DOTALL)
     return match.group(0) if match else text
 def preprocess_input_optimized(title: str, content: str) -> List[Dict[str, str]]:
     max_title_length = 100
     max_content_length = 500
     }]
 def analyze_news_optimized(title: str, content: str) -> str:
     try:
         with memory_efficient_context():
             start_time = time.time()
             messages = preprocess_input_optimized(title, content)
             inputs = tokenizer.apply_chat_template(
                 messages,
                 tokenize=True,
                 add_generation_prompt=True,
                 return_tensors="pt",
+                padding=False,
                 truncation=True,
                 max_length=1024,
             )
             with torch.no_grad(), torch.inference_mode():
                 with torch.autocast(device_type='cpu', dtype=torch.float16):
                     outputs = model.generate(
                         output_attentions=False,
                         return_dict_in_generate=False,
                         use_cache=True,
+                        do_sample=False,
                     )
             generated_tokens = outputs[0][inputs.shape[1]:]
             generated_text = tokenizer.decode(
                 generated_tokens,
                 clean_up_tokenization_spaces=False
             )
             json_result = extract_json_optimized(generated_text)
             duration = time.time() - start_time
             log.info(f"✅ Análise concluída em {duration:.2f}s")
             del outputs, inputs, generated_tokens
             try:
                 parsed_json = json.loads(json_result)
                 return json.dumps(parsed_json, indent=2, ensure_ascii=False)
         log.exception("❌ Erro durante análise:")
         return f"Erro durante a análise: {str(e)}"
 def warmup_optimized():
+    log.info("🔥 Executando warmup...")
     try:
         for i in range(3):
             result = analyze_news_optimized(f"Test title {i}", f"Test content {i}")
             log.info(f"Warmup {i+1}/3 concluído")
         gc.collect()
+        log.info("✅ Warmup concluído")
     except Exception as e:
         log.warning(f"⚠️ Warmup falhou: {e}")
 def create_optimized_interface():
     with gr.Blocks(
         title="Analisador de Notícias - Ultra Otimizado",
         theme=gr.themes.Monochrome(),
             padding: 15px;
             margin: 10px 0;
         }
         """
     ) as demo:
         gr.Markdown("# 🚀 Analisador de Notícias - Ultra Otimizado")
         with gr.Row():
             with gr.Column(scale=1):
                     max_lines=6
                 )
+                analyze_btn = gr.Button("⚡ Analisar Notícia", variant="primary")
                 with gr.Row():
                     example_btn1 = gr.Button("📻 Exemplo 1", size="sm")
                     example_btn2 = gr.Button("⚽ Exemplo 2", size="sm")
                     show_copy_button=True
                 )
+                status = gr.Textbox(
+                    label="Status",
+                    value="⚡ Pronto para análise",
+                    interactive=False
+                )
         def analyze_with_status(title: str, content: str) -> Tuple[str, str]:
             if not title.strip() or not content.strip():
                 return "❌ Preencha todos os campos", "Erro: Campos obrigatórios não preenchidos"
             except Exception as e:
                 return f"❌ Erro: {str(e)}", f"Erro: {str(e)}"
         examples = [
             ("Legendary Musician Carlos Mendes Dies at 78", "Carlos Mendes, the internationally acclaimed Brazilian guitarist and composer known for blending traditional bossa nova with modern jazz, has died at the age of 78."),
             ("Brazil Defeats Argentina 2-1 in Copa America Final", "In a thrilling match at the Maracana Stadium, Brazil secured victory over Argentina with goals from Neymar and Vinicius Jr. The match was watched by over 200 million viewers worldwide."),
             ("Tech Giant Announces Major Layoffs Affecting 10,000 Employees", "The technology company announced significant workforce reductions citing economic uncertainty and changing market conditions. The layoffs will affect multiple departments across different regions.")
         ]
         analyze_btn.click(
             fn=analyze_with_status,
             inputs=[title_input, content_input],
             outputs=[status, output]
         )
+        example_btn1.click(
+            fn=lambda: examples[0],
+            outputs=[title_input, content_input]
+        )
+        example_btn2.click(
+            fn=lambda: examples[1],
+            outputs=[title_input, content_input]
+        )
+        example_btn3.click(
+            fn=lambda: examples[2],
+            outputs=[title_input, content_input]
+        )
     return demo
 if __name__ == "__main__":
     warmup_optimized()
+    print("🚀 Iniciando interface...")
     demo = create_optimized_interface()
     demo.launch(
         share=False,
         server_port=7860,
         show_error=True,
         max_threads=num_threads,
+        show_api=False,
     )