Spaces:

DHEIVER
/

PDFQAApp

Runtime error

App Files Files Community

DHEIVER commited on Jan 28

Commit

5992817

verified ·

1 Parent(s): 0ed0b30

Update app.py

Browse files

Files changed (1) hide show

app.py +146 -43

app.py CHANGED Viewed

@@ -9,19 +9,48 @@ from nltk.tokenize import sent_tokenize
 import fitz  # PyMuPDF
 import logging
 from tqdm import tqdm
 # Configurar logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# Baixar recursos necessários do NLTK
-try:
-    nltk.download('punkt', quiet=True)
-except Exception as e:
-    logger.warning(f"Erro ao baixar recursos NLTK: {e}")
 class PDFQuestionAnswering:
     def __init__(self):
         # Usar modelo multilíngue mais avançado
         self.model_name = "deepset/roberta-base-squad2"
         try:
@@ -36,12 +65,45 @@ class PDFQuestionAnswering:
             logger.error(f"Erro ao carregar o modelo: {e}")
             raise
     def extract_text_from_pdf(self, pdf_file: str) -> Tuple[str, Dict[int, str]]:
         """
-        Extrai texto do PDF usando PyMuPDF para melhor precisão
-        Retorna o texto completo e um dicionário mapeando números de página para texto
         """
         try:
             doc = fitz.open(pdf_file)
             full_text = ""
             page_text = {}
@@ -52,55 +114,75 @@ class PDFQuestionAnswering:
                 page_text[page_num] = text
                 full_text += text + "\n"
             return full_text, page_text
         except Exception as e:
-            logger.error(f"Erro na extração do PDF: {e}")
-            raise
-        finally:
-            if 'doc' in locals():
-                doc.close()
     def preprocess_text(self, text: str) -> str:
         """
         Pré-processa o texto removendo caracteres especiais e formatação indesejada
         """
-        # Remover quebras de linha extras
-        text = re.sub(r'\n+', ' ', text)
-        # Remover espaços múltiplos
-        text = re.sub(r'\s+', ' ', text)
-        # Remover caracteres especiais
-        text = re.sub(r'[^\w\s.,!?-]', '', text)
-        return text.strip()
-    def split_into_chunks(self, text: str, max_length: int = 512) -> List[str]:
-        """
-        Divide o texto em chunks menores respeitando limites de sentenças
-        """
-        sentences = sent_tokenize(text)
-        chunks = []
-        current_chunk = ""
-        for sentence in sentences:
-            if len(current_chunk) + len(sentence) < max_length:
-                current_chunk += sentence + " "
-            else:
-                chunks.append(current_chunk.strip())
-                current_chunk = sentence + " "
-        if current_chunk:
-            chunks.append(current_chunk.strip())
-        return chunks
     def get_best_answer(self, question: str, chunks: List[str]) -> Dict:
         """
         Obtém a melhor resposta considerando todos os chunks de texto
         """
         try:
             answers = []
             for chunk in chunks:
-                result = self.nlp(question=question, context=chunk)
-                answers.append(result)
             # Ordenar por score
             best_answer = max(answers, key=lambda x: x['score'])
@@ -112,16 +194,36 @@ class PDFQuestionAnswering:
             }
         except Exception as e:
             logger.error(f"Erro ao processar resposta: {e}")
-            return {'answer': "Desculpe, não consegui processar sua pergunta.", 'score': 0, 'context': ""}
     def answer_question(self, pdf_file: gr.File, question: str) -> Dict:
         """
         Processa o PDF e responde à pergunta
         """
         try:
             # Extrair texto do PDF
             full_text, page_text = self.extract_text_from_pdf(pdf_file.name)
             # Pré-processar texto
             processed_text = self.preprocess_text(full_text)
@@ -202,4 +304,5 @@ def create_interface():
 if __name__ == "__main__":
     # Criar e iniciar a interface
     demo = create_interface()
-    demo.launch(share=True, debug=True)

 import fitz  # PyMuPDF
 import logging
 from tqdm import tqdm
+import os
 # Configurar logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+class NLTKDownloader:
+    @staticmethod
+    def download_nltk_resources():
+        """
+        Download recursos NLTK necessários e configura o diretório de dados
+        """
+        try:
+            # Configurar diretório de dados NLTK
+            nltk_data_dir = os.path.join(os.path.expanduser("~"), "nltk_data")
+            if not os.path.exists(nltk_data_dir):
+                os.makedirs(nltk_data_dir)
+            nltk.data.path.append(nltk_data_dir)
+            # Lista de recursos necessários
+            resources = ['punkt']
+            for resource in resources:
+                try:
+                    nltk.data.find(f'tokenizers/{resource}')
+                    logger.info(f"Recurso NLTK '{resource}' já está instalado")
+                except LookupError:
+                    logger.info(f"Baixando recurso NLTK '{resource}'...")
+                    nltk.download(resource, download_dir=nltk_data_dir, quiet=True)
+            return True
+        except Exception as e:
+            logger.error(f"Erro ao baixar recursos NLTK: {e}")
+            return False
 class PDFQuestionAnswering:
     def __init__(self):
+        # Inicializar recursos NLTK
+        if not NLTKDownloader.download_nltk_resources():
+            logger.warning("Alguns recursos NLTK podem não estar disponíveis")
         # Usar modelo multilíngue mais avançado
         self.model_name = "deepset/roberta-base-squad2"
         try:
             logger.error(f"Erro ao carregar o modelo: {e}")
             raise
+    def split_text_simple(self, text: str, max_length: int = 512) -> List[str]:
+        """
+        Método alternativo de divisão de texto caso o NLTK não esteja disponível
+        """
+        words = text.split()
+        chunks = []
+        current_chunk = []
+        current_length = 0
+        for word in words:
+            if current_length + len(word) + 1 <= max_length:
+                current_chunk.append(word)
+                current_length += len(word) + 1
+            else:
+                chunks.append(' '.join(current_chunk))
+                current_chunk = [word]
+                current_length = len(word) + 1
+        if current_chunk:
+            chunks.append(' '.join(current_chunk))
+        return chunks
+    def split_into_chunks(self, text: str, max_length: int = 512) -> List[str]:
+        """
+        Divide o texto em chunks menores, com fallback para método simples
+        """
+        try:
+            return [chunk for chunk in self.split_text_simple(text, max_length)]
+        except Exception as e:
+            logger.warning(f"Erro ao dividir texto com NLTK: {e}. Usando método simples.")
+            return self.split_text_simple(text, max_length)
     def extract_text_from_pdf(self, pdf_file: str) -> Tuple[str, Dict[int, str]]:
         """
+        Extrai texto do PDF com fallback para PyPDF2
         """
         try:
+            # Tentar primeiro com PyMuPDF
             doc = fitz.open(pdf_file)
             full_text = ""
             page_text = {}
                 page_text[page_num] = text
                 full_text += text + "\n"
+            doc.close()
             return full_text, page_text
         except Exception as e:
+            logger.warning(f"Erro com PyMuPDF: {e}. Tentando PyPDF2...")
+            try:
+                # Fallback para PyPDF2
+                with open(pdf_file, "rb") as file:
+                    reader = PyPDF2.PdfReader(file)
+                    full_text = ""
+                    page_text = {}
+                    for i, page in enumerate(reader.pages):
+                        text = page.extract_text()
+                        page_text[i] = text
+                        full_text += text + "\n"
+                return full_text, page_text
+            except Exception as e2:
+                logger.error(f"Erro ao extrair texto do PDF: {e2}")
+                raise
     def preprocess_text(self, text: str) -> str:
         """
         Pré-processa o texto removendo caracteres especiais e formatação indesejada
         """
+        try:
+            # Remover quebras de linha extras
+            text = re.sub(r'\n+', ' ', text)
+            # Remover espaços múltiplos
+            text = re.sub(r'\s+', ' ', text)
+            # Remover caracteres especiais mas manter acentos
+            text = re.sub(r'[^\w\s.,!?-áéíóúâêîôûãõçà]', '', text)
+            return text.strip()
+        except Exception as e:
+            logger.warning(f"Erro no pré-processamento: {e}")
+            return text  # Retorna texto original em caso de erro
     def get_best_answer(self, question: str, chunks: List[str]) -> Dict:
         """
         Obtém a melhor resposta considerando todos os chunks de texto
         """
         try:
+            if not chunks:
+                return {
+                    'answer': "Não foi possível processar o texto do documento.",
+                    'score': 0,
+                    'context': ""
+                }
             answers = []
             for chunk in chunks:
+                if not chunk.strip():
+                    continue
+                try:
+                    result = self.nlp(question=question, context=chunk)
+                    answers.append(result)
+                except Exception as e:
+                    logger.warning(f"Erro ao processar chunk: {e}")
+                    continue
+            if not answers:
+                return {
+                    'answer': "Não foi possível encontrar uma resposta no documento.",
+                    'score': 0,
+                    'context': ""
+                }
             # Ordenar por score
             best_answer = max(answers, key=lambda x: x['score'])
             }
         except Exception as e:
             logger.error(f"Erro ao processar resposta: {e}")
+            return {
+                'answer': "Ocorreu um erro ao processar sua pergunta.",
+                'score': 0,
+                'context': ""
+            }
     def answer_question(self, pdf_file: gr.File, question: str) -> Dict:
         """
         Processa o PDF e responde à pergunta
         """
         try:
+            if not pdf_file or not question:
+                return {
+                    'answer': "Por favor, forneça um arquivo PDF e uma pergunta.",
+                    'score': 0,
+                    'confidence': "0%",
+                    'context': ""
+                }
             # Extrair texto do PDF
             full_text, page_text = self.extract_text_from_pdf(pdf_file.name)
+            if not full_text.strip():
+                return {
+                    'answer': "Não foi possível extrair texto do PDF fornecido.",
+                    'score': 0,
+                    'confidence': "0%",
+                    'context': ""
+                }
             # Pré-processar texto
             processed_text = self.preprocess_text(full_text)
 if __name__ == "__main__":
     # Criar e iniciar a interface
     demo = create_interface()
+    # Desabilitar SSR para evitar problemas
+    demo.launch(share=False, debug=True, ssr=False)