Spaces:

DHEIVER
/

PDFQAApp

Runtime error

App Files Files Community

DHEIVER commited on Jan 28

Commit

870c41d

verified ·

1 Parent(s): 9247698

Update app.py

Browse files

Files changed (1) hide show

app.py +202 -35

app.py CHANGED Viewed

@@ -1,38 +1,205 @@
 import gradio as gr
-from transformers import pipeline
 import PyPDF2
-# Carregar o modelo de linguagem gratuito da Hugging Face
-nlp = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
-def extract_text_from_pdf(pdf_file):
-    with open(pdf_file.name, "rb") as file:
-        reader = PyPDF2.PdfFileReader(file)
-        text = ""
-        for page_num in range(reader.numPages):
-            page = reader.getPage(page_num)
-            text += page.extract_text()
-    return text
-def answer_question(pdf_file, question):
-    # Extrair texto do PDF
-    context = extract_text_from_pdf(pdf_file)
-    # Usar o modelo para responder a pergunta
-    result = nlp(question=question, context=context)
-    return result['answer']
-# Interface Gradio
-iface = gr.Interface(
-    fn=answer_question,
-    inputs=[
-        gr.File(label="Carregar PDF"),
-        gr.Textbox(label="Pergunta")
-    ],
-    outputs=gr.Textbox(label="Resposta"),
-    title="QA sobre PDF",
-    description="Carregue um PDF e faça perguntas sobre o conteúdo."
-)
-# Iniciar a interface
-iface.launch()

 import gradio as gr
+from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
 import PyPDF2
+import torch
+import re
+from typing import List, Dict, Tuple
+import nltk
+from nltk.tokenize import sent_tokenize
+import fitz  # PyMuPDF
+import logging
+from tqdm import tqdm
+# Configurar logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Baixar recursos necessários do NLTK
+try:
+    nltk.download('punkt', quiet=True)
+except Exception as e:
+    logger.warning(f"Erro ao baixar recursos NLTK: {e}")
+class PDFQuestionAnswering:
+    def __init__(self):
+        # Usar modelo multilíngue mais avançado
+        self.model_name = "deepset/roberta-base-squad2"
+        try:
+            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+            self.model = AutoModelForQuestionAnswering.from_pretrained(self.model_name)
+            self.nlp = pipeline('question-answering',
+                              model=self.model,
+                              tokenizer=self.tokenizer,
+                              device=0 if torch.cuda.is_available() else -1)
+            logger.info(f"Modelo {self.model_name} carregado com sucesso")
+        except Exception as e:
+            logger.error(f"Erro ao carregar o modelo: {e}")
+            raise
+    def extract_text_from_pdf(self, pdf_file: str) -> Tuple[str, Dict[int, str]]:
+        """
+        Extrai texto do PDF usando PyMuPDF para melhor precisão
+        Retorna o texto completo e um dicionário mapeando números de página para texto
+        """
+        try:
+            doc = fitz.open(pdf_file)
+            full_text = ""
+            page_text = {}
+            for page_num in range(len(doc)):
+                page = doc[page_num]
+                text = page.get_text("text")
+                page_text[page_num] = text
+                full_text += text + "\n"
+            return full_text, page_text
+        except Exception as e:
+            logger.error(f"Erro na extração do PDF: {e}")
+            raise
+        finally:
+            if 'doc' in locals():
+                doc.close()
+    def preprocess_text(self, text: str) -> str:
+        """
+        Pré-processa o texto removendo caracteres especiais e formatação indesejada
+        """
+        # Remover quebras de linha extras
+        text = re.sub(r'\n+', ' ', text)
+        # Remover espaços múltiplos
+        text = re.sub(r'\s+', ' ', text)
+        # Remover caracteres especiais
+        text = re.sub(r'[^\w\s.,!?-]', '', text)
+        return text.strip()
+    def split_into_chunks(self, text: str, max_length: int = 512) -> List[str]:
+        """
+        Divide o texto em chunks menores respeitando limites de sentenças
+        """
+        sentences = sent_tokenize(text)
+        chunks = []
+        current_chunk = ""
+        for sentence in sentences:
+            if len(current_chunk) + len(sentence) < max_length:
+                current_chunk += sentence + " "
+            else:
+                chunks.append(current_chunk.strip())
+                current_chunk = sentence + " "
+        if current_chunk:
+            chunks.append(current_chunk.strip())
+        return chunks
+    def get_best_answer(self, question: str, chunks: List[str]) -> Dict:
+        """
+        Obtém a melhor resposta considerando todos os chunks de texto
+        """
+        try:
+            answers = []
+            for chunk in chunks:
+                result = self.nlp(question=question, context=chunk)
+                answers.append(result)
+            # Ordenar por score
+            best_answer = max(answers, key=lambda x: x['score'])
+            return {
+                'answer': best_answer['answer'],
+                'score': best_answer['score'],
+                'context': best_answer['context']
+            }
+        except Exception as e:
+            logger.error(f"Erro ao processar resposta: {e}")
+            return {'answer': "Desculpe, não consegui processar sua pergunta.", 'score': 0, 'context': ""}
+    def answer_question(self, pdf_file: gr.File, question: str) -> Dict:
+        """
+        Processa o PDF e responde à pergunta
+        """
+        try:
+            # Extrair texto do PDF
+            full_text, page_text = self.extract_text_from_pdf(pdf_file.name)
+            # Pré-processar texto
+            processed_text = self.preprocess_text(full_text)
+            # Dividir em chunks
+            chunks = self.split_into_chunks(processed_text)
+            # Obter melhor resposta
+            result = self.get_best_answer(question, chunks)
+            # Adicionar informações extras
+            result['confidence'] = f"{result['score']*100:.2f}%"
+            return result
+        except Exception as e:
+            logger.error(f"Erro ao processar pergunta: {e}")
+            return {
+                'answer': "Ocorreu um erro ao processar sua pergunta.",
+                'score': 0,
+                'confidence': "0%",
+                'context': ""
+            }
+def create_interface():
+    qa_system = PDFQuestionAnswering()
+    # Interface mais elaborada com Gradio
+    with gr.Blocks(title="Sistema Avançado de QA sobre PDFs") as iface:
+        gr.Markdown("""
+        # Sistema de Perguntas e Respostas sobre PDFs
+        Este sistema utiliza um modelo de linguagem avançado para responder perguntas sobre documentos PDF.
+        Carregue um PDF e faça suas perguntas!
+        """)
+        with gr.Row():
+            with gr.Column():
+                pdf_input = gr.File(
+                    label="Carregar PDF",
+                    file_types=[".pdf"]
+                )
+                question_input = gr.Textbox(
+                    label="Sua Pergunta",
+                    placeholder="Digite sua pergunta aqui..."
+                )
+                submit_btn = gr.Button("Obter Resposta", variant="primary")
+            with gr.Column():
+                answer_output = gr.Textbox(label="Resposta")
+                confidence_output = gr.Textbox(label="Confiança da Resposta")
+                context_output = gr.Textbox(
+                    label="Contexto da Resposta",
+                    lines=5
+                )
+        def process_question(pdf, question):
+            result = qa_system.answer_question(pdf, question)
+            return (
+                result['answer'],
+                result['confidence'],
+                result['context']
+            )
+        submit_btn.click(
+            fn=process_question,
+            inputs=[pdf_input, question_input],
+            outputs=[answer_output, confidence_output, context_output]
+        )
+        gr.Markdown("""
+        ### Dicas de Uso
+        - Faça perguntas específicas e diretas
+        - O sistema funciona melhor com PDFs bem formatados
+        - A confiança indica o quanto o sistema está seguro da resposta
+        """)
+    return iface
+if __name__ == "__main__":
+    # Criar e iniciar a interface
+    demo = create_interface()
+    demo.launch(share=True, debug=True)