import gradio as gr
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
import PyPDF2
import torch
import re
from typing import List, Dict, Tuple
import nltk
from nltk.tokenize import sent_tokenize
import fitz  # PyMuPDF
import logging
from tqdm import tqdm

# Configurar logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Baixar recursos necessários do NLTK
try:
    nltk.download('punkt', quiet=True)
except Exception as e:
    logger.warning(f"Erro ao baixar recursos NLTK: {e}")

class PDFQuestionAnswering:
    def __init__(self):
        # Usar modelo multilíngue mais avançado
        self.model_name = "deepset/roberta-base-squad2"
        try:
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            self.model = AutoModelForQuestionAnswering.from_pretrained(self.model_name)
            self.nlp = pipeline('question-answering', 
                              model=self.model, 
                              tokenizer=self.tokenizer,
                              device=0 if torch.cuda.is_available() else -1)
            logger.info(f"Modelo {self.model_name} carregado com sucesso")
        except Exception as e:
            logger.error(f"Erro ao carregar o modelo: {e}")
            raise

    def extract_text_from_pdf(self, pdf_file: str) -> Tuple[str, Dict[int, str]]:
        """
        Extrai texto do PDF usando PyMuPDF para melhor precisão
        Retorna o texto completo e um dicionário mapeando números de página para texto
        """
        try:
            doc = fitz.open(pdf_file)
            full_text = ""
            page_text = {}
            
            for page_num in range(len(doc)):
                page = doc[page_num]
                text = page.get_text("text")
                page_text[page_num] = text
                full_text += text + "\n"
                
            return full_text, page_text
        except Exception as e:
            logger.error(f"Erro na extração do PDF: {e}")
            raise
        finally:
            if 'doc' in locals():
                doc.close()

    def preprocess_text(self, text: str) -> str:
        """
        Pré-processa o texto removendo caracteres especiais e formatação indesejada
        """
        # Remover quebras de linha extras
        text = re.sub(r'\n+', ' ', text)
        # Remover espaços múltiplos
        text = re.sub(r'\s+', ' ', text)
        # Remover caracteres especiais
        text = re.sub(r'[^\w\s.,!?-]', '', text)
        return text.strip()

    def split_into_chunks(self, text: str, max_length: int = 512) -> List[str]:
        """
        Divide o texto em chunks menores respeitando limites de sentenças
        """
        sentences = sent_tokenize(text)
        chunks = []
        current_chunk = ""
        
        for sentence in sentences:
            if len(current_chunk) + len(sentence) < max_length:
                current_chunk += sentence + " "
            else:
                chunks.append(current_chunk.strip())
                current_chunk = sentence + " "
        
        if current_chunk:
            chunks.append(current_chunk.strip())
            
        return chunks

    def get_best_answer(self, question: str, chunks: List[str]) -> Dict:
        """
        Obtém a melhor resposta considerando todos os chunks de texto
        """
        try:
            answers = []
            for chunk in chunks:
                result = self.nlp(question=question, context=chunk)
                answers.append(result)
            
            # Ordenar por score
            best_answer = max(answers, key=lambda x: x['score'])
            
            return {
                'answer': best_answer['answer'],
                'score': best_answer['score'],
                'context': best_answer['context']
            }
        except Exception as e:
            logger.error(f"Erro ao processar resposta: {e}")
            return {'answer': "Desculpe, não consegui processar sua pergunta.", 'score': 0, 'context': ""}

    def answer_question(self, pdf_file: gr.File, question: str) -> Dict:
        """
        Processa o PDF e responde à pergunta
        """
        try:
            # Extrair texto do PDF
            full_text, page_text = self.extract_text_from_pdf(pdf_file.name)
            
            # Pré-processar texto
            processed_text = self.preprocess_text(full_text)
            
            # Dividir em chunks
            chunks = self.split_into_chunks(processed_text)
            
            # Obter melhor resposta
            result = self.get_best_answer(question, chunks)
            
            # Adicionar informações extras
            result['confidence'] = f"{result['score']*100:.2f}%"
            
            return result
        except Exception as e:
            logger.error(f"Erro ao processar pergunta: {e}")
            return {
                'answer': "Ocorreu um erro ao processar sua pergunta.",
                'score': 0,
                'confidence': "0%",
                'context': ""
            }

def create_interface():
    qa_system = PDFQuestionAnswering()
    
    # Interface mais elaborada com Gradio
    with gr.Blocks(title="Sistema Avançado de QA sobre PDFs") as iface:
        gr.Markdown("""
        # Sistema de Perguntas e Respostas sobre PDFs
        
        Este sistema utiliza um modelo de linguagem avançado para responder perguntas sobre documentos PDF.
        Carregue um PDF e faça suas perguntas!
        """)
        
        with gr.Row():
            with gr.Column():
                pdf_input = gr.File(
                    label="Carregar PDF",
                    file_types=[".pdf"]
                )
                question_input = gr.Textbox(
                    label="Sua Pergunta",
                    placeholder="Digite sua pergunta aqui..."
                )
                submit_btn = gr.Button("Obter Resposta", variant="primary")
            
            with gr.Column():
                answer_output = gr.Textbox(label="Resposta")
                confidence_output = gr.Textbox(label="Confiança da Resposta")
                context_output = gr.Textbox(
                    label="Contexto da Resposta",
                    lines=5
                )
        
        def process_question(pdf, question):
            result = qa_system.answer_question(pdf, question)
            return (
                result['answer'],
                result['confidence'],
                result['context']
            )
        
        submit_btn.click(
            fn=process_question,
            inputs=[pdf_input, question_input],
            outputs=[answer_output, confidence_output, context_output]
        )
        
        gr.Markdown("""
        ### Dicas de Uso
        - Faça perguntas específicas e diretas
        - O sistema funciona melhor com PDFs bem formatados
        - A confiança indica o quanto o sistema está seguro da resposta
        """)
    
    return iface

if __name__ == "__main__":
    # Criar e iniciar a interface
    demo = create_interface()
    demo.launch(share=True, debug=True)