|
import gradio as gr |
|
from huggingface_hub import InferenceClient |
|
import PyPDF2 |
|
from sentence_transformers import SentenceTransformer |
|
import numpy as np |
|
import faiss |
|
from typing import List, Tuple |
|
from rank_bm25 import BM25Okapi |
|
|
|
|
|
client = InferenceClient("google/gemma-3-27b-it") |
|
embedder = SentenceTransformer('all-MiniLM-L6-v2') |
|
|
|
|
|
class AdvancedPDFKnowledgeBase: |
|
def __init__(self): |
|
self.chunks = [] |
|
self.chunk_embeddings = None |
|
self.index = None |
|
self.bm25 = None |
|
|
|
def _split_into_chunks(self, text: str, chunk_size: int = 500) -> List[str]: |
|
words = text.split() |
|
return [' '.join(words[i:i + chunk_size]) |
|
for i in range(0, len(words), chunk_size)] |
|
|
|
def load_pdfs(self, pdf_files: List[gr.File]) -> str: |
|
self.chunks = [] |
|
for file in pdf_files: |
|
with open(file.name, 'rb') as pdf_file: |
|
pdf_reader = PyPDF2.PdfReader(pdf_file) |
|
text = "" |
|
for page in pdf_reader.pages: |
|
text += page.extract_text() + "\n" |
|
chunks = self._split_into_chunks(text) |
|
for chunk in chunks: |
|
self.chunks.append({ |
|
'filename': file.name.split('/')[-1], |
|
'content': chunk |
|
}) |
|
|
|
if not self.chunks: |
|
return "Nenhum PDF encontrado." |
|
|
|
contents = [chunk['content'] for chunk in self.chunks] |
|
self.chunk_embeddings = embedder.encode(contents, convert_to_numpy=True) |
|
dimension = self.chunk_embeddings.shape[1] |
|
self.index = faiss.IndexFlatL2(dimension) |
|
self.index.add(self.chunk_embeddings) |
|
tokenized_chunks = [chunk['content'].split() for chunk in self.chunks] |
|
self.bm25 = BM25Okapi(tokenized_chunks) |
|
return f"Carregados {len(self.chunks)} chunks de {len(set(c['filename'] for c in self.chunks))} PDFs." |
|
|
|
def get_relevant_context(self, query: str, k: int = 5, rerank_k: int = 3) -> str: |
|
if self.index is None or not self.chunks: |
|
return "Nenhum documento carregado ainda." |
|
|
|
query_embedding = embedder.encode([query], convert_to_numpy=True) |
|
distances, indices = self.index.search(query_embedding, k) |
|
candidates = [self.chunks[idx] for idx in indices[0]] |
|
|
|
tokenized_query = query.split() |
|
bm25_scores = self.bm25.get_scores(tokenized_query) |
|
candidate_scores = [(candidates[i], bm25_scores[indices[0][i]]) |
|
for i in range(len(candidates))] |
|
candidate_scores.sort(key=lambda x: x[1], reverse=True) |
|
|
|
top_chunks = candidate_scores[:rerank_k] |
|
context = "" |
|
for chunk, score in top_chunks: |
|
context += f"**Documento**: {chunk['filename']}\n" |
|
context += f"**Trecho**: {chunk['content'][:500]}...\n" |
|
context += f"**Score BM25**: {score:.2f}\n\n" |
|
return context |
|
|
|
|
|
knowledge_base = AdvancedPDFKnowledgeBase() |
|
|
|
def respond( |
|
message: str, |
|
history: List[Tuple[str, str]], |
|
system_message: str, |
|
max_tokens: int, |
|
temperature: float, |
|
top_p: float, |
|
k_initial: int, |
|
k_final: int |
|
): |
|
if not knowledge_base.chunks: |
|
yield "Por favor, carregue os PDFs primeiro.", "", "" |
|
return |
|
|
|
context = knowledge_base.get_relevant_context(message, k_initial, k_final) |
|
|
|
|
|
rag_prompt = f"""Você é Grok 3, criado por xAI. Use o contexto dos documentos para responder: |
|
{context} |
|
Pergunta: {message} |
|
Responda com base no contexto quando relevante.""" |
|
|
|
|
|
messages = [{"role": "system", "content": system_message}] |
|
|
|
|
|
for user_msg, assistant_msg in history: |
|
if user_msg: |
|
messages.append({"role": "user", "content": user_msg}) |
|
if assistant_msg: |
|
messages.append({"role": "assistant", "content": assistant_msg}) |
|
|
|
|
|
messages.append({"role": "user", "content": rag_prompt}) |
|
|
|
response = "" |
|
try: |
|
for message_chunk in client.chat_completion( |
|
messages=messages, |
|
max_tokens=max_tokens, |
|
stream=True, |
|
temperature=temperature, |
|
top_p=top_p, |
|
): |
|
token = message_chunk.choices[0].delta.content |
|
if token: |
|
response += token |
|
yield response, context, "" |
|
except Exception as e: |
|
yield f"Erro ao gerar resposta: {str(e)}", context, "" |
|
|
|
|
|
def load_pdfs(pdf_files: List[gr.File]): |
|
status = knowledge_base.load_pdfs(pdf_files) |
|
return status |
|
|
|
|
|
with gr.Blocks(title="RAG Avançado com PDFs", theme=gr.themes.Soft()) as demo: |
|
with gr.Row(): |
|
with gr.Column(scale=2): |
|
gr.Markdown("# Chatbot RAG com PDFs") |
|
gr.Markdown("Arraste e solte seus PDFs abaixo ou clique para selecionar.") |
|
|
|
with gr.Column(scale=1): |
|
load_status = gr.Textbox(label="Status do Carregamento", interactive=False) |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=2): |
|
chatbot = gr.Chatbot(label="Conversa", height=400) |
|
msg = gr.Textbox(label="Sua pergunta", placeholder="Digite sua pergunta aqui...") |
|
submit_btn = gr.Button("Enviar") |
|
|
|
with gr.Column(scale=1): |
|
context_box = gr.Markdown(label="Contexto Recuperado", value="Contexto aparecerá aqui após a pergunta.") |
|
|
|
with gr.Accordion("Configurações", open=False): |
|
with gr.Row(): |
|
with gr.Column(): |
|
pdf_upload = gr.File(label="Carregar PDFs", file_types=[".pdf"], file_count="multiple", interactive=True) |
|
load_btn = gr.Button("Carregar PDFs") |
|
|
|
with gr.Column(): |
|
system_msg = gr.Textbox( |
|
label="Mensagem do Sistema", |
|
value="Você é um assistente útil que responde com base em documentos PDF." |
|
) |
|
max_tokens = gr.Slider(1, 2048, value=512, step=1, label="Max Tokens") |
|
temperature = gr.Slider(0.1, 4.0, value=0.7, step=0.1, label="Temperature") |
|
top_p = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-p") |
|
|
|
with gr.Row(): |
|
k_initial = gr.Slider(1, 20, value=5, step=1, label="Candidatos Iniciais (FAISS)") |
|
k_final = gr.Slider(1, 10, value=3, step=1, label="Resultados Finais (BM25)") |
|
|
|
|
|
def submit_message(message, history, system_message, max_tokens, temperature, top_p, k_initial, k_final): |
|
history = history or [] |
|
for response, context, _ in respond(message, history, system_message, max_tokens, temperature, top_p, k_initial, k_final): |
|
history.append((message, response)) |
|
yield history, context, "" |
|
yield history, context, "" |
|
|
|
|
|
submit_btn.click( |
|
submit_message, |
|
inputs=[msg, chatbot, system_msg, max_tokens, temperature, top_p, k_initial, k_final], |
|
outputs=[chatbot, context_box, msg] |
|
) |
|
msg.submit( |
|
submit_message, |
|
inputs=[msg, chatbot, system_msg, max_tokens, temperature, top_p, k_initial, k_final], |
|
outputs=[chatbot, context_box, msg] |
|
) |
|
load_btn.click( |
|
load_pdfs, |
|
inputs=[pdf_upload], |
|
outputs=[load_status] |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |