Update app.py
Browse files
app.py
CHANGED
@@ -3,23 +3,30 @@ from huggingface_hub import InferenceClient
|
|
3 |
import PyPDF2
|
4 |
from sentence_transformers import SentenceTransformer
|
5 |
import numpy as np
|
6 |
-
|
7 |
import os
|
8 |
from typing import List, Tuple
|
|
|
9 |
|
10 |
-
# Inicialização do cliente
|
11 |
client = InferenceClient("google/gemma-3-27b-it")
|
12 |
embedder = SentenceTransformer('all-MiniLM-L6-v2')
|
13 |
|
14 |
-
# Classe para gerenciar
|
15 |
-
class
|
16 |
def __init__(self):
|
17 |
-
self.
|
18 |
-
self.
|
|
|
|
|
19 |
|
|
|
|
|
|
|
|
|
|
|
20 |
def load_pdfs(self, pdf_directory: str):
|
21 |
-
|
22 |
-
self.documents = []
|
23 |
for filename in os.listdir(pdf_directory):
|
24 |
if filename.endswith('.pdf'):
|
25 |
pdf_path = os.path.join(pdf_directory, filename)
|
@@ -28,37 +35,51 @@ class PDFKnowledgeBase:
|
|
28 |
text = ""
|
29 |
for page in pdf_reader.pages:
|
30 |
text += page.extract_text() + "\n"
|
31 |
-
self.
|
32 |
-
|
33 |
-
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
-
|
37 |
-
|
38 |
-
|
|
|
|
|
|
|
|
|
|
|
39 |
|
40 |
-
def get_relevant_context(self, query: str, k: int = 3) -> str:
|
41 |
-
|
42 |
-
if self.embeddings is None or len(self.documents) == 0:
|
43 |
return "Nenhum documento carregado ainda."
|
44 |
|
45 |
-
query_embedding = embedder.encode(query, convert_to_numpy=True)
|
46 |
-
|
|
|
47 |
|
48 |
-
|
49 |
-
|
|
|
|
|
|
|
50 |
|
51 |
-
|
52 |
context = ""
|
53 |
-
for
|
54 |
-
context += f"Documento
|
55 |
-
context += f"Trecho
|
56 |
-
|
57 |
return context
|
58 |
|
59 |
# Inicializa a base de conhecimento
|
60 |
-
knowledge_base =
|
61 |
|
|
|
62 |
def respond(
|
63 |
message: str,
|
64 |
history: List[Tuple[str, str]],
|
@@ -66,30 +87,27 @@ def respond(
|
|
66 |
max_tokens: int,
|
67 |
temperature: float,
|
68 |
top_p: float,
|
69 |
-
pdf_directory: str
|
|
|
|
|
70 |
):
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
|
75 |
-
|
76 |
-
context = knowledge_base.get_relevant_context(message)
|
77 |
|
78 |
-
|
79 |
-
rag_prompt = f"""Você é Grok 3, criado por xAI. Use o seguinte contexto dos documentos para responder à pergunta:
|
80 |
|
81 |
{context}
|
82 |
|
83 |
-
Pergunta
|
84 |
-
|
85 |
-
Responda de forma clara e precisa, utilizando o contexto quando relevante."""
|
86 |
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
|
92 |
-
# Adiciona histórico se existir
|
93 |
for user_msg, assistant_msg in history:
|
94 |
if user_msg:
|
95 |
messages.append({"role": "user", "content": user_msg})
|
@@ -106,27 +124,73 @@ Responda de forma clara e precisa, utilizando o contexto quando relevante."""
|
|
106 |
):
|
107 |
token = message_chunk.choices[0].delta.content
|
108 |
response += token
|
109 |
-
yield response
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
|
111 |
-
#
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
|
|
126 |
|
127 |
if __name__ == "__main__":
|
128 |
-
# Crie um diretório 'pdfs' e coloque seus PDFs lá
|
129 |
if not os.path.exists("./pdfs"):
|
130 |
os.makedirs("./pdfs")
|
131 |
-
|
132 |
demo.launch()
|
|
|
3 |
import PyPDF2
|
4 |
from sentence_transformers import SentenceTransformer
|
5 |
import numpy as np
|
6 |
+
import faiss
|
7 |
import os
|
8 |
from typing import List, Tuple
|
9 |
+
from rank_bm25 import BM25Okapi
|
10 |
|
11 |
+
# Inicialização do cliente e modelos
|
12 |
client = InferenceClient("google/gemma-3-27b-it")
|
13 |
embedder = SentenceTransformer('all-MiniLM-L6-v2')
|
14 |
|
15 |
+
# Classe para gerenciar a base de conhecimento
|
16 |
+
class AdvancedPDFKnowledgeBase:
|
17 |
def __init__(self):
|
18 |
+
self.chunks = []
|
19 |
+
self.chunk_embeddings = None
|
20 |
+
self.index = None
|
21 |
+
self.bm25 = None
|
22 |
|
23 |
+
def _split_into_chunks(self, text: str, chunk_size: int = 500) -> List[str]:
|
24 |
+
words = text.split()
|
25 |
+
return [' '.join(words[i:i + chunk_size])
|
26 |
+
for i in range(0, len(words), chunk_size)]
|
27 |
+
|
28 |
def load_pdfs(self, pdf_directory: str):
|
29 |
+
self.chunks = []
|
|
|
30 |
for filename in os.listdir(pdf_directory):
|
31 |
if filename.endswith('.pdf'):
|
32 |
pdf_path = os.path.join(pdf_directory, filename)
|
|
|
35 |
text = ""
|
36 |
for page in pdf_reader.pages:
|
37 |
text += page.extract_text() + "\n"
|
38 |
+
chunks = self._split_into_chunks(text)
|
39 |
+
for chunk in chunks:
|
40 |
+
self.chunks.append({
|
41 |
+
'filename': filename,
|
42 |
+
'content': chunk
|
43 |
+
})
|
44 |
+
|
45 |
+
if not self.chunks:
|
46 |
+
return "Nenhum PDF encontrado."
|
47 |
|
48 |
+
contents = [chunk['content'] for chunk in self.chunks]
|
49 |
+
self.chunk_embeddings = embedder.encode(contents, convert_to_numpy=True)
|
50 |
+
dimension = self.chunk_embeddings.shape[1]
|
51 |
+
self.index = faiss.IndexFlatL2(dimension)
|
52 |
+
self.index.add(self.chunk_embeddings)
|
53 |
+
tokenized_chunks = [chunk['content'].split() for chunk in self.chunks]
|
54 |
+
self.bm25 = BM25Okapi(tokenized_chunks)
|
55 |
+
return f"Carregados {len(self.chunks)} chunks de {len(set(c['filename'] for c in self.chunks))} PDFs."
|
56 |
|
57 |
+
def get_relevant_context(self, query: str, k: int = 5, rerank_k: int = 3) -> str:
|
58 |
+
if self.index is None or not self.chunks:
|
|
|
59 |
return "Nenhum documento carregado ainda."
|
60 |
|
61 |
+
query_embedding = embedder.encode([query], convert_to_numpy=True)
|
62 |
+
distances, indices = self.index.search(query_embedding, k)
|
63 |
+
candidates = [self.chunks[idx] for idx in indices[0]]
|
64 |
|
65 |
+
tokenized_query = query.split()
|
66 |
+
bm25_scores = self.bm25.get_scores(tokenized_query)
|
67 |
+
candidate_scores = [(candidates[i], bm25_scores[indices[0][i]])
|
68 |
+
for i in range(len(candidates))]
|
69 |
+
candidate_scores.sort(key=lambda x: x[1], reverse=True)
|
70 |
|
71 |
+
top_chunks = candidate_scores[:rerank_k]
|
72 |
context = ""
|
73 |
+
for chunk, score in top_chunks:
|
74 |
+
context += f"**Documento**: {chunk['filename']}\n"
|
75 |
+
context += f"**Trecho**: {chunk['content'][:500]}...\n"
|
76 |
+
context += f"**Score BM25**: {score:.2f}\n\n"
|
77 |
return context
|
78 |
|
79 |
# Inicializa a base de conhecimento
|
80 |
+
knowledge_base = AdvancedPDFKnowledgeBase()
|
81 |
|
82 |
+
# Função principal de resposta
|
83 |
def respond(
|
84 |
message: str,
|
85 |
history: List[Tuple[str, str]],
|
|
|
87 |
max_tokens: int,
|
88 |
temperature: float,
|
89 |
top_p: float,
|
90 |
+
pdf_directory: str,
|
91 |
+
k_initial: int,
|
92 |
+
k_final: int
|
93 |
):
|
94 |
+
if not knowledge_base.chunks:
|
95 |
+
yield "Por favor, carregue os PDFs primeiro.", "", ""
|
96 |
+
return
|
97 |
|
98 |
+
context = knowledge_base.get_relevant_context(message, k_initial, k_final)
|
|
|
99 |
|
100 |
+
rag_prompt = f"""Você é Grok 3, criado por xAI. Use o contexto dos documentos para responder:
|
|
|
101 |
|
102 |
{context}
|
103 |
|
104 |
+
Pergunta: {message}
|
|
|
|
|
105 |
|
106 |
+
Responda com base no contexto quando relevante."""
|
107 |
+
|
108 |
+
messages = [{"role": "system", "content": system_message},
|
109 |
+
{"role": "user", "content": rag_prompt}]
|
110 |
|
|
|
111 |
for user_msg, assistant_msg in history:
|
112 |
if user_msg:
|
113 |
messages.append({"role": "user", "content": user_msg})
|
|
|
124 |
):
|
125 |
token = message_chunk.choices[0].delta.content
|
126 |
response += token
|
127 |
+
yield response, context, ""
|
128 |
+
|
129 |
+
# Função para carregar PDFs
|
130 |
+
def load_pdfs(pdf_directory: str):
|
131 |
+
status = knowledge_base.load_pdfs(pdf_directory)
|
132 |
+
return status
|
133 |
+
|
134 |
+
# Interface Gradio personalizada
|
135 |
+
with gr.Blocks(title="RAG Avançado com PDFs") as demo:
|
136 |
+
gr.Markdown("# Chatbot RAG com PDFs")
|
137 |
+
gr.Markdown("Carregue PDFs e faça perguntas baseadas nos documentos.")
|
138 |
+
|
139 |
+
with gr.Row():
|
140 |
+
with gr.Column(scale=2):
|
141 |
+
chatbot = gr.Chatbot(label="Conversa")
|
142 |
+
msg = gr.Textbox(label="Sua pergunta", placeholder="Digite sua pergunta aqui...")
|
143 |
+
submit_btn = gr.Button("Enviar")
|
144 |
+
|
145 |
+
with gr.Column(scale=1):
|
146 |
+
context_box = gr.Markdown(label="Contexto Recuperado", value="Contexto aparecerá aqui após a pergunta.")
|
147 |
+
|
148 |
+
with gr.Accordion("Configurações", open=False):
|
149 |
+
with gr.Row():
|
150 |
+
with gr.Column():
|
151 |
+
pdf_dir = gr.Textbox(label="Diretório dos PDFs", value="./pdfs")
|
152 |
+
load_btn = gr.Button("Carregar PDFs")
|
153 |
+
load_status = gr.Textbox(label="Status do Carregamento", interactive=False)
|
154 |
+
|
155 |
+
with gr.Column():
|
156 |
+
system_msg = gr.Textbox(
|
157 |
+
label="Mensagem do Sistema",
|
158 |
+
value="Você é um assistente útil que responde com base em documentos PDF."
|
159 |
+
)
|
160 |
+
max_tokens = gr.Slider(1, 2048, value=512, step=1, label="Max Tokens")
|
161 |
+
temperature = gr.Slider(0.1, 4.0, value=0.7, step=0.1, label="Temperature")
|
162 |
+
top_p = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-p")
|
163 |
+
|
164 |
+
with gr.Row():
|
165 |
+
k_initial = gr.Slider(1, 20, value=5, step=1, label="Candidatos Iniciais (FAISS)")
|
166 |
+
k_final = gr.Slider(1, 10, value=3, step=1, label="Resultados Finais (BM25)")
|
167 |
+
|
168 |
+
# Função para atualizar o chat
|
169 |
+
def submit_message(message, history, system_message, max_tokens, temperature, top_p, pdf_directory, k_initial, k_final):
|
170 |
+
history = history or []
|
171 |
+
for response, context, _ in respond(message, history, system_message, max_tokens, temperature, top_p, pdf_directory, k_initial, k_final):
|
172 |
+
history.append((message, response))
|
173 |
+
yield history, context, ""
|
174 |
+
yield history, context, ""
|
175 |
|
176 |
+
# Conexões de eventos
|
177 |
+
submit_btn.click(
|
178 |
+
submit_message,
|
179 |
+
inputs=[msg, chatbot, system_msg, max_tokens, temperature, top_p, pdf_dir, k_initial, k_final],
|
180 |
+
outputs=[chatbot, context_box, msg]
|
181 |
+
)
|
182 |
+
msg.submit(
|
183 |
+
submit_message,
|
184 |
+
inputs=[msg, chatbot, system_msg, max_tokens, temperature, top_p, pdf_dir, k_initial, k_final],
|
185 |
+
outputs=[chatbot, context_box, msg]
|
186 |
+
)
|
187 |
+
load_btn.click(
|
188 |
+
load_pdfs,
|
189 |
+
inputs=[pdf_dir],
|
190 |
+
outputs=[load_status]
|
191 |
+
)
|
192 |
|
193 |
if __name__ == "__main__":
|
|
|
194 |
if not os.path.exists("./pdfs"):
|
195 |
os.makedirs("./pdfs")
|
|
|
196 |
demo.launch()
|