DHEIVER commited on
Commit
e2c21c6
·
verified ·
1 Parent(s): e37e070

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +126 -62
app.py CHANGED
@@ -3,23 +3,30 @@ from huggingface_hub import InferenceClient
3
  import PyPDF2
4
  from sentence_transformers import SentenceTransformer
5
  import numpy as np
6
- from sklearn.metrics.pairwise import cosine_similarity
7
  import os
8
  from typing import List, Tuple
 
9
 
10
- # Inicialização do cliente de inferência e modelo de embeddings
11
  client = InferenceClient("google/gemma-3-27b-it")
12
  embedder = SentenceTransformer('all-MiniLM-L6-v2')
13
 
14
- # Classe para gerenciar o conhecimento dos PDFs
15
- class PDFKnowledgeBase:
16
  def __init__(self):
17
- self.documents = []
18
- self.embeddings = None
 
 
19
 
 
 
 
 
 
20
  def load_pdfs(self, pdf_directory: str):
21
- """Carrega todos os PDFs de um diretório"""
22
- self.documents = []
23
  for filename in os.listdir(pdf_directory):
24
  if filename.endswith('.pdf'):
25
  pdf_path = os.path.join(pdf_directory, filename)
@@ -28,37 +35,51 @@ class PDFKnowledgeBase:
28
  text = ""
29
  for page in pdf_reader.pages:
30
  text += page.extract_text() + "\n"
31
- self.documents.append({
32
- 'filename': filename,
33
- 'content': text
34
- })
 
 
 
 
 
35
 
36
- # Gera embeddings para todos os documentos
37
- contents = [doc['content'] for doc in self.documents]
38
- self.embeddings = embedder.encode(contents, convert_to_numpy=True)
 
 
 
 
 
39
 
40
- def get_relevant_context(self, query: str, k: int = 3) -> str:
41
- """Recupera os k documentos mais relevantes para a query"""
42
- if self.embeddings is None or len(self.documents) == 0:
43
  return "Nenhum documento carregado ainda."
44
 
45
- query_embedding = embedder.encode(query, convert_to_numpy=True)
46
- similarities = cosine_similarity([query_embedding], self.embeddings)[0]
 
47
 
48
- # Obtém os índices dos k documentos mais similares
49
- top_k_indices = np.argsort(similarities)[-k:][::-1]
 
 
 
50
 
51
- # Constrói o contexto relevante
52
  context = ""
53
- for idx in top_k_indices:
54
- context += f"Documento: {self.documents[idx]['filename']}\n"
55
- context += f"Trecho: {self.documents[idx]['content'][:500]}...\n\n"
56
-
57
  return context
58
 
59
  # Inicializa a base de conhecimento
60
- knowledge_base = PDFKnowledgeBase()
61
 
 
62
  def respond(
63
  message: str,
64
  history: List[Tuple[str, str]],
@@ -66,30 +87,27 @@ def respond(
66
  max_tokens: int,
67
  temperature: float,
68
  top_p: float,
69
- pdf_directory: str
 
 
70
  ):
71
- # Carrega os PDFs se ainda não foram carregados
72
- if not knowledge_base.documents:
73
- knowledge_base.load_pdfs(pdf_directory)
74
 
75
- # Obtém contexto relevante da base de conhecimento
76
- context = knowledge_base.get_relevant_context(message)
77
 
78
- # Constrói o prompt com o contexto RAG
79
- rag_prompt = f"""Você é Grok 3, criado por xAI. Use o seguinte contexto dos documentos para responder à pergunta:
80
 
81
  {context}
82
 
83
- Pergunta do usuário: {message}
84
-
85
- Responda de forma clara e precisa, utilizando o contexto quando relevante."""
86
 
87
- messages = [
88
- {"role": "system", "content": system_message},
89
- {"role": "user", "content": rag_prompt}
90
- ]
91
 
92
- # Adiciona histórico se existir
93
  for user_msg, assistant_msg in history:
94
  if user_msg:
95
  messages.append({"role": "user", "content": user_msg})
@@ -106,27 +124,73 @@ Responda de forma clara e precisa, utilizando o contexto quando relevante."""
106
  ):
107
  token = message_chunk.choices[0].delta.content
108
  response += token
109
- yield response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
- # Interface do Gradio
112
- demo = gr.ChatInterface(
113
- respond,
114
- additional_inputs=[
115
- gr.Textbox(value="Você é um assistente útil que responde com base em documentos PDF.",
116
- label="System message"),
117
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
118
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
119
- gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05,
120
- label="Top-p (nucleus sampling)"),
121
- gr.Textbox(value="./pdfs", label="Diretório dos PDFs"),
122
- ],
123
- title="RAG Chatbot com PDFs",
124
- description="Faça perguntas e obtenha respostas baseadas em documentos PDF carregados."
125
- )
 
126
 
127
  if __name__ == "__main__":
128
- # Crie um diretório 'pdfs' e coloque seus PDFs lá
129
  if not os.path.exists("./pdfs"):
130
  os.makedirs("./pdfs")
131
-
132
  demo.launch()
 
3
  import PyPDF2
4
  from sentence_transformers import SentenceTransformer
5
  import numpy as np
6
+ import faiss
7
  import os
8
  from typing import List, Tuple
9
+ from rank_bm25 import BM25Okapi
10
 
11
+ # Inicialização do cliente e modelos
12
  client = InferenceClient("google/gemma-3-27b-it")
13
  embedder = SentenceTransformer('all-MiniLM-L6-v2')
14
 
15
+ # Classe para gerenciar a base de conhecimento
16
+ class AdvancedPDFKnowledgeBase:
17
  def __init__(self):
18
+ self.chunks = []
19
+ self.chunk_embeddings = None
20
+ self.index = None
21
+ self.bm25 = None
22
 
23
+ def _split_into_chunks(self, text: str, chunk_size: int = 500) -> List[str]:
24
+ words = text.split()
25
+ return [' '.join(words[i:i + chunk_size])
26
+ for i in range(0, len(words), chunk_size)]
27
+
28
  def load_pdfs(self, pdf_directory: str):
29
+ self.chunks = []
 
30
  for filename in os.listdir(pdf_directory):
31
  if filename.endswith('.pdf'):
32
  pdf_path = os.path.join(pdf_directory, filename)
 
35
  text = ""
36
  for page in pdf_reader.pages:
37
  text += page.extract_text() + "\n"
38
+ chunks = self._split_into_chunks(text)
39
+ for chunk in chunks:
40
+ self.chunks.append({
41
+ 'filename': filename,
42
+ 'content': chunk
43
+ })
44
+
45
+ if not self.chunks:
46
+ return "Nenhum PDF encontrado."
47
 
48
+ contents = [chunk['content'] for chunk in self.chunks]
49
+ self.chunk_embeddings = embedder.encode(contents, convert_to_numpy=True)
50
+ dimension = self.chunk_embeddings.shape[1]
51
+ self.index = faiss.IndexFlatL2(dimension)
52
+ self.index.add(self.chunk_embeddings)
53
+ tokenized_chunks = [chunk['content'].split() for chunk in self.chunks]
54
+ self.bm25 = BM25Okapi(tokenized_chunks)
55
+ return f"Carregados {len(self.chunks)} chunks de {len(set(c['filename'] for c in self.chunks))} PDFs."
56
 
57
+ def get_relevant_context(self, query: str, k: int = 5, rerank_k: int = 3) -> str:
58
+ if self.index is None or not self.chunks:
 
59
  return "Nenhum documento carregado ainda."
60
 
61
+ query_embedding = embedder.encode([query], convert_to_numpy=True)
62
+ distances, indices = self.index.search(query_embedding, k)
63
+ candidates = [self.chunks[idx] for idx in indices[0]]
64
 
65
+ tokenized_query = query.split()
66
+ bm25_scores = self.bm25.get_scores(tokenized_query)
67
+ candidate_scores = [(candidates[i], bm25_scores[indices[0][i]])
68
+ for i in range(len(candidates))]
69
+ candidate_scores.sort(key=lambda x: x[1], reverse=True)
70
 
71
+ top_chunks = candidate_scores[:rerank_k]
72
  context = ""
73
+ for chunk, score in top_chunks:
74
+ context += f"**Documento**: {chunk['filename']}\n"
75
+ context += f"**Trecho**: {chunk['content'][:500]}...\n"
76
+ context += f"**Score BM25**: {score:.2f}\n\n"
77
  return context
78
 
79
  # Inicializa a base de conhecimento
80
+ knowledge_base = AdvancedPDFKnowledgeBase()
81
 
82
+ # Função principal de resposta
83
  def respond(
84
  message: str,
85
  history: List[Tuple[str, str]],
 
87
  max_tokens: int,
88
  temperature: float,
89
  top_p: float,
90
+ pdf_directory: str,
91
+ k_initial: int,
92
+ k_final: int
93
  ):
94
+ if not knowledge_base.chunks:
95
+ yield "Por favor, carregue os PDFs primeiro.", "", ""
96
+ return
97
 
98
+ context = knowledge_base.get_relevant_context(message, k_initial, k_final)
 
99
 
100
+ rag_prompt = f"""Você é Grok 3, criado por xAI. Use o contexto dos documentos para responder:
 
101
 
102
  {context}
103
 
104
+ Pergunta: {message}
 
 
105
 
106
+ Responda com base no contexto quando relevante."""
107
+
108
+ messages = [{"role": "system", "content": system_message},
109
+ {"role": "user", "content": rag_prompt}]
110
 
 
111
  for user_msg, assistant_msg in history:
112
  if user_msg:
113
  messages.append({"role": "user", "content": user_msg})
 
124
  ):
125
  token = message_chunk.choices[0].delta.content
126
  response += token
127
+ yield response, context, ""
128
+
129
+ # Função para carregar PDFs
130
+ def load_pdfs(pdf_directory: str):
131
+ status = knowledge_base.load_pdfs(pdf_directory)
132
+ return status
133
+
134
+ # Interface Gradio personalizada
135
+ with gr.Blocks(title="RAG Avançado com PDFs") as demo:
136
+ gr.Markdown("# Chatbot RAG com PDFs")
137
+ gr.Markdown("Carregue PDFs e faça perguntas baseadas nos documentos.")
138
+
139
+ with gr.Row():
140
+ with gr.Column(scale=2):
141
+ chatbot = gr.Chatbot(label="Conversa")
142
+ msg = gr.Textbox(label="Sua pergunta", placeholder="Digite sua pergunta aqui...")
143
+ submit_btn = gr.Button("Enviar")
144
+
145
+ with gr.Column(scale=1):
146
+ context_box = gr.Markdown(label="Contexto Recuperado", value="Contexto aparecerá aqui após a pergunta.")
147
+
148
+ with gr.Accordion("Configurações", open=False):
149
+ with gr.Row():
150
+ with gr.Column():
151
+ pdf_dir = gr.Textbox(label="Diretório dos PDFs", value="./pdfs")
152
+ load_btn = gr.Button("Carregar PDFs")
153
+ load_status = gr.Textbox(label="Status do Carregamento", interactive=False)
154
+
155
+ with gr.Column():
156
+ system_msg = gr.Textbox(
157
+ label="Mensagem do Sistema",
158
+ value="Você é um assistente útil que responde com base em documentos PDF."
159
+ )
160
+ max_tokens = gr.Slider(1, 2048, value=512, step=1, label="Max Tokens")
161
+ temperature = gr.Slider(0.1, 4.0, value=0.7, step=0.1, label="Temperature")
162
+ top_p = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-p")
163
+
164
+ with gr.Row():
165
+ k_initial = gr.Slider(1, 20, value=5, step=1, label="Candidatos Iniciais (FAISS)")
166
+ k_final = gr.Slider(1, 10, value=3, step=1, label="Resultados Finais (BM25)")
167
+
168
+ # Função para atualizar o chat
169
+ def submit_message(message, history, system_message, max_tokens, temperature, top_p, pdf_directory, k_initial, k_final):
170
+ history = history or []
171
+ for response, context, _ in respond(message, history, system_message, max_tokens, temperature, top_p, pdf_directory, k_initial, k_final):
172
+ history.append((message, response))
173
+ yield history, context, ""
174
+ yield history, context, ""
175
 
176
+ # Conexões de eventos
177
+ submit_btn.click(
178
+ submit_message,
179
+ inputs=[msg, chatbot, system_msg, max_tokens, temperature, top_p, pdf_dir, k_initial, k_final],
180
+ outputs=[chatbot, context_box, msg]
181
+ )
182
+ msg.submit(
183
+ submit_message,
184
+ inputs=[msg, chatbot, system_msg, max_tokens, temperature, top_p, pdf_dir, k_initial, k_final],
185
+ outputs=[chatbot, context_box, msg]
186
+ )
187
+ load_btn.click(
188
+ load_pdfs,
189
+ inputs=[pdf_dir],
190
+ outputs=[load_status]
191
+ )
192
 
193
  if __name__ == "__main__":
 
194
  if not os.path.exists("./pdfs"):
195
  os.makedirs("./pdfs")
 
196
  demo.launch()