DHEIVER commited on
Commit
43ba6e3
·
verified ·
1 Parent(s): 5c221dc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +190 -180
app.py CHANGED
@@ -1,217 +1,227 @@
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
3
  import PyPDF2
4
- from sentence_transformers import SentenceTransformer
 
 
5
  import numpy as np
6
- import faiss
7
- from typing import List, Tuple
8
- from rank_bm25 import BM25Okapi
 
9
 
10
- # Inicialização do cliente e modelos
11
- client = InferenceClient("google/gemma-3-27b-it")
12
- embedder = SentenceTransformer('all-MiniLM-L6-v2')
13
 
14
- # Classe para gerenciar a base de conhecimento
15
- class AdvancedPDFKnowledgeBase:
 
 
 
 
 
 
 
 
 
 
 
 
16
  def __init__(self):
17
- self.chunks = []
18
- self.chunk_embeddings = None
19
- self.index = None
20
- self.bm25 = None
21
 
22
- def _split_into_chunks(self, text: str, chunk_size: int = 500) -> List[str]:
23
- # Remove linhas duplicadas ou muito semelhantes
24
- lines = text.split("\n")
25
- unique_lines = []
26
- seen = set()
27
- for line in lines:
28
- if line.strip() and line not in seen:
29
- unique_lines.append(line)
30
- seen.add(line)
31
 
32
- # Junta as linhas únicas em um único texto
33
- cleaned_text = "\n".join(unique_lines)
 
 
 
34
 
35
- # Divide o texto limpo em chunks
36
- words = cleaned_text.split()
37
- return [' '.join(words[i:i + chunk_size])
38
- for i in range(0, len(words), chunk_size)]
39
-
40
- def load_pdfs(self, pdf_files: List[gr.File]) -> str:
41
- self.chunks = []
42
- for file in pdf_files:
43
- with open(file.name, 'rb') as pdf_file:
44
- pdf_reader = PyPDF2.PdfReader(pdf_file)
45
  text = ""
46
- for page in pdf_reader.pages:
47
  text += page.extract_text() + "\n"
48
- chunks = self._split_into_chunks(text)
49
- for chunk in chunks:
50
- self.chunks.append({
51
- 'filename': file.name.split('/')[-1],
52
- 'content': chunk
53
- })
54
-
55
- if not self.chunks:
56
- return "Nenhum PDF encontrado."
57
-
58
- contents = [chunk['content'] for chunk in self.chunks]
59
- self.chunk_embeddings = embedder.encode(contents, convert_to_numpy=True)
60
- dimension = self.chunk_embeddings.shape[1]
61
- self.index = faiss.IndexFlatL2(dimension)
62
- self.index.add(self.chunk_embeddings)
63
- tokenized_chunks = [chunk['content'].split() for chunk in self.chunks]
64
- self.bm25 = BM25Okapi(tokenized_chunks)
65
- return f"Carregados {len(self.chunks)} chunks de {len(set(c['filename'] for c in self.chunks))} PDFs."
66
-
67
- def get_relevant_context(self, query: str, k: int = 5, rerank_k: int = 3) -> str:
68
- if self.index is None or not self.chunks:
69
- return "Nenhum documento carregado ainda."
70
 
71
- query_embedding = embedder.encode([query], convert_to_numpy=True)
72
- distances, indices = self.index.search(query_embedding, k)
73
- candidates = [self.chunks[idx] for idx in indices[0]]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
- tokenized_query = query.split()
76
- bm25_scores = self.bm25.get_scores(tokenized_query)
77
- candidate_scores = [(candidates[i], bm25_scores[indices[0][i]])
78
- for i in range(len(candidates))]
79
- candidate_scores.sort(key=lambda x: x[1], reverse=True)
80
 
81
- top_chunks = candidate_scores[:rerank_k]
82
- context = ""
83
- for chunk, score in top_chunks:
84
- context += f"**Documento**: {chunk['filename']}\n"
85
- context += f"**Trecho**: {chunk['content'][:500]}...\n"
86
- context += f"**Score BM25**: {score:.2f}\n\n"
87
- return context
 
88
 
89
- # Inicializa a base de conhecimento
90
- knowledge_base = AdvancedPDFKnowledgeBase()
91
 
92
- # Função principal de resposta
93
- def respond(
94
- message: str,
95
- history: List[Tuple[str, str]],
96
- system_message: str,
97
- max_tokens: int,
98
- temperature: float,
99
- top_p: float,
100
- k_initial: int,
101
- k_final: int
102
- ):
103
- if not knowledge_base.chunks:
104
- yield "Por favor, carregue os PDFs primeiro.", "", ""
105
- return
106
-
107
- # Obtém o contexto relevante
108
- context = knowledge_base.get_relevant_context(message, k_initial, k_final)
109
 
110
- # Constrói o prompt RAG
111
- rag_prompt = f"""Você é Grok 3, criado por xAI. Use o contexto dos documentos para responder:
112
- {context}
113
- Pergunta: {message}
114
- Responda com base no contexto quando relevante."""
 
 
 
 
 
115
 
116
- # Inicializa a lista de mensagens
117
- messages = [{"role": "system", "content": system_message}]
118
 
119
- # Adiciona mensagens do histórico, garantindo alternância entre "user" e "assistant"
120
- for user_msg, assistant_msg in history:
121
- messages.append({"role": "user", "content": user_msg})
122
- messages.append({"role": "assistant", "content": assistant_msg})
 
 
123
 
124
- # Adiciona a nova mensagem do usuário
125
- messages.append({"role": "user", "content": rag_prompt})
 
 
126
 
127
- response = ""
128
- try:
129
- for message_chunk in client.chat_completion(
130
- messages=messages,
131
- max_tokens=max_tokens,
132
- stream=True,
133
- temperature=temperature,
134
- top_p=top_p,
135
- ):
136
- token = message_chunk.choices[0].delta.content
137
- if token:
138
- response += token
139
- yield response, context, ""
140
- except Exception as e:
141
- yield f"Erro ao gerar resposta: {str(e)}", context, ""
142
 
143
- # Função para carregar PDFs
144
- def load_pdfs(pdf_files: List[gr.File]):
145
- if not pdf_files:
146
- return "Nenhum arquivo selecionado."
147
- status = knowledge_base.load_pdfs(pdf_files)
148
- return status
 
 
 
149
 
150
- # Interface Gradio personalizada
151
- with gr.Blocks(title="RAG Avançado com PDFs", theme=gr.themes.Soft()) as demo:
152
  with gr.Row():
153
  with gr.Column(scale=2):
154
- gr.Markdown("# Chatbot RAG com PDFs")
155
- gr.Markdown("Arraste e solte seus PDFs abaixo ou clique para selecionar.")
 
 
 
 
 
 
 
 
 
156
 
157
- with gr.Column(scale=1):
158
- load_status = gr.Textbox(label="Status do Carregamento", interactive=False)
159
-
160
  with gr.Row():
161
  with gr.Column(scale=2):
162
- chatbot = gr.Chatbot(label="Conversa", height=400)
163
- msg = gr.Textbox(label="Sua pergunta", placeholder="Digite sua pergunta aqui...")
164
- submit_btn = gr.Button("Enviar")
 
 
 
165
 
166
  with gr.Column(scale=1):
167
- context_box = gr.Markdown(label="Contexto Recuperado")
168
-
169
- with gr.Accordion("Configurações", open=False):
170
- with gr.Row():
171
- with gr.Column():
172
- pdf_upload = gr.File(label="Carregar PDFs", file_types=[".pdf", ".txt"], file_count="multiple", interactive=True)
173
- load_btn = gr.Button("Carregar PDFs")
174
-
175
- with gr.Column():
176
- system_msg = gr.Textbox(
177
- label="Mensagem do Sistema",
178
- value="Você é um assistente útil que responde com base em documentos PDF."
179
- )
180
- max_tokens = gr.Slider(1, 2048, value=512, step=1, label="Max Tokens")
181
- temperature = gr.Slider(0.1, 4.0, value=0.7, step=0.1, label="Temperature")
182
- top_p = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-p")
183
-
184
- with gr.Row():
185
- k_initial = gr.Slider(1, 20, value=5, step=1, label="Candidatos Iniciais (FAISS)")
186
- k_final = gr.Slider(1, 10, value=3, step=1, label="Resultados Finais (BM25)")
187
-
188
- # Função para atualizar o chat
189
- def submit_message(message, history, system_message, max_tokens, temperature, top_p, k_initial, k_final):
190
  history = history or []
191
- return_history = history.copy()
192
- return_history.append((message, ""))
193
- yield return_history, "", ""
194
-
195
- for response, context, _ in respond(message, history, system_message, max_tokens, temperature, top_p, k_initial, k_final):
196
- return_history[-1] = (message, response)
197
- yield return_history, context, ""
198
-
199
- # Conexões de eventos
200
- submit_btn.click(
201
- submit_message,
202
- inputs=[msg, chatbot, system_msg, max_tokens, temperature, top_p, k_initial, k_final],
203
- outputs=[chatbot, context_box, msg]
204
- )
205
- msg.submit(
206
- submit_message,
207
- inputs=[msg, chatbot, system_msg, max_tokens, temperature, top_p, k_initial, k_final],
208
- outputs=[chatbot, context_box, msg]
209
- )
210
- load_btn.click(
211
- load_pdfs,
212
  inputs=[pdf_upload],
213
  outputs=[load_status]
214
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
 
216
  if __name__ == "__main__":
217
  demo.launch()
 
1
  import gradio as gr
 
2
  import PyPDF2
3
+ from transformers import AutoTokenizer, AutoModel
4
+ from sentence_transformers import util
5
+ import torch
6
  import numpy as np
7
+ import os
8
+ import re
9
+ from typing import List, Dict, Any
10
+ import requests
11
 
12
+ # Diretório para armazenar os PDFs
13
+ PDF_DIR = "pdf_data"
14
+ os.makedirs(PDF_DIR, exist_ok=True)
15
 
16
+ # Modelo simples para embeddings
17
+ tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
18
+ model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
19
+
20
+ # Função para gerar embeddings
21
+ def get_embeddings(texts):
22
+ inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
23
+ with torch.no_grad():
24
+ outputs = model(**inputs)
25
+ embeddings = outputs.last_hidden_state.mean(dim=1)
26
+ return embeddings / embeddings.norm(dim=1, keepdim=True)
27
+
28
+ # Classe RAG simplificada
29
+ class SimpleRAG:
30
  def __init__(self):
31
+ self.documents = []
32
+ self.embeddings = None
 
 
33
 
34
+ def clear(self):
35
+ self.documents = []
36
+ self.embeddings = None
 
 
 
 
 
 
37
 
38
+ def process_text(self, text):
39
+ # Limpa texto e divide em parágrafos
40
+ text = re.sub(r'\s+', ' ', text).strip()
41
+ paragraphs = [p for p in text.split('\n') if len(p) > 50]
42
+ return paragraphs
43
 
44
+ def load_pdf(self, file_obj):
45
+ try:
46
+ # Salva o arquivo
47
+ file_path = os.path.join(PDF_DIR, file_obj.name)
48
+ with open(file_path, 'wb') as f:
49
+ f.write(file_obj.read())
50
+
51
+ # Extrai o texto
52
+ with open(file_path, 'rb') as f:
53
+ pdf = PyPDF2.PdfReader(f)
54
  text = ""
55
+ for page in pdf.pages:
56
  text += page.extract_text() + "\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
+ # Processa o texto em chunks
59
+ chunks = self.process_text(text)
60
+
61
+ # Adiciona à base de conhecimento
62
+ filename = os.path.basename(file_path)
63
+ doc_chunks = [{"source": filename, "content": chunk} for chunk in chunks]
64
+ self.documents.extend(doc_chunks)
65
+
66
+ # Recalcula embeddings
67
+ if self.documents:
68
+ contents = [doc["content"] for doc in self.documents]
69
+ self.embeddings = get_embeddings(contents)
70
+
71
+ return f"Carregado: {filename} ({len(chunks)} segmentos)"
72
+ except Exception as e:
73
+ return f"Erro ao processar PDF: {str(e)}"
74
+
75
+ def search(self, query, top_k=3):
76
+ if not self.documents or self.embeddings is None:
77
+ return []
78
+
79
+ # Calcula embedding da query
80
+ query_embedding = get_embeddings([query])
81
+
82
+ # Calcula similaridade
83
+ similarities = util.pytorch_cos_sim(query_embedding, self.embeddings)[0]
84
 
85
+ # Encontra os top_k mais similares
86
+ top_results = torch.topk(similarities, min(top_k, len(self.documents)))
 
 
 
87
 
88
+ results = []
89
+ for score, idx in zip(top_results.values, top_results.indices):
90
+ results.append({
91
+ "score": score.item(),
92
+ "document": self.documents[idx]
93
+ })
94
+
95
+ return results
96
 
97
+ # Inicializa o RAG
98
+ rag = SimpleRAG()
99
 
100
+ # Configurações para LLM
101
+ LLM_API_URL = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2"
102
+ headers = {"Authorization": "Bearer hf_XXXXXXXXXXXXXXXXXXXXXXX"} # Substitua por sua API key
103
+
104
+ def query_llm(prompt):
105
+ payload = {
106
+ "inputs": prompt,
107
+ "parameters": {
108
+ "max_new_tokens": 512,
109
+ "temperature": 0.7,
110
+ "top_p": 0.95
111
+ }
112
+ }
 
 
 
 
113
 
114
+ try:
115
+ response = requests.post(LLM_API_URL, headers=headers, json=payload)
116
+ return response.json()[0]["generated_text"]
117
+ except Exception as e:
118
+ return f"Erro ao consultar o LLM: {str(e)}"
119
+
120
+ # Função para processar a consulta
121
+ def process_query(query, history):
122
+ # Busca documentos relevantes
123
+ results = rag.search(query)
124
 
125
+ if not results:
126
+ return "Por favor, carregue alguns PDFs primeiro.", "Nenhum documento disponível."
127
 
128
+ # Formata o contexto
129
+ context = ""
130
+ for i, result in enumerate(results):
131
+ context += f"[{i+1}] Fonte: {result['document']['source']}\n"
132
+ context += f"Trecho: {result['document']['content'][:300]}...\n"
133
+ context += f"Relevância: {result['score']:.2f}\n\n"
134
 
135
+ # Constrói o prompt
136
+ prompt = f"""<s>[INST]Você é um assistente de IA especializado em responder perguntas usando apenas
137
+ o contexto fornecido. Considere apenas as informações nos documentos abaixo. Se a resposta não
138
+ puder ser derivada do contexto, diga que não possui informações suficientes.
139
 
140
+ CONTEXTO:
141
+ {context}
 
 
 
 
 
 
 
 
 
 
 
 
 
142
 
143
+ PERGUNTA: {query}[/INST]"""
144
+
145
+ # Consulta o modelo
146
+ response = query_llm(prompt)
147
+
148
+ # Extrai a resposta real (removendo o prompt)
149
+ actual_response = response.split("[/INST]")[-1].strip()
150
+
151
+ return actual_response, context
152
 
153
+ # Interface Gradio
154
+ with gr.Blocks(title="RAG PDF Simplificado") as demo:
155
  with gr.Row():
156
  with gr.Column(scale=2):
157
+ gr.Markdown("# RAG PDF Simplificado")
158
+
159
+ with gr.Row():
160
+ with gr.Column():
161
+ pdf_upload = gr.File(
162
+ label="Carregar PDF",
163
+ file_types=[".pdf"],
164
+ file_count="single"
165
+ )
166
+ load_status = gr.Textbox(label="Status", interactive=False)
167
+ clear_btn = gr.Button("Limpar Base de Conhecimento")
168
 
 
 
 
169
  with gr.Row():
170
  with gr.Column(scale=2):
171
+ chatbot = gr.Chatbot(label="Conversa")
172
+ query_input = gr.Textbox(
173
+ label="Sua pergunta",
174
+ placeholder="Digite sua pergunta sobre os documentos..."
175
+ )
176
+ query_btn = gr.Button("Enviar")
177
 
178
  with gr.Column(scale=1):
179
+ context_display = gr.Textbox(
180
+ label="Contexto Recuperado",
181
+ interactive=False,
182
+ lines=10
183
+ )
184
+
185
+ # Funções de callback
186
+ def upload_pdf(file):
187
+ if file is None:
188
+ return "Nenhum arquivo selecionado."
189
+ return rag.load_pdf(file)
190
+
191
+ def clear_knowledge_base():
192
+ rag.clear()
193
+ return "Base de conhecimento limpa."
194
+
195
+ def submit_query(query, history):
 
 
 
 
 
 
196
  history = history or []
197
+ response, context = process_query(query, history)
198
+ history.append((query, response))
199
+ return history, "", context
200
+
201
+ # Eventos
202
+ pdf_upload.upload(
203
+ upload_pdf,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  inputs=[pdf_upload],
205
  outputs=[load_status]
206
  )
207
+
208
+ clear_btn.click(
209
+ clear_knowledge_base,
210
+ inputs=[],
211
+ outputs=[load_status]
212
+ )
213
+
214
+ query_btn.click(
215
+ submit_query,
216
+ inputs=[query_input, chatbot],
217
+ outputs=[chatbot, query_input, context_display]
218
+ )
219
+
220
+ query_input.submit(
221
+ submit_query,
222
+ inputs=[query_input, chatbot],
223
+ outputs=[chatbot, query_input, context_display]
224
+ )
225
 
226
  if __name__ == "__main__":
227
  demo.launch()