Update app.py
Browse files
app.py
CHANGED
@@ -1,217 +1,227 @@
|
|
1 |
import gradio as gr
|
2 |
-
from huggingface_hub import InferenceClient
|
3 |
import PyPDF2
|
4 |
-
from
|
|
|
|
|
5 |
import numpy as np
|
6 |
-
import
|
7 |
-
|
8 |
-
from
|
|
|
9 |
|
10 |
-
#
|
11 |
-
|
12 |
-
|
13 |
|
14 |
-
#
|
15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
def __init__(self):
|
17 |
-
self.
|
18 |
-
self.
|
19 |
-
self.index = None
|
20 |
-
self.bm25 = None
|
21 |
|
22 |
-
def
|
23 |
-
|
24 |
-
|
25 |
-
unique_lines = []
|
26 |
-
seen = set()
|
27 |
-
for line in lines:
|
28 |
-
if line.strip() and line not in seen:
|
29 |
-
unique_lines.append(line)
|
30 |
-
seen.add(line)
|
31 |
|
32 |
-
|
33 |
-
|
|
|
|
|
|
|
34 |
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
with open(
|
44 |
-
|
45 |
text = ""
|
46 |
-
for page in
|
47 |
text += page.extract_text() + "\n"
|
48 |
-
chunks = self._split_into_chunks(text)
|
49 |
-
for chunk in chunks:
|
50 |
-
self.chunks.append({
|
51 |
-
'filename': file.name.split('/')[-1],
|
52 |
-
'content': chunk
|
53 |
-
})
|
54 |
-
|
55 |
-
if not self.chunks:
|
56 |
-
return "Nenhum PDF encontrado."
|
57 |
-
|
58 |
-
contents = [chunk['content'] for chunk in self.chunks]
|
59 |
-
self.chunk_embeddings = embedder.encode(contents, convert_to_numpy=True)
|
60 |
-
dimension = self.chunk_embeddings.shape[1]
|
61 |
-
self.index = faiss.IndexFlatL2(dimension)
|
62 |
-
self.index.add(self.chunk_embeddings)
|
63 |
-
tokenized_chunks = [chunk['content'].split() for chunk in self.chunks]
|
64 |
-
self.bm25 = BM25Okapi(tokenized_chunks)
|
65 |
-
return f"Carregados {len(self.chunks)} chunks de {len(set(c['filename'] for c in self.chunks))} PDFs."
|
66 |
-
|
67 |
-
def get_relevant_context(self, query: str, k: int = 5, rerank_k: int = 3) -> str:
|
68 |
-
if self.index is None or not self.chunks:
|
69 |
-
return "Nenhum documento carregado ainda."
|
70 |
|
71 |
-
|
72 |
-
|
73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
|
75 |
-
|
76 |
-
|
77 |
-
candidate_scores = [(candidates[i], bm25_scores[indices[0][i]])
|
78 |
-
for i in range(len(candidates))]
|
79 |
-
candidate_scores.sort(key=lambda x: x[1], reverse=True)
|
80 |
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
|
|
88 |
|
89 |
-
# Inicializa
|
90 |
-
|
91 |
|
92 |
-
#
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
return
|
106 |
-
|
107 |
-
# Obtém o contexto relevante
|
108 |
-
context = knowledge_base.get_relevant_context(message, k_initial, k_final)
|
109 |
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
|
|
|
|
|
|
|
|
|
|
115 |
|
116 |
-
|
117 |
-
|
118 |
|
119 |
-
#
|
120 |
-
|
121 |
-
|
122 |
-
|
|
|
|
|
123 |
|
124 |
-
#
|
125 |
-
|
|
|
|
|
126 |
|
127 |
-
|
128 |
-
|
129 |
-
for message_chunk in client.chat_completion(
|
130 |
-
messages=messages,
|
131 |
-
max_tokens=max_tokens,
|
132 |
-
stream=True,
|
133 |
-
temperature=temperature,
|
134 |
-
top_p=top_p,
|
135 |
-
):
|
136 |
-
token = message_chunk.choices[0].delta.content
|
137 |
-
if token:
|
138 |
-
response += token
|
139 |
-
yield response, context, ""
|
140 |
-
except Exception as e:
|
141 |
-
yield f"Erro ao gerar resposta: {str(e)}", context, ""
|
142 |
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
|
|
|
|
|
|
149 |
|
150 |
-
# Interface Gradio
|
151 |
-
with gr.Blocks(title="RAG
|
152 |
with gr.Row():
|
153 |
with gr.Column(scale=2):
|
154 |
-
gr.Markdown("#
|
155 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
|
157 |
-
with gr.Column(scale=1):
|
158 |
-
load_status = gr.Textbox(label="Status do Carregamento", interactive=False)
|
159 |
-
|
160 |
with gr.Row():
|
161 |
with gr.Column(scale=2):
|
162 |
-
chatbot = gr.Chatbot(label="Conversa"
|
163 |
-
|
164 |
-
|
|
|
|
|
|
|
165 |
|
166 |
with gr.Column(scale=1):
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
with gr.Row():
|
185 |
-
k_initial = gr.Slider(1, 20, value=5, step=1, label="Candidatos Iniciais (FAISS)")
|
186 |
-
k_final = gr.Slider(1, 10, value=3, step=1, label="Resultados Finais (BM25)")
|
187 |
-
|
188 |
-
# Função para atualizar o chat
|
189 |
-
def submit_message(message, history, system_message, max_tokens, temperature, top_p, k_initial, k_final):
|
190 |
history = history or []
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
# Conexões de eventos
|
200 |
-
submit_btn.click(
|
201 |
-
submit_message,
|
202 |
-
inputs=[msg, chatbot, system_msg, max_tokens, temperature, top_p, k_initial, k_final],
|
203 |
-
outputs=[chatbot, context_box, msg]
|
204 |
-
)
|
205 |
-
msg.submit(
|
206 |
-
submit_message,
|
207 |
-
inputs=[msg, chatbot, system_msg, max_tokens, temperature, top_p, k_initial, k_final],
|
208 |
-
outputs=[chatbot, context_box, msg]
|
209 |
-
)
|
210 |
-
load_btn.click(
|
211 |
-
load_pdfs,
|
212 |
inputs=[pdf_upload],
|
213 |
outputs=[load_status]
|
214 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
215 |
|
216 |
if __name__ == "__main__":
|
217 |
demo.launch()
|
|
|
1 |
import gradio as gr
|
|
|
2 |
import PyPDF2
|
3 |
+
from transformers import AutoTokenizer, AutoModel
|
4 |
+
from sentence_transformers import util
|
5 |
+
import torch
|
6 |
import numpy as np
|
7 |
+
import os
|
8 |
+
import re
|
9 |
+
from typing import List, Dict, Any
|
10 |
+
import requests
|
11 |
|
12 |
+
# Diretório para armazenar os PDFs
|
13 |
+
PDF_DIR = "pdf_data"
|
14 |
+
os.makedirs(PDF_DIR, exist_ok=True)
|
15 |
|
16 |
+
# Modelo simples para embeddings
|
17 |
+
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
|
18 |
+
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
|
19 |
+
|
20 |
+
# Função para gerar embeddings
|
21 |
+
def get_embeddings(texts):
|
22 |
+
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
|
23 |
+
with torch.no_grad():
|
24 |
+
outputs = model(**inputs)
|
25 |
+
embeddings = outputs.last_hidden_state.mean(dim=1)
|
26 |
+
return embeddings / embeddings.norm(dim=1, keepdim=True)
|
27 |
+
|
28 |
+
# Classe RAG simplificada
|
29 |
+
class SimpleRAG:
|
30 |
def __init__(self):
|
31 |
+
self.documents = []
|
32 |
+
self.embeddings = None
|
|
|
|
|
33 |
|
34 |
+
def clear(self):
|
35 |
+
self.documents = []
|
36 |
+
self.embeddings = None
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
+
def process_text(self, text):
|
39 |
+
# Limpa texto e divide em parágrafos
|
40 |
+
text = re.sub(r'\s+', ' ', text).strip()
|
41 |
+
paragraphs = [p for p in text.split('\n') if len(p) > 50]
|
42 |
+
return paragraphs
|
43 |
|
44 |
+
def load_pdf(self, file_obj):
|
45 |
+
try:
|
46 |
+
# Salva o arquivo
|
47 |
+
file_path = os.path.join(PDF_DIR, file_obj.name)
|
48 |
+
with open(file_path, 'wb') as f:
|
49 |
+
f.write(file_obj.read())
|
50 |
+
|
51 |
+
# Extrai o texto
|
52 |
+
with open(file_path, 'rb') as f:
|
53 |
+
pdf = PyPDF2.PdfReader(f)
|
54 |
text = ""
|
55 |
+
for page in pdf.pages:
|
56 |
text += page.extract_text() + "\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
+
# Processa o texto em chunks
|
59 |
+
chunks = self.process_text(text)
|
60 |
+
|
61 |
+
# Adiciona à base de conhecimento
|
62 |
+
filename = os.path.basename(file_path)
|
63 |
+
doc_chunks = [{"source": filename, "content": chunk} for chunk in chunks]
|
64 |
+
self.documents.extend(doc_chunks)
|
65 |
+
|
66 |
+
# Recalcula embeddings
|
67 |
+
if self.documents:
|
68 |
+
contents = [doc["content"] for doc in self.documents]
|
69 |
+
self.embeddings = get_embeddings(contents)
|
70 |
+
|
71 |
+
return f"Carregado: {filename} ({len(chunks)} segmentos)"
|
72 |
+
except Exception as e:
|
73 |
+
return f"Erro ao processar PDF: {str(e)}"
|
74 |
+
|
75 |
+
def search(self, query, top_k=3):
|
76 |
+
if not self.documents or self.embeddings is None:
|
77 |
+
return []
|
78 |
+
|
79 |
+
# Calcula embedding da query
|
80 |
+
query_embedding = get_embeddings([query])
|
81 |
+
|
82 |
+
# Calcula similaridade
|
83 |
+
similarities = util.pytorch_cos_sim(query_embedding, self.embeddings)[0]
|
84 |
|
85 |
+
# Encontra os top_k mais similares
|
86 |
+
top_results = torch.topk(similarities, min(top_k, len(self.documents)))
|
|
|
|
|
|
|
87 |
|
88 |
+
results = []
|
89 |
+
for score, idx in zip(top_results.values, top_results.indices):
|
90 |
+
results.append({
|
91 |
+
"score": score.item(),
|
92 |
+
"document": self.documents[idx]
|
93 |
+
})
|
94 |
+
|
95 |
+
return results
|
96 |
|
97 |
+
# Inicializa o RAG
|
98 |
+
rag = SimpleRAG()
|
99 |
|
100 |
+
# Configurações para LLM
|
101 |
+
LLM_API_URL = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2"
|
102 |
+
headers = {"Authorization": "Bearer hf_XXXXXXXXXXXXXXXXXXXXXXX"} # Substitua por sua API key
|
103 |
+
|
104 |
+
def query_llm(prompt):
|
105 |
+
payload = {
|
106 |
+
"inputs": prompt,
|
107 |
+
"parameters": {
|
108 |
+
"max_new_tokens": 512,
|
109 |
+
"temperature": 0.7,
|
110 |
+
"top_p": 0.95
|
111 |
+
}
|
112 |
+
}
|
|
|
|
|
|
|
|
|
113 |
|
114 |
+
try:
|
115 |
+
response = requests.post(LLM_API_URL, headers=headers, json=payload)
|
116 |
+
return response.json()[0]["generated_text"]
|
117 |
+
except Exception as e:
|
118 |
+
return f"Erro ao consultar o LLM: {str(e)}"
|
119 |
+
|
120 |
+
# Função para processar a consulta
|
121 |
+
def process_query(query, history):
|
122 |
+
# Busca documentos relevantes
|
123 |
+
results = rag.search(query)
|
124 |
|
125 |
+
if not results:
|
126 |
+
return "Por favor, carregue alguns PDFs primeiro.", "Nenhum documento disponível."
|
127 |
|
128 |
+
# Formata o contexto
|
129 |
+
context = ""
|
130 |
+
for i, result in enumerate(results):
|
131 |
+
context += f"[{i+1}] Fonte: {result['document']['source']}\n"
|
132 |
+
context += f"Trecho: {result['document']['content'][:300]}...\n"
|
133 |
+
context += f"Relevância: {result['score']:.2f}\n\n"
|
134 |
|
135 |
+
# Constrói o prompt
|
136 |
+
prompt = f"""<s>[INST]Você é um assistente de IA especializado em responder perguntas usando apenas
|
137 |
+
o contexto fornecido. Considere apenas as informações nos documentos abaixo. Se a resposta não
|
138 |
+
puder ser derivada do contexto, diga que não possui informações suficientes.
|
139 |
|
140 |
+
CONTEXTO:
|
141 |
+
{context}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
|
143 |
+
PERGUNTA: {query}[/INST]"""
|
144 |
+
|
145 |
+
# Consulta o modelo
|
146 |
+
response = query_llm(prompt)
|
147 |
+
|
148 |
+
# Extrai a resposta real (removendo o prompt)
|
149 |
+
actual_response = response.split("[/INST]")[-1].strip()
|
150 |
+
|
151 |
+
return actual_response, context
|
152 |
|
153 |
+
# Interface Gradio
|
154 |
+
with gr.Blocks(title="RAG PDF Simplificado") as demo:
|
155 |
with gr.Row():
|
156 |
with gr.Column(scale=2):
|
157 |
+
gr.Markdown("# RAG PDF Simplificado")
|
158 |
+
|
159 |
+
with gr.Row():
|
160 |
+
with gr.Column():
|
161 |
+
pdf_upload = gr.File(
|
162 |
+
label="Carregar PDF",
|
163 |
+
file_types=[".pdf"],
|
164 |
+
file_count="single"
|
165 |
+
)
|
166 |
+
load_status = gr.Textbox(label="Status", interactive=False)
|
167 |
+
clear_btn = gr.Button("Limpar Base de Conhecimento")
|
168 |
|
|
|
|
|
|
|
169 |
with gr.Row():
|
170 |
with gr.Column(scale=2):
|
171 |
+
chatbot = gr.Chatbot(label="Conversa")
|
172 |
+
query_input = gr.Textbox(
|
173 |
+
label="Sua pergunta",
|
174 |
+
placeholder="Digite sua pergunta sobre os documentos..."
|
175 |
+
)
|
176 |
+
query_btn = gr.Button("Enviar")
|
177 |
|
178 |
with gr.Column(scale=1):
|
179 |
+
context_display = gr.Textbox(
|
180 |
+
label="Contexto Recuperado",
|
181 |
+
interactive=False,
|
182 |
+
lines=10
|
183 |
+
)
|
184 |
+
|
185 |
+
# Funções de callback
|
186 |
+
def upload_pdf(file):
|
187 |
+
if file is None:
|
188 |
+
return "Nenhum arquivo selecionado."
|
189 |
+
return rag.load_pdf(file)
|
190 |
+
|
191 |
+
def clear_knowledge_base():
|
192 |
+
rag.clear()
|
193 |
+
return "Base de conhecimento limpa."
|
194 |
+
|
195 |
+
def submit_query(query, history):
|
|
|
|
|
|
|
|
|
|
|
|
|
196 |
history = history or []
|
197 |
+
response, context = process_query(query, history)
|
198 |
+
history.append((query, response))
|
199 |
+
return history, "", context
|
200 |
+
|
201 |
+
# Eventos
|
202 |
+
pdf_upload.upload(
|
203 |
+
upload_pdf,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
204 |
inputs=[pdf_upload],
|
205 |
outputs=[load_status]
|
206 |
)
|
207 |
+
|
208 |
+
clear_btn.click(
|
209 |
+
clear_knowledge_base,
|
210 |
+
inputs=[],
|
211 |
+
outputs=[load_status]
|
212 |
+
)
|
213 |
+
|
214 |
+
query_btn.click(
|
215 |
+
submit_query,
|
216 |
+
inputs=[query_input, chatbot],
|
217 |
+
outputs=[chatbot, query_input, context_display]
|
218 |
+
)
|
219 |
+
|
220 |
+
query_input.submit(
|
221 |
+
submit_query,
|
222 |
+
inputs=[query_input, chatbot],
|
223 |
+
outputs=[chatbot, query_input, context_display]
|
224 |
+
)
|
225 |
|
226 |
if __name__ == "__main__":
|
227 |
demo.launch()
|