Spaces:
Build error
Build error
RAG con prompting
Browse files- .gitignore +1 -0
- __pycache__/mi_prompt.cpython-311.pyc +0 -0
- app.py +109 -141
- mi_prompt.py +21 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
.env
|
__pycache__/mi_prompt.cpython-311.pyc
ADDED
Binary file (1.01 kB). View file
|
|
app.py
CHANGED
@@ -2,185 +2,153 @@ import os
|
|
2 |
import streamlit as st
|
3 |
from dotenv import load_dotenv
|
4 |
|
5 |
-
#
|
6 |
from PyPDF2 import PdfReader
|
|
|
|
|
7 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
8 |
|
9 |
-
# Embeddings y
|
10 |
from langchain_community.embeddings.spacy_embeddings import SpacyEmbeddings
|
11 |
from langchain_community.vectorstores import FAISS
|
12 |
|
13 |
-
#
|
14 |
-
from
|
15 |
-
from
|
16 |
-
from langchain.
|
17 |
-
from
|
|
|
18 |
|
19 |
-
# Cargar
|
20 |
load_dotenv()
|
21 |
-
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" # A veces necesario en Windows o entornos concretos
|
22 |
|
23 |
-
#
|
|
|
|
|
|
|
24 |
embeddings = SpacyEmbeddings(model_name="en_core_web_sm")
|
25 |
|
26 |
-
#
|
27 |
# Funciones auxiliares
|
28 |
-
#
|
29 |
-
def
|
30 |
-
"""
|
31 |
-
Lee cada PDF y concatena su texto.
|
32 |
-
"""
|
33 |
text = ""
|
34 |
-
for
|
35 |
-
pdf_reader = PdfReader(
|
36 |
for page in pdf_reader.pages:
|
37 |
text += page.extract_text() or ""
|
38 |
return text
|
39 |
|
40 |
-
def
|
41 |
-
"""
|
42 |
-
Divide el texto en chunks para indexarlo en FAISS.
|
43 |
-
"""
|
44 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
Crea
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
def get_conversational_chain(tool, question):
|
55 |
-
"""
|
56 |
-
Genera la respuesta a la pregunta usando la herramienta de recuperaci贸n.
|
57 |
-
"""
|
58 |
-
api_key = os.getenv("OPENAI_API_KEY")
|
59 |
-
|
60 |
-
# Modelo LLM (adaptar model_name seg煤n lo que tengas disponible)
|
61 |
-
llm = ChatOpenAI(
|
62 |
-
model_name="gpt-4o-mini", # O "gpt-3.5-turbo", etc.
|
63 |
-
temperature=0.4,
|
64 |
-
api_key=api_key
|
65 |
-
)
|
66 |
-
|
67 |
-
# Plantilla de prompt
|
68 |
-
prompt = ChatPromptTemplate.from_messages([
|
69 |
-
(
|
70 |
-
"system",
|
71 |
-
"""Eres un asistente 煤til. Responde la pregunta de la forma m谩s completa posible
|
72 |
-
utilizando solo el contexto disponible. Si la respuesta no est谩 en el contexto,
|
73 |
-
di: "answer is not available in the context"."""
|
74 |
-
),
|
75 |
-
("placeholder", "{chat_history}"),
|
76 |
-
("human", "{input}"),
|
77 |
-
("placeholder", "{agent_scratchpad}"),
|
78 |
-
])
|
79 |
-
|
80 |
-
# Creamos el agente con la herramienta y ejecutamos
|
81 |
-
agent = create_tool_calling_agent(llm, tools=[tool], prompt=prompt)
|
82 |
-
agent_executor = AgentExecutor(agent=agent, tools=[tool], verbose=False)
|
83 |
-
response = agent_executor.invoke({"input": question})
|
84 |
-
return response["output"]
|
85 |
-
|
86 |
-
def generate_answer(user_question):
|
87 |
-
"""
|
88 |
-
Usa la base vectorial en session_state y retorna la respuesta.
|
89 |
-
"""
|
90 |
-
# Verifica si tenemos FAISS cargado
|
91 |
-
if "faiss_db" not in st.session_state or st.session_state["faiss_db"] is None:
|
92 |
-
return "No hay PDF(s) procesado(s). Por favor, carga y procesa alg煤n PDF."
|
93 |
-
|
94 |
-
# Crea la herramienta de recuperaci贸n
|
95 |
-
db = st.session_state["faiss_db"]
|
96 |
-
retriever = db.as_retriever()
|
97 |
-
retrieval_tool = create_retriever_tool(
|
98 |
-
retriever,
|
99 |
-
name="pdf_extractor",
|
100 |
-
description="This tool gives answers to queries from the PDF(s)."
|
101 |
-
)
|
102 |
-
|
103 |
-
# Obtiene la respuesta final usando la cadena conversacional
|
104 |
-
answer = get_conversational_chain(retrieval_tool, user_question)
|
105 |
-
return answer
|
106 |
-
|
107 |
-
# -----------------------------------------------------------
|
108 |
# Aplicaci贸n principal
|
109 |
-
#
|
110 |
def main():
|
111 |
-
st.set_page_config(page_title="Chat PDF", layout="wide")
|
112 |
-
st.header("RAG-based Chat con
|
|
|
|
|
|
|
|
|
113 |
|
114 |
-
#
|
115 |
if "messages" not in st.session_state:
|
116 |
st.session_state["messages"] = []
|
117 |
|
118 |
-
#
|
119 |
-
if "faiss_db" not in st.session_state:
|
120 |
-
st.session_state["faiss_db"] = None
|
121 |
-
|
122 |
-
# ----------------------------------------------------------------
|
123 |
-
# SIDEBAR: subir y procesar PDFs
|
124 |
-
# ----------------------------------------------------------------
|
125 |
with st.sidebar:
|
126 |
st.title("Men煤:")
|
127 |
-
|
128 |
-
"Sube tus
|
129 |
accept_multiple_files=True
|
130 |
)
|
131 |
|
132 |
if st.button("Procesar PDFs"):
|
133 |
-
if
|
134 |
-
with st.spinner("Procesando..."):
|
135 |
-
#
|
136 |
-
raw_text =
|
137 |
-
|
138 |
-
#
|
139 |
-
|
140 |
-
|
141 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
else:
|
143 |
-
st.warning("No
|
144 |
|
145 |
-
#
|
146 |
-
if st.button("
|
147 |
-
st.session_state["
|
148 |
-
st.
|
|
|
149 |
|
150 |
-
|
151 |
-
# MAIN CHAT
|
152 |
-
# ----------------------------------------------------------------
|
153 |
-
st.subheader("Chat")
|
154 |
|
155 |
-
#
|
156 |
for msg in st.session_state["messages"]:
|
157 |
-
# Si quieres un formato sencillo:
|
158 |
st.write(f"**{msg['role'].capitalize()}:** {msg['content']}")
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
# with st.chat_message("user"):
|
163 |
-
# st.write(msg["content"])
|
164 |
-
# else:
|
165 |
-
# with st.chat_message("assistant"):
|
166 |
-
# st.write(msg["content"])
|
167 |
-
|
168 |
-
# Input de chat del usuario
|
169 |
-
user_input = st.text_input("Escribe tu pregunta aqu铆...")
|
170 |
|
171 |
if user_input:
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
184 |
|
185 |
if __name__ == "__main__":
|
186 |
main()
|
|
|
2 |
import streamlit as st
|
3 |
from dotenv import load_dotenv
|
4 |
|
5 |
+
# Lector de PDFs
|
6 |
from PyPDF2 import PdfReader
|
7 |
+
|
8 |
+
# Fragmentador de texto
|
9 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
10 |
|
11 |
+
# Embeddings y VectorStore
|
12 |
from langchain_community.embeddings.spacy_embeddings import SpacyEmbeddings
|
13 |
from langchain_community.vectorstores import FAISS
|
14 |
|
15 |
+
# Librer铆as de LangChain para RAG
|
16 |
+
from langchain.chains import ConversationalRetrievalChain
|
17 |
+
from langchain.chat_models import ChatOpenAI
|
18 |
+
from langchain.memory import ConversationBufferMemory
|
19 |
+
from mi_prompt import tu_prompt_personalizado
|
20 |
+
|
21 |
|
22 |
+
# Cargar .env si lo necesitas
|
23 |
load_dotenv()
|
|
|
24 |
|
25 |
+
# Ajuste puntual (opcional en ciertos entornos Windows)
|
26 |
+
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
|
27 |
+
|
28 |
+
# Embeddings con spaCy (puedes cambiarlo por OpenAIEmbeddings, etc.)
|
29 |
embeddings = SpacyEmbeddings(model_name="en_core_web_sm")
|
30 |
|
31 |
+
# ---------------------------------------------
|
32 |
# Funciones auxiliares
|
33 |
+
# ---------------------------------------------
|
34 |
+
def read_pdfs(pdf_files):
|
35 |
+
"""Lee cada PDF y concatena su texto."""
|
|
|
|
|
36 |
text = ""
|
37 |
+
for pdf_file in pdf_files:
|
38 |
+
pdf_reader = PdfReader(pdf_file)
|
39 |
for page in pdf_reader.pages:
|
40 |
text += page.extract_text() or ""
|
41 |
return text
|
42 |
|
43 |
+
def chunk_text(text):
|
44 |
+
"""Divide el texto en chunks."""
|
|
|
|
|
45 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
46 |
+
chunks = text_splitter.split_text(text)
|
47 |
+
return chunks
|
48 |
+
|
49 |
+
def create_vectorstore(chunks):
|
50 |
+
"""Crea el FAISS VectorStore a partir de la lista de chunks."""
|
51 |
+
vectorstore = FAISS.from_texts(chunks, embedding=embeddings)
|
52 |
+
return vectorstore
|
53 |
+
|
54 |
+
# ---------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
# Aplicaci贸n principal
|
56 |
+
# ---------------------------------------------
|
57 |
def main():
|
58 |
+
st.set_page_config(page_title="Chat PDF (RAG)", layout="wide")
|
59 |
+
st.header("RAG-based Chat con PDFs")
|
60 |
+
|
61 |
+
# Iniciamos el estado de la conversaci贸n en la app
|
62 |
+
if "conversation_chain" not in st.session_state:
|
63 |
+
st.session_state["conversation_chain"] = None
|
64 |
|
65 |
+
# Guardamos el historial en session_state (para la UI)
|
66 |
if "messages" not in st.session_state:
|
67 |
st.session_state["messages"] = []
|
68 |
|
69 |
+
# Barra lateral: subir PDFs y procesarlos
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
with st.sidebar:
|
71 |
st.title("Men煤:")
|
72 |
+
uploaded_pdfs = st.file_uploader(
|
73 |
+
"Sube tus PDFs y haz clic en 'Procesar PDFs'.",
|
74 |
accept_multiple_files=True
|
75 |
)
|
76 |
|
77 |
if st.button("Procesar PDFs"):
|
78 |
+
if uploaded_pdfs:
|
79 |
+
with st.spinner("Procesando e indexando..."):
|
80 |
+
# 1) Leer PDFs
|
81 |
+
raw_text = read_pdfs(uploaded_pdfs)
|
82 |
+
|
83 |
+
# 2) Fragmentar texto
|
84 |
+
text_chunks = chunk_text(raw_text)
|
85 |
+
|
86 |
+
# 3) Crear FAISS VectorStore
|
87 |
+
vectorstore = create_vectorstore(text_chunks)
|
88 |
+
|
89 |
+
# 4) Crear la cadena conversacional con retrieval
|
90 |
+
# - ConversationalRetrievalChain maneja preguntas + contexto
|
91 |
+
llm = ChatOpenAI(
|
92 |
+
model_name="gpt-4o-mini", # o "gpt-4", seg煤n tu acceso
|
93 |
+
temperature=0
|
94 |
+
)
|
95 |
+
|
96 |
+
# Memoria para la conversaci贸n
|
97 |
+
memory = ConversationBufferMemory(
|
98 |
+
memory_key="chat_history",
|
99 |
+
return_messages=True
|
100 |
+
)
|
101 |
+
|
102 |
+
# Creamos la cadena RAG:
|
103 |
+
conversation_chain = ConversationalRetrievalChain.from_llm(
|
104 |
+
llm=llm,
|
105 |
+
retriever=vectorstore.as_retriever(search_kwargs={"k": 6}),
|
106 |
+
memory=memory,
|
107 |
+
# Opcionalmente, ajusta c贸mo combinar la pregunta con los documentos:
|
108 |
+
combine_docs_chain_kwargs={"prompt": tu_prompt_personalizado},
|
109 |
+
)
|
110 |
+
|
111 |
+
# Guardamos la cadena en session_state
|
112 |
+
st.session_state["conversation_chain"] = conversation_chain
|
113 |
+
st.success("隆PDFs procesados y VectorStore creado!")
|
114 |
else:
|
115 |
+
st.warning("No subiste ning煤n PDF")
|
116 |
|
117 |
+
# Bot贸n para reiniciar
|
118 |
+
if st.button("Reiniciar VectorStore"):
|
119 |
+
st.session_state["conversation_chain"] = None
|
120 |
+
st.session_state["messages"] = []
|
121 |
+
st.info("Base vectorial reiniciada. Sube nuevos PDFs si lo deseas.")
|
122 |
|
123 |
+
st.subheader("Chat con tus PDFs")
|
|
|
|
|
|
|
124 |
|
125 |
+
# Mostrar historial previo
|
126 |
for msg in st.session_state["messages"]:
|
|
|
127 |
st.write(f"**{msg['role'].capitalize()}:** {msg['content']}")
|
128 |
+
|
129 |
+
# Input del usuario
|
130 |
+
user_input = st.text_input("Haz una pregunta sobre el/los PDF(s)...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
|
132 |
if user_input:
|
133 |
+
if st.session_state["conversation_chain"] is None:
|
134 |
+
st.warning("No hay PDFs procesados. Sube y procesa al menos un PDF.")
|
135 |
+
else:
|
136 |
+
# Guardamos mensaje del usuario en el historial
|
137 |
+
st.session_state["messages"].append({"role": "user", "content": user_input})
|
138 |
+
|
139 |
+
# Usar la cadena conversacional para obtener respuesta
|
140 |
+
response = st.session_state["conversation_chain"]({
|
141 |
+
"question": user_input
|
142 |
+
})
|
143 |
+
|
144 |
+
# El output viene en la llave "answer" por defecto con ConversationalRetrievalChain
|
145 |
+
answer = response["answer"]
|
146 |
+
|
147 |
+
# A帽adir respuesta al historial
|
148 |
+
st.session_state["messages"].append({"role": "assistant", "content": answer})
|
149 |
+
|
150 |
+
# Mostrar la respuesta
|
151 |
+
st.write(f"**Asistente:** {answer}")
|
152 |
|
153 |
if __name__ == "__main__":
|
154 |
main()
|
mi_prompt.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.prompts import PromptTemplate
|
2 |
+
|
3 |
+
tu_prompt_personalizado = PromptTemplate(
|
4 |
+
input_variables=["context", "question"],
|
5 |
+
template="""
|
6 |
+
Eres un cient铆fico reconocido a nivel mundial en la materia contenida en los PDFs subidos.
|
7 |
+
Responde a la pregunta de la forma m谩s clara y detallada posible, bas谩ndote EXCLUSIVAMENTE
|
8 |
+
en la informaci贸n provista en el siguiente "Contexto".
|
9 |
+
|
10 |
+
Indica, cuando sea posible, las referencias o fragmentos relevantes del contexto que te llevaron a
|
11 |
+
tu respuesta (pero evita copiar fragmentos muy largos). Si no encuentras la respuesta en el contexto,
|
12 |
+
di que no dispones de datos suficientes, o que puedes dar una guia mas clara de como encontrar esta informaci贸n.
|
13 |
+
|
14 |
+
Contexto:
|
15 |
+
{context}
|
16 |
+
|
17 |
+
Pregunta: {question}
|
18 |
+
|
19 |
+
Respuesta:
|
20 |
+
""",
|
21 |
+
)
|