Spaces:
Build error
Build error
import os | |
import streamlit as st | |
from dotenv import load_dotenv | |
# Lector de PDFs | |
from PyPDF2 import PdfReader | |
# Fragmentador de texto | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
# Embeddings y VectorStore | |
from langchain_community.embeddings.spacy_embeddings import SpacyEmbeddings | |
from langchain_community.vectorstores import FAISS | |
# Librer铆as de LangChain para RAG | |
from langchain.chains import ConversationalRetrievalChain | |
from langchain.chat_models import ChatOpenAI | |
from langchain.memory import ConversationBufferMemory | |
from mi_prompt import tu_prompt_personalizado | |
# Cargar .env si lo necesitas | |
load_dotenv() | |
# Ajuste puntual (opcional en ciertos entornos Windows) | |
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" | |
# Embeddings con spaCy (puedes cambiarlo por OpenAIEmbeddings, etc.) | |
embeddings = SpacyEmbeddings(model_name="en_core_web_sm") | |
# --------------------------------------------- | |
# Funciones auxiliares | |
# --------------------------------------------- | |
def read_pdfs(pdf_files): | |
"""Lee cada PDF y concatena su texto.""" | |
text = "" | |
for pdf_file in pdf_files: | |
pdf_reader = PdfReader(pdf_file) | |
for page in pdf_reader.pages: | |
text += page.extract_text() or "" | |
return text | |
def chunk_text(text): | |
"""Divide el texto en chunks.""" | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) | |
chunks = text_splitter.split_text(text) | |
return chunks | |
def create_vectorstore(chunks): | |
"""Crea el FAISS VectorStore a partir de la lista de chunks.""" | |
vectorstore = FAISS.from_texts(chunks, embedding=embeddings) | |
return vectorstore | |
# --------------------------------------------- | |
# Aplicaci贸n principal | |
# --------------------------------------------- | |
def main(): | |
st.set_page_config(page_title="Chat PDF (RAG)", layout="wide") | |
st.header("RAG-based Chat con PDFs") | |
# Iniciamos el estado de la conversaci贸n en la app | |
if "conversation_chain" not in st.session_state: | |
st.session_state["conversation_chain"] = None | |
# Guardamos el historial en session_state (para la UI) | |
if "messages" not in st.session_state: | |
st.session_state["messages"] = [] | |
# Barra lateral: subir PDFs y procesarlos | |
with st.sidebar: | |
st.title("Men煤:") | |
uploaded_pdfs = st.file_uploader( | |
"Sube tus PDFs y haz clic en 'Procesar PDFs'.", | |
accept_multiple_files=True | |
) | |
if st.button("Procesar PDFs"): | |
if uploaded_pdfs: | |
with st.spinner("Procesando e indexando..."): | |
# 1) Leer PDFs | |
raw_text = read_pdfs(uploaded_pdfs) | |
# 2) Fragmentar texto | |
text_chunks = chunk_text(raw_text) | |
# 3) Crear FAISS VectorStore | |
vectorstore = create_vectorstore(text_chunks) | |
# 4) Crear la cadena conversacional con retrieval | |
# - ConversationalRetrievalChain maneja preguntas + contexto | |
llm = ChatOpenAI( | |
model_name="gpt-4o-mini", # o "gpt-4", seg煤n tu acceso | |
temperature=0 | |
) | |
# Memoria para la conversaci贸n | |
memory = ConversationBufferMemory( | |
memory_key="chat_history", | |
return_messages=True | |
) | |
# Creamos la cadena RAG: | |
conversation_chain = ConversationalRetrievalChain.from_llm( | |
llm=llm, | |
retriever=vectorstore.as_retriever(search_kwargs={"k": 6}), | |
memory=memory, | |
# Opcionalmente, ajusta c贸mo combinar la pregunta con los documentos: | |
combine_docs_chain_kwargs={"prompt": tu_prompt_personalizado}, | |
) | |
# Guardamos la cadena en session_state | |
st.session_state["conversation_chain"] = conversation_chain | |
st.success("隆PDFs procesados y VectorStore creado!") | |
else: | |
st.warning("No subiste ning煤n PDF") | |
# Bot贸n para reiniciar | |
if st.button("Reiniciar VectorStore"): | |
st.session_state["conversation_chain"] = None | |
st.session_state["messages"] = [] | |
st.info("Base vectorial reiniciada. Sube nuevos PDFs si lo deseas.") | |
st.subheader("Chat con tus PDFs") | |
# Mostrar historial previo | |
for msg in st.session_state["messages"]: | |
st.write(f"**{msg['role'].capitalize()}:** {msg['content']}") | |
# Input del usuario | |
user_input = st.text_input("Haz una pregunta sobre el/los PDF(s)...") | |
if user_input: | |
if st.session_state["conversation_chain"] is None: | |
st.warning("No hay PDFs procesados. Sube y procesa al menos un PDF.") | |
else: | |
# Guardamos mensaje del usuario en el historial | |
st.session_state["messages"].append({"role": "user", "content": user_input}) | |
# Usar la cadena conversacional para obtener respuesta | |
response = st.session_state["conversation_chain"]({ | |
"question": user_input | |
}) | |
# El output viene en la llave "answer" por defecto con ConversationalRetrievalChain | |
answer = response["answer"] | |
# A帽adir respuesta al historial | |
st.session_state["messages"].append({"role": "assistant", "content": answer}) | |
# Mostrar la respuesta | |
st.write(f"**Asistente:** {answer}") | |
if __name__ == "__main__": | |
main() | |