Spaces:

Segizu
/

PDF_CHATBOT

Build error

File size: 6,881 Bytes

import os
import streamlit as st
from dotenv import load_dotenv

# Lectura y procesamiento de PDFs
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Embeddings y VectorStores
from langchain_community.embeddings.spacy_embeddings import SpacyEmbeddings
from langchain_community.vectorstores import FAISS

# LLM y Herramientas
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain.tools.retriever import create_retriever_tool
from langchain.agents import AgentExecutor, create_tool_calling_agent

# Cargar variables de entorno
load_dotenv()
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"  # A veces necesario en Windows o entornos concretos

# Inicializamos el embedding con spaCy
embeddings = SpacyEmbeddings(model_name="en_core_web_sm")

# -----------------------------------------------------------
# Funciones auxiliares
# -----------------------------------------------------------
def pdf_read(pdf_docs):
    """
    Lee cada PDF y concatena su texto.
    """
    text = ""
    for pdf in pdf_docs:
        pdf_reader = PdfReader(pdf)
        for page in pdf_reader.pages:
            text += page.extract_text() or ""
    return text

def get_chunks(text):
    """
    Divide el texto en chunks para indexarlo en FAISS.
    """
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    return text_splitter.split_text(text)

def create_vector_store(text_chunks):
    """
    Crea un FAISS VectorStore a partir de los chunks.
    """
    vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
    return vector_store

def get_conversational_chain(tool, question):
    """
    Genera la respuesta a la pregunta usando la herramienta de recuperación.
    """
    api_key = os.getenv("OPENAI_API_KEY")

    # Modelo LLM (adaptar model_name según lo que tengas disponible)
    llm = ChatOpenAI(
        model_name="gpt-4o-mini",  # O "gpt-3.5-turbo", etc.
        temperature=0.4,
        api_key=api_key
    )

    # Plantilla de prompt
    prompt = ChatPromptTemplate.from_messages([
        (
            "system",
            """Eres un asistente útil. Responde la pregunta de la forma más completa posible 
            utilizando solo el contexto disponible. Si la respuesta no está en el contexto, 
            di: "answer is not available in the context"."""
        ),
        ("placeholder", "{chat_history}"),
        ("human", "{input}"),
        ("placeholder", "{agent_scratchpad}"),
    ])

    # Creamos el agente con la herramienta y ejecutamos
    agent = create_tool_calling_agent(llm, tools=[tool], prompt=prompt)
    agent_executor = AgentExecutor(agent=agent, tools=[tool], verbose=False)
    response = agent_executor.invoke({"input": question})
    return response["output"]

def generate_answer(user_question):
    """
    Usa la base vectorial en session_state y retorna la respuesta.
    """
    # Verifica si tenemos FAISS cargado
    if "faiss_db" not in st.session_state or st.session_state["faiss_db"] is None:
        return "No hay PDF(s) procesado(s). Por favor, carga y procesa algún PDF."

    # Crea la herramienta de recuperación
    db = st.session_state["faiss_db"]
    retriever = db.as_retriever()
    retrieval_tool = create_retriever_tool(
        retriever, 
        name="pdf_extractor",
        description="This tool gives answers to queries from the PDF(s)."
    )

    # Obtiene la respuesta final usando la cadena conversacional
    answer = get_conversational_chain(retrieval_tool, user_question)
    return answer

# -----------------------------------------------------------
# Aplicación principal
# -----------------------------------------------------------
def main():
    st.set_page_config(page_title="Chat PDF", layout="wide")
    st.header("RAG-based Chat con PDF")

    # Inicializa el historial de mensajes en session_state si no existe
    if "messages" not in st.session_state:
        st.session_state["messages"] = []

    # Inicializa la base vectorial (None si aún no se ha creado)
    if "faiss_db" not in st.session_state:
        st.session_state["faiss_db"] = None

    # ----------------------------------------------------------------
    # SIDEBAR: subir y procesar PDFs
    # ----------------------------------------------------------------
    with st.sidebar:
        st.title("Menú:")
        pdf_docs = st.file_uploader(
            "Sube tus archivos PDF y haz clic en 'Procesar PDFs'.",
            accept_multiple_files=True
        )

        if st.button("Procesar PDFs"):
            if pdf_docs:
                with st.spinner("Procesando..."):
                    # Leemos y fragmentamos los PDFs en chunks
                    raw_text = pdf_read(pdf_docs)
                    text_chunks = get_chunks(raw_text)
                    # Creamos la base vectorial FAISS y la guardamos en session_state
                    new_vector_store = create_vector_store(text_chunks)
                    st.session_state["faiss_db"] = new_vector_store
                st.success("¡Hecho! Se han indexado los PDF.")
            else:
                st.warning("No has seleccionado ningún PDF.")

        # Opción para borrar la base vectorial y subir otros PDFs
        if st.button("Borrar vector store"):
            st.session_state["faiss_db"] = None
            st.info("Vector store borrado. Ahora puedes subir nuevos PDFs.")

    # ----------------------------------------------------------------
    # MAIN CHAT
    # ----------------------------------------------------------------
    st.subheader("Chat")

    # Muestra los mensajes previos del historial
    for msg in st.session_state["messages"]:
        # Si quieres un formato sencillo:
        st.write(f"**{msg['role'].capitalize()}:** {msg['content']}")
        
        # O bien, podrías usar el componente experimental de chat si tu versión de Streamlit lo soporta:
        # if msg["role"] == "user":
        #     with st.chat_message("user"):
        #         st.write(msg["content"])
        # else:
        #     with st.chat_message("assistant"):
        #         st.write(msg["content"])

    # Input de chat del usuario
    user_input = st.text_input("Escribe tu pregunta aquí...")

    if user_input:
        # Guarda el mensaje del usuario
        st.session_state["messages"].append({"role": "user", "content": user_input})
        
        # Genera la respuesta
        answer = generate_answer(user_input)
        
        # Guarda la respuesta en el historial
        st.session_state["messages"].append({"role": "assistant", "content": answer})
        
        # Para forzar el refresco (opcional en Streamlit 1.x). 
        # Puedes comentarlo si te da problemas o no lo necesitas.
        #st.experimental_rerun()

if __name__ == "__main__":
    main()