Spaces:

Segizu
/

PDF_CHATBOT

Build error

App Files Files Community

PDF_CHATBOT / app.py

Segizu

ptompting

f585950 5 months ago

raw

history blame

6.88 kB

	import os
	import streamlit as st
	from dotenv import load_dotenv

	# Lectura y procesamiento de PDFs
	from PyPDF2 import PdfReader
	from langchain.text_splitter import RecursiveCharacterTextSplitter

	# Embeddings y VectorStores
	from langchain_community.embeddings.spacy_embeddings import SpacyEmbeddings
	from langchain_community.vectorstores import FAISS

	# LLM y Herramientas
	from langchain_openai import ChatOpenAI
	from langchain_core.prompts import ChatPromptTemplate
	from langchain.tools.retriever import create_retriever_tool
	from langchain.agents import AgentExecutor, create_tool_calling_agent

	# Cargar variables de entorno
	load_dotenv()
	os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" # A veces necesario en Windows o entornos concretos

	# Inicializamos el embedding con spaCy
	embeddings = SpacyEmbeddings(model_name="en_core_web_sm")

	# -----------------------------------------------------------
	# Funciones auxiliares
	# -----------------------------------------------------------
	def pdf_read(pdf_docs):
	"""
	Lee cada PDF y concatena su texto.
	"""
	text = ""
	for pdf in pdf_docs:
	pdf_reader = PdfReader(pdf)
	for page in pdf_reader.pages:
	text += page.extract_text() or ""
	return text

	def get_chunks(text):
	"""
	Divide el texto en chunks para indexarlo en FAISS.
	"""
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
	return text_splitter.split_text(text)

	def create_vector_store(text_chunks):
	"""
	Crea un FAISS VectorStore a partir de los chunks.
	"""
	vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
	return vector_store

	def get_conversational_chain(tool, question):
	"""
	Genera la respuesta a la pregunta usando la herramienta de recuperación.
	"""
	api_key = os.getenv("OPENAI_API_KEY")

	# Modelo LLM (adaptar model_name según lo que tengas disponible)
	llm = ChatOpenAI(
	model_name="gpt-4o-mini", # O "gpt-3.5-turbo", etc.
	temperature=0.4,
	api_key=api_key
	)

	# Plantilla de prompt
	prompt = ChatPromptTemplate.from_messages([
	(
	"system",
	"""Eres un asistente útil. Responde la pregunta de la forma más completa posible
	utilizando solo el contexto disponible. Si la respuesta no está en el contexto,
	di: "answer is not available in the context"."""
	),
	("placeholder", "{chat_history}"),
	("human", "{input}"),
	("placeholder", "{agent_scratchpad}"),
	])

	# Creamos el agente con la herramienta y ejecutamos
	agent = create_tool_calling_agent(llm, tools=[tool], prompt=prompt)
	agent_executor = AgentExecutor(agent=agent, tools=[tool], verbose=False)
	response = agent_executor.invoke({"input": question})
	return response["output"]

	def generate_answer(user_question):
	"""
	Usa la base vectorial en session_state y retorna la respuesta.
	"""
	# Verifica si tenemos FAISS cargado
	if "faiss_db" not in st.session_state or st.session_state["faiss_db"] is None:
	return "No hay PDF(s) procesado(s). Por favor, carga y procesa algún PDF."

	# Crea la herramienta de recuperación
	db = st.session_state["faiss_db"]
	retriever = db.as_retriever()
	retrieval_tool = create_retriever_tool(
	retriever,
	name="pdf_extractor",
	description="This tool gives answers to queries from the PDF(s)."
	)

	# Obtiene la respuesta final usando la cadena conversacional
	answer = get_conversational_chain(retrieval_tool, user_question)
	return answer

	# -----------------------------------------------------------
	# Aplicación principal
	# -----------------------------------------------------------
	def main():
	st.set_page_config(page_title="Chat PDF", layout="wide")
	st.header("RAG-based Chat con PDF")

	# Inicializa el historial de mensajes en session_state si no existe
	if "messages" not in st.session_state:
	st.session_state["messages"] = []

	# Inicializa la base vectorial (None si aún no se ha creado)
	if "faiss_db" not in st.session_state:
	st.session_state["faiss_db"] = None

	# ----------------------------------------------------------------
	# SIDEBAR: subir y procesar PDFs
	# ----------------------------------------------------------------
	with st.sidebar:
	st.title("Menú:")
	pdf_docs = st.file_uploader(
	"Sube tus archivos PDF y haz clic en 'Procesar PDFs'.",
	accept_multiple_files=True
	)

	if st.button("Procesar PDFs"):
	if pdf_docs:
	with st.spinner("Procesando..."):
	# Leemos y fragmentamos los PDFs en chunks
	raw_text = pdf_read(pdf_docs)
	text_chunks = get_chunks(raw_text)
	# Creamos la base vectorial FAISS y la guardamos en session_state
	new_vector_store = create_vector_store(text_chunks)
	st.session_state["faiss_db"] = new_vector_store
	st.success("¡Hecho! Se han indexado los PDF.")
	else:
	st.warning("No has seleccionado ningún PDF.")

	# Opción para borrar la base vectorial y subir otros PDFs
	if st.button("Borrar vector store"):
	st.session_state["faiss_db"] = None
	st.info("Vector store borrado. Ahora puedes subir nuevos PDFs.")

	# ----------------------------------------------------------------
	# MAIN CHAT
	# ----------------------------------------------------------------
	st.subheader("Chat")

	# Muestra los mensajes previos del historial
	for msg in st.session_state["messages"]:
	# Si quieres un formato sencillo:
	st.write(f"{msg['role'].capitalize()}: {msg['content']}")

	# O bien, podrías usar el componente experimental de chat si tu versión de Streamlit lo soporta:
	# if msg["role"] == "user":
	# with st.chat_message("user"):
	# st.write(msg["content"])
	# else:
	# with st.chat_message("assistant"):
	# st.write(msg["content"])

	# Input de chat del usuario
	user_input = st.text_input("Escribe tu pregunta aquí...")

	if user_input:
	# Guarda el mensaje del usuario
	st.session_state["messages"].append({"role": "user", "content": user_input})

	# Genera la respuesta
	answer = generate_answer(user_input)

	# Guarda la respuesta en el historial
	st.session_state["messages"].append({"role": "assistant", "content": answer})

	# Para forzar el refresco (opcional en Streamlit 1.x).
	# Puedes comentarlo si te da problemas o no lo necesitas.
	#st.experimental_rerun()

	if __name__ == "__main__":
	main()