Segizu commited on
Commit
f585950
·
1 Parent(s): 23cc930
Files changed (5) hide show
  1. .env +1 -0
  2. app.py +154 -67
  3. faiss_db/index.faiss +0 -0
  4. faiss_db/index.pkl +3 -0
  5. requirements.txt +3 -1
.env ADDED
@@ -0,0 +1 @@
 
 
1
+ OPENAI_API_KEY=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
app.py CHANGED
@@ -1,99 +1,186 @@
 
1
  import streamlit as st
 
 
 
2
  from PyPDF2 import PdfReader
3
  from langchain.text_splitter import RecursiveCharacterTextSplitter
4
- from langchain_core.prompts import ChatPromptTemplate
 
5
  from langchain_community.embeddings.spacy_embeddings import SpacyEmbeddings
6
  from langchain_community.vectorstores import FAISS
 
 
 
 
7
  from langchain.tools.retriever import create_retriever_tool
8
- from dotenv import load_dotenv
9
- from langchain_anthropic import ChatAnthropic
10
- from langchain_openai import ChatOpenAI, OpenAIEmbeddings
11
  from langchain.agents import AgentExecutor, create_tool_calling_agent
12
 
13
- import os
14
- os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
 
15
 
 
16
  embeddings = SpacyEmbeddings(model_name="en_core_web_sm")
17
- def pdf_read(pdf_doc):
 
 
 
 
 
 
 
18
  text = ""
19
- for pdf in pdf_doc:
20
  pdf_reader = PdfReader(pdf)
21
  for page in pdf_reader.pages:
22
- text += page.extract_text()
23
  return text
24
 
25
-
26
-
27
  def get_chunks(text):
 
 
 
28
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
29
- chunks = text_splitter.split_text(text)
30
- return chunks
31
 
32
-
33
- def vector_store(text_chunks):
34
-
 
35
  vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
36
- vector_store.save_local("faiss_db")
37
-
38
-
39
- def get_conversational_chain(tools,ques):
40
- #os.environ["ANTHROPIC_API_KEY"]=os.getenv["ANTHROPIC_API_KEY"]
41
- #llm = ChatAnthropic(model="claude-3-sonnet-20240229", temperature=0, api_key=os.getenv("ANTHROPIC_API_KEY"),verbose=True)
42
- llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, api_key="")
43
- prompt = ChatPromptTemplate.from_messages(
44
- [
 
 
 
 
 
 
 
 
45
  (
46
  "system",
47
- """You are a helpful assistant. Answer the question as detailed as possible from the provided context, make sure to provide all the details, if the answer is not in
48
- provided context just say, "answer is not available in the context", don't provide the wrong answer""",
 
49
  ),
50
  ("placeholder", "{chat_history}"),
51
  ("human", "{input}"),
52
  ("placeholder", "{agent_scratchpad}"),
53
- ]
54
- )
55
- tool=[tools]
56
- agent = create_tool_calling_agent(llm, tool, prompt)
57
-
58
- agent_executor = AgentExecutor(agent=agent, tools=tool, verbose=True)
59
- response=agent_executor.invoke({"input": ques})
60
- print(response)
61
- st.write("Reply: ", response['output'])
62
-
63
-
64
-
65
- def user_input(user_question):
66
-
67
-
68
-
69
- new_db = FAISS.load_local("faiss_db", embeddings,allow_dangerous_deserialization=True)
70
-
71
- retriever=new_db.as_retriever()
72
- retrieval_chain= create_retriever_tool(retriever,"pdf_extractor","This tool is to give answer to queries from the pdf")
73
- get_conversational_chain(retrieval_chain,user_question)
74
-
75
-
76
-
77
-
78
-
 
 
 
 
 
 
79
  def main():
80
- st.set_page_config("Chat PDF")
81
- st.header("RAG based Chat with PDF")
82
 
83
- user_question = st.text_input("Ask a Question from the PDF Files")
 
 
84
 
85
- if user_question:
86
- user_input(user_question)
 
87
 
 
 
 
88
  with st.sidebar:
89
- st.title("Menu:")
90
- pdf_doc = st.file_uploader("Upload your PDF Files and Click on the Submit & Process Button", accept_multiple_files=True)
91
- if st.button("Submit & Process"):
92
- with st.spinner("Processing..."):
93
- raw_text = pdf_read(pdf_doc)
94
- text_chunks = get_chunks(raw_text)
95
- vector_store(text_chunks)
96
- st.success("Done")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
 
98
  if __name__ == "__main__":
99
- main()
 
1
+ import os
2
  import streamlit as st
3
+ from dotenv import load_dotenv
4
+
5
+ # Lectura y procesamiento de PDFs
6
  from PyPDF2 import PdfReader
7
  from langchain.text_splitter import RecursiveCharacterTextSplitter
8
+
9
+ # Embeddings y VectorStores
10
  from langchain_community.embeddings.spacy_embeddings import SpacyEmbeddings
11
  from langchain_community.vectorstores import FAISS
12
+
13
+ # LLM y Herramientas
14
+ from langchain_openai import ChatOpenAI
15
+ from langchain_core.prompts import ChatPromptTemplate
16
  from langchain.tools.retriever import create_retriever_tool
 
 
 
17
  from langchain.agents import AgentExecutor, create_tool_calling_agent
18
 
19
+ # Cargar variables de entorno
20
+ load_dotenv()
21
+ os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" # A veces necesario en Windows o entornos concretos
22
 
23
+ # Inicializamos el embedding con spaCy
24
  embeddings = SpacyEmbeddings(model_name="en_core_web_sm")
25
+
26
+ # -----------------------------------------------------------
27
+ # Funciones auxiliares
28
+ # -----------------------------------------------------------
29
+ def pdf_read(pdf_docs):
30
+ """
31
+ Lee cada PDF y concatena su texto.
32
+ """
33
  text = ""
34
+ for pdf in pdf_docs:
35
  pdf_reader = PdfReader(pdf)
36
  for page in pdf_reader.pages:
37
+ text += page.extract_text() or ""
38
  return text
39
 
 
 
40
  def get_chunks(text):
41
+ """
42
+ Divide el texto en chunks para indexarlo en FAISS.
43
+ """
44
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
45
+ return text_splitter.split_text(text)
 
46
 
47
+ def create_vector_store(text_chunks):
48
+ """
49
+ Crea un FAISS VectorStore a partir de los chunks.
50
+ """
51
  vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
52
+ return vector_store
53
+
54
+ def get_conversational_chain(tool, question):
55
+ """
56
+ Genera la respuesta a la pregunta usando la herramienta de recuperación.
57
+ """
58
+ api_key = os.getenv("OPENAI_API_KEY")
59
+
60
+ # Modelo LLM (adaptar model_name según lo que tengas disponible)
61
+ llm = ChatOpenAI(
62
+ model_name="gpt-4o-mini", # O "gpt-3.5-turbo", etc.
63
+ temperature=0.4,
64
+ api_key=api_key
65
+ )
66
+
67
+ # Plantilla de prompt
68
+ prompt = ChatPromptTemplate.from_messages([
69
  (
70
  "system",
71
+ """Eres un asistente útil. Responde la pregunta de la forma más completa posible
72
+ utilizando solo el contexto disponible. Si la respuesta no está en el contexto,
73
+ di: "answer is not available in the context"."""
74
  ),
75
  ("placeholder", "{chat_history}"),
76
  ("human", "{input}"),
77
  ("placeholder", "{agent_scratchpad}"),
78
+ ])
79
+
80
+ # Creamos el agente con la herramienta y ejecutamos
81
+ agent = create_tool_calling_agent(llm, tools=[tool], prompt=prompt)
82
+ agent_executor = AgentExecutor(agent=agent, tools=[tool], verbose=False)
83
+ response = agent_executor.invoke({"input": question})
84
+ return response["output"]
85
+
86
+ def generate_answer(user_question):
87
+ """
88
+ Usa la base vectorial en session_state y retorna la respuesta.
89
+ """
90
+ # Verifica si tenemos FAISS cargado
91
+ if "faiss_db" not in st.session_state or st.session_state["faiss_db"] is None:
92
+ return "No hay PDF(s) procesado(s). Por favor, carga y procesa algún PDF."
93
+
94
+ # Crea la herramienta de recuperación
95
+ db = st.session_state["faiss_db"]
96
+ retriever = db.as_retriever()
97
+ retrieval_tool = create_retriever_tool(
98
+ retriever,
99
+ name="pdf_extractor",
100
+ description="This tool gives answers to queries from the PDF(s)."
101
+ )
102
+
103
+ # Obtiene la respuesta final usando la cadena conversacional
104
+ answer = get_conversational_chain(retrieval_tool, user_question)
105
+ return answer
106
+
107
+ # -----------------------------------------------------------
108
+ # Aplicación principal
109
+ # -----------------------------------------------------------
110
  def main():
111
+ st.set_page_config(page_title="Chat PDF", layout="wide")
112
+ st.header("RAG-based Chat con PDF")
113
 
114
+ # Inicializa el historial de mensajes en session_state si no existe
115
+ if "messages" not in st.session_state:
116
+ st.session_state["messages"] = []
117
 
118
+ # Inicializa la base vectorial (None si aún no se ha creado)
119
+ if "faiss_db" not in st.session_state:
120
+ st.session_state["faiss_db"] = None
121
 
122
+ # ----------------------------------------------------------------
123
+ # SIDEBAR: subir y procesar PDFs
124
+ # ----------------------------------------------------------------
125
  with st.sidebar:
126
+ st.title("Menú:")
127
+ pdf_docs = st.file_uploader(
128
+ "Sube tus archivos PDF y haz clic en 'Procesar PDFs'.",
129
+ accept_multiple_files=True
130
+ )
131
+
132
+ if st.button("Procesar PDFs"):
133
+ if pdf_docs:
134
+ with st.spinner("Procesando..."):
135
+ # Leemos y fragmentamos los PDFs en chunks
136
+ raw_text = pdf_read(pdf_docs)
137
+ text_chunks = get_chunks(raw_text)
138
+ # Creamos la base vectorial FAISS y la guardamos en session_state
139
+ new_vector_store = create_vector_store(text_chunks)
140
+ st.session_state["faiss_db"] = new_vector_store
141
+ st.success("¡Hecho! Se han indexado los PDF.")
142
+ else:
143
+ st.warning("No has seleccionado ningún PDF.")
144
+
145
+ # Opción para borrar la base vectorial y subir otros PDFs
146
+ if st.button("Borrar vector store"):
147
+ st.session_state["faiss_db"] = None
148
+ st.info("Vector store borrado. Ahora puedes subir nuevos PDFs.")
149
+
150
+ # ----------------------------------------------------------------
151
+ # MAIN CHAT
152
+ # ----------------------------------------------------------------
153
+ st.subheader("Chat")
154
+
155
+ # Muestra los mensajes previos del historial
156
+ for msg in st.session_state["messages"]:
157
+ # Si quieres un formato sencillo:
158
+ st.write(f"**{msg['role'].capitalize()}:** {msg['content']}")
159
+
160
+ # O bien, podrías usar el componente experimental de chat si tu versión de Streamlit lo soporta:
161
+ # if msg["role"] == "user":
162
+ # with st.chat_message("user"):
163
+ # st.write(msg["content"])
164
+ # else:
165
+ # with st.chat_message("assistant"):
166
+ # st.write(msg["content"])
167
+
168
+ # Input de chat del usuario
169
+ user_input = st.text_input("Escribe tu pregunta aquí...")
170
+
171
+ if user_input:
172
+ # Guarda el mensaje del usuario
173
+ st.session_state["messages"].append({"role": "user", "content": user_input})
174
+
175
+ # Genera la respuesta
176
+ answer = generate_answer(user_input)
177
+
178
+ # Guarda la respuesta en el historial
179
+ st.session_state["messages"].append({"role": "assistant", "content": answer})
180
+
181
+ # Para forzar el refresco (opcional en Streamlit 1.x).
182
+ # Puedes comentarlo si te da problemas o no lo necesitas.
183
+ #st.experimental_rerun()
184
 
185
  if __name__ == "__main__":
186
+ main()
faiss_db/index.faiss ADDED
Binary file (53.4 kB). View file
 
faiss_db/index.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1940baa6a5f93292bd16ac70eed201d34e88a1d855e08238d3eacf194a343f73
3
+ size 147745
requirements.txt CHANGED
@@ -8,4 +8,6 @@ langchain-anthropic
8
  langchain-openai
9
  faiss-cpu
10
  python-dotenv
11
- spacy
 
 
 
8
  langchain-openai
9
  faiss-cpu
10
  python-dotenv
11
+ spacy
12
+ en-core-web-sm==3.5.0
13
+ altair==4.2.2