PDFChat / app.py
lozanopastor's picture
Update app.py
0a89103 verified
raw
history blame
4.93 kB
import streamlit as st
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from langchain_community.embeddings import HuggingFaceEmbeddings # Using Hugging Face embeddings
from langchain.vectorstores import FAISS
from langchain_groq import ChatGroq
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv
import re
# Load environment variables
load_dotenv()
os.getenv("GROQ_API_KEY")
def get_pdf_text(pdf_docs):
"""Extracts text from uploaded PDF files."""
text = ""
for pdf in pdf_docs:
pdf_reader = PdfReader(pdf)
for page in pdf_reader.pages:
text += page.extract_text()
return text
def get_text_chunks(text):
"""Splits extracted text into manageable chunks."""
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
chunks = text_splitter.split_text(text)
return chunks
def get_vector_store(text_chunks):
"""Creates and saves a FAISS vector store from text chunks."""
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") # Using Hugging Face embeddings
vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
vector_store.save_local("faiss_index")
def get_conversational_chain():
"""Sets up a conversational chain using Groq LLM."""
prompt_template = """
Answer the question as detailed as possible from the provided context. If the answer is not in
the provided context, just say, "answer is not available in the context." Do not provide incorrect answers.
Context:
{context}?
Question:
{question}
Answer:
"""
model = ChatGroq(
temperature=0.3,
model_name="deepseek-r1-distill-llama-70b", # Using Mixtral model through Groq
groq_api_key=os.getenv("GROQ_API_KEY")
)
prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)
return chain
def user_input(user_question):
"""Handles user queries by retrieving answers from the vector store."""
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") # Using Hugging Face embeddings
new_db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
docs = new_db.similarity_search(user_question)
chain = get_conversational_chain()
response = chain(
{"input_documents": docs, "question": user_question},
return_only_outputs=True
)
# Debugging: Print the original response
print("Original Response:", response['output_text'])
# Extract the thought process
thought_process = ""
if "<think>" in response['output_text'] and "</think>" in response['output_text']:
thought_process_match = re.search(r"<think>(.*?)</think>", response['output_text'], re.DOTALL)
if thought_process_match:
thought_process = thought_process_match.group(1).strip()
# Remove the thought process from the main response
clean_response = response['output_text'].replace(f"<think>{thought_process}</think>", "").strip()
# Debugging: Print the cleaned response
print("Cleaned Response:", clean_response)
# Display the model's thought process in the expander
with st.expander("Model Thought Process"):
st.write(thought_process)
st.markdown(f"### Reply:\n{clean_response}")
def main():
"""Main function to run the Streamlit app."""
st.set_page_config(page_title="Chat PDF", page_icon=":books:", layout="wide")
st.title("Chat with PDF using DeepSeek Ai")
st.sidebar.header("Upload & Process PDF Files")
st.sidebar.markdown(
"Using DeepSeek R1 model for advanced conversational capabilities.")
with st.sidebar:
pdf_docs = st.file_uploader(
"Upload your PDF files:",
accept_multiple_files=True,
type=["pdf"]
)
if st.button("Submit & Process"):
with st.spinner("Processing your files..."):
raw_text = get_pdf_text(pdf_docs)
text_chunks = get_text_chunks(raw_text)
get_vector_store(text_chunks)
st.success("PDFs processed and indexed successfully!")
st.markdown(
"### Ask Questions from Your PDF Files :mag:\n"
"Once you upload and process your PDFs, type your questions below."
)
user_question = st.text_input("Enter your question:", placeholder="What do you want to know?")
if user_question:
with st.spinner("Fetching your answer..."):
user_input(user_question)
st.sidebar.info(
"**Note:** This app uses DeepSeek R1 model for answering questions accurately."
)
if __name__ == "__main__":
main()