Spaces:

lozanopastor
/

PDFChat

Running

App Files Files Community

PDFChat / app.py

lozanopastor

Update app.py

0a89103 verified 6 months ago

raw

history blame

4.93 kB

	import streamlit as st
	from PyPDF2 import PdfReader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	import os
	from langchain_community.embeddings import HuggingFaceEmbeddings # Using Hugging Face embeddings
	from langchain.vectorstores import FAISS
	from langchain_groq import ChatGroq
	from langchain.chains.question_answering import load_qa_chain
	from langchain.prompts import PromptTemplate
	from dotenv import load_dotenv
	import re

	# Load environment variables
	load_dotenv()
	os.getenv("GROQ_API_KEY")

	def get_pdf_text(pdf_docs):
	"""Extracts text from uploaded PDF files."""
	text = ""
	for pdf in pdf_docs:
	pdf_reader = PdfReader(pdf)
	for page in pdf_reader.pages:
	text += page.extract_text()
	return text

	def get_text_chunks(text):
	"""Splits extracted text into manageable chunks."""
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
	chunks = text_splitter.split_text(text)
	return chunks

	def get_vector_store(text_chunks):
	"""Creates and saves a FAISS vector store from text chunks."""
	embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") # Using Hugging Face embeddings
	vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
	vector_store.save_local("faiss_index")

	def get_conversational_chain():
	"""Sets up a conversational chain using Groq LLM."""
	prompt_template = """
	Answer the question as detailed as possible from the provided context. If the answer is not in
	the provided context, just say, "answer is not available in the context." Do not provide incorrect answers.

	Context:
	{context}?

	Question:
	{question}

	Answer:
	"""

	model = ChatGroq(
	temperature=0.3,
	model_name="deepseek-r1-distill-llama-70b", # Using Mixtral model through Groq
	groq_api_key=os.getenv("GROQ_API_KEY")
	)
	prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
	chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)
	return chain

	def user_input(user_question):
	"""Handles user queries by retrieving answers from the vector store."""
	embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") # Using Hugging Face embeddings

	new_db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
	docs = new_db.similarity_search(user_question)

	chain = get_conversational_chain()

	response = chain(
	{"input_documents": docs, "question": user_question},
	return_only_outputs=True
	)

	# Debugging: Print the original response
	print("Original Response:", response['output_text'])

	# Extract the thought process
	thought_process = ""
	if "<think>" in response['output_text'] and "</think>" in response['output_text']:
	thought_process_match = re.search(r"<think>(.*?)</think>", response['output_text'], re.DOTALL)
	if thought_process_match:
	thought_process = thought_process_match.group(1).strip()

	# Remove the thought process from the main response
	clean_response = response['output_text'].replace(f"<think>{thought_process}</think>", "").strip()

	# Debugging: Print the cleaned response
	print("Cleaned Response:", clean_response)

	# Display the model's thought process in the expander
	with st.expander("Model Thought Process"):
	st.write(thought_process)

	st.markdown(f"### Reply:\n{clean_response}")

	def main():
	"""Main function to run the Streamlit app."""
	st.set_page_config(page_title="Chat PDF", page_icon=":books:", layout="wide")
	st.title("Chat with PDF using DeepSeek Ai")

	st.sidebar.header("Upload & Process PDF Files")
	st.sidebar.markdown(
	"Using DeepSeek R1 model for advanced conversational capabilities.")

	with st.sidebar:
	pdf_docs = st.file_uploader(
	"Upload your PDF files:",
	accept_multiple_files=True,
	type=["pdf"]
	)
	if st.button("Submit & Process"):
	with st.spinner("Processing your files..."):
	raw_text = get_pdf_text(pdf_docs)
	text_chunks = get_text_chunks(raw_text)
	get_vector_store(text_chunks)
	st.success("PDFs processed and indexed successfully!")

	st.markdown(
	"### Ask Questions from Your PDF Files :mag:\n"
	"Once you upload and process your PDFs, type your questions below."
	)

	user_question = st.text_input("Enter your question:", placeholder="What do you want to know?")

	if user_question:
	with st.spinner("Fetching your answer..."):
	user_input(user_question)

	st.sidebar.info(
	"Note: This app uses DeepSeek R1 model for answering questions accurately."
	)

	if __name__ == "__main__":
	main()