Spaces:

DebabrataHalder
/

chatWithMultiplePDF1

Sleeping

App Files Files Community

chatWithMultiplePDF1 / app.py

DebabrataHalder

Update app.py

7b38ee1 verified 9 months ago

raw

history blame

9.59 kB


	# import os
	# import logging
	# from dotenv import load_dotenv
	# import streamlit as st
	# from PyPDF2 import PdfReader
	# from langchain.text_splitter import CharacterTextSplitter
	# # from langchain.embeddings import HuggingFaceInstructEmbeddings
	# from langchain_cohere import CohereEmbeddings
	# from langchain.vectorstores import FAISS
	# from langchain.memory import ConversationBufferMemory
	# from langchain.chains import ConversationalRetrievalChain
	# # from langchain.llms import Ollama
	# from langchain_groq import ChatGroq

	# # Load environment variables
	# load_dotenv()

	# # Set up logging
	# logging.basicConfig(
	# level=logging.INFO,
	# format='%(asctime)s - %(levelname)s - %(message)s'
	# )

	# # Function to extract text from PDF files
	# def get_pdf_text(pdf_docs):
	# text = ""
	# for pdf in pdf_docs:
	# pdf_reader = PdfReader(pdf)
	# for page in pdf_reader.pages:
	# text += page.extract_text()
	# return text

	# # Function to split the extracted text into chunks
	# def get_text_chunks(text):
	# text_splitter = CharacterTextSplitter(
	# separator="\n",
	# chunk_size=1000,
	# chunk_overlap=200,
	# length_function=len
	# )
	# chunks = text_splitter.split_text(text)
	# return chunks

	# # Function to create a FAISS vectorstore
	# # def get_vectorstore(text_chunks):
	# # embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
	# # vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
	# # return vectorstore

	# def get_vectorstore(text_chunks):
	# cohere_api_key = os.getenv("COHERE_API_KEY")
	# embeddings = CohereEmbeddings(model="embed-english-v3.0", cohere_api_key=cohere_api_key)
	# vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
	# return vectorstore

	# # Function to set up the conversational retrieval chain
	# def get_conversation_chain(vectorstore):
	# try:
	# # llm = Ollama(model="llama3.2:1b")
	# llm = ChatGroq(model="llama-3.1-70b-versatile", temperature=0.5)
	# memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

	# conversation_chain = ConversationalRetrievalChain.from_llm(
	# llm=llm,
	# retriever=vectorstore.as_retriever(),
	# memory=memory
	# )

	# logging.info("Conversation chain created successfully.")
	# return conversation_chain
	# except Exception as e:
	# logging.error(f"Error creating conversation chain: {e}")
	# st.error("An error occurred while setting up the conversation chain.")

	# # Handle user input
	# def handle_userinput(user_question):
	# if st.session_state.conversation is not None:
	# response = st.session_state.conversation({'question': user_question})
	# st.session_state.chat_history = response['chat_history']

	# for i, message in enumerate(st.session_state.chat_history):
	# if i % 2 == 0:
	# st.write(f"User: {message.content}")
	# else:
	# st.write(f"Bot: {message.content}")
	# else:
	# st.warning("Please process the documents first.")

	# # Main function to run the Streamlit app
	# def main():
	# load_dotenv()
	# st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:")

	# if "conversation" not in st.session_state:
	# st.session_state.conversation = None
	# if "chat_history" not in st.session_state:
	# st.session_state.chat_history = None

	# st.header("Chat with multiple PDFs :books:")
	# user_question = st.text_input("Ask a question about your documents:")
	# if user_question:
	# handle_userinput(user_question)

	# with st.sidebar:
	# st.subheader("Your documents")
	# pdf_docs = st.file_uploader(
	# "Upload your PDFs here and click on 'Process'", accept_multiple_files=True
	# )
	# if st.button("Process"):
	# with st.spinner("Processing..."):
	# raw_text = get_pdf_text(pdf_docs)
	# text_chunks = get_text_chunks(raw_text)
	# vectorstore = get_vectorstore(text_chunks)
	# st.session_state.conversation = get_conversation_chain(vectorstore)

	# if __name__ == '__main__':
	# main()




































	import os
	import logging
	from dotenv import load_dotenv
	import streamlit as st
	from PyPDF2 import PdfReader
	from docx import Document # Import for handling Word files
	import io # Import for handling byte streams
	from langchain.text_splitter import CharacterTextSplitter
	from langchain_cohere import CohereEmbeddings
	from langchain.vectorstores import FAISS
	from langchain.memory import ConversationBufferMemory
	from langchain.chains import ConversationalRetrievalChain
	from langchain_groq import ChatGroq

	# Load environment variables
	load_dotenv()

	# Set up logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(levelname)s - %(message)s'
	)

	# Function to extract text from PDF files
	def get_pdf_text(pdf_docs):
	text = ""
	for pdf in pdf_docs:
	pdf_reader = PdfReader(pdf)
	for page in pdf_reader.pages:
	text += page.extract_text()
	return text

	# Function to extract text from Word files
	def get_word_text(word_docs):
	text = ""
	for word in word_docs:
	doc = Document(io.BytesIO(word.read())) # Read the Word document from bytes
	for para in doc.paragraphs:
	text += para.text + "\n" # Append each paragraph followed by a newline
	return text

	# Function to extract text from TXT files
	def get_txt_text(txt_docs):
	text = ""
	for txt in txt_docs:
	text += txt.read().decode("utf-8") + "\n" # Read and decode the text file content
	return text

	# Function to split the extracted text into chunks
	def get_text_chunks(text):
	text_splitter = CharacterTextSplitter(
	separator="\n",
	chunk_size=1000,
	chunk_overlap=200,
	length_function=len
	)
	chunks = text_splitter.split_text(text)
	return chunks

	def get_vectorstore(text_chunks):
	cohere_api_key = os.getenv("COHERE_API_KEY")
	embeddings = CohereEmbeddings(model="embed-english-v3.0", cohere_api_key=cohere_api_key)
	vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
	return vectorstore

	# Function to set up the conversational retrieval chain
	def get_conversation_chain(vectorstore):
	try:
	llm = ChatGroq(model="llama-3.1-70b-versatile", temperature=0.5)
	memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

	conversation_chain = ConversationalRetrievalChain.from_llm(
	llm=llm,
	retriever=vectorstore.as_retriever(),
	memory=memory
	)

	logging.info("Conversation chain created successfully.")
	return conversation_chain
	except Exception as e:
	logging.error(f"Error creating conversation chain: {e}")
	st.error("An error occurred while setting up the conversation chain.")

	# Handle user input
	def handle_userinput(user_question):
	if st.session_state.conversation is not None:
	response = st.session_state.conversation({'question': user_question})
	st.session_state.chat_history = response['chat_history']

	for i, message in enumerate(st.session_state.chat_history):
	if i % 2 == 0:
	st.write(f"User: {message.content}")
	else:
	st.write(f"Bot: {message.content}")
	else:
	st.warning("Please process the documents first.")

	# Main function to run the Streamlit app
	def main():
	load_dotenv()
	st.set_page_config(page_title="Chat with multiple documents", page_icon=":books:")

	if "conversation" not in st.session_state:
	st.session_state.conversation = None
	if "chat_history" not in st.session_state:
	st.session_state.chat_history = None

	st.header("Chat with multiple documents :books:")

	user_question = st.text_input("Ask a question about your documents:")

	if user_question:
	handle_userinput(user_question)

	with st.sidebar:
	st.subheader("Your documents")

	pdf_docs = st.file_uploader(
	"Upload your PDFs here", accept_multiple_files=True, type=["pdf"]
	)

	word_docs = st.file_uploader(
	"Upload your Word documents here", accept_multiple_files=True, type=["docx"]
	)

	txt_docs = st.file_uploader(
	"Upload your TXT files here", accept_multiple_files=True, type=["txt"]
	)

	if st.button("Process"):
	with st.spinner("Processing..."):
	raw_text = ""

	if pdf_docs:
	raw_text += get_pdf_text(pdf_docs)

	if word_docs:
	raw_text += get_word_text(word_docs)

	if txt_docs:
	raw_text += get_txt_text(txt_docs)

	if raw_text: # Only process if there is any raw text extracted.
	text_chunks = get_text_chunks(raw_text)
	vectorstore = get_vectorstore(text_chunks)
	st.session_state.conversation = get_conversation_chain(vectorstore)
	else:
	st.warning("No documents were uploaded or processed.")

	if __name__ == '__main__':
	main()