Spaces:

DebabrataHalder
/

chatWithMultiplePDF1

Sleeping

App Files Files Community

chatWithMultiplePDF1 / app.py

DebabrataHalder

Update app.py

1a40686 verified 4 months ago

raw

history blame

8.02 kB

	# import os
	# import logging
	# from dotenv import load_dotenv
	# import streamlit as st
	# from PyPDF2 import PdfReader
	# from langchain.text_splitter import CharacterTextSplitter
	# # from langchain.embeddings import HuggingFaceInstructEmbeddings
	# from langchain_cohere import CohereEmbeddings
	# from langchain.vectorstores import FAISS
	# from langchain.memory import ConversationBufferMemory
	# from langchain.chains import ConversationalRetrievalChain
	# # from langchain.llms import Ollama
	# from langchain_groq import ChatGroq

	# # Load environment variables
	# load_dotenv()

	# # Set up logging
	# logging.basicConfig(
	# level=logging.INFO,
	# format='%(asctime)s - %(levelname)s - %(message)s'
	# )

	# # Function to extract text from PDF files
	# def get_pdf_text(pdf_docs):
	# text = ""
	# for pdf in pdf_docs:
	# pdf_reader = PdfReader(pdf)
	# for page in pdf_reader.pages:
	# text += page.extract_text()
	# return text

	# # Function to split the extracted text into chunks
	# def get_text_chunks(text):
	# text_splitter = CharacterTextSplitter(
	# separator="\n",
	# chunk_size=1000,
	# chunk_overlap=200,
	# length_function=len
	# )
	# chunks = text_splitter.split_text(text)
	# return chunks

	# # Function to create a FAISS vectorstore
	# # def get_vectorstore(text_chunks):
	# # embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
	# # vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
	# # return vectorstore

	# def get_vectorstore(text_chunks):
	# cohere_api_key = os.getenv("COHERE_API_KEY")
	# embeddings = CohereEmbeddings(model="embed-english-v3.0", cohere_api_key=cohere_api_key)
	# vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
	# return vectorstore

	# # Function to set up the conversational retrieval chain
	# def get_conversation_chain(vectorstore):
	# try:
	# # llm = Ollama(model="llama3.2:1b")
	# llm = ChatGroq(model="llama-3.3-70b-versatile", temperature=0.5)
	# memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

	# conversation_chain = ConversationalRetrievalChain.from_llm(
	# llm=llm,
	# retriever=vectorstore.as_retriever(),
	# memory=memory
	# )

	# logging.info("Conversation chain created successfully.")
	# return conversation_chain
	# except Exception as e:
	# logging.error(f"Error creating conversation chain: {e}")
	# st.error("An error occurred while setting up the conversation chain.")

	# # Handle user input
	# def handle_userinput(user_question):
	# if st.session_state.conversation is not None:
	# response = st.session_state.conversation({'question': user_question})
	# st.session_state.chat_history = response['chat_history']

	# for i, message in enumerate(st.session_state.chat_history):
	# if i % 2 == 0:
	# st.write(f"User: {message.content}")
	# else:
	# st.write(f"Bot: {message.content}")
	# else:
	# st.warning("Please process the documents first.")

	# # Main function to run the Streamlit app
	# def main():
	# load_dotenv()
	# st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:")

	# if "conversation" not in st.session_state:
	# st.session_state.conversation = None
	# if "chat_history" not in st.session_state:
	# st.session_state.chat_history = None

	# st.header("Chat with multiple PDFs :books:")
	# user_question = st.text_input("Ask a question about your documents:")
	# if user_question:
	# handle_userinput(user_question)

	# with st.sidebar:
	# st.subheader("Your documents")
	# pdf_docs = st.file_uploader(
	# "Upload your PDFs here and click on 'Process'", accept_multiple_files=True
	# )
	# if st.button("Process"):
	# with st.spinner("Processing..."):
	# raw_text = get_pdf_text(pdf_docs)
	# text_chunks = get_text_chunks(raw_text)
	# vectorstore = get_vectorstore(text_chunks)
	# st.session_state.conversation = get_conversation_chain(vectorstore)

	# if __name__ == '__main__':
	# main()







	import streamlit as st
	import os
	from dotenv import load_dotenv
	import PyPDF2
	import requests
	import cohere
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from langchain_community.vectorstores import FAISS
	from langchain_cohere import CohereEmbeddings

	# Load environment variables
	load_dotenv()
	GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
	COHERE_API_KEY = os.getenv("COHERE_API_KEY")

	# Initialize Cohere client
	co = cohere.Client(COHERE_API_KEY)

	# Configure Streamlit
	st.set_page_config(page_title="RAG Chatbot with Gemini & Cohere")
	st.title("🤖 Multi-Model RAG Chatbot")

	# Initialize session state
	if "messages" not in st.session_state:
	st.session_state.messages = []
	if "vector_store" not in st.session_state:
	st.session_state.vector_store = None

	# File upload and processing
	uploaded_file = st.file_uploader("Upload a PDF document", type="pdf")

	if uploaded_file and not st.session_state.vector_store:
	# Process PDF
	pdf_reader = PyPDF2.PdfReader(uploaded_file)
	text = ""
	for page in pdf_reader.pages:
	text += page.extract_text()

	# Split text
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000,
	chunk_overlap=200
	)
	chunks = text_splitter.split_text(text)

	# Create embeddings and vector store
	embeddings = CohereEmbeddings(
	cohere_api_key=COHERE_API_KEY,
	model="embed-english-v3.0",
	user_agent="rag-chatbot-v1"
	)
	st.session_state.vector_store = FAISS.from_texts(
	texts=chunks,
	embedding=embeddings
	)

	# Display chat messages
	for message in st.session_state.messages:
	with st.chat_message(message["role"]):
	st.markdown(message["content"])

	# Query expansion function
	def expand_query(query):
	prompt = f"""Generate 3 query variations that help answer: {query}
	Format as numbered bullet points:"""

	response = co.generate(
	prompt=prompt,
	max_tokens=100,
	temperature=0.7
	)
	expanded_queries = [query] + [q.split(". ")[1] for q in response.generations[0].text.split("\n") if q]
	return expanded_queries

	# Gemini API call
	def generate_with_gemini(context, query):
	url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={GEMINI_API_KEY}"

	system_prompt = f"""You're an expert assistant. Use this context to answer:
	{context}

	Apply Chain of Abstraction and Grounding (CAG):
	1. Identify key concepts
	2. Create abstract relationships
	3. Ground in specific examples
	4. Synthesize final answer"""

	headers = {"Content-Type": "application/json"}
	data = {
	"contents": [{
	"parts": [{
	"text": f"{system_prompt}\n\nQuestion: {query}"
	}]
	}]
	}

	response = requests.post(url, json=data, headers=headers)
	return response.json()["candidates"][0]["content"]["parts"][0]["text"]

	# Chat input
	if prompt := st.chat_input("Ask about the document"):
	st.session_state.messages.append({"role": "user", "content": prompt})

	with st.chat_message("user"):
	st.markdown(prompt)

	# Query expansion
	expanded_queries = expand_query(prompt)

	# Retrieve documents
	docs = []
	for query in expanded_queries:
	docs.extend(st.session_state.vector_store.similarity_search(query, k=2))

	# Generate response
	context = "\n\n".join([doc.page_content for doc in docs])
	response = generate_with_gemini(context, prompt)

	with st.chat_message("assistant"):
	st.markdown(response)

	st.session_state.messages.append({"role": "assistant", "content": response})