import os from dotenv import load_dotenv import streamlit as st from langchain.text_splitter import CharacterTextSplitter from langchain.embeddings.openai import OpenAIEmbeddings from langchain.vectorstores import FAISS from langchain.prompts import PromptTemplate from langchain.memory import ConversationBufferMemory from langchain.chains import ConversationalRetrievalChain from langchain.chat_models import ChatOpenAI from langchain.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader, CSVLoader import tempfile # Load environment variables load_dotenv() api_key = os.getenv("OPENAI_API_KEY") # Custom Prompt Template custom_template = """ [INST] You are an Expert PDF and document assistant. Follow these instructions: 1. Greet the user and introduce yourself as a professional document assistant. 2. Answer user queries based on the document content. If a question is out of scope, politely end the conversation. CHAT HISTORY: {chat_history} QUESTION: {question} ANSWER: [INST] """ CUSTOM_QUESTION_PROMPT = PromptTemplate.from_template(custom_template) # Function to extract text from documents def get_document_text(uploaded_files): documents = [] for uploaded_file in uploaded_files: with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_file.name)[-1]) as temp_file: temp_file.write(uploaded_file.read()) temp_file_path = temp_file.name # Load document based on its type if uploaded_file.name.endswith(".pdf"): loader = PyPDFLoader(temp_file_path) documents.extend(loader.load()) elif uploaded_file.name.endswith(".docx") or uploaded_file.name.endswith(".doc"): loader = Docx2txtLoader(temp_file_path) documents.extend(loader.load()) elif uploaded_file.name.endswith(".txt"): loader = TextLoader(temp_file_path) documents.extend(loader.load()) elif uploaded_file.name.endswith(".csv"): loader = CSVLoader(temp_file_path) documents.extend(loader.load()) return documents # Split text into chunks def get_chunks(documents): text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=200, length_function=len) return [chunk for doc in documents for chunk in text_splitter.split_text(doc.page_content)] # Create vectorstore def get_vectorstore(chunks): embeddings = OpenAIEmbeddings() return FAISS.from_texts(texts=chunks, embedding=embeddings) # Create a conversational chain def get_conversationchain(vectorstore): llm = ChatOpenAI(temperature=0.4, model_name='gpt-4o-mini') memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True) return ConversationalRetrievalChain.from_llm( llm=llm, retriever=vectorstore.as_retriever(), condense_question_prompt=CUSTOM_QUESTION_PROMPT, memory=memory ) # Handle user questions and update chat history def handle_question(question): if not st.session_state.conversation: st.warning("Please process your documents first.") return response = st.session_state.conversation({'question': question}) st.session_state.chat_history = response['chat_history'] for i, msg in enumerate(st.session_state.chat_history): if i % 2 == 0: st.markdown(f"**You:** {msg.content}") else: st.markdown(f"**Bot:** {msg.content}") # Main Streamlit app def main(): st.set_page_config(page_title="Chat with Documents", page_icon="📚") st.title("📚 Chat with Your Documents") st.sidebar.title("Upload Your Files") if "conversation" not in st.session_state: st.session_state.conversation = None if "chat_history" not in st.session_state: st.session_state.chat_history = None # File uploader uploaded_files = st.sidebar.file_uploader("Upload your files (PDF, DOCX, TXT, CSV):", accept_multiple_files=True) # Process button if st.sidebar.button("Process Documents"): if uploaded_files: with st.spinner("Processing documents..."): # Extract text and create conversation chain raw_documents = get_document_text(uploaded_files) text_chunks = get_chunks(raw_documents) vectorstore = get_vectorstore(text_chunks) st.session_state.conversation = get_conversationchain(vectorstore) st.success("Documents processed successfully!") else: st.warning("Please upload at least one document.") # User input question = st.text_input("Ask a question about the uploaded documents:") if question: handle_question(question) if __name__ == '__main__': main()