Spaces:
Sleeping
Sleeping
import os | |
from dotenv import load_dotenv | |
import streamlit as st | |
from langchain.text_splitter import CharacterTextSplitter | |
from langchain.embeddings.openai import OpenAIEmbeddings | |
from langchain.vectorstores import FAISS | |
from langchain.prompts import PromptTemplate | |
from langchain.memory import ConversationBufferMemory | |
from langchain.chains import ConversationalRetrievalChain | |
from langchain.chat_models import ChatOpenAI | |
from langchain.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader, CSVLoader | |
# Load environment variables from .env | |
load_dotenv() | |
from openai import OpenAI | |
api_key = os.getenv("OPENAI_API_KEY") | |
client = OpenAI(api_key=api_key) | |
# Custom template to guide the LLM | |
custom_template = """ | |
<s>[INST]You will start the conversation by greeting the user and introducing yourself as an Expert PDF documents analyzer and assistant, | |
stating your availability for assistance. Your next step will depend on the user's response. | |
If the user expresses a need for assistance in pdf or document, you will ask them to describe their question. | |
However, if the user asks questions out of context from the knowledge base, you will immediately thank them and | |
say goodbye, ending the conversation. Remember to base your responses on the user's needs, providing accurate and | |
concise information regarding the data within the knowledge base. Your interactions should be professional and | |
focused, ensuring the user's queries are addressed efficiently without deviating from the set flows. | |
CHAT HISTORY: {chat_history} | |
QUESTION: {question} | |
ANSWER: | |
</s>[INST] | |
""" | |
CUSTOM_QUESTION_PROMPT = PromptTemplate.from_template(custom_template) | |
# Convert text to chunks | |
def get_chunks(documents): | |
text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=200, length_function=len) | |
chunks = [chunk for doc in documents for chunk in text_splitter.split_text(doc.page_content)] | |
return chunks | |
# Create vectorstore using OpenAI embeddings and FAISS | |
def get_vectorstore(chunks): | |
embeddings = OpenAIEmbeddings() | |
vectorstore = FAISS.from_texts(texts=chunks, embedding=embeddings) | |
return vectorstore | |
# Create conversation chain for LLM interaction | |
def get_conversationchain(vectorstore): | |
llm = ChatOpenAI(temperature=0.4, model_name='gpt-4') | |
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True) | |
conversation_chain = ConversationalRetrievalChain.from_llm( | |
llm=llm, | |
retriever=vectorstore.as_retriever(), | |
condense_question_prompt=CUSTOM_QUESTION_PROMPT, | |
memory=memory | |
) | |
return conversation_chain | |
# Extract text from various document types including PDFs, TXT, DOCX, and CSV. | |
import tempfile | |
def get_document_text(uploaded_files): | |
documents = [] | |
for uploaded_file in uploaded_files: | |
# Create a temporary file to save the uploaded file | |
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_file.name)[-1]) as temp_file: | |
temp_file.write(uploaded_file.read()) | |
temp_file_path = temp_file.name | |
# Check the file extension and load accordingly | |
if uploaded_file.name.endswith(".pdf"): | |
loader = PyPDFLoader(temp_file_path) | |
documents.extend(loader.load()) | |
elif uploaded_file.name.endswith(".docx") or uploaded_file.name.endswith(".doc"): | |
loader = Docx2txtLoader(temp_file_path) | |
documents.extend(loader.load()) | |
elif uploaded_file.name.endswith(".txt"): | |
loader = TextLoader(temp_file_path) | |
documents.extend(loader.load()) | |
elif uploaded_file.name.endswith(".csv"): | |
loader = CSVLoader(temp_file_path) | |
documents.extend(loader.load()) | |
print("Number of documents:", len(documents)) | |
return documents | |
# Function to process and handle a user's query | |
def handle_question(conversation_chain, question): | |
response = conversation_chain({'question': question}) | |
return response['answer'] | |
def main(): | |
st.set_page_config(page_title="Chat with multiple documents", page_icon=":books:") | |
st.header("Chat with your documents :books:") | |
if "conversation" not in st.session_state: | |
st.session_state.conversation = None | |
uploaded_files = st.file_uploader("Upload your files (PDF, DOCX, TXT, CSV):", accept_multiple_files=True) | |
if st.button("Process"): | |
if uploaded_files: | |
with st.spinner("Processing documents..."): | |
# Extract text from the uploaded documents | |
raw_documents = get_document_text(uploaded_files) | |
# Convert text into chunks | |
text_chunks = get_chunks(raw_documents) | |
# Create vectorstore | |
vectorstore = get_vectorstore(text_chunks) | |
# Create conversation chain | |
st.session_state.conversation = get_conversationchain(vectorstore) | |
st.success("Documents processed successfully!") | |
else: | |
st.warning("Please upload at least one document.") | |
question = st.text_input("Ask a question about the uploaded documents:") | |
if question and st.session_state.conversation: | |
handle_question(st.session_state.conversation, question) | |
elif question: | |
st.warning("Please process your documents first.") | |
if __name__ == '__main__': | |
main() | |