File size: 3,432 Bytes
0bff161
 
 
 
 
 
 
 
 
 
 
1218ed1
88baa91
57a73aa
 
0bff161
690b005
0bff161
 
 
 
57a73aa
 
 
 
 
 
0bff161
 
 
 
 
 
 
9277057
 
0bff161
 
64a2736
0bff161
 
 
 
64a2736
0b5eccf
0bff161
 
 
 
64a2736
 
 
0bff161
 
 
64a2736
 
 
 
0bff161
 
 
64a2736
0bff161
64a2736
 
0bff161
 
 
 
 
64a2736
0bff161
 
 
 
 
 
64a2736
 
0bff161
 
 
 
 
 
 
 
64a2736
0bff161
 
 
 
64a2736
0bff161
 
 
64a2736
0bff161
 
 
 
64a2736
0bff161
 
64a2736
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
from dotenv import load_dotenv
import streamlit as st
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_text_splitters.character import CharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_groq import ChatGroq
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
import os
import nltk
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
# Install Poppler and Tesseract in the runtime environment
os.system("apt-get update && apt-get install -y poppler-utils tesseract-ocr")

secret = os.getenv('Groq_api')

working_dir = os.path.dirname(os.path.abspath(__file__))

def load_documents(file_path):
    # Specify poppler_path and tesseract_path to ensure compatibility
    loader = UnstructuredPDFLoader(
        file_path, 
        poppler_path="/usr/bin", 
        tesseract_path="/usr/bin/tesseract"
    )
    documents = loader.load()
    return documents

def setup_vectorstore(documents):
    embeddings = HuggingFaceEmbeddings()
    text_splitter = CharacterTextSplitter(
        separator="/n", 
        chunk_size=1000,
        chunk_overlap=200
    )
    doc_chunks = text_splitter.split_documents(documents)
    vectorstores = FAISS.from_documents(doc_chunks, embeddings)
    return vectorstores

def create_chain(vectorstores):
    llm = ChatGroq(
        api_key=secret,
        model="llama-3.1-8b-instant",
        temperature=0
    )
    retriever = vectorstores.as_retriever()
    memory = ConversationBufferMemory(
        llm=llm,
        output_key="answer",
        memory_key="chat_history",
        return_messages=True
    )
    chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=retriever,
        memory=memory,
        verbose=True
    )
    return chain

# Streamlit page configuration
st.set_page_config(
    page_title="Chat with your documents",
    page_icon="📑",
    layout="centered"
)

st.title("📝Chat With your docs 😎")

# Initialize session states
if "chat_history" not in st.session_state:
    st.session_state.chat_history = []

uploaded_file = st.file_uploader(label="Upload your PDF")

if uploaded_file:
    file_path = f"{working_dir}/{uploaded_file.name}"
    with open(file_path, "wb") as f:
        f.write(uploaded_file.getbuffer())

    if "vectorstores" not in st.session_state:
        st.session_state.vectorstores = setup_vectorstore(load_documents(file_path))

    if "conversation_chain" not in st.session_state:
        st.session_state.conversation_chain = create_chain(st.session_state.vectorstores)

# Display chat history
for message in st.session_state.chat_history:
    with st.chat_message(message["role"]):
        st.markdown(message["content"])

# User input handling
user_input = st.chat_input("Ask any questions relevant to uploaded pdf")

if user_input:
    st.session_state.chat_history.append({"role": "user", "content": user_input})
    with st.chat_message("user"):
        st.markdown(user_input)

    with st.chat_message("assistant"):
        response = st.session_state.conversation_chain({"question": user_input})
        assistant_response = response["answer"]
        st.markdown(assistant_response)
        st.session_state.chat_history.append({"role": "assistant", "content": assistant_response})