Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import PyPDF2
|
3 |
+
import pandas as pd
|
4 |
+
import tempfile
|
5 |
+
import os
|
6 |
+
import logging
|
7 |
+
|
8 |
+
from langchain.document_loaders import TextLoader
|
9 |
+
from langchain.text_splitter import CharacterTextSplitter
|
10 |
+
from langchain.vectorstores import Chroma
|
11 |
+
from langchain.embeddings import GPT4AllEmbeddings
|
12 |
+
from langchain.llms import LlamaCpp
|
13 |
+
from langchain.prompts import PromptTemplate
|
14 |
+
from langchain.chains import LLMChain
|
15 |
+
from huggingface_hub import hf_hub_download
|
16 |
+
|
17 |
+
# Configure the
|
18 |
+
logging.basicConfig(level=logging.INFO,
|
19 |
+
format='%(asctime)s - %(levelname)s - %(message)s',
|
20 |
+
datefmt='%Y-%m-%d %H:%M:%S')
|
21 |
+
|
22 |
+
|
23 |
+
#in memory caching ref: https://python.langchain.com/docs/integrations/llms/llm_caching
|
24 |
+
from langchain.cache import InMemoryCache
|
25 |
+
import langchain
|
26 |
+
langchain.llm_cache = InMemoryCache()
|
27 |
+
|
28 |
+
#sqlite issue with chroma
|
29 |
+
import sqlite3
|
30 |
+
__import__('pysqlite3')
|
31 |
+
import sys
|
32 |
+
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
|
33 |
+
|
34 |
+
@st.cache_resource
|
35 |
+
def load_model():
|
36 |
+
prompt_template = """Use the following pieces of context that you have access to answer the question at the end. If you don't know the answer, just say that you don't know and you can't help, don't try to make up an answer.
|
37 |
+
{context}
|
38 |
+
Question: {question}
|
39 |
+
Answer:"""
|
40 |
+
prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
|
41 |
+
model_name_or_path = "TheBloke/Llama-2-7B-chat-GGML"
|
42 |
+
model_basename = "llama-2-7b-chat.ggmlv3.q5_1.bin" # the model is in bin format
|
43 |
+
|
44 |
+
model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename)
|
45 |
+
logging.info("uploading model from hf pub")
|
46 |
+
#model_path = '/content/llama.cpp/models/llama-2-7b-chat.ggmlv3.q4_K_M.bin'
|
47 |
+
llm = LlamaCpp(model_path=model_path, n_ctx=4096)
|
48 |
+
llm_chain = LLMChain(llm=llm, prompt=prompt)
|
49 |
+
logging.info("uploading model done")
|
50 |
+
return llm_chain
|
51 |
+
|
52 |
+
|
53 |
+
def return_embeddings():
|
54 |
+
logging.info("uploading embeddings")
|
55 |
+
embeddings = GPT4AllEmbeddings()
|
56 |
+
logging.info("uploading embeddings")
|
57 |
+
return embeddings
|
58 |
+
|
59 |
+
|
60 |
+
|
61 |
+
|
62 |
+
# Function to convert PDF to text
|
63 |
+
@st.cache_data
|
64 |
+
def pdf_to_text(file):
|
65 |
+
pdf_reader = PyPDF2.PdfReader(file)
|
66 |
+
text = ""
|
67 |
+
for page_num in range(len(pdf_reader.pages)):
|
68 |
+
page = pdf_reader.pages[page_num]
|
69 |
+
text += page.extract_text()
|
70 |
+
return text
|
71 |
+
|
72 |
+
# Function to convert CSV to text
|
73 |
+
@st.cache_data
|
74 |
+
def csv_to_text(file):
|
75 |
+
df = pd.read_csv(file)
|
76 |
+
text = df.to_string(index=False)
|
77 |
+
return text
|
78 |
+
|
79 |
+
@st.cache_data
|
80 |
+
def read_txt(file_path):
|
81 |
+
# Read text file
|
82 |
+
with open(file_path, 'r', encoding='utf-8') as file:
|
83 |
+
text = file.read()
|
84 |
+
return text
|
85 |
+
|
86 |
+
|
87 |
+
def process_file(uploaded_file):
|
88 |
+
|
89 |
+
logging.info("received the file")
|
90 |
+
# Check file type and process accordingly
|
91 |
+
if uploaded_file.type == 'application/pdf':
|
92 |
+
# Process PDF file
|
93 |
+
text = pdf_to_text(uploaded_file)
|
94 |
+
elif uploaded_file.type == 'text/csv':
|
95 |
+
# Process CSV file
|
96 |
+
text = csv_to_text(uploaded_file)
|
97 |
+
elif uploaded_file.type == 'text/plain':
|
98 |
+
# Process TXT file
|
99 |
+
text = read_txt(uploaded_file)
|
100 |
+
else:
|
101 |
+
raise ValueError("Unsupported file format. Please upload a PDF, CSV, or TXT file.")
|
102 |
+
|
103 |
+
# Create a temporary file to store the text content
|
104 |
+
temp_file = tempfile.NamedTemporaryFile(mode='w', delete=False)
|
105 |
+
temp_file.write(text)
|
106 |
+
temp_file.close()
|
107 |
+
|
108 |
+
return temp_file.name
|
109 |
+
|
110 |
+
|
111 |
+
def main():
|
112 |
+
#os.environ['LLAMA_BLAS'] = 'ON'
|
113 |
+
#os.environ['LLAMA_BLAS_VENDOR'] = 'OpenBLAS'
|
114 |
+
st.title("SorGPT, Chat with your files")
|
115 |
+
|
116 |
+
# File Upload
|
117 |
+
uploaded_file = st.file_uploader("Upload a PDF, CSV, or TXT file", type=["pdf", "csv", "txt"])
|
118 |
+
|
119 |
+
if uploaded_file is not None:
|
120 |
+
# Process the file and get the path of the temporary text file
|
121 |
+
logging.info("docs load start")
|
122 |
+
temp_file_path = process_file(uploaded_file)
|
123 |
+
loader = TextLoader(temp_file_path)
|
124 |
+
docs = loader.load()
|
125 |
+
logging.info(f"docs load end, docs is : {docs}")
|
126 |
+
|
127 |
+
text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=0)
|
128 |
+
texts = text_splitter.split_documents(docs)
|
129 |
+
logging.info(f"got the text, text is : {docs}")
|
130 |
+
embeddings = return_embeddings()
|
131 |
+
db = Chroma.from_documents(texts, embeddings, persist_directory='db')
|
132 |
+
|
133 |
+
question = st.text_input("Enter your question:")
|
134 |
+
if st.button("Submit"):
|
135 |
+
similar_doc = db.similarity_search(question, k=1)
|
136 |
+
context = similar_doc[0].page_content
|
137 |
+
logging.info("querying start")
|
138 |
+
query_llm = load_model()
|
139 |
+
response = query_llm.run({"context": context, "question": question})
|
140 |
+
logging.info(f"querying end response is: {response}")
|
141 |
+
st.subheader("Answer:")
|
142 |
+
st.write(response)
|
143 |
+
|
144 |
+
# Clean up the temporary file after processing
|
145 |
+
os.remove(temp_file_path)
|
146 |
+
st.markdown(
|
147 |
+
'<a href="https://www.linkedin.com/in/umarigan/"><img src="https://www.edigitalagency.com.au/wp-content/uploads/Linkedin-logo-icon-png.png" width="40" height="40" ></a>',
|
148 |
+
unsafe_allow_html=True)
|
149 |
+
|
150 |
+
hide_streamlit_style = """
|
151 |
+
<style>
|
152 |
+
#MainMenu {visibility: hidden;}
|
153 |
+
footer {visibility: hidden;}
|
154 |
+
</style>
|
155 |
+
"""
|
156 |
+
st.markdown(hide_streamlit_style, unsafe_allow_html=True)
|
157 |
+
|
158 |
+
|
159 |
+
if __name__ == "__main__":
|
160 |
+
main()
|