umarigan commited on
Commit
994b45d
·
1 Parent(s): 57d8436

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +160 -0
app.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import PyPDF2
3
+ import pandas as pd
4
+ import tempfile
5
+ import os
6
+ import logging
7
+
8
+ from langchain.document_loaders import TextLoader
9
+ from langchain.text_splitter import CharacterTextSplitter
10
+ from langchain.vectorstores import Chroma
11
+ from langchain.embeddings import GPT4AllEmbeddings
12
+ from langchain.llms import LlamaCpp
13
+ from langchain.prompts import PromptTemplate
14
+ from langchain.chains import LLMChain
15
+ from huggingface_hub import hf_hub_download
16
+
17
+ # Configure the
18
+ logging.basicConfig(level=logging.INFO,
19
+ format='%(asctime)s - %(levelname)s - %(message)s',
20
+ datefmt='%Y-%m-%d %H:%M:%S')
21
+
22
+
23
+ #in memory caching ref: https://python.langchain.com/docs/integrations/llms/llm_caching
24
+ from langchain.cache import InMemoryCache
25
+ import langchain
26
+ langchain.llm_cache = InMemoryCache()
27
+
28
+ #sqlite issue with chroma
29
+ import sqlite3
30
+ __import__('pysqlite3')
31
+ import sys
32
+ sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
33
+
34
+ @st.cache_resource
35
+ def load_model():
36
+ prompt_template = """Use the following pieces of context that you have access to answer the question at the end. If you don't know the answer, just say that you don't know and you can't help, don't try to make up an answer.
37
+ {context}
38
+ Question: {question}
39
+ Answer:"""
40
+ prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
41
+ model_name_or_path = "TheBloke/Llama-2-7B-chat-GGML"
42
+ model_basename = "llama-2-7b-chat.ggmlv3.q5_1.bin" # the model is in bin format
43
+
44
+ model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename)
45
+ logging.info("uploading model from hf pub")
46
+ #model_path = '/content/llama.cpp/models/llama-2-7b-chat.ggmlv3.q4_K_M.bin'
47
+ llm = LlamaCpp(model_path=model_path, n_ctx=4096)
48
+ llm_chain = LLMChain(llm=llm, prompt=prompt)
49
+ logging.info("uploading model done")
50
+ return llm_chain
51
+
52
+
53
+ def return_embeddings():
54
+ logging.info("uploading embeddings")
55
+ embeddings = GPT4AllEmbeddings()
56
+ logging.info("uploading embeddings")
57
+ return embeddings
58
+
59
+
60
+
61
+
62
+ # Function to convert PDF to text
63
+ @st.cache_data
64
+ def pdf_to_text(file):
65
+ pdf_reader = PyPDF2.PdfReader(file)
66
+ text = ""
67
+ for page_num in range(len(pdf_reader.pages)):
68
+ page = pdf_reader.pages[page_num]
69
+ text += page.extract_text()
70
+ return text
71
+
72
+ # Function to convert CSV to text
73
+ @st.cache_data
74
+ def csv_to_text(file):
75
+ df = pd.read_csv(file)
76
+ text = df.to_string(index=False)
77
+ return text
78
+
79
+ @st.cache_data
80
+ def read_txt(file_path):
81
+ # Read text file
82
+ with open(file_path, 'r', encoding='utf-8') as file:
83
+ text = file.read()
84
+ return text
85
+
86
+
87
+ def process_file(uploaded_file):
88
+
89
+ logging.info("received the file")
90
+ # Check file type and process accordingly
91
+ if uploaded_file.type == 'application/pdf':
92
+ # Process PDF file
93
+ text = pdf_to_text(uploaded_file)
94
+ elif uploaded_file.type == 'text/csv':
95
+ # Process CSV file
96
+ text = csv_to_text(uploaded_file)
97
+ elif uploaded_file.type == 'text/plain':
98
+ # Process TXT file
99
+ text = read_txt(uploaded_file)
100
+ else:
101
+ raise ValueError("Unsupported file format. Please upload a PDF, CSV, or TXT file.")
102
+
103
+ # Create a temporary file to store the text content
104
+ temp_file = tempfile.NamedTemporaryFile(mode='w', delete=False)
105
+ temp_file.write(text)
106
+ temp_file.close()
107
+
108
+ return temp_file.name
109
+
110
+
111
+ def main():
112
+ #os.environ['LLAMA_BLAS'] = 'ON'
113
+ #os.environ['LLAMA_BLAS_VENDOR'] = 'OpenBLAS'
114
+ st.title("SorGPT, Chat with your files")
115
+
116
+ # File Upload
117
+ uploaded_file = st.file_uploader("Upload a PDF, CSV, or TXT file", type=["pdf", "csv", "txt"])
118
+
119
+ if uploaded_file is not None:
120
+ # Process the file and get the path of the temporary text file
121
+ logging.info("docs load start")
122
+ temp_file_path = process_file(uploaded_file)
123
+ loader = TextLoader(temp_file_path)
124
+ docs = loader.load()
125
+ logging.info(f"docs load end, docs is : {docs}")
126
+
127
+ text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=0)
128
+ texts = text_splitter.split_documents(docs)
129
+ logging.info(f"got the text, text is : {docs}")
130
+ embeddings = return_embeddings()
131
+ db = Chroma.from_documents(texts, embeddings, persist_directory='db')
132
+
133
+ question = st.text_input("Enter your question:")
134
+ if st.button("Submit"):
135
+ similar_doc = db.similarity_search(question, k=1)
136
+ context = similar_doc[0].page_content
137
+ logging.info("querying start")
138
+ query_llm = load_model()
139
+ response = query_llm.run({"context": context, "question": question})
140
+ logging.info(f"querying end response is: {response}")
141
+ st.subheader("Answer:")
142
+ st.write(response)
143
+
144
+ # Clean up the temporary file after processing
145
+ os.remove(temp_file_path)
146
+ st.markdown(
147
+ '<a href="https://www.linkedin.com/in/umarigan/"><img src="https://www.edigitalagency.com.au/wp-content/uploads/Linkedin-logo-icon-png.png" width="40" height="40" ></a>',
148
+ unsafe_allow_html=True)
149
+
150
+ hide_streamlit_style = """
151
+ <style>
152
+ #MainMenu {visibility: hidden;}
153
+ footer {visibility: hidden;}
154
+ </style>
155
+ """
156
+ st.markdown(hide_streamlit_style, unsafe_allow_html=True)
157
+
158
+
159
+ if __name__ == "__main__":
160
+ main()