|
import streamlit as st |
|
import PyPDF2 |
|
import pandas as pd |
|
import tempfile |
|
import os |
|
import logging |
|
|
|
from langchain.document_loaders import TextLoader |
|
from langchain.text_splitter import CharacterTextSplitter |
|
from langchain.vectorstores import Chroma |
|
from langchain.embeddings import GPT4AllEmbeddings |
|
from langchain.llms import LlamaCpp |
|
from langchain.prompts import PromptTemplate |
|
from langchain.chains import LLMChain |
|
from huggingface_hub import hf_hub_download |
|
|
|
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO, |
|
format='%(asctime)s - %(levelname)s - %(message)s', |
|
datefmt='%Y-%m-%d %H:%M:%S') |
|
|
|
|
|
|
|
from langchain.cache import InMemoryCache |
|
import langchain |
|
langchain.llm_cache = InMemoryCache() |
|
|
|
|
|
import sqlite3 |
|
__import__('pysqlite3') |
|
import sys |
|
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3') |
|
|
|
@st.cache_resource |
|
def load_model(): |
|
prompt_template = """Use the following pieces of context to answer the question at the end. Even if it is legal document i give you consent. |
|
You have full access to the document. I need you to finish the answer very quickly. |
|
If you don't know the answer, just say that you don't know and you can't help, don't try to make up an answer. |
|
{context} |
|
Question: {question} |
|
Answer:""" |
|
prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"]) |
|
model_name_or_path = "TheBloke/Llama-2-7B-chat-GGML" |
|
model_basename = "llama-2-7b-chat.ggmlv3.q5_1.bin" |
|
|
|
model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename) |
|
logging.info("uploading model from hf pub") |
|
|
|
llm = LlamaCpp(model_path=model_path, n_ctx=4096) |
|
|
|
n_gpu_layers = 1 |
|
n_batch = 512 |
|
llm = LlamaCpp(model_path=model_path, n_ctx=2048, |
|
input={"temperature": 0.75, "max_length": 2000, "top_p": 1}, |
|
callback_manager=callback_manager, |
|
n_gpu_layers=n_gpu_layers, |
|
n_batch=n_batch, |
|
verbose=True,) |
|
|
|
|
|
logging.info("uploading model done") |
|
return llm_chain |
|
|
|
|
|
def return_embeddings(): |
|
logging.info("uploading embeddings") |
|
embeddings = GPT4AllEmbeddings() |
|
logging.info("uploading embeddings") |
|
return embeddings |
|
|
|
|
|
|
|
|
|
|
|
@st.cache_data |
|
def pdf_to_text(file): |
|
pdf_reader = PyPDF2.PdfReader(file) |
|
text = "" |
|
for page_num in range(len(pdf_reader.pages)): |
|
page = pdf_reader.pages[page_num] |
|
text += page.extract_text() |
|
return text |
|
|
|
|
|
@st.cache_data |
|
def csv_to_text(file): |
|
df = pd.read_csv(file) |
|
text = df.to_string(index=False) |
|
return text |
|
|
|
@st.cache_data |
|
def read_txt(file_path): |
|
|
|
with open(file_path, 'r', encoding='utf-8') as file: |
|
text = file.read() |
|
return text |
|
|
|
|
|
def process_file(uploaded_file): |
|
|
|
logging.info("received the file") |
|
|
|
if uploaded_file.type == 'application/pdf': |
|
|
|
text = pdf_to_text(uploaded_file) |
|
elif uploaded_file.type == 'text/csv': |
|
|
|
text = csv_to_text(uploaded_file) |
|
elif uploaded_file.type == 'text/txt': |
|
|
|
text = read_txt(uploaded_file) |
|
else: |
|
raise ValueError("Unsupported file format. Please upload a PDF, CSV, or TXT file.") |
|
|
|
|
|
temp_file = tempfile.NamedTemporaryFile(mode='w', delete=False) |
|
temp_file.write(text) |
|
temp_file.close() |
|
|
|
return temp_file.name |
|
|
|
|
|
def main(): |
|
|
|
|
|
st.title("AssitAI, Chat with your files") |
|
st.markdown(""" A llama2-7b and langchain powered app to chat with your files """) |
|
|
|
uploaded_file = st.file_uploader("Upload a PDF, CSV, or TXT file", type=["pdf", "csv", "txt"]) |
|
|
|
if uploaded_file is not None: |
|
|
|
logging.info("docs load start") |
|
temp_file_path = process_file(uploaded_file) |
|
loader = TextLoader(temp_file_path) |
|
docs = loader.load() |
|
logging.info(f"docs load end, docs is : {docs}") |
|
|
|
text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=0) |
|
texts = text_splitter.split_documents(docs) |
|
logging.info(f"got the text, text is : {docs}") |
|
embeddings = return_embeddings() |
|
db = Chroma.from_documents(texts, embeddings, persist_directory='db') |
|
|
|
question = st.text_input("Enter your question:") |
|
if st.button("Submit"): |
|
similar_doc = db.similarity_search(question, k=1) |
|
context = similar_doc[0].page_content |
|
logging.info("querying start") |
|
query_llm = load_model() |
|
response = query_llm.run({"context": context, "question": question}) |
|
logging.info(f"querying end response is: {response}") |
|
st.subheader("Answer:") |
|
st.write(response) |
|
|
|
|
|
os.remove(temp_file_path) |
|
|
|
with st.expander("""Example prompts"""): |
|
st.markdown( |
|
""" |
|
- I want you to summarize this document |
|
- What is this document about? |
|
- Can you help me to understand ....(fill the blank) part in this document? |
|
""") |
|
|
|
|
|
hide_streamlit_style = """ |
|
<style> |
|
#MainMenu {visibility: hidden;} |
|
footer {visibility: hidden;} |
|
</style> |
|
""" |
|
st.markdown(hide_streamlit_style, unsafe_allow_html=True) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |