| # JB: | |
| # LangChainDeprecationWarning: Importing embeddings from langchain is deprecated. | |
| # Importing from langchain will no longer be supported as of langchain==0.2.0. | |
| # Please import from langchain-community instead: | |
| # `from langchain_community.embeddings import FastEmbedEmbeddings`. | |
| # To install langchain-community run `pip install -U langchain-community`. | |
| from langchain_community.embeddings import FastEmbedEmbeddings | |
| import os | |
| import streamlit as st | |
| from langchain_groq import ChatGroq | |
| from langchain_community.document_loaders import WebBaseLoader | |
| # JB: | |
| from langchain_community.document_loaders import PyPDFLoader | |
| from langchain_community.embeddings import OllamaEmbeddings | |
| # JB: | |
| from langchain.embeddings import FastEmbedEmbeddings | |
| from langchain_community.document_loaders import PyPDFDirectoryLoader | |
| from langchain_community.vectorstores import FAISS | |
| # from langchain.vectorstores import Chroma | |
| # from langchain_community.vectorstores import Chroma | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.chains.combine_documents import create_stuff_documents_chain | |
| from langchain_core.prompts import ChatPromptTemplate | |
| from langchain.chains import create_retrieval_chain | |
| import time | |
| from dotenv import load_dotenv | |
| load_dotenv() # | |
| # groq_api_key = os.environ['GROQ_API_KEY'] | |
| groq_api_key = "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" # os.environ['GROQ_API_KEY'] | |
| print("groq_api_key: ", groq_api_key) | |
| if "vector" not in st.session_state: | |
| # st.session_state.embeddings = OllamaEmbeddings() # ORIGINAL | |
| st.session_state.embeddings = FastEmbedEmbeddings() # JB | |
| # st.session_state.loader = WebBaseLoader("https://paulgraham.com/greatwork.html") # ORIGINAL | |
| # st.session_state.docs = st.session_state.loader.load() # ORIGINAL | |
| # https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFLoader.html | |
| # https://python.langchain.com/docs/integrations/document_loaders/merge_doc | |
| # from langchain_community.document_loaders import PyPDFLoader | |
| # loader_pdf = PyPDFLoader("../MachineLearning-Lecture01.pdf") | |
| # | |
| # https://stackoverflow.com/questions/60215731/pypdf-to-read-each-pdf-in-a-folder | |
| # | |
| # https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFDirectoryLoader.html | |
| # https://python.langchain.com/docs/modules/data_connection/document_loaders/pdf#pypdf-directory | |
| # !!!!! | |
| # PyPDF Directory | |
| # Load PDFs from directory | |
| # from langchain_community.document_loaders import PyPDFDirectoryLoader | |
| # loader = PyPDFDirectoryLoader("example_data/") | |
| # docs = loader.load() | |
| # | |
| # ZIE OOK: | |
| # https://python.langchain.com/docs/modules/data_connection/document_loaders/pdf#using-pypdf | |
| # Using MathPix | |
| # Inspired by Daniel Gross's https://gist.github.com/danielgross/3ab4104e14faccc12b49200843adab21 | |
| # from langchain_community.document_loaders import MathpixPDFLoader | |
| # loader = MathpixPDFLoader("example_data/layout-parser-paper.pdf") | |
| # data = loader.load() | |
| pdf_file_path = "*.pdf" # JB | |
| # st.session_state.loader = PyPDFLoader(file_path=pdf_file_path).load() # JB | |
| # st.session_state.loader = PyPDFLoader(*.pdf).load() # JB syntax error *.pdf ! | |
| st.session_state.loader = PyPDFDirectoryLoader("") # JB PyPDFDirectoryLoader("example_data/") | |
| # chunks = self.text_splitter.split_documents(docs) | |
| # chunks = filter_complex_metadata(chunks) | |
| st.session_state.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) | |
| st.session_state.documents = st.session_state.text_splitter.split_documents( st.session_state.docs) | |
| # st.session_state.vector = FAISS.from_documents(st.session_state.documents, st.session_state.embeddings) # ORIGINAL | |
| st.session_state.vector = FAISS.from_documents(st.session_state.documents, st.session_state.embeddings) # ORIGINAL | |
| # ZIE: | |
| # ZIE VOOR EEN APP MET CHROMADB: | |
| # https://github.com/vndee/local-rag-example/blob/main/rag.py | |
| # https://raw.githubusercontent.com/vndee/local-rag-example/main/rag.py | |
| # Chroma.from_documents(documents=chunks, embedding=FastEmbedEmbeddings()) | |
| # st.session_state.vector = Chroma.from_documents(st.session_state.documents, st.session_state.embeddings) # JB | |
| # st.title("Chat with Docs - Groq Edition :) ") | |
| st.title("Literature Based Research (LBR) - A. Unzicker and J. Bours - Chat with Docs - Groq Edition (Very Fast!) - VERSION 3 - March 8 2024") | |
| llm = ChatGroq( | |
| groq_api_key=groq_api_key, | |
| model_name='mixtral-8x7b-32768' | |
| ) | |
| prompt = ChatPromptTemplate.from_template(""" | |
| Answer the following question based only on the provided context. | |
| Think step by step before providing a detailed answer. | |
| I will tip you $200 if the user finds the answer helpful. | |
| <context> | |
| {context} | |
| </context> | |
| Question: {input}""") | |
| document_chain = create_stuff_documents_chain(llm, prompt) | |
| retriever = st.session_state.vector.as_retriever() | |
| retrieval_chain = create_retrieval_chain(retriever, document_chain) | |
| prompt = st.text_input("Input your prompt here") | |
| # If the user hits enter | |
| if prompt: | |
| # Then pass the prompt to the LLM | |
| start = time.process_time() | |
| response = retrieval_chain.invoke({"input": prompt}) | |
| print(f"Response time: {time.process_time() - start}") | |
| st.write(response["answer"]) | |
| # With a streamlit expander | |
| with st.expander("Document Similarity Search"): | |
| # Find the relevant chunks | |
| for i, doc in enumerate(response["context"]): | |
| # print(doc) | |
| # st.write(f"Source Document # {i+1} : {doc.metadata['source'].split('/')[-1]}") | |
| st.write(doc.page_content) | |
| st.write("--------------------------------") |