learn-ai

Sleeping

App Files Files Community

dh-mc commited on Jul 17, 2023

Commit

e8c6d72

1 Parent(s): 7806e7d

fixed bug on metadata url handling

Browse files

Files changed (2) hide show

app_modules/qa_chain.py +4 -2
ingest.py +41 -5

app_modules/qa_chain.py CHANGED Viewed

@@ -140,8 +140,10 @@ class QAChain:
         if self.llm is None:
             if self.llm_model_type == "openai":
                 self.llm = ChatOpenAI(
-                    model_name="gpt-4",
                     streaming=True,
                     callbacks=callbacks,
                     verbose=True,
@@ -536,7 +538,7 @@ class QAChain:
         result["answer"] = remove_extra_spaces(result["answer"])
         base_url = os.environ.get("PDF_FILE_BASE_URL")
-        if base_url is not None:
             documents = result["source_documents"]
             for doc in documents:
                 source = doc.metadata["source"]

         if self.llm is None:
             if self.llm_model_type == "openai":
+                MODEL_NAME = os.environ.get("OPENAI_MODEL_NAME") or "gpt-4"
+                print(f"              using model: {MODEL_NAME}")
                 self.llm = ChatOpenAI(
+                    model_name=MODEL_NAME,
                     streaming=True,
                     callbacks=callbacks,
                     verbose=True,
         result["answer"] = remove_extra_spaces(result["answer"])
         base_url = os.environ.get("PDF_FILE_BASE_URL")
+        if base_url is not None and len(base_url) > 0:
             documents = result["source_documents"]
             for doc in documents:
                 source = doc.metadata["source"]

ingest.py CHANGED Viewed

@@ -13,9 +13,17 @@ from langchain.vectorstores.faiss import FAISS
 from app_modules.utils import *
-def load_documents(source_pdfs_path) -> List:
     loader = PyPDFDirectoryLoader(source_pdfs_path, silent_errors=True)
     documents = loader.load()
     return documents
@@ -55,6 +63,7 @@ hf_embeddings_model_name = (
 index_path = os.environ.get("FAISS_INDEX_PATH") or os.environ.get("CHROMADB_INDEX_PATH")
 using_faiss = os.environ.get("FAISS_INDEX_PATH") is not None
 source_pdfs_path = os.environ.get("SOURCE_PDFS_PATH")
 chunk_size = os.environ.get("CHUNCK_SIZE")
 chunk_overlap = os.environ.get("CHUNK_OVERLAP")
@@ -69,11 +78,29 @@ print(f"Completed in {end - start:.3f}s")
 start = timer()
 if not os.path.isdir(index_path):
-    print("The index persist directory is not present. Creating a new one.")
     os.mkdir(index_path)
-    print(f"Loading PDF files from {source_pdfs_path}")
-    sources = load_documents(source_pdfs_path)
     print(f"Splitting {len(sources)} PDF pages in to chunks ...")
     chunks = split_chunks(
@@ -83,12 +110,21 @@ if not os.path.isdir(index_path):
     index = generate_index(chunks, embeddings)
 else:
-    print("The index persist directory is present. Loading index ...")
     index = (
         FAISS.load_local(index_path, embeddings)
         if using_faiss
         else Chroma(embedding_function=embeddings, persist_directory=index_path)
     )
 end = timer()

 from app_modules.utils import *
+def load_documents(source_pdfs_path, urls) -> List:
     loader = PyPDFDirectoryLoader(source_pdfs_path, silent_errors=True)
     documents = loader.load()
+    if urls is not None and len(urls) > 0:
+        for doc in documents:
+            source = doc.metadata["source"]
+            filename = source.split("/")[-1]
+            for url in urls:
+                if url.endswith(filename):
+                    doc.metadata["url"] = url
+                    break
     return documents
 index_path = os.environ.get("FAISS_INDEX_PATH") or os.environ.get("CHROMADB_INDEX_PATH")
 using_faiss = os.environ.get("FAISS_INDEX_PATH") is not None
 source_pdfs_path = os.environ.get("SOURCE_PDFS_PATH")
+source_urls = os.environ.get("SOURCE_URLS")
 chunk_size = os.environ.get("CHUNCK_SIZE")
 chunk_overlap = os.environ.get("CHUNK_OVERLAP")
 start = timer()
 if not os.path.isdir(index_path):
+    print(
+        f"The index persist directory {index_path} is not present. Creating a new one."
+    )
     os.mkdir(index_path)
+    if source_urls is not None:
+        # Open the file for reading
+        file = open(source_urls, "r")
+        # Read the contents of the file into a list of strings
+        lines = file.readlines()
+        # Close the file
+        file.close()
+        # Remove the newline characters from each string
+        source_urls = [line.strip() for line in lines]
+    print(
+        f"Loading {'' if source_urls is None else str(len(source_urls)) + ' '}PDF files from {source_pdfs_path}"
+    )
+    sources = load_documents(source_pdfs_path, source_urls)
     print(f"Splitting {len(sources)} PDF pages in to chunks ...")
     chunks = split_chunks(
     index = generate_index(chunks, embeddings)
 else:
+    print(f"The index persist directory {index_path} is present. Loading index ...")
     index = (
         FAISS.load_local(index_path, embeddings)
         if using_faiss
         else Chroma(embedding_function=embeddings, persist_directory=index_path)
     )
+    query = "hi"
+    print(f"Load relevant documents for standalone question: {query}")
+    start2 = timer()
+    docs = index.as_retriever().get_relevant_documents(query)
+    end = timer()
+    print(f"Completed in {end - start2:.3f}s")
+    print(docs)
 end = timer()