switch from Unstructured Loader to PyPDF as its results have page nubmer
Browse files- .env.example +2 -0
- README.md +1 -1
- app_modules/qa_chain.py +12 -1
- data/chromadb_1024_512/chroma-collections.parquet +1 -1
- data/chromadb_1024_512/chroma-embeddings.parquet +2 -2
- data/chromadb_1024_512/index/{id_to_uuid_67de6665-0585-4559-85bd-e044c61f64df.pkl β id_to_uuid_44a39155-bdc7-450c-8532-01db0e4b66cc.pkl} +2 -2
- data/chromadb_1024_512/index/{uuid_to_id_67de6665-0585-4559-85bd-e044c61f64df.pkl β index_44a39155-bdc7-450c-8532-01db0e4b66cc.bin} +2 -2
- data/chromadb_1024_512/index/{index_metadata_67de6665-0585-4559-85bd-e044c61f64df.pkl β index_metadata_44a39155-bdc7-450c-8532-01db0e4b66cc.pkl} +1 -1
- data/chromadb_1024_512/index/{index_67de6665-0585-4559-85bd-e044c61f64df.bin β uuid_to_id_44a39155-bdc7-450c-8532-01db0e4b66cc.pkl} +2 -2
- ingest.py +2 -4
.env.example
CHANGED
|
@@ -18,6 +18,8 @@ HF_PIPELINE_DEVICE_TYPE=
|
|
| 18 |
|
| 19 |
CHAT_HISTORY_ENABLED=true
|
| 20 |
|
|
|
|
|
|
|
| 21 |
# if unset, default to "hkunlp/instructor-xl"
|
| 22 |
HF_EMBEDDINGS_MODEL_NAME="hkunlp/instructor-large"
|
| 23 |
|
|
|
|
| 18 |
|
| 19 |
CHAT_HISTORY_ENABLED=true
|
| 20 |
|
| 21 |
+
PDF_FILE_BASE_URL=
|
| 22 |
+
|
| 23 |
# if unset, default to "hkunlp/instructor-xl"
|
| 24 |
HF_EMBEDDINGS_MODEL_NAME="hkunlp/instructor-large"
|
| 25 |
|
README.md
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
---
|
| 2 |
-
title: Chat
|
| 3 |
emoji: π
|
| 4 |
colorFrom: indigo
|
| 5 |
colorTo: blue
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Chat with AI Books
|
| 3 |
emoji: π
|
| 4 |
colorFrom: indigo
|
| 5 |
colorTo: blue
|
app_modules/qa_chain.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
import os
|
| 2 |
import sys
|
|
|
|
| 3 |
from queue import Queue
|
| 4 |
from typing import Any, Optional
|
| 5 |
|
|
@@ -528,4 +529,14 @@ class QAChain:
|
|
| 528 |
self.streamer.reset(q)
|
| 529 |
|
| 530 |
qa = self.get_chain(tracing)
|
| 531 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
import sys
|
| 3 |
+
import urllib
|
| 4 |
from queue import Queue
|
| 5 |
from typing import Any, Optional
|
| 6 |
|
|
|
|
| 529 |
self.streamer.reset(q)
|
| 530 |
|
| 531 |
qa = self.get_chain(tracing)
|
| 532 |
+
result = qa(inputs)
|
| 533 |
+
|
| 534 |
+
base_url = os.environ.get("PDF_FILE_BASE_URL")
|
| 535 |
+
if base_url is not None:
|
| 536 |
+
documents = result["source_documents"]
|
| 537 |
+
for doc in documents:
|
| 538 |
+
source = doc.metadata["source"]
|
| 539 |
+
title = source.split("/")[-1]
|
| 540 |
+
doc.metadata["url"] = f"{base_url}{urllib.parse.quote(title)}"
|
| 541 |
+
|
| 542 |
+
return result
|
data/chromadb_1024_512/chroma-collections.parquet
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 557
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1d0e4364f9a67d91e3185cc597297b8651ca02bdfddb8467767c8a71cbb89d4e
|
| 3 |
size 557
|
data/chromadb_1024_512/chroma-embeddings.parquet
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8b050c60c5fd263355aabc3cc35e6308930cb4b8a1929e7209b6777da0782d59
|
| 3 |
+
size 7513430
|
data/chromadb_1024_512/index/{id_to_uuid_67de6665-0585-4559-85bd-e044c61f64df.pkl β id_to_uuid_44a39155-bdc7-450c-8532-01db0e4b66cc.pkl}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4eb4fe05362f4052e3af173b0915e9758bb7bc7f9f681850e765cbde35d8783f
|
| 3 |
+
size 47652
|
data/chromadb_1024_512/index/{uuid_to_id_67de6665-0585-4559-85bd-e044c61f64df.pkl β index_44a39155-bdc7-450c-8532-01db0e4b66cc.bin}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5a26db7cd65749049856321b4aef559a0ffbef7f4286131c1bcd5f5dc4cc3849
|
| 3 |
+
size 4743996
|
data/chromadb_1024_512/index/{index_metadata_67de6665-0585-4559-85bd-e044c61f64df.pkl β index_metadata_44a39155-bdc7-450c-8532-01db0e4b66cc.pkl}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 105
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ae5e0c780f18efa625dc2d0ad2d60328b51d2842cac144446196e4032e7c2c43
|
| 3 |
size 105
|
data/chromadb_1024_512/index/{index_67de6665-0585-4559-85bd-e044c61f64df.bin β uuid_to_id_44a39155-bdc7-450c-8532-01db0e4b66cc.pkl}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:49aab2d749c5650688e4b1b566d8773889ca59d92ea2083d04fd5882a626ecc0
|
| 3 |
+
size 55737
|
ingest.py
CHANGED
|
@@ -3,9 +3,7 @@ import os
|
|
| 3 |
from timeit import default_timer as timer
|
| 4 |
from typing import List
|
| 5 |
|
| 6 |
-
import
|
| 7 |
-
from dotenv import load_dotenv
|
| 8 |
-
from langchain.document_loaders.directory import DirectoryLoader
|
| 9 |
from langchain.embeddings import HuggingFaceInstructEmbeddings
|
| 10 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 11 |
from langchain.vectorstores.chroma import Chroma
|
|
@@ -14,7 +12,7 @@ from app_modules.utils import *
|
|
| 14 |
|
| 15 |
|
| 16 |
def load_documents(source_pdfs_path) -> List:
|
| 17 |
-
loader =
|
| 18 |
documents = loader.load()
|
| 19 |
return documents
|
| 20 |
|
|
|
|
| 3 |
from timeit import default_timer as timer
|
| 4 |
from typing import List
|
| 5 |
|
| 6 |
+
from langchain.document_loaders import PyPDFDirectoryLoader
|
|
|
|
|
|
|
| 7 |
from langchain.embeddings import HuggingFaceInstructEmbeddings
|
| 8 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 9 |
from langchain.vectorstores.chroma import Chroma
|
|
|
|
| 12 |
|
| 13 |
|
| 14 |
def load_documents(source_pdfs_path) -> List:
|
| 15 |
+
loader = PyPDFDirectoryLoader(source_pdfs_path, silent_errors=True)
|
| 16 |
documents = loader.load()
|
| 17 |
return documents
|
| 18 |
|