Spaces:
Runtime error
Runtime error
Update concat_vector_store.py
Browse files- concat_vector_store.py +16 -16
concat_vector_store.py
CHANGED
@@ -3,14 +3,13 @@ from langchain.schema.document import Document
|
|
3 |
from e5_embeddings import E5Embeddings
|
4 |
from langchain_community.vectorstores import FAISS
|
5 |
|
6 |
-
from document_processor_image import load_documents, split_documents #
|
7 |
|
8 |
-
#
|
9 |
-
NEW_FOLDER = "
|
10 |
-
#NEW_FOLDER = "์์"
|
11 |
VECTOR_STORE_PATH = "vector_db"
|
12 |
|
13 |
-
# 1.
|
14 |
def get_embeddings(model_name="intfloat/multilingual-e5-large-instruct", device="cuda"):
|
15 |
return E5Embeddings(
|
16 |
model_name=model_name,
|
@@ -18,29 +17,30 @@ def get_embeddings(model_name="intfloat/multilingual-e5-large-instruct", device=
|
|
18 |
encode_kwargs={'normalize_embeddings': True}
|
19 |
)
|
20 |
|
21 |
-
# 2.
|
22 |
def load_vector_store(embeddings, load_path="vector_db"):
|
23 |
if not os.path.exists(load_path):
|
24 |
-
raise FileNotFoundError(f"
|
25 |
return FAISS.load_local(load_path, embeddings, allow_dangerous_deserialization=True)
|
26 |
|
27 |
-
# 3.
|
28 |
def add_new_documents_to_vector_store(new_folder, vectorstore, embeddings):
|
29 |
-
print(f"
|
30 |
new_docs = load_documents(new_folder)
|
31 |
-
new_chunks = split_documents(new_docs
|
|
|
32 |
|
33 |
-
print(f"
|
34 |
-
print(f"
|
35 |
vectorstore.add_documents(new_chunks)
|
36 |
-
print(f"
|
37 |
|
38 |
-
print("
|
39 |
|
40 |
-
# 4.
|
41 |
if __name__ == "__main__":
|
42 |
embeddings = get_embeddings()
|
43 |
vectorstore = load_vector_store(embeddings, VECTOR_STORE_PATH)
|
44 |
add_new_documents_to_vector_store(NEW_FOLDER, vectorstore, embeddings)
|
45 |
vectorstore.save_local(VECTOR_STORE_PATH)
|
46 |
-
print(f"
|
|
|
3 |
from e5_embeddings import E5Embeddings
|
4 |
from langchain_community.vectorstores import FAISS
|
5 |
|
6 |
+
from document_processor_image import load_documents, split_documents # This function is required!
|
7 |
|
8 |
+
# Path configuration
|
9 |
+
NEW_FOLDER = "new_documents" # Folder containing the new documents
|
|
|
10 |
VECTOR_STORE_PATH = "vector_db"
|
11 |
|
12 |
+
# 1. Loading the embedding model
|
13 |
def get_embeddings(model_name="intfloat/multilingual-e5-large-instruct", device="cuda"):
|
14 |
return E5Embeddings(
|
15 |
model_name=model_name,
|
|
|
17 |
encode_kwargs={'normalize_embeddings': True}
|
18 |
)
|
19 |
|
20 |
+
# 2. Load existing vector store
|
21 |
def load_vector_store(embeddings, load_path="vector_db"):
|
22 |
if not os.path.exists(load_path):
|
23 |
+
raise FileNotFoundError(f"Cannot find vector store: {load_path}")
|
24 |
return FAISS.load_local(load_path, embeddings, allow_dangerous_deserialization=True)
|
25 |
|
26 |
+
# 3. Embed and Add New Documents
|
27 |
def add_new_documents_to_vector_store(new_folder, vectorstore, embeddings):
|
28 |
+
print(f"Loading new documents: {new_folder}")
|
29 |
new_docs = load_documents(new_folder)
|
30 |
+
new_chunks = split_documents(new_docs) #, chunk_size=800, chunk_overlap=100
|
31 |
+
#Es fehlen noch die Parameter chunk_size=800, chunk_overlap=100, aber ohne Kenntnis der Funktionen, kann ich diese nicht sinnvoll befรผllen
|
32 |
|
33 |
+
print(f"Number of new chunks: {len(new_chunks)}")
|
34 |
+
print(f"Vector count before addition: {vectorstore.index.ntotal}")
|
35 |
vectorstore.add_documents(new_chunks)
|
36 |
+
print(f"Vector count after addition: {vectorstore.index.ntotal}")
|
37 |
|
38 |
+
print("New documents have been added to the vector store.")
|
39 |
|
40 |
+
# 4. Main Execution
|
41 |
if __name__ == "__main__":
|
42 |
embeddings = get_embeddings()
|
43 |
vectorstore = load_vector_store(embeddings, VECTOR_STORE_PATH)
|
44 |
add_new_documents_to_vector_store(NEW_FOLDER, vectorstore, embeddings)
|
45 |
vectorstore.save_local(VECTOR_STORE_PATH)
|
46 |
+
print(f"Vector store save completed: {VECTOR_STORE_PATH}")
|