hugging2021 commited on
Commit
8c95c04
ยท
verified ยท
1 Parent(s): 801b168

Update concat_vector_store.py

Browse files
Files changed (1) hide show
  1. concat_vector_store.py +16 -16
concat_vector_store.py CHANGED
@@ -3,14 +3,13 @@ from langchain.schema.document import Document
3
  from e5_embeddings import E5Embeddings
4
  from langchain_community.vectorstores import FAISS
5
 
6
- from document_processor_image import load_documents, split_documents # ๋ฐ˜๋“œ์‹œ ์ด ํ•จ์ˆ˜๊ฐ€ ํ•„์š”
7
 
8
- # ๊ฒฝ๋กœ ์„ค์ •
9
- NEW_FOLDER = "25.05.28 RAG์šฉ 2์ฐจ ์—…๋ฌดํŽธ๋žŒ ์ทจํ•ฉ๋ณธ"
10
- #NEW_FOLDER = "์ž„์‹œ"
11
  VECTOR_STORE_PATH = "vector_db"
12
 
13
- # 1. ์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ ๋กœ๋”ฉ
14
  def get_embeddings(model_name="intfloat/multilingual-e5-large-instruct", device="cuda"):
15
  return E5Embeddings(
16
  model_name=model_name,
@@ -18,29 +17,30 @@ def get_embeddings(model_name="intfloat/multilingual-e5-large-instruct", device=
18
  encode_kwargs={'normalize_embeddings': True}
19
  )
20
 
21
- # 2. ๊ธฐ์กด ๋ฒกํ„ฐ ์Šคํ† ์–ด ๋กœ๋“œ
22
  def load_vector_store(embeddings, load_path="vector_db"):
23
  if not os.path.exists(load_path):
24
- raise FileNotFoundError(f"๋ฒกํ„ฐ ์Šคํ† ์–ด๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค: {load_path}")
25
  return FAISS.load_local(load_path, embeddings, allow_dangerous_deserialization=True)
26
 
27
- # 3. ๋ฌธ์„œ ์ž„๋ฒ ๋”ฉ ๋ฐ ์ถ”๊ฐ€
28
  def add_new_documents_to_vector_store(new_folder, vectorstore, embeddings):
29
- print(f"๐Ÿ“‚ ์ƒˆ๋กœ์šด ๋ฌธ์„œ ๋กœ๋“œ ์ค‘: {new_folder}")
30
  new_docs = load_documents(new_folder)
31
- new_chunks = split_documents(new_docs, chunk_size=800, chunk_overlap=100)
 
32
 
33
- print(f"๐Ÿ“„ ์ƒˆ๋กœ์šด ์ฒญํฌ ์ˆ˜: {len(new_chunks)}")
34
- print(f"์ถ”๊ฐ€ ์ „ ๋ฒกํ„ฐ ์ˆ˜: {vectorstore.index.ntotal}")
35
  vectorstore.add_documents(new_chunks)
36
- print(f"์ถ”๊ฐ€ ํ›„ ๋ฒกํ„ฐ ์ˆ˜: {vectorstore.index.ntotal}")
37
 
38
- print("โœ… ์ƒˆ๋กœ์šด ๋ฌธ์„œ๊ฐ€ ๋ฒกํ„ฐ ์Šคํ† ์–ด์— ์ถ”๊ฐ€๋˜์—ˆ์Šต๋‹ˆ๋‹ค.")
39
 
40
- # 4. ์ „์ฒด ์‹คํ–‰
41
  if __name__ == "__main__":
42
  embeddings = get_embeddings()
43
  vectorstore = load_vector_store(embeddings, VECTOR_STORE_PATH)
44
  add_new_documents_to_vector_store(NEW_FOLDER, vectorstore, embeddings)
45
  vectorstore.save_local(VECTOR_STORE_PATH)
46
- print(f"๐Ÿ’พ ๋ฒกํ„ฐ ์Šคํ† ์–ด ์ €์žฅ ์™„๋ฃŒ: {VECTOR_STORE_PATH}")
 
3
  from e5_embeddings import E5Embeddings
4
  from langchain_community.vectorstores import FAISS
5
 
6
+ from document_processor_image import load_documents, split_documents # This function is required!
7
 
8
+ # Path configuration
9
+ NEW_FOLDER = "new_documents" # Folder containing the new documents
 
10
  VECTOR_STORE_PATH = "vector_db"
11
 
12
+ # 1. Loading the embedding model
13
  def get_embeddings(model_name="intfloat/multilingual-e5-large-instruct", device="cuda"):
14
  return E5Embeddings(
15
  model_name=model_name,
 
17
  encode_kwargs={'normalize_embeddings': True}
18
  )
19
 
20
+ # 2. Load existing vector store
21
  def load_vector_store(embeddings, load_path="vector_db"):
22
  if not os.path.exists(load_path):
23
+ raise FileNotFoundError(f"Cannot find vector store: {load_path}")
24
  return FAISS.load_local(load_path, embeddings, allow_dangerous_deserialization=True)
25
 
26
+ # 3. Embed and Add New Documents
27
  def add_new_documents_to_vector_store(new_folder, vectorstore, embeddings):
28
+ print(f"Loading new documents: {new_folder}")
29
  new_docs = load_documents(new_folder)
30
+ new_chunks = split_documents(new_docs) #, chunk_size=800, chunk_overlap=100
31
+ #Es fehlen noch die Parameter chunk_size=800, chunk_overlap=100, aber ohne Kenntnis der Funktionen, kann ich diese nicht sinnvoll befรผllen
32
 
33
+ print(f"Number of new chunks: {len(new_chunks)}")
34
+ print(f"Vector count before addition: {vectorstore.index.ntotal}")
35
  vectorstore.add_documents(new_chunks)
36
+ print(f"Vector count after addition: {vectorstore.index.ntotal}")
37
 
38
+ print("New documents have been added to the vector store.")
39
 
40
+ # 4. Main Execution
41
  if __name__ == "__main__":
42
  embeddings = get_embeddings()
43
  vectorstore = load_vector_store(embeddings, VECTOR_STORE_PATH)
44
  add_new_documents_to_vector_store(NEW_FOLDER, vectorstore, embeddings)
45
  vectorstore.save_local(VECTOR_STORE_PATH)
46
+ print(f"Vector store save completed: {VECTOR_STORE_PATH}")