hugging2021 commited on
Commit
f10485f
ยท
verified ยท
1 Parent(s): 2abe6e2

Update vector_store_test.py

Browse files
Files changed (1) hide show
  1. vector_store_test.py +34 -38
vector_store_test.py CHANGED
@@ -1,26 +1,23 @@
1
- #!/usr/bin/env python
2
- # -*- coding: utf-8 -*-
3
-
4
- """
5
- ๋ฒกํ„ฐ ์Šคํ† ์–ด ๋ชจ๋“ˆ: ๋ฌธ์„œ ์ž„๋ฒ ๋”ฉ ์ƒ์„ฑ ๋ฐ ๋ฒกํ„ฐ ์Šคํ† ์–ด ๊ตฌ์ถ•
6
- ๋ฐฐ์น˜ ์ฒ˜๋ฆฌ ์ ์šฉ + ์ฒญํฌ ๊ธธ์ด ํ™•์ธ ์ถ”๊ฐ€
7
- """
8
-
9
  import os
 
 
 
10
  import argparse
11
  import logging
12
- from tqdm import tqdm
 
 
 
13
  from langchain_community.vectorstores import FAISS
14
- from langchain.schema.document import Document
15
- from langchain_huggingface import HuggingFaceEmbeddings
16
- from e5_embeddings import E5Embeddings
17
 
18
- # ๋กœ๊น… ์„ค์ •
19
  logging.getLogger().setLevel(logging.ERROR)
20
 
 
21
  def get_embeddings(model_name="intfloat/multilingual-e5-large-instruct", device="cuda"):
22
- print(f"[INFO] ์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ ๋””๋ฐ”์ด์Šค: {device}")
23
- return E5Embeddings(
24
  model_name=model_name,
25
  model_kwargs={'device': device},
26
  encode_kwargs={'normalize_embeddings': True}
@@ -28,31 +25,32 @@ def get_embeddings(model_name="intfloat/multilingual-e5-large-instruct", device=
28
 
29
  def build_vector_store_batch(documents, embeddings, save_path="vector_db", batch_size=4):
30
  if not documents:
31
- raise ValueError("๋ฌธ์„œ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค. ๋ฌธ์„œ๊ฐ€ ์˜ฌ๋ฐ”๋ฅด๊ฒŒ ๋กœ๋“œ๋˜์—ˆ๋Š”์ง€ ํ™•์ธํ•˜์„ธ์š”.")
32
 
33
  texts = [doc.page_content for doc in documents]
34
  metadatas = [doc.metadata for doc in documents]
35
 
36
- # ์ฒญํฌ ๊ธธ์ด ์ถœ๋ ฅ
37
  lengths = [len(t) for t in texts]
38
- print(f"๐Ÿ’ก ์ฒญํฌ ์ˆ˜: {len(texts)}")
39
- print(f"๐Ÿ’ก ๊ฐ€์žฅ ๊ธด ์ฒญํฌ ๊ธธ์ด: {max(lengths)} chars")
40
- print(f"๐Ÿ’ก ํ‰๊ท  ์ฒญํฌ ๊ธธ์ด: {sum(lengths) // len(lengths)} chars")
41
 
42
- # ๋ฐฐ์น˜๋กœ ๋‚˜๋ˆ„๊ธฐ
43
  batches = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)]
44
  metadata_batches = [metadatas[i:i + batch_size] for i in range(0, len(metadatas), batch_size)]
45
 
46
  print(f"Processing {len(batches)} batches with size {batch_size}")
47
  print(f"Initializing vector store with batch 1/{len(batches)}")
48
 
49
- # โœ… from_documents ์‚ฌ์šฉ
50
  first_docs = [
51
  Document(page_content=text, metadata=meta)
52
  for text, meta in zip(batches[0], metadata_batches[0])
53
  ]
54
  vectorstore = FAISS.from_documents(first_docs, embeddings)
55
 
 
56
  for i in tqdm(range(1, len(batches)), desc="Processing batches"):
57
  try:
58
  docs_batch = [
@@ -83,39 +81,37 @@ def build_vector_store_batch(documents, embeddings, save_path="vector_db", batch
83
 
84
  def load_vector_store(embeddings, load_path="vector_db"):
85
  if not os.path.exists(load_path):
86
- raise FileNotFoundError(f"๋ฒกํ„ฐ ์Šคํ† ์–ด๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค: {load_path}")
87
  return FAISS.load_local(load_path, embeddings, allow_dangerous_deserialization=True)
88
 
89
  if __name__ == "__main__":
90
- parser = argparse.ArgumentParser(description="๋ฒกํ„ฐ ์Šคํ† ์–ด ๊ตฌ์ถ•")
91
- parser.add_argument("--folder", type=str, default="final_dataset", help="๋ฌธ์„œ๊ฐ€ ์žˆ๋Š” ํด๋” ๊ฒฝ๋กœ")
92
- parser.add_argument("--save_path", type=str, default="vector_db", help="๋ฒกํ„ฐ ์Šคํ† ์–ด ์ €์žฅ ๊ฒฝ๋กœ")
93
- parser.add_argument("--batch_size", type=int, default=4, help="๋ฐฐ์น˜ ํฌ๊ธฐ")
94
- parser.add_argument("--model_name", type=str, default="intfloat/multilingual-e5-large-instruct", help="์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ ์ด๋ฆ„")
95
- # parser.add_argument("--device", type=str, default="cuda", help="์‚ฌ์šฉํ•  ๋””๋ฐ”์ด์Šค ('cuda' ๋˜๋Š” 'cpu')")
96
- parser.add_argument("--device", type=str, default="cuda", help="์‚ฌ์šฉํ•  ๋””๋ฐ”์ด์Šค ('cuda' ๋˜๋Š” 'cpu' ๋˜๋Š” 'cuda:1')")
97
-
98
  args = parser.parse_args()
99
 
100
- # ๋ฌธ์„œ ์ฒ˜๋ฆฌ ๋ชจ๋“ˆ import
101
  from document_processor_image_test import load_documents, split_documents
102
 
103
  documents = load_documents(args.folder)
104
  chunks = split_documents(documents, chunk_size=800, chunk_overlap=100)
105
 
106
- print(f"[DEBUG] ๋ฌธ์„œ ๋กœ๋”ฉ ๋ฐ ์ฒญํฌ ๋ถ„ํ•  ์™„๋ฃŒ, ์ž„๋ฒ ๋”ฉ ๋‹จ๊ณ„ ์ง„์ž… ์ „")
107
- print(f"[INFO] ์„ ํƒ๋œ ๋””๋ฐ”์ด์Šค: {args.device}")
108
 
109
  try:
110
  embeddings = get_embeddings(
111
  model_name=args.model_name,
112
  device=args.device
113
  )
114
- print(f"[DEBUG] ์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ ์ƒ์„ฑ ์™„๋ฃŒ")
115
  except Exception as e:
116
- print(f"[ERROR] ์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ ์ƒ์„ฑ ์ค‘ ์—๋Ÿฌ ๋ฐœ์ƒ: {e}")
117
  import traceback; traceback.print_exc()
118
  exit(1)
119
 
120
- build_vector_store_batch(chunks, embeddings, args.save_path, args.batch_size)
121
-
 
 
 
 
 
 
 
 
 
1
  import os
2
+ import re
3
+ import glob
4
+ import time
5
  import argparse
6
  import logging
7
+ from collections import defaultdict
8
+
9
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
10
+ from langchain_core.documents import Document
11
  from langchain_community.vectorstores import FAISS
12
+ from langchain_community.embeddings import HuggingFaceEmbeddings
 
 
13
 
14
+ # Logging Configuration
15
  logging.getLogger().setLevel(logging.ERROR)
16
 
17
+ # Embedding model loading
18
  def get_embeddings(model_name="intfloat/multilingual-e5-large-instruct", device="cuda"):
19
+ print(f"[INFO] Embedding model device: {device}")
20
+ return HuggingFaceEmbeddings(
21
  model_name=model_name,
22
  model_kwargs={'device': device},
23
  encode_kwargs={'normalize_embeddings': True}
 
25
 
26
  def build_vector_store_batch(documents, embeddings, save_path="vector_db", batch_size=4):
27
  if not documents:
28
+ raise ValueError("No documents found. Check if documents were loaded correctly.")
29
 
30
  texts = [doc.page_content for doc in documents]
31
  metadatas = [doc.metadata for doc in documents]
32
 
33
+ # Print chunk lengths
34
  lengths = [len(t) for t in texts]
35
+ print(f"๐Ÿ’ก Number of chunks: {len(texts)}")
36
+ print(f"๐Ÿ’ก Longest chunk length: {max(lengths)} chars")
37
+ print(f"๐Ÿ’ก Average chunk length: {sum(lengths) // len(lengths)} chars")
38
 
39
+ # Split into batches
40
  batches = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)]
41
  metadata_batches = [metadatas[i:i + batch_size] for i in range(0, len(metadatas), batch_size)]
42
 
43
  print(f"Processing {len(batches)} batches with size {batch_size}")
44
  print(f"Initializing vector store with batch 1/{len(batches)}")
45
 
46
+ # Use from_documents
47
  first_docs = [
48
  Document(page_content=text, metadata=meta)
49
  for text, meta in zip(batches[0], metadata_batches[0])
50
  ]
51
  vectorstore = FAISS.from_documents(first_docs, embeddings)
52
 
53
+ # Add remaining batches
54
  for i in tqdm(range(1, len(batches)), desc="Processing batches"):
55
  try:
56
  docs_batch = [
 
81
 
82
  def load_vector_store(embeddings, load_path="vector_db"):
83
  if not os.path.exists(load_path):
84
+ raise FileNotFoundError(f"Cannot find vector store: {load_path}")
85
  return FAISS.load_local(load_path, embeddings, allow_dangerous_deserialization=True)
86
 
87
  if __name__ == "__main__":
88
+ parser = argparse.ArgumentParser(description="Builds a vector store")
89
+ parser.add_argument("--folder", type=str, default="final_dataset", help="Path to the folder containing the documents")
90
+ parser.add_argument("--save_path", type=str, default="vector_db", help="Path to save the vector store")
91
+ parser.add_argument("--batch_size", type=int, default=4, help="Batch size")
92
+ parser.add_argument("--model_name", type=str, default="intfloat/multilingual-e5-large-instruct", help="Name of the embedding model")
93
+ parser.add_argument("--device", type=str, default="cuda", help="Device to use ('cuda' or 'cpu' or 'cuda:0')") #Ermรถglicht cuda:0
94
+
 
95
  args = parser.parse_args()
96
 
97
+ # Import the document processing module
98
  from document_processor_image_test import load_documents, split_documents
99
 
100
  documents = load_documents(args.folder)
101
  chunks = split_documents(documents, chunk_size=800, chunk_overlap=100)
102
 
103
+ print(f"[DEBUG] Document loading and chunk splitting complete, entering embedding stage")
104
+ print(f"[INFO] Selected device: {args.device}")
105
 
106
  try:
107
  embeddings = get_embeddings(
108
  model_name=args.model_name,
109
  device=args.device
110
  )
111
+ print(f"[DEBUG] Embedding model created")
112
  except Exception as e:
113
+ print(f"[ERROR] Error creating embedding model: {e}")
114
  import traceback; traceback.print_exc()
115
  exit(1)
116
 
117
+ build_vector_store_batch(chunks, embeddings, args.save_path, args.batch_size)