hugging2021 commited on
Commit
2abe6e2
·
verified ·
1 Parent(s): 65cfd8a

Update vector_store.py

Browse files
Files changed (1) hide show
  1. vector_store.py +57 -31
vector_store.py CHANGED
@@ -1,22 +1,49 @@
1
- #!/usr/bin/env python
2
- # -*- coding: utf-8 -*-
3
-
4
- """
5
- 벡터 스토어 모듈: 문서 임베딩 생성 및 벡터 스토어 구축
6
- 배치 처리 적용으로 메모리 사용량 최적화 + 긴 청크 오류 방지
7
- """
8
-
9
  import os
10
  import argparse
11
  import logging
12
- from tqdm import tqdm
13
- from langchain_community.vectorstores import FAISS
14
- from langchain.schema.document import Document
15
- from langchain_huggingface import HuggingFaceEmbeddings
16
 
17
- # 로깅 설정 - 불필요한 경고 메시지 제거
18
- logging.getLogger().setLevel(logging.ERROR)
 
 
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  def get_embeddings(model_name="intfloat/multilingual-e5-large-instruct", device="cuda"):
21
  return HuggingFaceEmbeddings(
22
  model_name=model_name,
@@ -26,26 +53,26 @@ def get_embeddings(model_name="intfloat/multilingual-e5-large-instruct", device=
26
 
27
  def build_vector_store_batch(documents, embeddings, save_path="vector_db", batch_size=16):
28
  if not documents:
29
- raise ValueError("문서가 없습니다. 문서가 올바르게 로드되었는지 확인하세요.")
30
 
31
  texts = [doc.page_content for doc in documents]
32
  metadatas = [doc.metadata for doc in documents]
33
 
34
- # 배치로 분할
35
  batches = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)]
36
  metadata_batches = [metadatas[i:i + batch_size] for i in range(0, len(metadatas), batch_size)]
37
 
38
  print(f"Processing {len(batches)} batches with size {batch_size}")
39
  print(f"Initializing vector store with batch 1/{len(batches)}")
40
 
41
- # from_texts 대신 from_documents 사용 (길이 문제 방지)
42
  first_docs = [
43
  Document(page_content=text, metadata=meta)
44
  for text, meta in zip(batches[0], metadata_batches[0])
45
  ]
46
  vectorstore = FAISS.from_documents(first_docs, embeddings)
47
 
48
- # 나머지 배치 추가
49
  for i in tqdm(range(1, len(batches)), desc="Processing batches"):
50
  try:
51
  docs_batch = [
@@ -76,29 +103,28 @@ def build_vector_store_batch(documents, embeddings, save_path="vector_db", batch
76
 
77
  def load_vector_store(embeddings, load_path="vector_db"):
78
  if not os.path.exists(load_path):
79
- raise FileNotFoundError(f"벡터 스토어를 찾을 없습니다: {load_path}")
80
  return FAISS.load_local(load_path, embeddings, allow_dangerous_deserialization=True)
81
 
82
-
83
  if __name__ == "__main__":
84
- parser = argparse.ArgumentParser(description="벡터 스토어 구축")
85
- parser.add_argument("--folder", type=str, default="dataset", help="문서가 있는 폴더 경로")
86
- parser.add_argument("--save_path", type=str, default="vector_db", help="벡터 스토어 저장 경로")
87
- parser.add_argument("--batch_size", type=int, default=16, help="배치 크기")
88
- parser.add_argument("--model_name", type=str, default="intfloat/multilingual-e5-large-instruct", help="임베딩 모델 이름")
89
- parser.add_argument("--device", type=str, default="cuda", help="사용할 디바이스 ('cuda' 또는 'cpu')")
90
 
91
  args = parser.parse_args()
92
 
93
- # 문서 처리 모듈 import
94
  from document_processor import load_documents, split_documents
95
 
96
- # 문서 로드 분할
97
  documents = load_documents(args.folder)
98
  chunks = split_documents(documents, chunk_size=800, chunk_overlap=100)
99
 
100
- # 임베딩 모델 로드
101
  embeddings = get_embeddings(model_name=args.model_name, device=args.device)
102
 
103
- # 벡터 스토어 구축
104
- build_vector_store_batch(chunks, embeddings, args.save_path, args.batch_size)
 
 
 
 
 
 
 
 
 
1
  import os
2
  import argparse
3
  import logging
4
+ import time
5
+ from collections import defaultdict
 
 
6
 
7
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
8
+ from langchain_core.documents import Document
9
+ from langchain_community.embeddings import HuggingFaceEmbeddings
10
+ from langchain_community.vectorstores import FAISS
11
 
12
+ # PyMuPDF library
13
+ try:
14
+ import fitz # PyMuPDF
15
+ PYMUPDF_AVAILABLE = True
16
+ print("✅ PyMuPDF library available")
17
+ except ImportError:
18
+ PYMUPDF_AVAILABLE = False
19
+ print("⚠️ PyMuPDF library is not installed. Install with: pip install PyMuPDF")
20
+
21
+
22
+ # --------------------------------
23
+ # Log Output
24
+ # --------------------------------
25
+
26
+ def log(msg):
27
+ print(f"[{time.strftime('%H:%M:%S')}] {msg}")
28
+
29
+ # --------------------------------
30
+ # Text Cleaning Function
31
+ # --------------------------------
32
+
33
+ def clean_text(text):
34
+ return re.sub(r"[^\uAC00-\uD7A3\u1100-\u11FF\u3130-\u318F\w\s.,!?\"'()$:\-]", "", text)
35
+
36
+ def apply_corrections(text):
37
+ corrections = {
38
+ 'º©': 'info', 'Ì': 'of', '½': 'operation', 'Ã': '', '©': '',
39
+ '’': "'", '“': '"', 'â€': '"'
40
+ }
41
+ for k, v in corrections.items():
42
+ text = text.replace(k, v)
43
+ return text
44
+
45
+ # --------------------------------
46
+ # Load the embedding model
47
  def get_embeddings(model_name="intfloat/multilingual-e5-large-instruct", device="cuda"):
48
  return HuggingFaceEmbeddings(
49
  model_name=model_name,
 
53
 
54
  def build_vector_store_batch(documents, embeddings, save_path="vector_db", batch_size=16):
55
  if not documents:
56
+ raise ValueError("No documents found. Check if documents are loaded correctly.")
57
 
58
  texts = [doc.page_content for doc in documents]
59
  metadatas = [doc.metadata for doc in documents]
60
 
61
+ # Split into batches
62
  batches = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)]
63
  metadata_batches = [metadatas[i:i + batch_size] for i in range(0, len(metadatas), batch_size)]
64
 
65
  print(f"Processing {len(batches)} batches with size {batch_size}")
66
  print(f"Initializing vector store with batch 1/{len(batches)}")
67
 
68
+ # Use from_documents instead of from_texts (to prevent length issues)
69
  first_docs = [
70
  Document(page_content=text, metadata=meta)
71
  for text, meta in zip(batches[0], metadata_batches[0])
72
  ]
73
  vectorstore = FAISS.from_documents(first_docs, embeddings)
74
 
75
+ # Add remaining batches
76
  for i in tqdm(range(1, len(batches)), desc="Processing batches"):
77
  try:
78
  docs_batch = [
 
103
 
104
  def load_vector_store(embeddings, load_path="vector_db"):
105
  if not os.path.exists(load_path):
106
+ raise FileNotFoundError(f"Cannot find vector store: {load_path}")
107
  return FAISS.load_local(load_path, embeddings, allow_dangerous_deserialization=True)
108
 
 
109
  if __name__ == "__main__":
110
+ parser = argparse.ArgumentParser(description="Builds a vector store")
111
+ parser.add_argument("--folder", type=str, default="dataset", help="Path to the folder containing the documents")
112
+ parser.add_argument("--save_path", type=str, default="vector_db", help="Path to save the vector store")
113
+ parser.add_argument("--batch_size", type=int, default=16, help="Batch size")
114
+ parser.add_argument("--model_name", type=str, default="intfloat/multilingual-e5-large-instruct", help="Name of the embedding model")
115
+ parser.add_argument("--device", type=str, default="cuda", choices=["cuda", "cpu"], help="Device to use ('cuda' or 'cpu')")
116
 
117
  args = parser.parse_args()
118
 
119
+ # Import the document processing module
120
  from document_processor import load_documents, split_documents
121
 
122
+ # Load and split documents
123
  documents = load_documents(args.folder)
124
  chunks = split_documents(documents, chunk_size=800, chunk_overlap=100)
125
 
126
+ # Load the embedding model
127
  embeddings = get_embeddings(model_name=args.model_name, device=args.device)
128
 
129
+ # Build the vector store
130
+ build_vector_store_batch(chunks, embeddings, args.save_path, args.batch_size)