hugging2021 commited on
Commit
670c138
·
verified ·
1 Parent(s): 8c95c04

Update concat_vector_store_정리된.py

Browse files
Files changed (1) hide show
  1. concat_vector_store_정리된.py +16 -16
concat_vector_store_정리된.py CHANGED
@@ -5,11 +5,11 @@ from e5_embeddings import E5Embeddings
5
  from langchain_community.vectorstores import FAISS
6
  from document_processor import load_pdf_with_pymupdf, split_documents
7
 
8
- # 경로 설정
9
- FOLDER = "25.05.28 RAG용 2차 업무편람 취합본"
10
  VECTOR_STORE_PATH = "vector_db"
11
 
12
- # 1. 임베딩 모델 로드
13
  def get_embeddings(model_name="intfloat/multilingual-e5-large-instruct", device="cuda"):
14
  return E5Embeddings(
15
  model_name=model_name,
@@ -17,39 +17,39 @@ def get_embeddings(model_name="intfloat/multilingual-e5-large-instruct", device=
17
  encode_kwargs={'normalize_embeddings': True}
18
  )
19
 
20
- # 2. 기존 벡터 스토어 로드
21
  def load_vector_store(embeddings, load_path=VECTOR_STORE_PATH):
22
  if not os.path.exists(load_path):
23
- raise FileNotFoundError(f"벡터 스토어를 찾을 없습니다: {load_path}")
24
  return FAISS.load_local(load_path, embeddings, allow_dangerous_deserialization=True)
25
 
26
- # 3. 정리된 PDF만 임베딩
27
  def embed_cleaned_pdfs(folder, vectorstore, embeddings):
28
- pattern = os.path.join(folder, "정리된*.pdf")
29
  pdf_files = glob.glob(pattern)
30
- print(f"🧾 대상 PDF 수: {len(pdf_files)}")
31
 
32
  new_documents = []
33
  for pdf_path in pdf_files:
34
- print(f"📄 처리 중: {pdf_path}")
35
  text = load_pdf_with_pymupdf(pdf_path)
36
  if text.strip():
37
  new_documents.append(Document(page_content=text, metadata={"source": pdf_path}))
38
 
39
- print(f"📚 문서 수: {len(new_documents)}")
40
 
41
  chunks = split_documents(new_documents, chunk_size=300, chunk_overlap=50)
42
- print(f"�� 청크 수: {len(chunks)}")
43
 
44
- print(f"추가 벡터 수: {vectorstore.index.ntotal}")
45
  vectorstore.add_documents(chunks)
46
- print(f"추가 벡터 수: {vectorstore.index.ntotal}")
47
 
48
  vectorstore.save_local(VECTOR_STORE_PATH)
49
- print(f" 저장 완료: {VECTOR_STORE_PATH}")
50
 
51
- # 실행
52
  if __name__ == "__main__":
53
  embeddings = get_embeddings()
54
  vectorstore = load_vector_store(embeddings)
55
- embed_cleaned_pdfs(FOLDER, vectorstore, embeddings)
 
5
  from langchain_community.vectorstores import FAISS
6
  from document_processor import load_pdf_with_pymupdf, split_documents
7
 
8
+ # Path configuration
9
+ FOLDER = "cleaned_pdfs" # Folder containing the cleaned PDFs
10
  VECTOR_STORE_PATH = "vector_db"
11
 
12
+ # 1. Load the embedding model
13
  def get_embeddings(model_name="intfloat/multilingual-e5-large-instruct", device="cuda"):
14
  return E5Embeddings(
15
  model_name=model_name,
 
17
  encode_kwargs={'normalize_embeddings': True}
18
  )
19
 
20
+ # 2. Load existing vector store
21
  def load_vector_store(embeddings, load_path=VECTOR_STORE_PATH):
22
  if not os.path.exists(load_path):
23
+ raise FileNotFoundError(f"Cannot find vector store: {load_path}")
24
  return FAISS.load_local(load_path, embeddings, allow_dangerous_deserialization=True)
25
 
26
+ # 3. Embed only the cleaned PDFs
27
  def embed_cleaned_pdfs(folder, vectorstore, embeddings):
28
+ pattern = os.path.join(folder, "cleaned*.pdf")
29
  pdf_files = glob.glob(pattern)
30
+ print(f"Number of target PDFs: {len(pdf_files)}")
31
 
32
  new_documents = []
33
  for pdf_path in pdf_files:
34
+ print(f"Processing: {pdf_path}")
35
  text = load_pdf_with_pymupdf(pdf_path)
36
  if text.strip():
37
  new_documents.append(Document(page_content=text, metadata={"source": pdf_path}))
38
 
39
+ print(f"Number of documents: {len(new_documents)}")
40
 
41
  chunks = split_documents(new_documents, chunk_size=300, chunk_overlap=50)
42
+ print(f"Number of chunks: {len(chunks)}")
43
 
44
+ print(f"Vector count before addition: {vectorstore.index.ntotal}")
45
  vectorstore.add_documents(chunks)
46
+ print(f"Vector count after addition: {vectorstore.index.ntotal}")
47
 
48
  vectorstore.save_local(VECTOR_STORE_PATH)
49
+ print(f"Save completed: {VECTOR_STORE_PATH}")
50
 
51
+ # Execution
52
  if __name__ == "__main__":
53
  embeddings = get_embeddings()
54
  vectorstore = load_vector_store(embeddings)
55
+ embed_cleaned_pdfs(FOLDER, vectorstore, embeddings)