Spaces:
Runtime error
Runtime error
Update concat_vector_store_정리된.py
Browse files- concat_vector_store_정리된.py +16 -16
concat_vector_store_정리된.py
CHANGED
@@ -5,11 +5,11 @@ from e5_embeddings import E5Embeddings
|
|
5 |
from langchain_community.vectorstores import FAISS
|
6 |
from document_processor import load_pdf_with_pymupdf, split_documents
|
7 |
|
8 |
-
#
|
9 |
-
FOLDER = "
|
10 |
VECTOR_STORE_PATH = "vector_db"
|
11 |
|
12 |
-
# 1.
|
13 |
def get_embeddings(model_name="intfloat/multilingual-e5-large-instruct", device="cuda"):
|
14 |
return E5Embeddings(
|
15 |
model_name=model_name,
|
@@ -17,39 +17,39 @@ def get_embeddings(model_name="intfloat/multilingual-e5-large-instruct", device=
|
|
17 |
encode_kwargs={'normalize_embeddings': True}
|
18 |
)
|
19 |
|
20 |
-
# 2.
|
21 |
def load_vector_store(embeddings, load_path=VECTOR_STORE_PATH):
|
22 |
if not os.path.exists(load_path):
|
23 |
-
raise FileNotFoundError(f"
|
24 |
return FAISS.load_local(load_path, embeddings, allow_dangerous_deserialization=True)
|
25 |
|
26 |
-
# 3.
|
27 |
def embed_cleaned_pdfs(folder, vectorstore, embeddings):
|
28 |
-
pattern = os.path.join(folder, "
|
29 |
pdf_files = glob.glob(pattern)
|
30 |
-
print(f"
|
31 |
|
32 |
new_documents = []
|
33 |
for pdf_path in pdf_files:
|
34 |
-
print(f"
|
35 |
text = load_pdf_with_pymupdf(pdf_path)
|
36 |
if text.strip():
|
37 |
new_documents.append(Document(page_content=text, metadata={"source": pdf_path}))
|
38 |
|
39 |
-
print(f"
|
40 |
|
41 |
chunks = split_documents(new_documents, chunk_size=300, chunk_overlap=50)
|
42 |
-
print(f"
|
43 |
|
44 |
-
print(f"
|
45 |
vectorstore.add_documents(chunks)
|
46 |
-
print(f"
|
47 |
|
48 |
vectorstore.save_local(VECTOR_STORE_PATH)
|
49 |
-
print(f"
|
50 |
|
51 |
-
#
|
52 |
if __name__ == "__main__":
|
53 |
embeddings = get_embeddings()
|
54 |
vectorstore = load_vector_store(embeddings)
|
55 |
-
embed_cleaned_pdfs(FOLDER, vectorstore, embeddings)
|
|
|
5 |
from langchain_community.vectorstores import FAISS
|
6 |
from document_processor import load_pdf_with_pymupdf, split_documents
|
7 |
|
8 |
+
# Path configuration
|
9 |
+
FOLDER = "cleaned_pdfs" # Folder containing the cleaned PDFs
|
10 |
VECTOR_STORE_PATH = "vector_db"
|
11 |
|
12 |
+
# 1. Load the embedding model
|
13 |
def get_embeddings(model_name="intfloat/multilingual-e5-large-instruct", device="cuda"):
|
14 |
return E5Embeddings(
|
15 |
model_name=model_name,
|
|
|
17 |
encode_kwargs={'normalize_embeddings': True}
|
18 |
)
|
19 |
|
20 |
+
# 2. Load existing vector store
|
21 |
def load_vector_store(embeddings, load_path=VECTOR_STORE_PATH):
|
22 |
if not os.path.exists(load_path):
|
23 |
+
raise FileNotFoundError(f"Cannot find vector store: {load_path}")
|
24 |
return FAISS.load_local(load_path, embeddings, allow_dangerous_deserialization=True)
|
25 |
|
26 |
+
# 3. Embed only the cleaned PDFs
|
27 |
def embed_cleaned_pdfs(folder, vectorstore, embeddings):
|
28 |
+
pattern = os.path.join(folder, "cleaned*.pdf")
|
29 |
pdf_files = glob.glob(pattern)
|
30 |
+
print(f"Number of target PDFs: {len(pdf_files)}")
|
31 |
|
32 |
new_documents = []
|
33 |
for pdf_path in pdf_files:
|
34 |
+
print(f"Processing: {pdf_path}")
|
35 |
text = load_pdf_with_pymupdf(pdf_path)
|
36 |
if text.strip():
|
37 |
new_documents.append(Document(page_content=text, metadata={"source": pdf_path}))
|
38 |
|
39 |
+
print(f"Number of documents: {len(new_documents)}")
|
40 |
|
41 |
chunks = split_documents(new_documents, chunk_size=300, chunk_overlap=50)
|
42 |
+
print(f"Number of chunks: {len(chunks)}")
|
43 |
|
44 |
+
print(f"Vector count before addition: {vectorstore.index.ntotal}")
|
45 |
vectorstore.add_documents(chunks)
|
46 |
+
print(f"Vector count after addition: {vectorstore.index.ntotal}")
|
47 |
|
48 |
vectorstore.save_local(VECTOR_STORE_PATH)
|
49 |
+
print(f"Save completed: {VECTOR_STORE_PATH}")
|
50 |
|
51 |
+
# Execution
|
52 |
if __name__ == "__main__":
|
53 |
embeddings = get_embeddings()
|
54 |
vectorstore = load_vector_store(embeddings)
|
55 |
+
embed_cleaned_pdfs(FOLDER, vectorstore, embeddings)
|