Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python | |
| # -*- coding: utf-8 -*- | |
| """ | |
| ๋ฒกํฐ ์คํ ์ด ๋ชจ๋: ๋ฌธ์ ์๋ฒ ๋ฉ ์์ฑ ๋ฐ ๋ฒกํฐ ์คํ ์ด ๊ตฌ์ถ | |
| ๋ฐฐ์น ์ฒ๋ฆฌ ์ ์ฉ + ์ฒญํฌ ๊ธธ์ด ํ์ธ ์ถ๊ฐ | |
| """ | |
| import os | |
| import argparse | |
| import logging | |
| from tqdm import tqdm | |
| from langchain_community.vectorstores import FAISS | |
| from langchain.schema.document import Document | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| from e5_embeddings import E5Embeddings | |
| # ๋ก๊น ์ค์ | |
| logging.getLogger().setLevel(logging.ERROR) | |
| def get_embeddings(model_name="intfloat/multilingual-e5-large-instruct", device="cuda"): | |
| print(f"[INFO] ์๋ฒ ๋ฉ ๋ชจ๋ธ ๋๋ฐ์ด์ค: {device}") | |
| return E5Embeddings( | |
| model_name=model_name, | |
| model_kwargs={'device': device}, | |
| encode_kwargs={'normalize_embeddings': True} | |
| ) | |
| def build_vector_store_batch(documents, embeddings, save_path="vector_db", batch_size=4): | |
| if not documents: | |
| raise ValueError("๋ฌธ์๊ฐ ์์ต๋๋ค. ๋ฌธ์๊ฐ ์ฌ๋ฐ๋ฅด๊ฒ ๋ก๋๋์๋์ง ํ์ธํ์ธ์.") | |
| texts = [doc.page_content for doc in documents] | |
| metadatas = [doc.metadata for doc in documents] | |
| # ์ฒญํฌ ๊ธธ์ด ์ถ๋ ฅ | |
| lengths = [len(t) for t in texts] | |
| print(f"๐ก ์ฒญํฌ ์: {len(texts)}") | |
| print(f"๐ก ๊ฐ์ฅ ๊ธด ์ฒญํฌ ๊ธธ์ด: {max(lengths)} chars") | |
| print(f"๐ก ํ๊ท ์ฒญํฌ ๊ธธ์ด: {sum(lengths) // len(lengths)} chars") | |
| # ๋ฐฐ์น๋ก ๋๋๊ธฐ | |
| batches = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)] | |
| metadata_batches = [metadatas[i:i + batch_size] for i in range(0, len(metadatas), batch_size)] | |
| print(f"Processing {len(batches)} batches with size {batch_size}") | |
| print(f"Initializing vector store with batch 1/{len(batches)}") | |
| # โ from_documents ์ฌ์ฉ | |
| first_docs = [ | |
| Document(page_content=text, metadata=meta) | |
| for text, meta in zip(batches[0], metadata_batches[0]) | |
| ] | |
| vectorstore = FAISS.from_documents(first_docs, embeddings) | |
| for i in tqdm(range(1, len(batches)), desc="Processing batches"): | |
| try: | |
| docs_batch = [ | |
| Document(page_content=text, metadata=meta) | |
| for text, meta in zip(batches[i], metadata_batches[i]) | |
| ] | |
| vectorstore.add_documents(docs_batch) | |
| if i % 10 == 0: | |
| temp_save_path = f"{save_path}_temp" | |
| os.makedirs(os.path.dirname(temp_save_path) if os.path.dirname(temp_save_path) else '.', exist_ok=True) | |
| vectorstore.save_local(temp_save_path) | |
| print(f"Temporary vector store saved to {temp_save_path} after batch {i}") | |
| except Exception as e: | |
| print(f"Error processing batch {i}: {e}") | |
| error_save_path = f"{save_path}_error_at_batch_{i}" | |
| os.makedirs(os.path.dirname(error_save_path) if os.path.dirname(error_save_path) else '.', exist_ok=True) | |
| vectorstore.save_local(error_save_path) | |
| print(f"Partial vector store saved to {error_save_path}") | |
| raise | |
| os.makedirs(os.path.dirname(save_path) if os.path.dirname(save_path) else '.', exist_ok=True) | |
| vectorstore.save_local(save_path) | |
| print(f"Vector store saved to {save_path}") | |
| return vectorstore | |
| def load_vector_store(embeddings, load_path="vector_db"): | |
| if not os.path.exists(load_path): | |
| raise FileNotFoundError(f"๋ฒกํฐ ์คํ ์ด๋ฅผ ์ฐพ์ ์ ์์ต๋๋ค: {load_path}") | |
| return FAISS.load_local(load_path, embeddings, allow_dangerous_deserialization=True) | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser(description="๋ฒกํฐ ์คํ ์ด ๊ตฌ์ถ") | |
| parser.add_argument("--folder", type=str, default="final_dataset", help="๋ฌธ์๊ฐ ์๋ ํด๋ ๊ฒฝ๋ก") | |
| parser.add_argument("--save_path", type=str, default="vector_db", help="๋ฒกํฐ ์คํ ์ด ์ ์ฅ ๊ฒฝ๋ก") | |
| parser.add_argument("--batch_size", type=int, default=4, help="๋ฐฐ์น ํฌ๊ธฐ") | |
| parser.add_argument("--model_name", type=str, default="intfloat/multilingual-e5-large-instruct", help="์๋ฒ ๋ฉ ๋ชจ๋ธ ์ด๋ฆ") | |
| # parser.add_argument("--device", type=str, default="cuda", help="์ฌ์ฉํ ๋๋ฐ์ด์ค ('cuda' ๋๋ 'cpu')") | |
| parser.add_argument("--device", type=str, default="cuda", help="์ฌ์ฉํ ๋๋ฐ์ด์ค ('cuda' ๋๋ 'cpu' ๋๋ 'cuda:1')") | |
| args = parser.parse_args() | |
| # ๋ฌธ์ ์ฒ๋ฆฌ ๋ชจ๋ import | |
| from document_processor_image_test import load_documents, split_documents | |
| documents = load_documents(args.folder) | |
| chunks = split_documents(documents, chunk_size=800, chunk_overlap=100) | |
| print(f"[DEBUG] ๋ฌธ์ ๋ก๋ฉ ๋ฐ ์ฒญํฌ ๋ถํ ์๋ฃ, ์๋ฒ ๋ฉ ๋จ๊ณ ์ง์ ์ ") | |
| print(f"[INFO] ์ ํ๋ ๋๋ฐ์ด์ค: {args.device}") | |
| try: | |
| embeddings = get_embeddings( | |
| model_name=args.model_name, | |
| device=args.device | |
| ) | |
| print(f"[DEBUG] ์๋ฒ ๋ฉ ๋ชจ๋ธ ์์ฑ ์๋ฃ") | |
| except Exception as e: | |
| print(f"[ERROR] ์๋ฒ ๋ฉ ๋ชจ๋ธ ์์ฑ ์ค ์๋ฌ ๋ฐ์: {e}") | |
| import traceback; traceback.print_exc() | |
| exit(1) | |
| build_vector_store_batch(chunks, embeddings, args.save_path, args.batch_size) | |