jeongsoo's picture
init
6575706
# --- μž„λ² λ”© κ΄€λ ¨ 헬퍼 ν•¨μˆ˜ ---
def save_embeddings(base_retriever, file_path):
"""μž„λ² λ”© 데이터λ₯Ό μ••μΆ•ν•˜μ—¬ νŒŒμΌμ— μ €μž₯"""
try:
# μ €μž₯ 디렉토리가 μ—†μœΌλ©΄ 생성
os.makedirs(os.path.dirname(file_path), exist_ok=True)
# νƒ€μž„μŠ€νƒ¬ν”„ μΆ”κ°€
save_data = {
'timestamp': datetime.now().isoformat(),
'retriever': base_retriever
}
# μ••μΆ•ν•˜μ—¬ μ €μž₯ (μš©λŸ‰ 쀄이기)
with gzip.open(file_path, 'wb') as f:
pickle.dump(save_data, f)
logger.info(f"μž„λ² λ”© 데이터λ₯Ό {file_path}에 μ••μΆ•ν•˜μ—¬ μ €μž₯ν–ˆμŠ΅λ‹ˆλ‹€.")
return True
except Exception as e:
logger.error(f"μž„λ² λ”© μ €μž₯ 쀑 였λ₯˜ λ°œμƒ: {e}")
return False
def load_embeddings(file_path, max_age_days=30):
"""μ €μž₯된 μž„λ² λ”© 데이터λ₯Ό νŒŒμΌμ—μ„œ λ‘œλ“œ"""
try:
if not os.path.exists(file_path):
logger.info(f"μ €μž₯된 μž„λ² λ”© 파일({file_path})이 μ—†μŠ΅λ‹ˆλ‹€.")
return None
# μ••μΆ• 파일 λ‘œλ“œ
with gzip.open(file_path, 'rb') as f:
data = pickle.load(f)
# νƒ€μž„μŠ€νƒ¬ν”„ 확인 (λ„ˆλ¬΄ 였래된 λ°μ΄ν„°λŠ” μ‚¬μš©ν•˜μ§€ μ•ŠμŒ)
saved_time = datetime.fromisoformat(data['timestamp'])
age = (datetime.now() - saved_time).days
if age > max_age_days:
logger.info(f"μ €μž₯된 μž„λ² λ”©μ΄ {age}일둜 λ„ˆλ¬΄ μ˜€λž˜λ˜μ—ˆμŠ΅λ‹ˆλ‹€. μƒˆλ‘œ μƒμ„±ν•©λ‹ˆλ‹€.")
return None
logger.info(f"{file_path}μ—μ„œ μž„λ² λ”© 데이터λ₯Ό λ‘œλ“œν–ˆμŠ΅λ‹ˆλ‹€. (생성일: {saved_time})")
return data['retriever']
except Exception as e:
logger.error(f"μž„λ² λ”© λ‘œλ“œ 쀑 였λ₯˜ λ°œμƒ: {e}")
return None
def init_retriever():
"""검색기 객체 μ΄ˆκΈ°ν™” λ˜λŠ” λ‘œλ“œ"""
global base_retriever, retriever
# μž„λ² λ”© μΊμ‹œ 파일 경둜
cache_path = os.path.join(app.config['INDEX_PATH'], "cached_embeddings.gz")
# λ¨Όμ € μ €μž₯된 μž„λ² λ”© 데이터 λ‘œλ“œ μ‹œλ„
cached_retriever = load_embeddings(cache_path)
if cached_retriever:
logger.info("μΊμ‹œλœ μž„λ² λ”© 데이터λ₯Ό μ„±κ³΅μ μœΌλ‘œ λ‘œλ“œν–ˆμŠ΅λ‹ˆλ‹€.")
base_retriever = cached_retriever
else:
# μΊμ‹œλœ 데이터가 μ—†μœΌλ©΄ κΈ°μ‘΄ λ°©μ‹μœΌλ‘œ μ΄ˆκΈ°ν™”
index_path = app.config['INDEX_PATH']
# VectorRetriever λ‘œλ“œ λ˜λŠ” μ΄ˆκΈ°ν™”
if os.path.exists(os.path.join(index_path, "documents.json")):
try:
logger.info(f"κΈ°μ‘΄ 벑터 인덱슀λ₯Ό '{index_path}'μ—μ„œ λ‘œλ“œν•©λ‹ˆλ‹€...")
base_retriever = VectorRetriever.load(index_path)
logger.info(f"{len(base_retriever.documents) if hasattr(base_retriever, 'documents') else 0}개 λ¬Έμ„œκ°€ λ‘œλ“œλ˜μ—ˆμŠ΅λ‹ˆλ‹€.")
except Exception as e:
logger.error(f"인덱슀 λ‘œλ“œ 쀑 였λ₯˜ λ°œμƒ: {e}. μƒˆ 검색기λ₯Ό μ΄ˆκΈ°ν™”ν•©λ‹ˆλ‹€.")
base_retriever = VectorRetriever()
else:
logger.info("κΈ°μ‘΄ 인덱슀λ₯Ό 찾을 수 μ—†μ–΄ μƒˆ 검색기λ₯Ό μ΄ˆκΈ°ν™”ν•©λ‹ˆλ‹€...")
base_retriever = VectorRetriever()
# 데이터 ν΄λ”μ˜ λ¬Έμ„œ λ‘œλ“œ
data_path = app.config['DATA_FOLDER']
if (not hasattr(base_retriever, 'documents') or not base_retriever.documents) and os.path.exists(data_path):
logger.info(f"{data_path}μ—μ„œ λ¬Έμ„œλ₯Ό λ‘œλ“œν•©λ‹ˆλ‹€...")
try:
docs = DocumentProcessor.load_documents_from_directory(
data_path,
extensions=[".txt", ".md", ".csv"],
recursive=True
)
if docs and hasattr(base_retriever, 'add_documents'):
logger.info(f"{len(docs)}개 λ¬Έμ„œλ₯Ό 검색기에 μΆ”κ°€ν•©λ‹ˆλ‹€...")
base_retriever.add_documents(docs)
if hasattr(base_retriever, 'save'):
logger.info(f"검색기 μƒνƒœλ₯Ό '{index_path}'에 μ €μž₯ν•©λ‹ˆλ‹€...")
try:
base_retriever.save(index_path)
logger.info("인덱슀 μ €μž₯ μ™„λ£Œ")
# μƒˆλ‘œ μƒμ„±λœ 검색기 캐싱
if hasattr(base_retriever, 'documents') and base_retriever.documents:
save_embeddings(base_retriever, cache_path)
logger.info(f"검색기λ₯Ό μΊμ‹œ 파일 {cache_path}에 μ €μž₯ μ™„λ£Œ")
except Exception as e:
logger.error(f"인덱슀 μ €μž₯ 쀑 였λ₯˜ λ°œμƒ: {e}")
except Exception as e:
logger.error(f"DATA_FOLDERμ—μ„œ λ¬Έμ„œ λ‘œλ“œ 쀑 였λ₯˜: {e}")
# μž¬μˆœμœ„ν™” 검색기 μ΄ˆκΈ°ν™”
logger.info("μž¬μˆœμœ„ν™” 검색기λ₯Ό μ΄ˆκΈ°ν™”ν•©λ‹ˆλ‹€...")
try:
# 자체 κ΅¬ν˜„λœ μž¬μˆœμœ„ν™” ν•¨μˆ˜
def custom_rerank_fn(query, results):
query_terms = set(query.lower().split())
for result in results:
if isinstance(result, dict) and "text" in result:
text = result["text"].lower()
term_freq = sum(1 for term in query_terms if term in text)
normalized_score = term_freq / (len(text.split()) + 1) * 10
result["rerank_score"] = result.get("score", 0) * 0.7 + normalized_score * 0.3
elif isinstance(result, dict):
result["rerank_score"] = result.get("score", 0)
results.sort(key=lambda x: x.get("rerank_score", 0) if isinstance(x, dict) else 0, reverse=True)
return results
# ReRanker 클래슀 μ‚¬μš©
retriever = ReRanker(
base_retriever=base_retriever,
rerank_fn=custom_rerank_fn,
rerank_field="text"
)
logger.info("μž¬μˆœμœ„ν™” 검색기 μ΄ˆκΈ°ν™” μ™„λ£Œ")
except Exception as e:
logger.error(f"μž¬μˆœμœ„ν™” 검색기 μ΄ˆκΈ°ν™” μ‹€νŒ¨: {e}")
retriever = base_retriever # μ‹€νŒ¨ μ‹œ κΈ°λ³Έ 검색기 μ‚¬μš©
return retriever
def background_init():
"""λ°±κ·ΈλΌμš΄λ“œμ—μ„œ 검색기 μ΄ˆκΈ°ν™” μˆ˜ν–‰"""
global app_ready, retriever, base_retriever
# μ¦‰μ‹œ μ•± μ‚¬μš© κ°€λŠ₯ μƒνƒœλ‘œ μ„€μ •
app_ready = True
logger.info("앱을 μ¦‰μ‹œ μ‚¬μš© κ°€λŠ₯ μƒνƒœλ‘œ μ„€μ • (app_ready=True)")
try:
# κΈ°λ³Έ 검색기 μ΄ˆκΈ°ν™” (λ³΄ν—˜)
if base_retriever is None:
base_retriever = MockComponent()
if hasattr(base_retriever, 'documents'):
base_retriever.documents = []
# μž„μ‹œ retriever μ„€μ •
if retriever is None:
retriever = MockComponent()
if not hasattr(retriever, 'search'):
retriever.search = lambda query, **kwargs: []
# μΊμ‹œλœ μž„λ² λ”© λ‘œλ“œ μ‹œλ„
cache_path = os.path.join(app.config['INDEX_PATH'], "cached_embeddings.gz")
cached_retriever = load_embeddings(cache_path)
if cached_retriever:
# μΊμ‹œλœ 데이터가 있으면 λ°”λ‘œ μ‚¬μš©
base_retriever = cached_retriever
# κ°„λ‹¨ν•œ μž¬μˆœμœ„ν™” ν•¨μˆ˜
def simple_rerank(query, results):
if results:
for result in results:
if isinstance(result, dict):
result["rerank_score"] = result.get("score", 0)
results.sort(key=lambda x: x.get("rerank_score", 0) if isinstance(x, dict) else 0, reverse=True)
return results
# μž¬μˆœμœ„ν™” 검색기 μ΄ˆκΈ°ν™”
retriever = ReRanker(
base_retriever=base_retriever,
rerank_fn=simple_rerank,
rerank_field="text"
)
logger.info("μΊμ‹œλœ μž„λ² λ”©μœΌλ‘œ 검색기 μ΄ˆκΈ°ν™” μ™„λ£Œ (λΉ λ₯Έ μ‹œμž‘)")
else:
# μΊμ‹œλœ 데이터가 μ—†μœΌλ©΄ 전체 μ΄ˆκΈ°ν™” μ§„ν–‰
logger.info("μΊμ‹œλœ μž„λ² λ”©μ΄ μ—†μ–΄ 전체 μ΄ˆκΈ°ν™” μ‹œμž‘")
retriever = init_retriever()
logger.info("전체 μ΄ˆκΈ°ν™” μ™„λ£Œ")
logger.info("μ•± μ΄ˆκΈ°ν™” μ™„λ£Œ (λͺ¨λ“  μ»΄ν¬λ„ŒνŠΈ 쀀비됨)")
except Exception as e:
logger.error(f"μ•± λ°±κ·ΈλΌμš΄λ“œ μ΄ˆκΈ°ν™” 쀑 μ‹¬κ°ν•œ 였λ₯˜ λ°œμƒ: {e}", exc_info=True)
# μ΄ˆκΈ°ν™” μ‹€νŒ¨ μ‹œ κΈ°λ³Έ 객체 생성
if base_retriever is None:
base_retriever = MockComponent()
if hasattr(base_retriever, 'documents'):
base_retriever.documents = []
if retriever is None:
retriever = MockComponent()
if not hasattr(retriever, 'search'):
retriever.search = lambda query, **kwargs: []
logger.warning("μ΄ˆκΈ°ν™” 쀑 였λ₯˜κ°€ μžˆμ§€λ§Œ 앱은 계속 μ‚¬μš© κ°€λŠ₯ν•©λ‹ˆλ‹€.")
# λ°±κ·ΈλΌμš΄λ“œ μŠ€λ ˆλ“œ μ‹œμž‘
init_thread = threading.Thread(target=background_init)
init_thread.daemon = True
init_thread.start()