Spaces:
No application file
No application file
""" | |
RAG κ²μ μ±λ΄ μΉ μ ν리μΌμ΄μ - API λΌμ°νΈ μ μ | |
""" | |
import os | |
import json | |
import logging | |
import tempfile | |
import requests | |
from flask import request, jsonify, render_template, send_from_directory, session, redirect, url_for | |
from datetime import datetime | |
from werkzeug.utils import secure_filename | |
# λ‘κ±° κ°μ Έμ€κΈ° | |
logger = logging.getLogger(__name__) | |
def register_routes(app, login_required, llm_interface, retriever, stt_client, DocumentProcessor, base_retriever, app_ready, ADMIN_USERNAME, ADMIN_PASSWORD, DEVICE_SERVER_URL): | |
"""Flask μ ν리μΌμ΄μ μ κΈ°λ³Έ λΌμ°νΈ λ±λ‘""" | |
# ν¬νΌ ν¨μ | |
def allowed_audio_file(filename): | |
"""νμΌμ΄ νμ©λ μ€λμ€ νμ₯μλ₯Ό κ°μ§λμ§ νμΈ""" | |
ALLOWED_AUDIO_EXTENSIONS = {'mp3', 'wav', 'ogg', 'm4a'} | |
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_AUDIO_EXTENSIONS | |
def allowed_doc_file(filename): | |
"""νμΌμ΄ νμ©λ λ¬Έμ νμ₯μλ₯Ό κ°μ§λμ§ νμΈ""" | |
ALLOWED_DOC_EXTENSIONS = {'txt', 'md', 'pdf', 'docx', 'csv'} | |
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_DOC_EXTENSIONS | |
def login(): | |
error = None | |
next_url = request.args.get('next') | |
logger.info(f"-------------- λ‘κ·ΈμΈ νμ΄μ§ μ μ (Next: {next_url}) --------------") | |
logger.info(f"Method: {request.method}") | |
if request.method == 'POST': | |
logger.info("λ‘κ·ΈμΈ μλ λ°μ") | |
username = request.form.get('username', '') | |
password = request.form.get('password', '') | |
logger.info(f"μ λ ₯λ μ¬μ©μλͺ : {username}") | |
logger.info(f"λΉλ°λ²νΈ μ λ ₯ μ¬λΆ: {len(password) > 0}") | |
# νκ²½ λ³μ λλ κΈ°λ³Έκ°κ³Ό λΉκ΅ | |
valid_username = ADMIN_USERNAME | |
valid_password = ADMIN_PASSWORD | |
logger.info(f"κ²μ¦μ© μ¬μ©μλͺ : {valid_username}") | |
logger.info(f"κ²μ¦μ© λΉλ°λ²νΈ μ‘΄μ¬ μ¬λΆ: {valid_password is not None and len(valid_password) > 0}") | |
if username == valid_username and password == valid_password: | |
logger.info(f"λ‘κ·ΈμΈ μ±κ³΅: {username}") | |
# μΈμ μ€μ μ νμ¬ μΈμ μν λ‘κΉ | |
logger.debug(f"μΈμ μ€μ μ : {session}") | |
# μΈμ μ λ‘κ·ΈμΈ μ 보 μ μ₯ | |
session.permanent = True | |
session['logged_in'] = True | |
session['username'] = username | |
session.modified = True | |
logger.info(f"μΈμ μ€μ ν: {session}") | |
logger.info("μΈμ μ€μ μλ£, 리λλ μ μλ") | |
# λ‘κ·ΈμΈ μ±κ³΅ ν 리λλ μ | |
redirect_to = next_url or url_for('index') | |
logger.info(f"리λλ μ λμ: {redirect_to}") | |
response = redirect(redirect_to) | |
return response | |
else: | |
logger.warning("λ‘κ·ΈμΈ μ€ν¨: μμ΄λ λλ λΉλ°λ²νΈ λΆμΌμΉ") | |
if username != valid_username: logger.warning("μ¬μ©μλͺ λΆμΌμΉ") | |
if password != valid_password: logger.warning("λΉλ°λ²νΈ λΆμΌμΉ") | |
error = 'μμ΄λ λλ λΉλ°λ²νΈκ° μ¬λ°λ₯΄μ§ μμ΅λλ€.' | |
else: | |
logger.info("λ‘κ·ΈμΈ νμ΄μ§ GET μμ²") | |
if 'logged_in' in session: | |
logger.info("μ΄λ―Έ λ‘κ·ΈμΈλ μ¬μ©μ, λ©μΈ νμ΄μ§λ‘ 리λλ μ ") | |
return redirect(url_for('index')) | |
logger.info("---------- λ‘κ·ΈμΈ νμ΄μ§ λ λλ§ ----------") | |
return render_template('login.html', error=error, next=next_url) | |
def logout(): | |
logger.info("-------------- λ‘κ·Έμμ μμ² --------------") | |
logger.info(f"λ‘κ·Έμμ μ μΈμ μν: {session}") | |
if 'logged_in' in session: | |
username = session.get('username', 'unknown') | |
logger.info(f"μ¬μ©μ {username} λ‘κ·Έμμ μ²λ¦¬ μμ") | |
session.pop('logged_in', None) | |
session.pop('username', None) | |
session.modified = True | |
logger.info(f"μΈμ μ 보 μμ μλ£. νμ¬ μΈμ : {session}") | |
else: | |
logger.warning("λ‘κ·ΈμΈλμ§ μμ μνμμ λ‘κ·Έμμ μλ") | |
logger.info("λ‘κ·ΈμΈ νμ΄μ§λ‘ 리λλ μ ") | |
response = redirect(url_for('login')) | |
return response | |
def index(): | |
"""λ©μΈ νμ΄μ§""" | |
nonlocal app_ready | |
# μ± μ€λΉ μν νμΈ - 30μ΄ μ΄μ μ§λ¬μΌλ©΄ κ°μ λ‘ ready μνλ‘ λ³κ²½ | |
current_time = datetime.now() | |
start_time = datetime.fromtimestamp(os.path.getmtime(__file__)) | |
time_diff = (current_time - start_time).total_seconds() | |
if not app_ready and time_diff > 30: | |
logger.warning(f"μ±μ΄ 30μ΄ μ΄μ μ΄κΈ°ν μ€ μνμ λλ€. κ°μ λ‘ ready μνλ‘ λ³κ²½ν©λλ€.") | |
app_ready = True | |
if not app_ready: | |
logger.info("μ±μ΄ μμ§ μ€λΉλμ§ μμ λ‘λ© νμ΄μ§ νμ") | |
return render_template('loading.html'), 503 # μλΉμ€ μ€λΉ μλ¨ μν μ½λ | |
logger.info("λ©μΈ νμ΄μ§ μμ²") | |
return render_template('index.html') | |
def app_status(): | |
"""μ± μ΄κΈ°ν μν νμΈ API""" | |
logger.info(f"μ± μν νμΈ μμ²: {'Ready' if app_ready else 'Not Ready'}") | |
return jsonify({"ready": app_ready}) | |
def llm_api(): | |
"""μ¬μ© κ°λ₯ν LLM λͺ©λ‘ λ° μ ν API""" | |
if not app_ready: | |
return jsonify({"error": "μ±μ΄ μμ§ μ΄κΈ°ν μ€μ λλ€. μ μ ν λ€μ μλν΄μ£ΌμΈμ."}), 503 | |
if request.method == 'GET': | |
logger.info("LLM λͺ©λ‘ μμ²") | |
try: | |
current_details = llm_interface.get_current_llm_details() if hasattr(llm_interface, 'get_current_llm_details') else {"id": "unknown", "name": "Unknown"} | |
supported_llms_dict = llm_interface.SUPPORTED_LLMS if hasattr(llm_interface, 'SUPPORTED_LLMS') else {} | |
supported_list = [{ | |
"name": name, "id": id, "current": id == current_details.get("id") | |
} for name, id in supported_llms_dict.items()] | |
return jsonify({ | |
"supported_llms": supported_list, | |
"current_llm": current_details | |
}) | |
except Exception as e: | |
logger.error(f"LLM μ 보 μ‘°ν μ€λ₯: {e}") | |
return jsonify({"error": "LLM μ 보 μ‘°ν μ€ μ€λ₯ λ°μ"}), 500 | |
elif request.method == 'POST': | |
data = request.get_json() | |
if not data or 'llm_id' not in data: | |
return jsonify({"error": "LLM IDκ° μ 곡λμ§ μμμ΅λλ€."}), 400 | |
llm_id = data['llm_id'] | |
logger.info(f"LLM λ³κ²½ μμ²: {llm_id}") | |
try: | |
if not hasattr(llm_interface, 'set_llm') or not hasattr(llm_interface, 'llm_clients'): | |
raise NotImplementedError("LLM μΈν°νμ΄μ€μ νμν λ©μλ/μμ± μμ") | |
if llm_id not in llm_interface.llm_clients: | |
return jsonify({"error": f"μ§μλμ§ μλ LLM ID: {llm_id}"}), 400 | |
success = llm_interface.set_llm(llm_id) | |
if success: | |
new_details = llm_interface.get_current_llm_details() | |
logger.info(f"LLMμ΄ '{new_details.get('name', llm_id)}'λ‘ λ³κ²½λμμ΅λλ€.") | |
return jsonify({ | |
"success": True, | |
"message": f"LLMμ΄ '{new_details.get('name', llm_id)}'λ‘ λ³κ²½λμμ΅λλ€.", | |
"current_llm": new_details | |
}) | |
else: | |
logger.error(f"LLM λ³κ²½ μ€ν¨ (ID: {llm_id})") | |
return jsonify({"error": "LLM λ³κ²½ μ€ λ΄λΆ μ€λ₯ λ°μ"}), 500 | |
except Exception as e: | |
logger.error(f"LLM λ³κ²½ μ²λ¦¬ μ€ μ€λ₯: {e}", exc_info=True) | |
return jsonify({"error": f"LLM λ³κ²½ μ€ μ€λ₯ λ°μ: {str(e)}"}), 500 | |
def chat(): | |
"""ν μ€νΈ κΈ°λ° μ±λ΄ API""" | |
if not app_ready or retriever is None: | |
return jsonify({"error": "μ±/κ²μκΈ°κ° μμ§ μ΄κΈ°ν μ€μ λλ€. μ μ ν λ€μ μλν΄μ£ΌμΈμ."}), 503 | |
try: | |
data = request.get_json() | |
if not data or 'query' not in data: | |
return jsonify({"error": "μΏΌλ¦¬κ° μ 곡λμ§ μμμ΅λλ€."}), 400 | |
query = data['query'] | |
logger.info(f"ν μ€νΈ 쿼리 μμ : {query[:100]}...") | |
# RAG κ²μ μν | |
if not hasattr(retriever, 'search'): | |
raise NotImplementedError("Retrieverμ search λ©μλκ° μμ΅λλ€.") | |
search_results = retriever.search(query, top_k=5, first_stage_k=6) | |
# 컨ν μ€νΈ μ€λΉ | |
if not hasattr(DocumentProcessor, 'prepare_rag_context'): | |
raise NotImplementedError("DocumentProcessorμ prepare_rag_context λ©μλκ° μμ΅λλ€.") | |
context = DocumentProcessor.prepare_rag_context(search_results, field="text") | |
if not context: | |
logger.warning("κ²μ κ²°κ³Όκ° μμ΄ μ»¨ν μ€νΈλ₯Ό μμ±νμ§ λͺ»ν¨.") | |
# LLMμ μ§μ | |
llm_id = data.get('llm_id', None) | |
if not hasattr(llm_interface, 'rag_generate'): | |
raise NotImplementedError("LLMInterfaceμ rag_generate λ©μλκ° μμ΅λλ€.") | |
if not context: | |
answer = "μ£μ‘ν©λλ€. κ΄λ ¨ μ 보λ₯Ό μ°Ύμ μ μμ΅λλ€." | |
logger.info("컨ν μ€νΈ μμ΄ κΈ°λ³Έ μλ΅ μμ±") | |
else: | |
answer = llm_interface.rag_generate(query, context, llm_id=llm_id) | |
logger.info(f"LLM μλ΅ μμ± μλ£ (κΈΈμ΄: {len(answer)})") | |
# μμ€ μ 보 μΆμΆ (CSV ID μΆμΆ λ‘μ§ ν¬ν¨) | |
sources = [] | |
if search_results: | |
for result in search_results: | |
if not isinstance(result, dict): | |
logger.warning(f"μμμΉ λͺ»ν κ²μ κ²°κ³Ό νμ: {type(result)}") | |
continue | |
if "source" in result: | |
source_info = { | |
"source": result.get("source", "Unknown"), | |
"score": result.get("rerank_score", result.get("score", 0)) | |
} | |
# CSV νμΌ νΉμ μ²λ¦¬ | |
if "text" in result and result.get("filetype") == "csv": | |
try: | |
text_lines = result["text"].strip().split('\n') | |
if text_lines: | |
first_line = text_lines[0].strip() | |
if ',' in first_line: | |
first_column = first_line.split(',')[0].strip() | |
source_info["id"] = first_column | |
logger.debug(f"CSV μμ€ ID μΆμΆ: {first_column} from {source_info['source']}") | |
except Exception as e: | |
logger.warning(f"CSV μμ€ ID μΆμΆ μ€ν¨ ({result.get('source')}): {e}") | |
sources.append(source_info) | |
# μ΅μ’ μλ΅ | |
response_data = { | |
"answer": answer, | |
"sources": sources, | |
"llm": llm_interface.get_current_llm_details() if hasattr(llm_interface, 'get_current_llm_details') else {} | |
} | |
return jsonify(response_data) | |
except Exception as e: | |
logger.error(f"μ±ν μ²λ¦¬ μ€ μ€λ₯ λ°μ: {e}", exc_info=True) | |
return jsonify({"error": f"μ²λ¦¬ μ€ μ€λ₯κ° λ°μνμ΅λλ€: {str(e)}"}), 500 | |
def voice_chat(): | |
"""μμ± μ± API μλν¬μΈνΈ""" | |
if not app_ready: | |
logger.warning("μ± μ΄κΈ°νκ° μλ£λμ§ μμμ§λ§ μμ± API μμ² μ²λ¦¬ μλ") | |
# μ¬κΈ°μ λ°λ‘ 리ν΄νμ§ μκ³ κ³μ μ§ν | |
# μ¬μ κ²μ¬: retrieverμ stt_clientκ° μ λλ‘ μ΄κΈ°νλμλμ§ νμΈ | |
if retriever is None: | |
logger.error("retrieverκ° μμ§ μ΄κΈ°νλμ§ μμμ΅λλ€") | |
return jsonify({ | |
"transcription": "(μμ±μ ν μ€νΈλ‘ λ³ννμ§λ§ κ²μ μμ§μ΄ μμ§ μ€λΉλμ§ μμμ΅λλ€)", | |
"answer": "μ£μ‘ν©λλ€. κ²μ μμ§μ΄ μμ§ μ΄κΈ°ν μ€μ λλ€. μ μ ν λ€μ μλν΄μ£ΌμΈμ.", | |
"sources": [] | |
}) | |
# λλ νμ μ»΄ν¬λνΈκ° μμ λλ§ νΉλ³ μλ΅ λ°ν | |
if stt_client is None: | |
return jsonify({ | |
"transcription": "(μμ± μΈμ κΈ°λ₯μ΄ μ€λΉ μ€μ λλ€)", | |
"answer": "μ£μ‘ν©λλ€. νμ¬ μμ± μΈμ μλΉμ€κ° μ΄κΈ°ν μ€μ λλ€. μ μ ν λ€μ μλν΄μ£ΌμΈμ.", | |
"sources": [] | |
}) | |
logger.info("μμ± μ± μμ² μμ ") | |
if 'audio' not in request.files: | |
logger.error("μ€λμ€ νμΌμ΄ μ 곡λμ§ μμ") | |
return jsonify({"error": "μ€λμ€ νμΌμ΄ μ 곡λμ§ μμμ΅λλ€."}), 400 | |
audio_file = request.files['audio'] | |
logger.info(f"μμ λ μ€λμ€ νμΌ: {audio_file.filename} ({audio_file.content_type})") | |
try: | |
# μ€λμ€ νμΌ μ²λ¦¬ | |
# μμ νμΌ μ¬μ© κ³ λ € (λ©λͺ¨λ¦¬ λΆλ΄ μ€μ΄κΈ° μν΄) | |
with tempfile.NamedTemporaryFile(delete=True) as temp_audio: | |
audio_file.save(temp_audio.name) | |
logger.info(f"μ€λμ€ νμΌμ μμ μ μ₯: {temp_audio.name}") | |
# VitoSTT.transcribe_audio κ° νμΌ κ²½λ‘ λλ λ°μ΄νΈλ₯Ό λ°μ μ μλλ‘ κ΅¬νλμ΄μΌ ν¨ | |
# μ¬κΈ°μλ νμΌ κ²½λ‘λ₯Ό μ¬μ©νλ€κ³ κ°μ | |
if not hasattr(stt_client, 'transcribe_audio'): | |
raise NotImplementedError("STT ν΄λΌμ΄μΈνΈμ transcribe_audio λ©μλκ° μμ΅λλ€.") | |
# νμΌ κ²½λ‘λ‘ μ λ¬ μ | |
# stt_result = stt_client.transcribe_audio(temp_audio.name, language="ko") | |
# λ°μ΄νΈλ‘ μ λ¬ μ | |
with open(temp_audio.name, 'rb') as f_bytes: | |
audio_bytes = f_bytes.read() | |
stt_result = stt_client.transcribe_audio(audio_bytes, language="ko") | |
if not isinstance(stt_result, dict) or not stt_result.get("success"): | |
error_msg = stt_result.get("error", "μ μ μλ STT μ€λ₯") if isinstance(stt_result, dict) else "STT κ²°κ³Ό νμ μ€λ₯" | |
logger.error(f"μμ±μΈμ μ€ν¨: {error_msg}") | |
return jsonify({ | |
"error": "μμ±μΈμ μ€ν¨", | |
"details": error_msg | |
}), 500 | |
transcription = stt_result.get("text", "") | |
if not transcription: | |
logger.warning("μμ±μΈμ κ²°κ³Όκ° λΉμ΄μμ΅λλ€.") | |
return jsonify({"error": "μμ±μμ ν μ€νΈλ₯Ό μΈμνμ§ λͺ»νμ΅λλ€.", "transcription": ""}), 400 | |
logger.info(f"μμ±μΈμ μ±κ³΅: {transcription[:50]}...") | |
if retriever is None: | |
logger.error("STT μ±κ³΅ ν κ²μ μλ μ€ retrieverκ° Noneμ") | |
return jsonify({ | |
"transcription": transcription, | |
"answer": "μμ±μ μΈμνμ§λ§, νμ¬ κ²μ μμ€ν μ΄ μ€λΉλμ§ μμμ΅λλ€. μ μ ν λ€μ μλν΄μ£ΌμΈμ.", | |
"sources": [] | |
}) | |
# --- μ΄ν λ‘μ§μ /api/chatκ³Ό κ±°μ λμΌ --- | |
# RAG κ²μ μν | |
search_results = retriever.search(transcription, top_k=5, first_stage_k=6) | |
context = DocumentProcessor.prepare_rag_context(search_results, field="text") | |
if not context: | |
logger.warning("μμ± μΏΌλ¦¬μ λν κ²μ κ²°κ³Ό μμ.") | |
# answer = "μ£μ‘ν©λλ€. κ΄λ ¨ μ 보λ₯Ό μ°Ύμ μ μμ΅λλ€." (μλ LLM νΈμΆ λ‘μ§μμ μ²λ¦¬) | |
pass | |
# LLM νΈμΆ | |
llm_id = request.form.get('llm_id', None) # μμ± μμ²μ form λ°μ΄ν°λ‘ LLM ID λ°μ μ μμ | |
if not context: | |
answer = "μ£μ‘ν©λλ€. κ΄λ ¨ μ 보λ₯Ό μ°Ύμ μ μμ΅λλ€." | |
logger.info("컨ν μ€νΈ μμ΄ κΈ°λ³Έ μλ΅ μμ±") | |
else: | |
answer = llm_interface.rag_generate(transcription, context, llm_id=llm_id) | |
logger.info(f"LLM μλ΅ μμ± μλ£ (κΈΈμ΄: {len(answer)})") | |
# μμ€ μ 보 μΆμΆ | |
enhanced_sources = [] | |
if search_results: | |
for doc in search_results: | |
if not isinstance(doc, dict): continue # A | |
if "source" in doc: | |
source_info = { | |
"source": doc.get("source", "Unknown"), | |
"score": doc.get("rerank_score", doc.get("score", 0)) | |
} | |
if "text" in doc and doc.get("filetype") == "csv": | |
try: | |
text_lines = doc["text"].strip().split('\n') | |
if text_lines: | |
first_line = text_lines[0].strip() | |
if ',' in first_line: | |
first_column = first_line.split(',')[0].strip() | |
source_info["id"] = first_column | |
except Exception as e: | |
logger.warning(f"[μμ±μ±] CSV μμ€ ID μΆμΆ μ€ν¨ ({doc.get('source')}): {e}") | |
enhanced_sources.append(source_info) | |
# μ΅μ’ μλ΅ | |
response_data = { | |
"transcription": transcription, | |
"answer": answer, | |
"sources": enhanced_sources, | |
"llm": llm_interface.get_current_llm_details() if hasattr(llm_interface, 'get_current_llm_details') else {} | |
} | |
return jsonify(response_data) | |
except Exception as e: | |
logger.error(f"μμ± μ± μ²λ¦¬ μ€ μ€λ₯ λ°μ: {e}", exc_info=True) | |
return jsonify({ | |
"error": "μμ± μ²λ¦¬ μ€ λ΄λΆ μ€λ₯ λ°μ", | |
"details": str(e) | |
}), 500 | |
def upload_document(): | |
"""μ§μλ² μ΄μ€ λ¬Έμ μ λ‘λ API""" | |
if not app_ready or base_retriever is None: | |
return jsonify({"error": "μ±/κΈ°λ³Έ κ²μκΈ°κ° μμ§ μ΄κΈ°ν μ€μ λλ€."}), 503 | |
if 'document' not in request.files: | |
return jsonify({"error": "λ¬Έμ νμΌμ΄ μ 곡λμ§ μμμ΅λλ€."}), 400 | |
doc_file = request.files['document'] | |
if doc_file.filename == '': | |
return jsonify({"error": "μ νλ νμΌμ΄ μμ΅λλ€."}), 400 | |
if not allowed_doc_file(doc_file.filename): | |
logger.error(f"νμ©λμ§ μλ νμΌ νμ: {doc_file.filename}") | |
return jsonify({"error": f"νμ©λμ§ μλ νμΌ νμμ λλ€. νμ©: {', '.join(ALLOWED_DOC_EXTENSIONS)}"}), 400 | |
try: | |
filename = secure_filename(doc_file.filename) | |
filepath = os.path.join(app.config['DATA_FOLDER'], filename) | |
doc_file.save(filepath) | |
logger.info(f"λ¬Έμ μ μ₯ μλ£: {filepath}") | |
# λ¬Έμ μ²λ¦¬ (μΈμ½λ© μ²λ¦¬ ν¬ν¨) | |
try: | |
with open(filepath, 'r', encoding='utf-8') as f: | |
content = f.read() | |
except UnicodeDecodeError: | |
logger.info(f"UTF-8 λμ½λ© μ€ν¨, CP949λ‘ μλ: {filename}") | |
try: | |
with open(filepath, 'r', encoding='cp949') as f: | |
content = f.read() | |
except Exception as e_cp949: | |
logger.error(f"CP949 λμ½λ© μ€ν¨ ({filename}): {e_cp949}") | |
return jsonify({"error": "νμΌ μΈμ½λ©μ μ½μ μ μμ΅λλ€ (UTF-8, CP949 μλ μ€ν¨)."}), 400 | |
except Exception as e_read: | |
logger.error(f"νμΌ μ½κΈ° μ€λ₯ ({filename}): {e_read}") | |
return jsonify({"error": f"νμΌ μ½κΈ° μ€ μ€λ₯ λ°μ: {str(e_read)}"}), 500 | |
# λ©νλ°μ΄ν° λ° λ¬Έμ λΆν /μ²λ¦¬ | |
metadata = { | |
"source": filename, "filename": filename, | |
"filetype": filename.rsplit('.', 1)[1].lower(), | |
"filepath": filepath | |
} | |
file_ext = metadata["filetype"] | |
docs = [] | |
if not hasattr(DocumentProcessor, 'csv_to_documents') or not hasattr(DocumentProcessor, 'text_to_documents'): | |
raise NotImplementedError("DocumentProcessorμ νμν λ©μλ μμ") | |
if file_ext == 'csv': | |
logger.info(f"CSV νμΌ μ²λ¦¬ μμ: {filename}") | |
docs = DocumentProcessor.csv_to_documents(content, metadata) # ν λ¨μ μ²λ¦¬ κ°μ | |
else: # κΈ°ν ν μ€νΈ κΈ°λ° λ¬Έμ | |
logger.info(f"μΌλ° ν μ€νΈ λ¬Έμ μ²λ¦¬ μμ: {filename}") | |
# PDF, DOCX λ±μ λ³λ λΌμ΄λΈλ¬λ¦¬(pypdf, python-docx) νμ | |
if file_ext in ['pdf', 'docx']: | |
logger.warning(f".{file_ext} νμΌ μ²λ¦¬λ νμ¬ κ΅¬νλμ§ μμμ΅λλ€. ν μ€νΈ μΆμΆ λ‘μ§ μΆκ° νμ.") | |
# μ¬κΈ°μ pdf/docx ν μ€νΈ μΆμΆ λ‘μ§ μΆκ° | |
# μ: content = extract_text_from_pdf(filepath) | |
# content = extract_text_from_docx(filepath) | |
# μμλ‘ λΉμλ | |
content = "" | |
if content: # ν μ€νΈ λ΄μ©μ΄ μμ λλ§ μ²λ¦¬ | |
docs = DocumentProcessor.text_to_documents( | |
content, metadata=metadata, | |
chunk_size=512, chunk_overlap=50 | |
) | |
# κ²μκΈ°μ λ¬Έμ μΆκ° λ° μΈλ±μ€ μ μ₯ | |
if docs: | |
if not hasattr(base_retriever, 'add_documents') or not hasattr(base_retriever, 'save'): | |
raise NotImplementedError("κΈ°λ³Έ κ²μκΈ°μ add_documents λλ save λ©μλ μμ") | |
logger.info(f"{len(docs)}κ° λ¬Έμ μ²ν¬λ₯Ό κ²μκΈ°μ μΆκ°ν©λλ€...") | |
base_retriever.add_documents(docs) | |
# μΈλ±μ€ μ μ₯ (μ λ‘λλ§λ€ μ μ₯ - λΉν¨μ¨μ μΌ μ μμ) | |
logger.info(f"κ²μκΈ° μνλ₯Ό μ μ₯ν©λλ€...") | |
index_path = app.config['INDEX_PATH'] | |
try: | |
base_retriever.save(index_path) | |
logger.info("μΈλ±μ€ μ μ₯ μλ£") | |
# μ¬μμν κ²μκΈ°λ μ λ°μ΄νΈ νμ μ λ‘μ§ μΆκ° | |
# μ: retriever.update_base_retriever(base_retriever) | |
return jsonify({ | |
"success": True, | |
"message": f"νμΌ '{filename}' μ λ‘λ λ° μ²λ¦¬ μλ£ ({len(docs)}κ° μ²ν¬ μΆκ°)." | |
}) | |
except Exception as e_save: | |
logger.error(f"μΈλ±μ€ μ μ₯ μ€ μ€λ₯ λ°μ: {e_save}") | |
return jsonify({"error": f"μΈλ±μ€ μ μ₯ μ€ μ€λ₯: {str(e_save)}"}), 500 | |
else: | |
logger.warning(f"νμΌ '{filename}'μμ μ²λ¦¬ν λ΄μ©μ΄ μκ±°λ μ§μλμ§ μλ νμμ λλ€.") | |
# νμΌμ μ μ₯λμμΌλ―λ‘ μ±κ³΅μΌλ‘ κ°μ£Όν μ§ κ²°μ νμ | |
return jsonify({ | |
"warning": True, | |
"message": f"νμΌ '{filename}'μ΄ μ μ₯λμμ§λ§ μ²λ¦¬ν λ΄μ©μ΄ μμ΅λλ€." | |
}) | |
except Exception as e: | |
logger.error(f"νμΌ μ λ‘λ λλ μ²λ¦¬ μ€ μ€λ₯ λ°μ: {e}", exc_info=True) | |
return jsonify({"error": f"νμΌ μ λ‘λ μ€ μ€λ₯: {str(e)}"}), 500 | |
def list_documents(): | |
"""μ§μλ² μ΄μ€ λ¬Έμ λͺ©λ‘ API""" | |
if not app_ready or base_retriever is None: | |
return jsonify({"error": "μ±/κΈ°λ³Έ κ²μκΈ°κ° μμ§ μ΄κΈ°ν μ€μ λλ€."}), 503 | |
try: | |
sources = {} | |
total_chunks = 0 | |
# base_retriever.documents μ κ°μ μμ±μ΄ μ€μ ν΄λμ€μ μλ€κ³ κ°μ | |
if hasattr(base_retriever, 'documents') and base_retriever.documents: | |
logger.info(f"μ΄ {len(base_retriever.documents)}κ° λ¬Έμ μ²ν¬μμ μμ€ λͺ©λ‘ μμ± μ€...") | |
for doc in base_retriever.documents: | |
# λ¬Έμ μ²ν¬κ° λμ λ리 ννλΌκ³ κ°μ | |
if not isinstance(doc, dict): continue | |
source = doc.get("source", "unknown") # λ©νλ°μ΄ν°μμ source κ°μ Έμ€κΈ° | |
if source == "unknown" and "metadata" in doc and isinstance(doc["metadata"], dict): | |
source = doc["metadata"].get("source", "unknown") # Langchain Document ꡬ쑰 κ³ λ € | |
if source != "unknown": | |
if source in sources: | |
sources[source]["chunks"] += 1 | |
else: | |
# λ©νλ°μ΄ν°μμ μΆκ° μ 보 κ°μ Έμ€κΈ° | |
filename = doc.get("filename", source) | |
filetype = doc.get("filetype", "unknown") | |
if "metadata" in doc and isinstance(doc["metadata"], dict): | |
filename = doc["metadata"].get("filename", filename) | |
filetype = doc["metadata"].get("filetype", filetype) | |
sources[source] = { | |
"filename": filename, | |
"chunks": 1, | |
"filetype": filetype | |
} | |
total_chunks += 1 | |
else: | |
logger.info("κ²μκΈ°μ λ¬Έμκ° μκ±°λ documents μμ±μ μ°Ύμ μ μμ΅λλ€.") | |
# λͺ©λ‘ νμ λ³ν λ° μ λ ¬ | |
documents = [{"source": src, **info} for src, info in sources.items()] | |
documents.sort(key=lambda x: x["chunks"], reverse=True) | |
logger.info(f"λ¬Έμ λͺ©λ‘ μ‘°ν μλ£: {len(documents)}κ° μμ€ νμΌ, {total_chunks}κ° μ²ν¬") | |
return jsonify({ | |
"documents": documents, | |
"total_documents": len(documents), | |
"total_chunks": total_chunks | |
}) | |
except Exception as e: | |
logger.error(f"λ¬Έμ λͺ©λ‘ μ‘°ν μ€ μ€λ₯ λ°μ: {e}", exc_info=True) | |
return jsonify({"error": f"λ¬Έμ λͺ©λ‘ μ‘°ν μ€ μ€λ₯: {str(e)}"}), 500 |