Spaces:
Sleeping
Sleeping
fix
Browse files- app/app.py +155 -209
app/app.py
CHANGED
@@ -7,15 +7,11 @@ import json
|
|
7 |
import logging
|
8 |
import tempfile
|
9 |
import threading
|
10 |
-
|
11 |
from flask import Flask, request, jsonify, render_template, send_from_directory, session, redirect, url_for
|
12 |
from werkzeug.utils import secure_filename
|
13 |
from dotenv import load_dotenv
|
14 |
-
from functools import
|
15 |
-
import pickle
|
16 |
-
import os
|
17 |
-
import gzip
|
18 |
-
from datetime import datetime
|
19 |
|
20 |
# ๋ก๊ฑฐ ์ค์
|
21 |
logging.basicConfig(
|
@@ -147,133 +143,81 @@ def allowed_doc_file(filename):
|
|
147 |
# --- ํฌํผ ํจ์ ๋ ---
|
148 |
|
149 |
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
Args:
|
154 |
-
base_retriever: ์ ์ฅํ ๊ฒ์๊ธฐ ๊ฐ์ฒด
|
155 |
-
file_path: ์ ์ฅํ ํ์ผ ๊ฒฝ๋ก
|
156 |
-
|
157 |
-
Returns:
|
158 |
-
bool: ์ ์ฅ ์ฑ๊ณต ์ฌ๋ถ
|
159 |
-
"""
|
160 |
-
try:
|
161 |
-
# ์ ์ฅ ๋๋ ํ ๋ฆฌ๊ฐ ์์ผ๋ฉด ์์ฑ
|
162 |
-
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
163 |
-
|
164 |
-
# ํ์์คํฌํ ์ถ๊ฐ
|
165 |
-
save_data = {
|
166 |
-
'timestamp': datetime.now().isoformat(),
|
167 |
-
'retriever': base_retriever
|
168 |
-
}
|
169 |
-
|
170 |
-
# ์์ถํ์ฌ ์ ์ฅ (์ฉ๋ ์ค์ด๊ธฐ)
|
171 |
-
with gzip.open(file_path, 'wb') as f:
|
172 |
-
pickle.dump(save_data, f)
|
173 |
-
|
174 |
-
logger.info(f"์๋ฒ ๋ฉ ๋ฐ์ดํฐ๋ฅผ {file_path}์ ์์ถํ์ฌ ์ ์ฅํ์ต๋๋ค.")
|
175 |
-
return True
|
176 |
-
except Exception as e:
|
177 |
-
logger.error(f"์๋ฒ ๋ฉ ์ ์ฅ ์ค ์ค๋ฅ ๋ฐ์: {e}")
|
178 |
-
return False
|
179 |
-
|
180 |
-
def load_embeddings(file_path, max_age_days=30):
|
181 |
-
"""์ ์ฅ๋ ์๋ฒ ๋ฉ ๋ฐ์ดํฐ๋ฅผ ํ์ผ์์ ๋ก๋
|
182 |
-
|
183 |
-
Args:
|
184 |
-
file_path: ๋ก๋ํ ํ์ผ ๊ฒฝ๋ก
|
185 |
-
max_age_days: ์ต๋ ํ์ฉ ๊ฒฝ๊ณผ ์ผ์ (๊ธฐ๋ณธ๊ฐ: 30์ผ)
|
186 |
-
|
187 |
-
Returns:
|
188 |
-
object or None: ๋ก๋๋ ๊ฒ์๊ธฐ ๊ฐ์ฒด ๋๋ ์คํจ ์ None
|
189 |
-
"""
|
190 |
-
try:
|
191 |
-
if not os.path.exists(file_path):
|
192 |
-
logger.info(f"์ ์ฅ๋ ์๋ฒ ๋ฉ ํ์ผ({file_path})์ด ์์ต๋๋ค.")
|
193 |
-
return None
|
194 |
-
|
195 |
-
# ์์ถ ํ์ผ ๋ก๋
|
196 |
-
with gzip.open(file_path, 'rb') as f:
|
197 |
-
data = pickle.load(f)
|
198 |
-
|
199 |
-
# ํ์์คํฌํ ํ์ธ (๋๋ฌด ์ค๋๋ ๋ฐ์ดํฐ๋ ์ฌ์ฉํ์ง ์์)
|
200 |
-
saved_time = datetime.fromisoformat(data['timestamp'])
|
201 |
-
age = (datetime.now() - saved_time).days
|
202 |
-
|
203 |
-
if age > max_age_days:
|
204 |
-
logger.info(f"์ ์ฅ๋ ์๋ฒ ๋ฉ์ด {age}์ผ๋ก ๋๋ฌด ์ค๋๋์์ต๋๋ค. ์๋ก ์์ฑํฉ๋๋ค.")
|
205 |
-
return None
|
206 |
-
|
207 |
-
logger.info(f"{file_path}์์ ์๋ฒ ๋ฉ ๋ฐ์ดํฐ๋ฅผ ๋ก๋ํ์ต๋๋ค. (์์ฑ์ผ: {saved_time})")
|
208 |
-
return data['retriever']
|
209 |
-
except Exception as e:
|
210 |
-
logger.error(f"์๋ฒ ๋ฉ ๋ก๋ ์ค ์ค๋ฅ ๋ฐ์: {e}")
|
211 |
-
return None
|
212 |
-
|
213 |
def init_retriever():
|
214 |
"""๊ฒ์๊ธฐ ๊ฐ์ฒด ์ด๊ธฐํ ๋๋ ๋ก๋"""
|
215 |
global base_retriever, retriever
|
216 |
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
try:
|
233 |
-
logger.info(f"๊ธฐ์กด ๋ฒกํฐ ์ธ๋ฑ์ค๋ฅผ '{index_path}'์์ ๋ก๋ํฉ๋๋ค...")
|
234 |
-
base_retriever = VectorRetriever.load(index_path)
|
235 |
-
logger.info(f"{len(base_retriever.documents) if hasattr(base_retriever, 'documents') else 0}๊ฐ ๋ฌธ์๊ฐ ๋ก๋๋์์ต๋๋ค.")
|
236 |
-
except Exception as e:
|
237 |
-
logger.error(f"์ธ๋ฑ์ค ๋ก๋ ์ค ์ค๋ฅ ๋ฐ์: {e}. ์ ๊ฒ์๊ธฐ๋ฅผ ์ด๊ธฐํํฉ๋๋ค.")
|
238 |
base_retriever = VectorRetriever()
|
239 |
-
|
240 |
-
|
|
|
|
|
|
|
|
|
|
|
241 |
base_retriever = VectorRetriever()
|
|
|
|
|
|
|
|
|
242 |
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
if (not hasattr(base_retriever, 'documents') or not base_retriever.documents) and os.path.exists(data_path):
|
247 |
-
logger.info(f"{data_path}์์ ๋ฌธ์๋ฅผ ๋ก๋ํฉ๋๋ค...")
|
248 |
-
try:
|
249 |
-
docs = DocumentProcessor.load_documents_from_directory(
|
250 |
-
data_path,
|
251 |
-
extensions=[".txt", ".md", ".csv"], # .pdf, .docx ๋ฑ์ ๋ณ๋ ์ฒ๋ฆฌ ํ์
|
252 |
-
recursive=True
|
253 |
-
)
|
254 |
-
if docs and hasattr(base_retriever, 'add_documents'):
|
255 |
-
logger.info(f"{len(docs)}๊ฐ ๋ฌธ์๋ฅผ ๊ฒ์๊ธฐ์ ์ถ๊ฐํฉ๋๋ค...")
|
256 |
-
base_retriever.add_documents(docs)
|
257 |
-
|
258 |
-
if hasattr(base_retriever, 'save'):
|
259 |
-
logger.info(f"๊ฒ์๊ธฐ ์ํ๋ฅผ '{index_path}'์ ์ ์ฅํฉ๋๋ค...")
|
260 |
-
try:
|
261 |
-
base_retriever.save(index_path)
|
262 |
-
logger.info("์ธ๋ฑ์ค ์ ์ฅ ์๋ฃ")
|
263 |
-
|
264 |
-
# ์๋ก ์์ฑ๋ ๊ฒ์๊ธฐ ์บ์ฑ
|
265 |
-
if hasattr(base_retriever, 'documents') and base_retriever.documents:
|
266 |
-
save_embeddings(base_retriever, cache_path)
|
267 |
-
logger.info(f"๊ฒ์๊ธฐ๋ฅผ ์บ์ ํ์ผ {cache_path}์ ์ ์ฅ ์๋ฃ")
|
268 |
-
except Exception as e:
|
269 |
-
logger.error(f"์ธ๋ฑ์ค ์ ์ฅ ์ค ์ค๋ฅ ๋ฐ์: {e}")
|
270 |
-
except Exception as e:
|
271 |
-
logger.error(f"DATA_FOLDER์์ ๋ฌธ์ ๋ก๋ ์ค ์ค๋ฅ: {e}")
|
272 |
|
273 |
-
#
|
274 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
275 |
try:
|
276 |
-
#
|
|
|
277 |
def custom_rerank_fn(query, results):
|
278 |
query_terms = set(query.lower().split())
|
279 |
for result in results:
|
@@ -283,94 +227,105 @@ def init_retriever():
|
|
283 |
normalized_score = term_freq / (len(text.split()) + 1) * 10
|
284 |
result["rerank_score"] = result.get("score", 0) * 0.7 + normalized_score * 0.3
|
285 |
elif isinstance(result, dict):
|
286 |
-
|
287 |
-
# ๊ฒฐ๊ณผ ํ์์ด ๋ค๋ฅผ ๊ฒฝ์ฐ ์ฒ๋ฆฌ ํ์
|
288 |
results.sort(key=lambda x: x.get("rerank_score", 0) if isinstance(x, dict) else 0, reverse=True)
|
289 |
return results
|
|
|
290 |
|
291 |
# ReRanker ํด๋์ค ์ฌ์ฉ
|
292 |
retriever = ReRanker(
|
293 |
base_retriever=base_retriever,
|
294 |
-
rerank_fn=custom_rerank_fn, #
|
295 |
-
rerank_field="text"
|
296 |
)
|
297 |
-
logger.info("์ฌ์์ํ ๊ฒ์๊ธฐ ์ด๊ธฐํ
|
298 |
-
except Exception as
|
299 |
-
logger.error(f"์ฌ์์ํ ๊ฒ์๊ธฐ ์ด๊ธฐํ ์คํจ: {
|
300 |
-
|
|
|
301 |
|
|
|
302 |
return retriever
|
303 |
|
304 |
def background_init():
|
305 |
"""๋ฐฑ๊ทธ๋ผ์ด๋์์ ๊ฒ์๊ธฐ ์ด๊ธฐํ ์ํ"""
|
306 |
-
global app_ready, retriever, base_retriever
|
307 |
-
|
308 |
-
|
309 |
-
app_ready = True
|
310 |
-
logger.info("์ฑ์ ์ฆ์ ์ฌ์ฉ ๊ฐ๋ฅ ์ํ๋ก ์ค์ (app_ready=True)")
|
311 |
-
|
312 |
try:
|
313 |
-
|
314 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
315 |
base_retriever = MockComponent()
|
316 |
-
if hasattr(base_retriever, 'documents'):
|
317 |
-
base_retriever.documents = []
|
318 |
-
|
319 |
-
# ์์ retriever ์ค์ (๋น ๋ฅธ ์์์ ์ํด)
|
320 |
-
if retriever is None:
|
321 |
retriever = MockComponent()
|
322 |
-
if not hasattr(retriever, 'search'):
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
# ์บ์๋ ์๋ฒ ๋ฉ ๋ก๋ ์๋ (๋น ๋ฅธ ์์์ ์ํด)
|
329 |
-
cached_retriever = load_embeddings(cache_path)
|
330 |
-
|
331 |
-
if cached_retriever:
|
332 |
-
# ์บ์๋ ๋ฐ์ดํฐ๊ฐ ์์ผ๋ฉด ๋ฐ๋ก ์ฌ์ฉ
|
333 |
-
base_retriever = cached_retriever
|
334 |
-
|
335 |
-
# ๊ฐ๋จํ ์ฌ์์ํ ํจ์
|
336 |
-
def simple_rerank(query, results):
|
337 |
-
# ๊ฒฐ๊ณผ ์ ์ ์ ์งํ๋ฉด์ ์ ๋ ฌ
|
338 |
-
if results:
|
339 |
-
for result in results:
|
340 |
-
if isinstance(result, dict):
|
341 |
-
result["rerank_score"] = result.get("score", 0)
|
342 |
-
results.sort(key=lambda x: x.get("rerank_score", 0) if isinstance(x, dict) else 0, reverse=True)
|
343 |
-
return results
|
344 |
-
|
345 |
-
# ์ฌ์์ํ ๊ฒ์๊ธฐ ์ด๊ธฐํ
|
346 |
-
retriever = ReRanker(
|
347 |
-
base_retriever=base_retriever,
|
348 |
-
rerank_fn=simple_rerank,
|
349 |
-
rerank_field="text"
|
350 |
-
)
|
351 |
-
|
352 |
-
logger.info("์บ์๋ ์๋ฒ ๋ฉ์ผ๋ก ๊ฒ์๊ธฐ ์ด๊ธฐํ ์๋ฃ (๋น ๋ฅธ ์์)")
|
353 |
-
else:
|
354 |
-
# ์บ์๋ ๋ฐ์ดํฐ๊ฐ ์์ผ๋ฉด ์ ์ฒด ์ด๊ธฐํ ์งํ
|
355 |
-
logger.info("์บ์๋ ์๋ฒ ๋ฉ์ด ์์ด ์ ์ฒด ์ด๊ธฐํ ์์")
|
356 |
-
# ์๊ฐ์ด ์ค๋ ๊ฑธ๋ฆด ์ ์๋ ์์
์ด๋ฏ๋ก ๋ณ๋ ์ค๋ ๋๋ก ์คํํ ์๋ ์์
|
357 |
-
retriever = init_retriever()
|
358 |
-
logger.info("์ ์ฒด ์ด๊ธฐํ ์๋ฃ")
|
359 |
-
|
360 |
-
logger.info("์ฑ ์ด๊ธฐํ ์๋ฃ (๋ชจ๋ ์ปดํฌ๋ํธ ์ค๋น๋จ)")
|
361 |
except Exception as e:
|
362 |
logger.error(f"์ฑ ๋ฐฑ๊ทธ๋ผ์ด๋ ์ด๊ธฐํ ์ค ์ฌ๊ฐํ ์ค๋ฅ ๋ฐ์: {e}", exc_info=True)
|
363 |
-
#
|
364 |
-
if base_retriever is None:
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
|
375 |
# ๋ฐฑ๊ทธ๋ผ์ด๋ ์ค๋ ๋ ์์ ๋ถ๋ถ์ ๊ทธ๋๋ก ์ ์ง
|
376 |
init_thread = threading.Thread(target=background_init)
|
@@ -780,7 +735,7 @@ def voice_chat():
|
|
780 |
"details": str(e)
|
781 |
}), 500
|
782 |
|
783 |
-
|
784 |
@app.route('/api/upload', methods=['POST'])
|
785 |
@login_required
|
786 |
def upload_document():
|
@@ -823,6 +778,7 @@ def upload_document():
|
|
823 |
logger.error(f"ํ์ผ ์ฝ๊ธฐ ์ค๋ฅ ({filename}): {e_read}")
|
824 |
return jsonify({"error": f"ํ์ผ ์ฝ๊ธฐ ์ค ์ค๋ฅ ๋ฐ์: {str(e_read)}"}), 500
|
825 |
|
|
|
826 |
# ๋ฉํ๋ฐ์ดํฐ ๋ฐ ๋ฌธ์ ๋ถํ /์ฒ๋ฆฌ
|
827 |
metadata = {
|
828 |
"source": filename, "filename": filename,
|
@@ -863,30 +819,20 @@ def upload_document():
|
|
863 |
logger.info(f"{len(docs)}๊ฐ ๋ฌธ์ ์ฒญํฌ๋ฅผ ๊ฒ์๊ธฐ์ ์ถ๊ฐํฉ๋๋ค...")
|
864 |
base_retriever.add_documents(docs)
|
865 |
|
866 |
-
# ์ธ๋ฑ์ค ์ ์ฅ
|
867 |
logger.info(f"๊ฒ์๊ธฐ ์ํ๋ฅผ ์ ์ฅํฉ๋๋ค...")
|
868 |
index_path = app.config['INDEX_PATH']
|
869 |
try:
|
870 |
-
# ๊ธฐ์กด ์ธ๋ฑ์ค ์ ์ฅ
|
871 |
base_retriever.save(index_path)
|
872 |
logger.info("์ธ๋ฑ์ค ์ ์ฅ ์๋ฃ")
|
873 |
-
|
874 |
-
#
|
875 |
-
cache_path = os.path.join(app.config['INDEX_PATH'], "cached_embeddings.gz")
|
876 |
-
if save_embeddings(base_retriever, cache_path):
|
877 |
-
logger.info("์๋ฒ ๋ฉ ์บ์ ์
๋ฐ์ดํธ ์๋ฃ")
|
878 |
-
|
879 |
-
# ์ฌ์์ํ ๊ฒ์๊ธฐ๋ ์
๋ฐ์ดํธ ํ์ ์ (๊ธฐ๋ณธ ๊ฒ์๊ธฐ๊ฐ ๋ณ๊ฒฝ๋์์ผ๋ฏ๋ก)
|
880 |
-
if hasattr(retriever, 'base_retriever'):
|
881 |
-
retriever.base_retriever = base_retriever
|
882 |
-
logger.info("์ฌ์์ํ ๊ฒ์๊ธฐ์ ๊ธฐ๋ณธ ๊ฒ์๊ธฐ ์
๋ฐ์ดํธ ์๋ฃ")
|
883 |
-
|
884 |
return jsonify({
|
885 |
"success": True,
|
886 |
"message": f"ํ์ผ '{filename}' ์
๋ก๋ ๋ฐ ์ฒ๋ฆฌ ์๋ฃ ({len(docs)}๊ฐ ์ฒญํฌ ์ถ๊ฐ)."
|
887 |
})
|
888 |
except Exception as e_save:
|
889 |
-
logger.error(f"์ธ๋ฑ์ค ์ ์ฅ
|
890 |
return jsonify({"error": f"์ธ๋ฑ์ค ์ ์ฅ ์ค ์ค๋ฅ: {str(e_save)}"}), 500
|
891 |
else:
|
892 |
logger.warning(f"ํ์ผ '{filename}'์์ ์ฒ๋ฆฌํ ๋ด์ฉ์ด ์๊ฑฐ๋ ์ง์๋์ง ์๋ ํ์์
๋๋ค.")
|
|
|
7 |
import logging
|
8 |
import tempfile
|
9 |
import threading
|
10 |
+
import datetime
|
11 |
from flask import Flask, request, jsonify, render_template, send_from_directory, session, redirect, url_for
|
12 |
from werkzeug.utils import secure_filename
|
13 |
from dotenv import load_dotenv
|
14 |
+
from functools import wraps
|
|
|
|
|
|
|
|
|
15 |
|
16 |
# ๋ก๊ฑฐ ์ค์
|
17 |
logging.basicConfig(
|
|
|
143 |
# --- ํฌํผ ํจ์ ๋ ---
|
144 |
|
145 |
|
146 |
+
# init_retriever ํจ์ ๋ด๋ถ์ ๋ก๊น
์ถ๊ฐ ์์
|
147 |
+
# --- ๊ฒ์๊ธฐ ์ด๊ธฐํ ๊ด๋ จ ํจ์ ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
def init_retriever():
|
149 |
"""๊ฒ์๊ธฐ ๊ฐ์ฒด ์ด๊ธฐํ ๋๋ ๋ก๋"""
|
150 |
global base_retriever, retriever
|
151 |
|
152 |
+
index_path = app.config['INDEX_PATH']
|
153 |
+
data_path = app.config['DATA_FOLDER'] # data_path ์ ์ ํ์ธ
|
154 |
+
logger.info("--- init_retriever ์์ ---")
|
155 |
+
|
156 |
+
# 1. ๊ธฐ๋ณธ ๊ฒ์๊ธฐ ๋ก๋ ๋๋ ์ด๊ธฐํ
|
157 |
+
# ... (VectorRetriever ๋ก๋ ๋๋ ์ด๊ธฐํ ๋ก์ง์ ์ด์ ๊ณผ ๋์ผํ๊ฒ ์ ์ง) ...
|
158 |
+
# VectorRetriever ์ด๊ธฐํ/๋ก๋ ์คํจ ์ base_retriever = None ๋ฐ return None ์ฒ๋ฆฌ ํฌํจ
|
159 |
+
if os.path.exists(os.path.join(index_path, "documents.json")):
|
160 |
+
try:
|
161 |
+
logger.info(f"์ธ๋ฑ์ค ๋ก๋ ์๋: {index_path}")
|
162 |
+
base_retriever = VectorRetriever.load(index_path)
|
163 |
+
logger.info(f"์ธ๋ฑ์ค ๋ก๋ ์ฑ๊ณต. ๋ฌธ์ {len(getattr(base_retriever, 'documents', []))}๊ฐ")
|
164 |
+
except Exception as e:
|
165 |
+
logger.error(f"์ธ๋ฑ์ค ๋ก๋ ์คํจ: {e}", exc_info=True)
|
166 |
+
logger.info("์ VectorRetriever ์ด๊ธฐํ ์๋...")
|
167 |
try:
|
|
|
|
|
|
|
|
|
|
|
168 |
base_retriever = VectorRetriever()
|
169 |
+
logger.info("์ VectorRetriever ์ด๊ธฐํ ์ฑ๊ณต.")
|
170 |
+
except Exception as e_init:
|
171 |
+
logger.error(f"์ VectorRetriever ์ด๊ธฐํ ์คํจ: {e_init}", exc_info=True)
|
172 |
+
base_retriever = None
|
173 |
+
else:
|
174 |
+
logger.info("์ธ๋ฑ์ค ํ์ผ ์์. ์ VectorRetriever ์ด๊ธฐํ ์๋...")
|
175 |
+
try:
|
176 |
base_retriever = VectorRetriever()
|
177 |
+
logger.info("์ VectorRetriever ์ด๊ธฐํ ์ฑ๊ณต.")
|
178 |
+
except Exception as e_init:
|
179 |
+
logger.error(f"์ VectorRetriever ์ด๊ธฐํ ์คํจ: {e_init}", exc_info=True)
|
180 |
+
base_retriever = None
|
181 |
|
182 |
+
if base_retriever is None:
|
183 |
+
logger.error("base_retriever ์ด๊ธฐํ/๋ก๋์ ์คํจํ์ฌ init_retriever ์ค๋จ.")
|
184 |
+
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
185 |
|
186 |
+
# 2. ๋ฐ์ดํฐ ํด๋ ๋ฌธ์ ๋ก๋ (๊ธฐ๋ณธ ๊ฒ์๊ธฐ๊ฐ ๋น์ด์์ ๋)
|
187 |
+
needs_loading = (not hasattr(base_retriever, 'documents') or not getattr(base_retriever, 'documents', None)) # None ์ฒดํฌ ์ถ๊ฐ
|
188 |
+
if needs_loading and os.path.exists(data_path):
|
189 |
+
logger.info(f"๊ธฐ๋ณธ ๊ฒ์๊ธฐ๊ฐ ๋น์ด์์ด {data_path}์์ ๋ฌธ์ ๋ก๋ ์๋...")
|
190 |
+
try:
|
191 |
+
# ================== ์์ ๋ ๋ถ๋ถ 1 ์์ ==================
|
192 |
+
# DocumentProcessor.load_documents_from_directory ํธ์ถ ์ ์ฌ๋ฐ๋ฅธ ์ธ์ ์ ๋ฌ
|
193 |
+
docs = DocumentProcessor.load_documents_from_directory(
|
194 |
+
directory=data_path, # <-- ๊ฒฝ๋ก ๋ณ์ ์ฌ์ฉ
|
195 |
+
extensions=[".txt", ".md", ".csv"], # <-- ํ์ํ ํ์ฅ์ ์ ๋ฌ
|
196 |
+
recursive=True # <-- ์ฌ๊ท ํ์ ์ฌ๋ถ ์ ๋ฌ
|
197 |
+
)
|
198 |
+
# ================== ์์ ๋ ๋ถ๋ถ 1 ๋ ====================
|
199 |
+
logger.info(f"{len(docs)}๊ฐ ๋ฌธ์ ๋ก๋ ์ฑ๊ณต.")
|
200 |
+
if docs and hasattr(base_retriever, 'add_documents'):
|
201 |
+
logger.info("๊ฒ์๊ธฐ์ ๋ฌธ์ ์ถ๊ฐ ์๋...")
|
202 |
+
base_retriever.add_documents(docs)
|
203 |
+
logger.info("๋ฌธ์ ์ถ๊ฐ ์๋ฃ.")
|
204 |
+
|
205 |
+
if hasattr(base_retriever, 'save'):
|
206 |
+
logger.info(f"๊ฒ์๊ธฐ ์ํ ์ ์ฅ ์๋: {index_path}")
|
207 |
+
try:
|
208 |
+
base_retriever.save(index_path)
|
209 |
+
logger.info("์ธ๋ฑ์ค ์ ์ฅ ์๋ฃ.")
|
210 |
+
except Exception as e_save:
|
211 |
+
logger.error(f"์ธ๋ฑ์ค ์ ์ฅ ์คํจ: {e_save}", exc_info=True)
|
212 |
+
except Exception as e_load_add:
|
213 |
+
# load_documents_from_directory ์์ฒด์์ ์ค๋ฅ๊ฐ ๋ ์๋ ์์ (๊ถํ ๋ฑ)
|
214 |
+
logger.error(f"DATA_FOLDER ๋ฌธ์ ๋ก๋/์ถ๊ฐ ์ค ์ค๋ฅ: {e_load_add}", exc_info=True)
|
215 |
+
|
216 |
+
# 3. ์ฌ์์ํ ๊ฒ์๊ธฐ ์ด๊ธฐํ
|
217 |
+
logger.info("์ฌ์์ํ ๊ฒ์๊ธฐ ์ด๊ธฐํ ์๋...")
|
218 |
try:
|
219 |
+
# ================== ์์ ๋ ๋ถ๋ถ 2 ์์ ==================
|
220 |
+
# custom_rerank_fn ํจ์๋ฅผ ReRanker ์ด๊ธฐํ ์ ์ ์ ์
|
221 |
def custom_rerank_fn(query, results):
|
222 |
query_terms = set(query.lower().split())
|
223 |
for result in results:
|
|
|
227 |
normalized_score = term_freq / (len(text.split()) + 1) * 10
|
228 |
result["rerank_score"] = result.get("score", 0) * 0.7 + normalized_score * 0.3
|
229 |
elif isinstance(result, dict):
|
230 |
+
result["rerank_score"] = result.get("score", 0)
|
|
|
231 |
results.sort(key=lambda x: x.get("rerank_score", 0) if isinstance(x, dict) else 0, reverse=True)
|
232 |
return results
|
233 |
+
# ================== ์์ ๋ ๋ถ๋ถ 2 ๋ ====================
|
234 |
|
235 |
# ReRanker ํด๋์ค ์ฌ์ฉ
|
236 |
retriever = ReRanker(
|
237 |
base_retriever=base_retriever,
|
238 |
+
rerank_fn=custom_rerank_fn, # ์ด์ ํจ์๊ฐ ์ ์๋์์ผ๋ฏ๋ก ์ฌ์ฉ ๊ฐ๋ฅ
|
239 |
+
rerank_field="text"
|
240 |
)
|
241 |
+
logger.info("์ฌ์์ํ ๊ฒ์๊ธฐ ์ด๊ธฐํ ์๋ฃ.")
|
242 |
+
except Exception as e_rerank:
|
243 |
+
logger.error(f"์ฌ์์ํ ๊ฒ์๊ธฐ ์ด๊ธฐํ ์คํจ: {e_rerank}", exc_info=True)
|
244 |
+
logger.warning("์ฌ์์ํ ์คํจ, ๊ธฐ๋ณธ ๊ฒ์๊ธฐ๋ฅผ retriever๋ก ์ฌ์ฉํฉ๋๋ค.")
|
245 |
+
retriever = base_retriever # fallback
|
246 |
|
247 |
+
logger.info("--- init_retriever ์ข
๋ฃ ---")
|
248 |
return retriever
|
249 |
|
250 |
def background_init():
|
251 |
"""๋ฐฑ๊ทธ๋ผ์ด๋์์ ๊ฒ์๊ธฐ ์ด๊ธฐํ ์ํ"""
|
252 |
+
global app_ready, retriever, base_retriever, llm_interface, stt_client
|
253 |
+
|
254 |
+
temp_app_ready = False # ์์ ์ํ ํ๋๊ทธ
|
|
|
|
|
|
|
255 |
try:
|
256 |
+
logger.info("๋ฐฑ๊ทธ๋ผ์ด๋ ์ด๊ธฐํ ์์...")
|
257 |
+
|
258 |
+
# 1. LLM, STT ์ธํฐํ์ด์ค ์ด๊ธฐํ (ํ์ ์)
|
259 |
+
if llm_interface is None or isinstance(llm_interface, MockComponent):
|
260 |
+
if 'LLMInterface' in globals() and LLMInterface != MockComponent:
|
261 |
+
llm_interface = LLMInterface(default_llm="openai")
|
262 |
+
logger.info("LLM ์ธํฐํ์ด์ค ์ด๊ธฐํ ์๋ฃ.")
|
263 |
+
else:
|
264 |
+
logger.warning("LLMInterface ํด๋์ค ์์. Mock ์ฌ์ฉ.")
|
265 |
+
llm_interface = MockComponent() # Mock ๊ฐ์ฒด ๋ณด์ฅ
|
266 |
+
if stt_client is None or isinstance(stt_client, MockComponent):
|
267 |
+
if 'VitoSTT' in globals() and VitoSTT != MockComponent:
|
268 |
+
stt_client = VitoSTT()
|
269 |
+
logger.info("STT ํด๋ผ์ด์ธํธ ์ด๊ธฐํ ์๋ฃ.")
|
270 |
+
else:
|
271 |
+
logger.warning("VitoSTT ํด๋์ค ์์. Mock ์ฌ์ฉ.")
|
272 |
+
stt_client = MockComponent() # Mock ๊ฐ์ฒด ๋ณด์ฅ
|
273 |
+
|
274 |
+
|
275 |
+
# 2. ๊ฒ์๊ธฐ ์ด๊ธฐํ
|
276 |
+
if 'VectorRetriever' in globals() and VectorRetriever != MockComponent:
|
277 |
+
logger.info("์ค์ ๊ฒ์๊ธฐ ์ด๊ธฐํ ์๋...")
|
278 |
+
# init_retriever๊ฐ base_retriever์ retriever๋ฅผ ๋ชจ๋ ์ค์ ํ๋ค๊ณ ๊ฐ์
|
279 |
+
retriever = init_retriever()
|
280 |
+
# init_retriever ๋ด๋ถ์์ base_retriever๊ฐ ์ค์ ๋์ง ์์๋ค๋ฉด ์ฌ๊ธฐ์ ์ค์
|
281 |
+
if hasattr(retriever, 'base_retriever') and base_retriever is None:
|
282 |
+
base_retriever = retriever.base_retriever
|
283 |
+
elif base_retriever is None:
|
284 |
+
# retriever๊ฐ base_retriever๋ฅผ ํฌํจํ์ง ์๋ ๊ฒฝ์ฐ ๋๋ ReRanker๊ฐ ์๋ ๊ฒฝ์ฐ
|
285 |
+
# init_retriever์์ base_retriever๋ฅผ ์ง์ ์ค์ ํ๋๋ก ํ๊ฑฐ๋, ์ฌ๊ธฐ์ ๋ณ๋ ๋ก์ง ํ์
|
286 |
+
# ์์: base_retriever = VectorRetriever.load(...) ๋๋ VectorRetriever()
|
287 |
+
logger.warning("init_retriever ํ base_retriever๊ฐ ์ค์ ๋์ง ์์. ํ์ธ ํ์.")
|
288 |
+
# ์์๋ก retriever ์์ฒด๋ฅผ base_retriever๋ก ์ค์ (๋์ผ ๊ฐ์ฒด์ผ ๊ฒฝ์ฐ)
|
289 |
+
if isinstance(retriever, VectorRetriever):
|
290 |
+
base_retriever = retriever
|
291 |
+
|
292 |
+
# ์ฑ๊ณต์ ์ผ๋ก ์ด๊ธฐํ ๋์๋์ง ํ์ธ (None์ด ์๋์ง)
|
293 |
+
if retriever is not None and base_retriever is not None:
|
294 |
+
logger.info("๊ฒ์๊ธฐ (Retriever, Base Retriever) ์ด๊ธฐํ ์ฑ๊ณต")
|
295 |
+
temp_app_ready = True # ์ด๊ธฐํ ์ฑ๊ณต ์์๋ง True ์ค์
|
296 |
+
else:
|
297 |
+
logger.error("๊ฒ์๊ธฐ ์ด๊ธฐํ ํ์๋ retriever ๋๋ base_retriever๊ฐ None์
๋๋ค.")
|
298 |
+
# ์คํจ ์ Mock ๊ฐ์ฒด ํ ๋น (์ต์ํ์ ๋์ ๋ณด์ฅ)
|
299 |
+
if base_retriever is None: base_retriever = MockComponent()
|
300 |
+
if retriever is None: retriever = MockComponent()
|
301 |
+
if not hasattr(retriever, 'search'): retriever.search = lambda query, **kwargs: []
|
302 |
+
if not hasattr(base_retriever, 'documents'): base_retriever.documents = []
|
303 |
+
# temp_app_ready = False ๋๋ True (์ ์ฑ
์ ๋ฐ๋ผ ๊ฒฐ์ )
|
304 |
+
temp_app_ready = True # ์ผ๋จ ์ฑ์ ์คํ๋๋๋ก ์ค์
|
305 |
+
|
306 |
+
else:
|
307 |
+
logger.warning("VectorRetriever ํด๋์ค ์์. Mock ๊ฒ์๊ธฐ ์ฌ์ฉ.")
|
308 |
base_retriever = MockComponent()
|
|
|
|
|
|
|
|
|
|
|
309 |
retriever = MockComponent()
|
310 |
+
if not hasattr(retriever, 'search'): retriever.search = lambda query, **kwargs: []
|
311 |
+
if not hasattr(base_retriever, 'documents'): base_retriever.documents = []
|
312 |
+
temp_app_ready = True # Mock์ด๋ผ๋ ์ค๋น๋ ๋ ๊ฒ์ผ๋ก ๊ฐ์ฃผ
|
313 |
+
|
314 |
+
logger.info(f"๋ฐฑ๊ทธ๋ผ์ด๋ ์ด๊ธฐํ ์๋ฃ. ์ต์ข
์ํ: {'Ready' if temp_app_ready else 'Not Ready (Error during init)'}")
|
315 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
316 |
except Exception as e:
|
317 |
logger.error(f"์ฑ ๋ฐฑ๊ทธ๋ผ์ด๋ ์ด๊ธฐํ ์ค ์ฌ๊ฐํ ์ค๋ฅ ๋ฐ์: {e}", exc_info=True)
|
318 |
+
# ์ค๋ฅ ๋ฐ์ ์์๋ Mock ๊ฐ์ฒด ํ ๋น ์๋
|
319 |
+
if base_retriever is None: base_retriever = MockComponent()
|
320 |
+
if retriever is None: retriever = MockComponent()
|
321 |
+
if not hasattr(retriever, 'search'): retriever.search = lambda query, **kwargs: []
|
322 |
+
if not hasattr(base_retriever, 'documents'): base_retriever.documents = []
|
323 |
+
temp_app_ready = True # ์ค๋ฅ ๋ฐ์ํด๋ ์ฑ์ ์๋ตํ๋๋ก ์ค์ (์ ์ฑ
์ ๋ฐ๋ผ False ๊ฐ๋ฅ)
|
324 |
+
logger.warning("์ด๊ธฐํ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ง๋ง Mock ๊ฐ์ฒด๋ก ๋์ฒด ํ ์ฑ ์ฌ์ฉ ๊ฐ๋ฅ ์ํ๋ก ์ค์ .")
|
325 |
+
|
326 |
+
finally:
|
327 |
+
# ์ต์ข
์ ์ผ๋ก app_ready ์ํ ์
๋ฐ์ดํธ
|
328 |
+
app_ready = temp_app_ready
|
329 |
|
330 |
# ๋ฐฑ๊ทธ๋ผ์ด๋ ์ค๋ ๋ ์์ ๋ถ๋ถ์ ๊ทธ๋๋ก ์ ์ง
|
331 |
init_thread = threading.Thread(target=background_init)
|
|
|
735 |
"details": str(e)
|
736 |
}), 500
|
737 |
|
738 |
+
|
739 |
@app.route('/api/upload', methods=['POST'])
|
740 |
@login_required
|
741 |
def upload_document():
|
|
|
778 |
logger.error(f"ํ์ผ ์ฝ๊ธฐ ์ค๋ฅ ({filename}): {e_read}")
|
779 |
return jsonify({"error": f"ํ์ผ ์ฝ๊ธฐ ์ค ์ค๋ฅ ๋ฐ์: {str(e_read)}"}), 500
|
780 |
|
781 |
+
|
782 |
# ๋ฉํ๋ฐ์ดํฐ ๋ฐ ๋ฌธ์ ๋ถํ /์ฒ๋ฆฌ
|
783 |
metadata = {
|
784 |
"source": filename, "filename": filename,
|
|
|
819 |
logger.info(f"{len(docs)}๊ฐ ๋ฌธ์ ์ฒญํฌ๋ฅผ ๊ฒ์๊ธฐ์ ์ถ๊ฐํฉ๋๋ค...")
|
820 |
base_retriever.add_documents(docs)
|
821 |
|
822 |
+
# ์ธ๋ฑ์ค ์ ์ฅ (์
๋ก๋๋ง๋ค ์ ์ฅ - ๋นํจ์จ์ ์ผ ์ ์์)
|
823 |
logger.info(f"๊ฒ์๊ธฐ ์ํ๋ฅผ ์ ์ฅํฉ๋๋ค...")
|
824 |
index_path = app.config['INDEX_PATH']
|
825 |
try:
|
|
|
826 |
base_retriever.save(index_path)
|
827 |
logger.info("์ธ๋ฑ์ค ์ ์ฅ ์๋ฃ")
|
828 |
+
# ์ฌ์์ํ ๊ฒ์๊ธฐ๋ ์
๋ฐ์ดํธ ํ์ ์ ๋ก์ง ์ถ๊ฐ
|
829 |
+
# ์: retriever.update_base_retriever(base_retriever)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
830 |
return jsonify({
|
831 |
"success": True,
|
832 |
"message": f"ํ์ผ '{filename}' ์
๋ก๋ ๋ฐ ์ฒ๋ฆฌ ์๋ฃ ({len(docs)}๊ฐ ์ฒญํฌ ์ถ๊ฐ)."
|
833 |
})
|
834 |
except Exception as e_save:
|
835 |
+
logger.error(f"์ธ๋ฑ์ค ์ ์ฅ ์ค ์ค๋ฅ ๋ฐ์: {e_save}")
|
836 |
return jsonify({"error": f"์ธ๋ฑ์ค ์ ์ฅ ์ค ์ค๋ฅ: {str(e_save)}"}), 500
|
837 |
else:
|
838 |
logger.warning(f"ํ์ผ '{filename}'์์ ์ฒ๋ฆฌํ ๋ด์ฉ์ด ์๊ฑฐ๋ ์ง์๋์ง ์๋ ํ์์
๋๋ค.")
|