jeongsoo commited on
Commit
4a98f26
Β·
1 Parent(s): c7ddcb9

Add application file

Browse files
Files changed (12) hide show
  1. .gitignore +34 -0
  2. app.py +670 -0
  3. config.py +60 -0
  4. dir +154 -0
  5. monitoring.py +136 -0
  6. optimized_document_processor.py +346 -0
  7. rag_chain.py +151 -0
  8. requirements.txt +14 -0
  9. reranker.py +58 -0
  10. simple_rag_chain.py +66 -0
  11. vector_store.py +235 -0
  12. voice_rag_app.py +670 -0
.gitignore ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ν™˜κ²½ λ³€μˆ˜
2
+ .env
3
+
4
+ # μΊμ‹œ 및 μž„μ‹œ 파일
5
+ __pycache__/
6
+ *.py[cod]
7
+ *.so
8
+ .Python
9
+ env/
10
+ build/
11
+ develop-eggs/
12
+ dist/
13
+ downloads/
14
+ eggs/
15
+ .eggs/
16
+ lib/
17
+ lib64/
18
+ parts/
19
+ sdist/
20
+ var/
21
+ *.egg-info/
22
+ .installed.cfg
23
+ *.egg
24
+
25
+ # 폴더
26
+ documents/
27
+ faiss_index/
28
+ cached_data/
29
+ preprocessed_index/
30
+ **/__pycache__/
31
+
32
+ # ν”„λ‘œμ νŠΈ νŠΉν™” 파일
33
+ parts_extraction_cache.json
34
+ .venv/
app.py ADDED
@@ -0,0 +1,670 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ μŒμ„±μΈμ‹(STT) κΈ°λŠ₯이 κ΅¬ν˜„λœ RAG 챗봇 μ•±
3
+ """
4
+ import os
5
+ import time
6
+ import hashlib
7
+ import pickle
8
+ import json
9
+ import tempfile
10
+ from typing import List, Dict, Tuple, Any
11
+
12
+ from langchain.schema import Document
13
+
14
+ from config import (
15
+ PDF_DIRECTORY, CHUNK_SIZE, CHUNK_OVERLAP, LLM_MODEL,
16
+ STT_LANGUAGE, IS_HUGGINGFACE
17
+ )
18
+ from optimized_document_processor import OptimizedDocumentProcessor
19
+ from vector_store import VectorStore
20
+ from clova_stt import ClovaSTT
21
+
22
+ # μ•ˆμ „ν•œ μž„ν¬νŠΈ
23
+ try:
24
+ from rag_chain import RAGChain
25
+ RAG_CHAIN_AVAILABLE = True
26
+ except ImportError:
27
+ print("RAG 체인 λͺ¨λ“ˆμ„ λ‘œλ“œν•  수 μ—†μŠ΅λ‹ˆλ‹€.")
28
+ RAG_CHAIN_AVAILABLE = False
29
+
30
+
31
+ class AutoRAGChatApp:
32
+ """
33
+ documents ν΄λ”μ˜ PDF νŒŒμΌμ„ μžλ™μœΌλ‘œ μ²˜λ¦¬ν•˜κ³  μŒμ„±μΈμ‹ κΈ°λŠ₯을 μ œκ³΅ν•˜λŠ” RAG 챗봇
34
+ """
35
+
36
+ def __init__(self):
37
+ """
38
+ RAG 챗봇 μ• ν”Œλ¦¬μΌ€μ΄μ…˜ μ΄ˆκΈ°ν™”
39
+ """
40
+ # 데이터 디렉토리 μ •μ˜
41
+ self.pdf_directory = PDF_DIRECTORY
42
+ self.cache_directory = "cached_data"
43
+ self.index_file = os.path.join(self.cache_directory, "file_index.json")
44
+ self.chunks_dir = os.path.join(self.cache_directory, "chunks")
45
+ self.vector_index_dir = os.path.join(self.cache_directory, "vector_index")
46
+
47
+ # 디렉토리 생성
48
+ os.makedirs(self.pdf_directory, exist_ok=True)
49
+ os.makedirs(self.cache_directory, exist_ok=True)
50
+ os.makedirs(self.chunks_dir, exist_ok=True)
51
+ os.makedirs(self.vector_index_dir, exist_ok=True)
52
+
53
+ print(f"PDF λ¬Έμ„œ 디렉토리: '{self.pdf_directory}'")
54
+ print(f"μΊμ‹œ 디렉토리: '{self.cache_directory}'")
55
+
56
+ # μ»΄ν¬λ„ŒνŠΈ μ΄ˆκΈ°ν™”
57
+ self.document_processor = OptimizedDocumentProcessor(
58
+ chunk_size=CHUNK_SIZE,
59
+ chunk_overlap=CHUNK_OVERLAP
60
+ )
61
+
62
+ # 벑터 μ €μž₯μ†Œ μ΄ˆκΈ°ν™”
63
+ self.vector_store = VectorStore(use_milvus=False)
64
+
65
+ # λ¬Έμ„œ 인덱슀 λ‘œλ“œ
66
+ self.file_index = self._load_file_index()
67
+
68
+ # κΈ°λ³Έ λ³€μˆ˜ μ΄ˆκΈ°ν™”
69
+ self.documents = []
70
+ self.processed_files = []
71
+ self.is_initialized = False
72
+
73
+ # ν΄λ‘œλ°” STT ν΄λΌμ΄μ–ΈνŠΈ μ΄ˆκΈ°ν™”
74
+ self.stt_client = ClovaSTT()
75
+ print("μŒμ„±μΈμ‹(STT) κΈ°λŠ₯이 μ΄ˆκΈ°ν™”λ˜μ—ˆμŠ΅λ‹ˆλ‹€.")
76
+
77
+ # μ‹œμž‘ μ‹œ μžλ™μœΌλ‘œ λ¬Έμ„œ λ‘œλ“œ 및 처리
78
+ print("λ¬Έμ„œ μžλ™ λ‘œλ“œ 및 처리 μ‹œμž‘...")
79
+ self.auto_process_documents()
80
+
81
+ def _process_pdf_file(self, file_path: str) -> List[Document]:
82
+ """
83
+ PDF 파일 처리 - docling μ‹€νŒ¨ μ‹œ PyPDFLoader μ‚¬μš©
84
+
85
+ Args:
86
+ file_path: μ²˜λ¦¬ν•  PDF 파일 경둜
87
+
88
+ Returns:
89
+ 처리된 λ¬Έμ„œ 청크 리슀트
90
+ """
91
+ try:
92
+ print(f"docling으둜 처리 μ‹œλ„: {file_path}")
93
+
94
+ # docling μ‚¬μš© μ‹œλ„
95
+ try:
96
+ # 10초 νƒ€μž„μ•„μ›ƒ μ„€μ • (μ˜΅μ…˜)
97
+ import signal
98
+
99
+ def timeout_handler(signum, frame):
100
+ raise TimeoutError("docling 처리 μ‹œκ°„ 초과")
101
+
102
+ # λ¦¬λˆ…μŠ€/λ§₯μ—μ„œλ§Œ μž‘λ™ (μœˆλ„μš°μ—μ„œλŠ” λ¬΄μ‹œλ¨)
103
+ try:
104
+ signal.signal(signal.SIGALRM, timeout_handler)
105
+ signal.alarm(60) # 60초 νƒ€μž„μ•„μ›ƒ
106
+ except:
107
+ pass
108
+
109
+ # docling으둜 처리 μ‹œλ„
110
+ chunks = self.document_processor.process_pdf(file_path, use_docling=True)
111
+
112
+ # νƒ€μž„μ•„μ›ƒ μ·¨μ†Œ
113
+ try:
114
+ signal.alarm(0)
115
+ except:
116
+ pass
117
+
118
+ return chunks
119
+
120
+ except Exception as e:
121
+ # docling 였λ₯˜ 확인
122
+ error_str = str(e)
123
+ if "Invalid code point" in error_str or "RuntimeError" in error_str:
124
+ print(f"docling 처리 였λ₯˜ (μ½”λ“œ 포인트 문제): {error_str}")
125
+ print("PyPDFLoader둜 λŒ€μ²΄ν•©λ‹ˆλ‹€.")
126
+ else:
127
+ print(f"docling 처리 였λ₯˜: {error_str}")
128
+ print("PyPDFLoader둜 λŒ€μ²΄ν•©λ‹ˆλ‹€.")
129
+
130
+ # PyPDFLoader둜 λŒ€μ²΄
131
+ try:
132
+ return self.document_processor.process_pdf(file_path, use_docling=False)
133
+ except Exception as inner_e:
134
+ print(f"PyPDFLoader 처리 였λ₯˜: {inner_e}")
135
+ raise # 두 방법 λͺ¨λ‘ μ‹€νŒ¨ν•˜λ©΄ μ˜ˆμ™Έ λ°œμƒ
136
+
137
+ except Exception as e:
138
+ print(f"PDF 처리 쀑 μ‹¬κ°ν•œ 였λ₯˜: {e}")
139
+ # 빈 청크라도 λ°˜ν™˜ν•˜μ—¬ 전체 μ²˜λ¦¬κ°€ μ€‘λ‹¨λ˜μ§€ μ•Šλ„λ‘ 함
140
+ return []
141
+
142
+ def _load_file_index(self) -> Dict[str, Dict[str, Any]]:
143
+ """
144
+ 파일 인덱슀 λ‘œλ“œ
145
+
146
+ Returns:
147
+ 파일 경둜 -> 메타데이터 λ§€ν•‘
148
+ """
149
+ if os.path.exists(self.index_file):
150
+ try:
151
+ with open(self.index_file, 'r', encoding='utf-8') as f:
152
+ return json.load(f)
153
+ except Exception as e:
154
+ print(f"인덱슀 파일 λ‘œλ“œ μ‹€νŒ¨: {e}")
155
+ return {}
156
+ return {}
157
+
158
+ def _save_file_index(self) -> None:
159
+ """
160
+ 파일 인덱슀 μ €μž₯
161
+ """
162
+ with open(self.index_file, 'w', encoding='utf-8') as f:
163
+ json.dump(self.file_index, f, ensure_ascii=False, indent=2)
164
+
165
+ def _calculate_file_hash(self, file_path: str) -> str:
166
+ """
167
+ 파일 ν•΄μ‹œ 계산
168
+
169
+ Args:
170
+ file_path: 파일 경둜
171
+
172
+ Returns:
173
+ MD5 ν•΄μ‹œκ°’
174
+ """
175
+ hasher = hashlib.md5()
176
+ with open(file_path, 'rb') as f:
177
+ buf = f.read(65536)
178
+ while len(buf) > 0:
179
+ hasher.update(buf)
180
+ buf = f.read(65536)
181
+ return hasher.hexdigest()
182
+
183
+ def _is_file_processed(self, file_path: str) -> bool:
184
+ """
185
+ 파일이 이미 μ²˜λ¦¬λ˜μ—ˆκ³  λ³€κ²½λ˜μ§€ μ•Šμ•˜λŠ”μ§€ 확인
186
+
187
+ Args:
188
+ file_path: 파일 경둜
189
+
190
+ Returns:
191
+ 처리 μ—¬λΆ€
192
+ """
193
+ if file_path not in self.file_index:
194
+ return False
195
+
196
+ # ν˜„μž¬ ν•΄μ‹œκ°’ 계산
197
+ current_hash = self._calculate_file_hash(file_path)
198
+
199
+ # μ €μž₯된 ν•΄μ‹œκ°’κ³Ό 비ꡐ
200
+ if self.file_index[file_path]['hash'] != current_hash:
201
+ print(f"파일 λ³€κ²½ 감지: {file_path}")
202
+ return False
203
+
204
+ # 청크 파일 쑴재 확인
205
+ chunks_path = self.file_index[file_path]['chunks_path']
206
+ if not os.path.exists(chunks_path):
207
+ return False
208
+
209
+ return True
210
+
211
+ def _get_chunks_path(self, file_hash: str) -> str:
212
+ """
213
+ 청크 파일 경둜 생성
214
+
215
+ Args:
216
+ file_hash: 파일 ν•΄μ‹œκ°’
217
+
218
+ Returns:
219
+ 청크 파일 경둜
220
+ """
221
+ return os.path.join(self.chunks_dir, f"{file_hash}.pkl")
222
+
223
+ def _save_chunks(self, file_path: str, chunks: List[Document]) -> None:
224
+ """
225
+ 청크 데이터 μ €μž₯
226
+
227
+ Args:
228
+ file_path: 원본 파일 경둜
229
+ chunks: λ¬Έμ„œ 청크 리슀트
230
+ """
231
+ # ν•΄μ‹œ 계산
232
+ file_hash = self._calculate_file_hash(file_path)
233
+
234
+ # 청크 파일 경둜
235
+ chunks_path = self._get_chunks_path(file_hash)
236
+
237
+ # 청크 데이터 μ €μž₯
238
+ with open(chunks_path, 'wb') as f:
239
+ pickle.dump(chunks, f)
240
+
241
+ # 인덱슀 μ—…λ°μ΄νŠΈ
242
+ self.file_index[file_path] = {
243
+ 'hash': file_hash,
244
+ 'chunks_path': chunks_path,
245
+ 'last_processed': time.time(),
246
+ 'chunks_count': len(chunks)
247
+ }
248
+
249
+ # 인덱슀 μ €μž₯
250
+ self._save_file_index()
251
+
252
+ print(f"청크 μ €μž₯ μ™„λ£Œ: {file_path} ({len(chunks)}개 청크)")
253
+
254
+ def _load_chunks(self, file_path: str) -> List[Document]:
255
+ """
256
+ μ €μž₯된 청크 데이터 λ‘œλ“œ
257
+
258
+ Args:
259
+ file_path: 파일 경둜
260
+
261
+ Returns:
262
+ λ¬Έμ„œ 청크 리슀트
263
+ """
264
+ chunks_path = self.file_index[file_path]['chunks_path']
265
+ with open(chunks_path, 'rb') as f:
266
+ chunks = pickle.load(f)
267
+
268
+ print(f"청크 λ‘œλ“œ μ™„λ£Œ: {file_path} ({len(chunks)}개 청크)")
269
+ return chunks
270
+
271
+ def auto_process_documents(self) -> str:
272
+ """
273
+ documents ν΄λ”μ˜ PDF 파일 μžλ™ 처리
274
+
275
+ Returns:
276
+ 처리 κ²°κ³Ό λ©”μ‹œμ§€
277
+ """
278
+ try:
279
+ start_time = time.time()
280
+
281
+ # PDF 파일 λͺ©λ‘ μˆ˜μ§‘
282
+ pdf_files = []
283
+ for filename in os.listdir(self.pdf_directory):
284
+ if filename.lower().endswith('.pdf'):
285
+ pdf_files.append(os.path.join(self.pdf_directory, filename))
286
+
287
+ if not pdf_files:
288
+ return f"'{self.pdf_directory}' 폴더에 PDF 파일이 μ—†μŠ΅λ‹ˆλ‹€."
289
+
290
+ print(f"발견된 PDF 파일: {len(pdf_files)}개")
291
+
292
+ # 폴더 λ‚΄ PDF 파일 처리
293
+ new_files = []
294
+ updated_files = []
295
+ cached_files = []
296
+ failed_files = []
297
+ all_chunks = []
298
+
299
+ for file_path in pdf_files:
300
+ if self._is_file_processed(file_path):
301
+ # μΊμ‹œμ—μ„œ 청크 λ‘œλ“œ
302
+ chunks = self._load_chunks(file_path)
303
+ all_chunks.extend(chunks)
304
+ cached_files.append(file_path)
305
+ self.processed_files.append(os.path.basename(file_path))
306
+ else:
307
+ # μƒˆ 파일 λ˜λŠ” λ³€κ²½λœ 파일 처리
308
+ print(f"처리 쀑: {file_path}")
309
+
310
+ try:
311
+ # κ°œμ„ λœ PDF 처리 λ©”μ„œλ“œ μ‚¬μš©
312
+ chunks = self._process_pdf_file(file_path)
313
+
314
+ if chunks: # 청크가 μžˆλŠ” κ²½μš°μ—λ§Œ μ €μž₯
315
+ # 청크 μ €μž₯
316
+ self._save_chunks(file_path, chunks)
317
+
318
+ all_chunks.extend(chunks)
319
+ if file_path in self.file_index:
320
+ updated_files.append(file_path)
321
+ else:
322
+ new_files.append(file_path)
323
+
324
+ self.processed_files.append(os.path.basename(file_path))
325
+ else:
326
+ print(f"'{file_path}' 처리 μ‹€νŒ¨: μΆ”μΆœλœ 청크 μ—†μŒ")
327
+ failed_files.append(file_path)
328
+ except Exception as e:
329
+ print(f"'{file_path}' 처리 쀑 였λ₯˜: {e}")
330
+ failed_files.append(file_path)
331
+
332
+ # λͺ¨λ“  청크 μ €μž₯
333
+ self.documents = all_chunks
334
+
335
+ processing_time = time.time() - start_time
336
+ print(f"λ¬Έμ„œ 처리 μ™„λ£Œ: {len(all_chunks)}개 청크, {processing_time:.2f}초")
337
+
338
+ # 벑터 인덱슀 μ €μž₯ 경둜 확인
339
+ if os.path.exists(self.vector_index_dir) and any(os.listdir(self.vector_index_dir)):
340
+ # κΈ°μ‘΄ 벑터 인덱슀 λ‘œλ“œ
341
+ try:
342
+ print("μ €μž₯된 벑터 인덱슀 λ‘œλ“œ 쀑...")
343
+ vector_store_loaded = self.vector_store.load_local(self.vector_index_dir)
344
+
345
+ # 인덱슀 λ‘œλ“œ 성곡 확인
346
+ if self.vector_store.vector_store is not None:
347
+ # μƒˆ λ¬Έμ„œλ‚˜ λ³€κ²½λœ λ¬Έμ„œκ°€ 있으면 인덱슀 μ—…λ°μ΄νŠΈ
348
+ if new_files or updated_files:
349
+ print("벑터 인덱슀 μ—…λ°μ΄νŠΈ 쀑...")
350
+ self.vector_store.add_documents(self.documents)
351
+
352
+ print("벑터 인덱슀 λ‘œλ“œ μ™„λ£Œ")
353
+ else:
354
+ print("벑터 인덱슀λ₯Ό λ‘œλ“œν–ˆμœΌλ‚˜ μœ νš¨ν•˜μ§€ μ•ŠμŒ, μƒˆλ‘œ μƒμ„±ν•©λ‹ˆλ‹€.")
355
+ self.vector_store.create_or_load(self.documents)
356
+
357
+ except Exception as e:
358
+ print(f"벑터 인덱슀 λ‘œλ“œ μ‹€νŒ¨, μƒˆλ‘œ μƒμ„±ν•©λ‹ˆλ‹€: {e}")
359
+ # 였λ₯˜ 상세 정보 좜λ ₯
360
+ import traceback
361
+ traceback.print_exc()
362
+
363
+ # μƒˆ 벑터 인덱슀 생성
364
+ self.vector_store.create_or_load(self.documents)
365
+ else:
366
+ # μƒˆ 벑터 인덱슀 생성
367
+ print("μƒˆ 벑터 인덱슀 생성 쀑...")
368
+ self.vector_store.create_or_load(self.documents)
369
+
370
+ # 벑터 인덱슀 μ €μž₯
371
+ if self.vector_store and self.vector_store.vector_store is not None:
372
+ try:
373
+ print(f"벑터 인덱슀 μ €μž₯ 쀑: {self.vector_index_dir}")
374
+ save_result = self.vector_store.save_local(self.vector_index_dir)
375
+ print(f"벑터 인덱슀 μ €μž₯ μ™„λ£Œ: {self.vector_index_dir}")
376
+ except Exception as e:
377
+ print(f"벑터 인덱슀 μ €μž₯ μ‹€νŒ¨: {e}")
378
+ # 였λ₯˜ 상세 정보 좜λ ₯
379
+ import traceback
380
+ traceback.print_exc()
381
+ else:
382
+ print("벑터 μΈλ±μŠ€κ°€ μ΄ˆκΈ°ν™”λ˜μ§€ μ•Šμ•„ μ €μž₯ν•˜μ§€ μ•ŠμŠ΅λ‹ˆλ‹€.")
383
+
384
+ # RAG 체인 μ΄ˆκΈ°ν™”
385
+ if RAG_CHAIN_AVAILABLE:
386
+ self.rag_chain = RAGChain(self.vector_store)
387
+ self.is_initialized = True
388
+
389
+ total_time = time.time() - start_time
390
+
391
+ status_message = (
392
+ f"λ¬Έμ„œ 처리 μ™„λ£Œ!\n"
393
+ f"- 처리된 파일: {len(self.processed_files)}개\n"
394
+ f"- μΊμ‹œλœ 파일: {len(cached_files)}개\n"
395
+ f"- μƒˆ 파일: {len(new_files)}개\n"
396
+ f"- μ—…λ°μ΄νŠΈλœ 파일: {len(updated_files)}개\n"
397
+ f"- μ‹€νŒ¨ν•œ 파일: {len(failed_files)}개\n"
398
+ f"- 총 청크 수: {len(self.documents)}개\n"
399
+ f"- 처리 μ‹œκ°„: {total_time:.2f}초\n"
400
+ f"이제 μ§ˆλ¬Έν•  μ€€λΉ„κ°€ λ˜μ—ˆμŠ΅λ‹ˆλ‹€!"
401
+ )
402
+
403
+ print(status_message)
404
+ return status_message
405
+ else:
406
+ return "RAG 체인을 μ΄ˆκΈ°ν™”ν•  수 μ—†μŠ΅λ‹ˆλ‹€. ν•„μš”ν•œ λΌμ΄λΈŒλŸ¬λ¦¬κ°€ μ„€μΉ˜λ˜μ–΄ μžˆλŠ”μ§€ ν™•μΈν•˜μ„Έμš”."
407
+
408
+ except Exception as e:
409
+ error_message = f"λ¬Έμ„œ 처리 쀑 였λ₯˜ λ°œμƒ: {str(e)}"
410
+ print(error_message)
411
+ import traceback
412
+ traceback.print_exc()
413
+ return error_message
414
+
415
+ def reset_cache(self) -> str:
416
+ """
417
+ μΊμ‹œ μ΄ˆκΈ°ν™”
418
+
419
+ Returns:
420
+ κ²°κ³Ό λ©”μ‹œμ§€
421
+ """
422
+ try:
423
+ # 청크 파일 μ‚­μ œ
424
+ for filename in os.listdir(self.chunks_dir):
425
+ file_path = os.path.join(self.chunks_dir, filename)
426
+ if os.path.isfile(file_path):
427
+ os.remove(file_path)
428
+
429
+ # 인덱슀 μ΄ˆκΈ°ν™”
430
+ self.file_index = {}
431
+ self._save_file_index()
432
+
433
+ # 벑터 인덱슀 μ‚­μ œ
434
+ for filename in os.listdir(self.vector_index_dir):
435
+ file_path = os.path.join(self.vector_index_dir, filename)
436
+ if os.path.isfile(file_path):
437
+ os.remove(file_path)
438
+
439
+ self.documents = []
440
+ self.processed_files = []
441
+ self.is_initialized = False
442
+
443
+ return "μΊμ‹œκ°€ μ΄ˆκΈ°ν™”λ˜μ—ˆμŠ΅λ‹ˆλ‹€. λ‹€μŒ μ‹€ν–‰ μ‹œ λͺ¨λ“  λ¬Έμ„œκ°€ λ‹€μ‹œ μ²˜λ¦¬λ©λ‹ˆλ‹€."
444
+ except Exception as e:
445
+ return f"μΊμ‹œ μ΄ˆκΈ°ν™” 쀑 였λ₯˜ λ°œμƒ: {str(e)}"
446
+
447
+ def process_query(self, query: str, chat_history: List[Tuple[str, str]]) -> Tuple[str, List[Tuple[str, str]]]:
448
+ """
449
+ μ‚¬μš©μž 쿼리 처리
450
+
451
+ Args:
452
+ query: μ‚¬μš©μž 질문
453
+ chat_history: λŒ€ν™” 기둝
454
+
455
+ Returns:
456
+ 응닡 및 μ—…λ°μ΄νŠΈλœ λŒ€ν™” 기둝
457
+ """
458
+ if not query: # λΉ„μ–΄μžˆλŠ” 쿼리 처리
459
+ return "", chat_history
460
+
461
+ if not self.is_initialized:
462
+ response = "λ¬Έμ„œ λ‘œλ“œκ°€ μ΄ˆκΈ°ν™”λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€. μžλ™ λ‘œλ“œλ₯Ό μ‹œλ„ν•©λ‹ˆλ‹€."
463
+ chat_history.append((query, response))
464
+
465
+ # μžλ™ λ‘œλ“œ μ‹œλ„
466
+ try:
467
+ self.auto_process_documents()
468
+ if not self.is_initialized:
469
+ response = "λ¬Έμ„œλ₯Ό λ‘œλ“œν•  수 μ—†μŠ΅λ‹ˆλ‹€. 'documents' 폴더에 PDF 파일이 μžˆλŠ”μ§€ ν™•μΈν•˜μ„Έμš”."
470
+ chat_history.append((query, response))
471
+ return "", chat_history
472
+ except Exception as e:
473
+ response = f"λ¬Έμ„œ λ‘œλ“œ 쀑 였λ₯˜ λ°œμƒ: {str(e)}"
474
+ chat_history.append((query, response))
475
+ return "", chat_history
476
+
477
+ try:
478
+ # RAG 체인 μ‹€ν–‰ 및 응닡 생성
479
+ start_time = time.time()
480
+ response = self.rag_chain.run(query)
481
+ end_time = time.time()
482
+
483
+ query_time = end_time - start_time
484
+ print(f"쿼리 처리 μ‹œκ°„: {query_time:.2f}초")
485
+
486
+ chat_history.append((query, response))
487
+ return "", chat_history
488
+ except Exception as e:
489
+ error_msg = f"였λ₯˜ λ°œμƒ: {str(e)}"
490
+ chat_history.append((query, error_msg))
491
+ return "", chat_history
492
+
493
+ def process_voice_query(self, audio, chat_history: List[Tuple[str, str]]) -> Tuple[str, List[Tuple[str, str]]]:
494
+ """
495
+ μŒμ„± 쿼리 처리
496
+
497
+ Args:
498
+ audio: λ…ΉμŒλœ μ˜€λ””μ˜€ 데이터
499
+ chat_history: λŒ€ν™” 기둝
500
+
501
+ Returns:
502
+ 응닡 및 μ—…λ°μ΄νŠΈλœ λŒ€ν™” 기둝
503
+ """
504
+ if audio is None:
505
+ return "", chat_history
506
+
507
+ try:
508
+ # μž„μ‹œ νŒŒμΌμ— μ˜€λ””μ˜€ μ €μž₯
509
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
510
+ temp_path = temp_file.name
511
+ temp_file.write(audio)
512
+
513
+ print(f"[STT] μž„μ‹œ μ˜€λ””μ˜€ 파일 생성: {temp_path}")
514
+
515
+ # config.pyμ—μ„œ μ„€μ •ν•œ μ–Έμ–΄ μ½”λ“œλ‘œ STT μ‹€ν–‰
516
+ result = self.stt_client.recognize_file(temp_path, language=STT_LANGUAGE)
517
+
518
+ # μž„μ‹œ 파일 μ‚­μ œ
519
+ try:
520
+ os.unlink(temp_path)
521
+ print("[STT] μž„μ‹œ μ˜€λ””μ˜€ 파일 μ‚­μ œλ¨")
522
+ except Exception as e:
523
+ print(f"[STT] μž„μ‹œ 파일 μ‚­μ œ μ‹€νŒ¨: {e}")
524
+
525
+ # STT 결과 처리
526
+ if "error" in result:
527
+ error_msg = f"μŒμ„±μΈμ‹ 였λ₯˜: {result.get('error')}"
528
+ print(f"[STT] {error_msg}")
529
+ chat_history.append(("μŒμ„± λ©”μ‹œμ§€", error_msg))
530
+ return "", chat_history
531
+
532
+ # μΈμ‹λœ ν…μŠ€νŠΈ μΆ”μΆœ
533
+ recognized_text = result.get("text", "")
534
+ if not recognized_text:
535
+ error_msg = "μŒμ„±μ„ 인식할 수 μ—†μŠ΅λ‹ˆλ‹€. λ‹€μ‹œ μ‹œλ„ν•΄μ£Όμ„Έμš”."
536
+ print("[STT] μΈμ‹λœ ν…μŠ€νŠΈ μ—†μŒ")
537
+ chat_history.append(("μŒμ„± λ©”μ‹œμ§€", error_msg))
538
+ return "", chat_history
539
+
540
+ print(f"[STT] μΈμ‹λœ ν…μŠ€νŠΈ: {recognized_text}")
541
+
542
+ # μΈμ‹λœ ν…μŠ€νŠΈλ‘œ 쿼리 처리 (μŒμ„± λ©”μ‹œμ§€ 접두어 μΆ”κ°€)
543
+ return self.process_query(f"🎀 {recognized_text}", chat_history)
544
+
545
+ except Exception as e:
546
+ error_msg = f"μŒμ„± 처리 쀑 였λ₯˜ λ°œμƒ: {str(e)}"
547
+ print(f"[STT] {error_msg}")
548
+ chat_history.append(("μŒμ„± λ©”μ‹œμ§€", error_msg))
549
+ return "", chat_history
550
+
551
+ def launch_app(self) -> None:
552
+ """
553
+ μŒμ„±μΈμ‹ κΈ°λŠ₯이 μΆ”κ°€λœ Gradio μ•± μ‹€ν–‰
554
+ """
555
+ import gradio as gr
556
+
557
+ with gr.Blocks(title="μŒμ„±μΈμ‹ κΈ°λŠ₯이 μΆ”κ°€λœ PDF λ¬Έμ„œ 기반 RAG 챗봇") as app:
558
+ gr.Markdown("# μŒμ„±μΈμ‹ κΈ°λŠ₯이 μΆ”κ°€λœ PDF λ¬Έμ„œ 기반 RAG 챗봇")
559
+ gr.Markdown(f"* μ‚¬μš© 쀑인 LLM λͺ¨λΈ: **{LLM_MODEL}**")
560
+ gr.Markdown(f"* PDF λ¬Έμ„œ 폴더: **{self.pdf_directory}**")
561
+ gr.Markdown("* 넀이버 ν΄λ‘œλ°” μŒμ„±μΈμ‹ API 톡합")
562
+
563
+ with gr.Row():
564
+ with gr.Column(scale=1):
565
+ # λ¬Έμ„œ μƒνƒœ μ„Ήμ…˜
566
+ status_box = gr.Textbox(
567
+ label="λ¬Έμ„œ 처리 μƒνƒœ",
568
+ value=f"처리된 λ¬Έμ„œ ({len(self.processed_files)}개): {', '.join(self.processed_files)}",
569
+ lines=5,
570
+ interactive=False
571
+ )
572
+
573
+ # μΊμ‹œ 관리 λ²„νŠΌ
574
+ refresh_button = gr.Button("λ¬Έμ„œ μƒˆλ‘œ 읽기", variant="primary")
575
+ reset_button = gr.Button("μΊμ‹œ μ΄ˆκΈ°ν™”", variant="stop")
576
+
577
+ # 처리된 파일 정보
578
+ with gr.Accordion("μΊμ‹œ μ„ΈλΆ€ 정보", open=False):
579
+ file_info = ""
580
+ for file_path, info in self.file_index.items():
581
+ file_info += f"- {os.path.basename(file_path)}: {info['chunks_count']}개 청크\n"
582
+
583
+ cache_info = gr.Textbox(
584
+ label="μΊμ‹œλœ 파일 정보",
585
+ value=file_info or "μΊμ‹œλœ 파일이 μ—†μŠ΅λ‹ˆλ‹€.",
586
+ lines=5,
587
+ interactive=False
588
+ )
589
+
590
+ with gr.Column(scale=2):
591
+ # μ±„νŒ… μΈν„°νŽ˜μ΄μŠ€
592
+ chatbot = gr.Chatbot(
593
+ label="λŒ€ν™” λ‚΄μš©",
594
+ bubble_full_width=False,
595
+ height=500,
596
+ show_copy_button=True
597
+ )
598
+
599
+ with gr.Tabs() as input_tabs:
600
+ # ν…μŠ€νŠΈ μž…λ ₯ νƒ­
601
+ with gr.Tab("ν…μŠ€νŠΈ μž…λ ₯"):
602
+ # ν…μŠ€νŠΈ μž…λ ₯κ³Ό 전솑 λ²„νŠΌμ„ μˆ˜ν‰μœΌλ‘œ 배치
603
+ with gr.Row():
604
+ query_box = gr.Textbox(
605
+ label="질문",
606
+ placeholder="처리된 λ¬Έμ„œ λ‚΄μš©μ— λŒ€ν•΄ μ§ˆλ¬Έν•˜μ„Έμš”...",
607
+ lines=2,
608
+ scale=4
609
+ )
610
+ submit_btn = gr.Button("전솑", variant="primary", scale=1)
611
+
612
+ # μŒμ„± μž…λ ₯ νƒ­
613
+ with gr.Tab("μŒμ„± μž…λ ₯"):
614
+ audio_input = gr.Audio(
615
+ label="마이크 μž…λ ₯",
616
+ sources=["microphone"],
617
+ type="bytes",
618
+ format="wav"
619
+ )
620
+ voice_submit_btn = gr.Button("μŒμ„± 질문 전솑", variant="primary")
621
+
622
+ clear_chat_button = gr.Button("λŒ€ν™” μ΄ˆκΈ°ν™”")
623
+
624
+ # 이벀트 ν•Έλ“€λŸ¬ μ„€μ •
625
+ refresh_button.click(
626
+ fn=self.auto_process_documents,
627
+ inputs=[],
628
+ outputs=[status_box]
629
+ )
630
+
631
+ reset_button.click(
632
+ fn=lambda: (self.reset_cache(), self.auto_process_documents()),
633
+ inputs=[],
634
+ outputs=[status_box]
635
+ )
636
+
637
+ # ν…μŠ€νŠΈ 전솑 λ²„νŠΌ 클릭 이벀트
638
+ submit_btn.click(
639
+ fn=self.process_query,
640
+ inputs=[query_box, chatbot],
641
+ outputs=[query_box, chatbot]
642
+ )
643
+
644
+ # μ—”ν„°ν‚€ μž…λ ₯ 이벀트
645
+ query_box.submit(
646
+ fn=self.process_query,
647
+ inputs=[query_box, chatbot],
648
+ outputs=[query_box, chatbot]
649
+ )
650
+
651
+ # μŒμ„± 전솑 λ²„νŠΌ 클릭 이벀트
652
+ voice_submit_btn.click(
653
+ fn=self.process_voice_query,
654
+ inputs=[audio_input, chatbot],
655
+ outputs=[audio_input, chatbot]
656
+ )
657
+
658
+ # λŒ€ν™” μ΄ˆκΈ°ν™” λ²„νŠΌ
659
+ clear_chat_button.click(
660
+ fn=lambda: [],
661
+ outputs=[chatbot]
662
+ )
663
+
664
+ # μ•± μ‹€ν–‰
665
+ app.launch(share=False)
666
+
667
+
668
+ if __name__ == "__main__":
669
+ app = AutoRAGChatApp()
670
+ app.launch_app()
config.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 벑터 μŠ€ν† μ–΄, μž„λ² λ”© λͺ¨λΈ, LLM λ“± ꡬ성 μš”μ†Œ μ„€μ •
3
+ """
4
+ import os
5
+ from dotenv import load_dotenv
6
+
7
+ # .env 파일이 있으면 λ‘œλ“œ (둜컬 ν™˜κ²½μš©)
8
+ load_dotenv(verbose=True)
9
+
10
+ # ν™˜κ²½ 감지
11
+ IS_HUGGINGFACE = os.getenv('SPACE_ID') is not None
12
+
13
+ # API ν‚€ 및 ν™˜κ²½ μ„€μ •
14
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "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")
15
+ LANGFUSE_PUBLIC_KEY = os.getenv("LANGFUSE_PUBLIC_KEY", "pk-lf-cd6248e2-59ad-496d-a4cb-487bb3ecfcd5")
16
+ LANGFUSE_SECRET_KEY = os.getenv("LANGFUSE_SECRET_KEY", "sk-lf-61460a1d-e637-4c22-b5e9-9250ac2579ba")
17
+ LANGFUSE_HOST = os.getenv("LANGFUSE_HOST", "https://cloud.langfuse.com")
18
+
19
+ # 넀이버 ν΄λ‘œλ°” STT API μ„€μ •
20
+ NAVER_CLIENT_ID = os.getenv("NAVER_CLIENT_ID", "xae4kga9s5")
21
+ NAVER_CLIENT_SECRET = os.getenv("NAVER_CLIENT_SECRET", "aoSmmr3xMrdVopxGduFX5YfGZRJpu2MDUiUvlvQx")
22
+
23
+ # 넀이버 ν΄λ‘œλ°” API ν‚€ 확인
24
+ if NAVER_CLIENT_ID and NAVER_CLIENT_SECRET:
25
+ print("넀이버 ν΄λ‘œλ°” STT API ν‚€κ°€ μ„€μ •λ˜μ—ˆμŠ΅λ‹ˆλ‹€.")
26
+ else:
27
+ print("κ²½κ³ : 넀이버 ν΄λ‘œλ°” STT API ν‚€κ°€ μ„€μ •λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€.")
28
+ print("STT κΈ°λŠ₯을 μ‚¬μš©ν•˜λ €λ©΄ NAVER_CLIENT_ID와 NAVER_CLIENT_SECRET ν™˜κ²½ λ³€μˆ˜λ₯Ό μ„€μ •ν•˜μ„Έμš”.")
29
+
30
+ # Milvus 벑터 DB μ„€μ •
31
+ MILVUS_HOST = os.getenv("MILVUS_HOST", "localhost")
32
+ MILVUS_PORT = os.getenv("MILVUS_PORT", "19530")
33
+ MILVUS_COLLECTION = "pdf_documents"
34
+
35
+ # μž„λ² λ”© λͺ¨λΈ μ„€μ •
36
+ EMBEDDING_MODEL = "Alibaba-NLP/gte-multilingual-base" # λ‹€κ΅­μ–΄ 지원 λͺ¨λΈ
37
+ RERANKER_MODEL = "Alibaba-NLP/gte-multilingual-reranker-base" # λ‹€κ΅­μ–΄ 지원 리랭컀
38
+
39
+ # LLM λͺ¨λΈ μ„€μ • (ν™˜κ²½μ— 따라 μžλ™ 선택)
40
+ if IS_HUGGINGFACE:
41
+ # HuggingFace ν™˜κ²½μ—μ„œλŠ” OpenAI μ‚¬μš©
42
+ USE_OPENAI = True
43
+ LLM_MODEL = "gpt-3.5-turbo" # λ˜λŠ” λ‹€λ₯Έ μ μ ˆν•œ λͺ¨λΈ
44
+ print("HuggingFace Spaces ν™˜κ²½ 감지: OpenAI λͺ¨λΈ μ‚¬μš©")
45
+ else:
46
+ # 둜컬 ν™˜κ²½μ—μ„œλŠ” Ollama μ‚¬μš©
47
+ USE_OPENAI = os.getenv("USE_OPENAI", "False").lower() == "true"
48
+ LLM_MODEL = os.getenv("LLM_MODEL", "gemma3:latest" if not USE_OPENAI else "gpt-3.5-turbo")
49
+ OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://localhost:11434")
50
+ print(f"둜컬 ν™˜κ²½: {'OpenAI' if USE_OPENAI else 'Ollama'} λͺ¨λΈ μ‚¬μš©")
51
+
52
+ # μ•± μ„€μ •
53
+ CHUNK_SIZE = 1000
54
+ CHUNK_OVERLAP = 200
55
+ TOP_K_RETRIEVAL = 5 # 벑터 검색 κ²°κ³Ό 수
56
+ TOP_K_RERANK = 3 # λ¦¬λž­ν‚Ή ν›„ 선택할 κ²°κ³Ό 수
57
+ PDF_DIRECTORY = "documents" # PDF λ¬Έμ„œκ°€ μ €μž₯된 디렉토리
58
+
59
+ # μŒμ„±μΈμ‹ μ„€μ •
60
+ STT_LANGUAGE = "Kor" # κΈ°λ³Έ μ–Έμ–΄ μ„€μ • (Kor, Eng, Jpn, Chn λ“±)
dir ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ λ™μ˜μ–΄ 처리 λͺ¨λ“ˆ
3
+ """
4
+ import os
5
+ import sys
6
+ import re
7
+ from typing import Dict, List, Optional, Set
8
+
9
+ # κΈ°λ³Έ λ™μ˜μ–΄ 사전 (MP_synonyms.py 파일이 없을 경우 μ‚¬μš©)
10
+ DEFAULT_SYNONYMS = {
11
+ "μ—‘μΈ„λ ˆμ΄ν„°": "앑츄에이터",
12
+ "앑츄에이터": "앑츄에이터",
13
+ "λͺ¨ν„°": "앑츄에이터",
14
+ "컨박": "μ»¨νŠΈλ‘€λ°•μŠ€"
15
+ }
16
+
17
+
18
+ class SynonymsHandler:
19
+ """
20
+ λΆ€ν’ˆλͺ…μ˜ λ™μ˜μ–΄λ₯Ό μ²˜λ¦¬ν•˜λŠ” 클래슀
21
+ """
22
+
23
+ def __init__(self, synonyms_file: Optional[str] = None):
24
+ """
25
+ λ™μ˜μ–΄ ν•Έλ“€λŸ¬ μ΄ˆκΈ°ν™”
26
+
27
+ Args:
28
+ synonyms_file: λ™μ˜μ–΄ 파일 경둜 (선택적)
29
+ """
30
+ self.synonyms = {}
31
+ self.loaded = False
32
+
33
+ # 1. κΈ°λ³Έ 제곡된 파일 경둜 확인
34
+ if synonyms_file and os.path.exists(synonyms_file):
35
+ self._load_from_file(synonyms_file)
36
+
37
+ # 2. 일반적인 μœ„μΉ˜ 확인 (.venv/SYNONYMS/MP_synonyms.py)
38
+ elif os.path.exists(".venv/SYNONYMS/MP_synonyms.py"):
39
+ self._load_from_file(".venv/SYNONYMS/MP_synonyms.py")
40
+
41
+ # 3. ν˜„μž¬ 디렉토리 확인
42
+ elif os.path.exists("MP_synonyms.py"):
43
+ self._load_from_file("MP_synonyms.py")
44
+
45
+ # 4. κΈ°λ³Έ λ™μ˜μ–΄ μ‚¬μš©
46
+ else:
47
+ print("λ™μ˜μ–΄ νŒŒμΌμ„ 찾을 수 μ—†μ–΄ κΈ°λ³Έ λ™μ˜μ–΄ 사전을 μ‚¬μš©ν•©λ‹ˆλ‹€.")
48
+ self.synonyms = DEFAULT_SYNONYMS
49
+ self.loaded = True
50
+
51
+ def _load_from_file(self, file_path: str) -> None:
52
+ """
53
+ νŒŒμΌμ—μ„œ λ™μ˜μ–΄ 사전 λ‘œλ“œ
54
+
55
+ Args:
56
+ file_path: λ™μ˜μ–΄ 파일 경둜
57
+ """
58
+ try:
59
+ # 파일 λ‚΄μš© 읽기
60
+ with open(file_path, 'r', encoding='utf-8') as f:
61
+ content = f.read()
62
+
63
+ # SYNONYMS λ”•μ…”λ„ˆλ¦¬ μΆ”μΆœ
64
+ synonyms_match = re.search(r'SYNONYMS\s*=\s*\{(.*?)\}', content, re.DOTALL)
65
+ if synonyms_match:
66
+ # μ‹€ν–‰ν•˜μ§€ μ•Šκ³  λ³€ν™˜ν•˜λŠ” 방법
67
+ synonyms_str = "{" + synonyms_match.group(1) + "}"
68
+
69
+ # μ •κ·œμ‹μ„ μ‚¬μš©ν•˜μ—¬ λ”•μ…”λ„ˆλ¦¬ ν˜•νƒœλ‘œ νŒŒμ‹±
70
+ pattern = r'"([^"]*)"\s*:\s*"([^"]*)"'
71
+ matches = re.findall(pattern, synonyms_str)
72
+
73
+ self.synonyms = {key: value for key, value in matches}
74
+ self.loaded = True
75
+ print(f"λ™μ˜μ–΄ 사전 λ‘œλ“œ μ™„λ£Œ: {file_path}, {len(self.synonyms)}개 ν•­λͺ©")
76
+ else:
77
+ print(f"νŒŒμΌμ—μ„œ SYNONYMS λ”•μ…”λ„ˆλ¦¬λ₯Ό 찾을 수 μ—†μŠ΅λ‹ˆλ‹€: {file_path}")
78
+ self.synonyms = DEFAULT_SYNONYMS
79
+ self.loaded = True
80
+
81
+ except Exception as e:
82
+ print(f"λ™μ˜μ–΄ 사전 λ‘œλ“œ 쀑 였λ₯˜: {e}")
83
+ self.synonyms = DEFAULT_SYNONYMS
84
+ self.loaded = True
85
+
86
+ def find_in_text(self, text: str) -> List[str]:
87
+ """
88
+ ν…μŠ€νŠΈμ—μ„œ λ™μ˜μ–΄ μ°ΎκΈ°
89
+
90
+ Args:
91
+ text: 검색할 ν…μŠ€νŠΈ
92
+
93
+ Returns:
94
+ 찾은 ν‘œμ€€ν™”λœ λΆ€ν’ˆλͺ… 리슀트
95
+ """
96
+ if not text or not self.loaded:
97
+ return []
98
+
99
+ # 곡백 제거 및 μ†Œλ¬Έμž λ³€ν™˜
100
+ text = text.lower()
101
+
102
+ found_parts = set()
103
+
104
+ # λ™μ˜μ–΄ ν‚€μ›Œλ“œκ°€ ν…μŠ€νŠΈμ— ν¬ν•¨λ˜μ–΄ μžˆλŠ”μ§€ 확인
105
+ for keyword, standard_name in self.synonyms.items():
106
+ if keyword.lower() in text:
107
+ found_parts.add(standard_name)
108
+
109
+ return list(found_parts)
110
+
111
+ def standardize(self, part_name: str) -> str:
112
+ """
113
+ λΆ€ν’ˆλͺ…을 ν‘œμ€€ν™”
114
+
115
+ Args:
116
+ part_name: ν‘œμ€€ν™”ν•  λΆ€ν’ˆλͺ…
117
+
118
+ Returns:
119
+ ν‘œμ€€ν™”λœ λΆ€ν’ˆλͺ…
120
+ """
121
+ if not part_name or not self.loaded:
122
+ return part_name
123
+
124
+ # μ†Œλ¬Έμž λ³€ν™˜ν•˜μ—¬ 비ꡐ
125
+ part_lower = part_name.lower().strip()
126
+
127
+ # λ™μ˜μ–΄ μ‚¬μ „μ—μ„œ 검색
128
+ for keyword, standard_name in self.synonyms.items():
129
+ if part_lower == keyword.lower():
130
+ return standard_name
131
+
132
+ # λ§€μΉ­λ˜μ§€ μ•ŠμœΌλ©΄ μ›λž˜ 이름 λ°˜ν™˜
133
+ return part_name
134
+
135
+ def standardize_parts_list(self, parts: List[str]) -> List[str]:
136
+ """
137
+ λΆ€ν’ˆλͺ… 리슀트λ₯Ό ν‘œμ€€ν™”
138
+
139
+ Args:
140
+ parts: ν‘œμ€€ν™”ν•  λΆ€ν’ˆλͺ… 리슀트
141
+
142
+ Returns:
143
+ ν‘œμ€€ν™”λœ λΆ€ν’ˆλͺ… 리슀트
144
+ """
145
+ if not parts or not self.loaded:
146
+ return parts
147
+
148
+ standardized = set()
149
+
150
+ for part in parts:
151
+ if part:
152
+ standardized.add(self.standardize(part))
153
+
154
+ return list(standardized)
monitoring.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Langfuseλ₯Ό ν™œμš©ν•œ λͺ¨λ‹ˆν„°λ§ κ΅¬ν˜„ (선택적)
3
+ """
4
+ from typing import Dict, Any, Optional
5
+ import time
6
+ import os
7
+ from dotenv import load_dotenv
8
+
9
+ # ν™˜κ²½ λ³€μˆ˜ λ‘œλ“œ
10
+ load_dotenv()
11
+
12
+ # μ„€μ • κ°’ κ°€μ Έμ˜€κΈ°
13
+ LANGFUSE_SECRET_KEY = os.getenv("LANGFUSE_SECRET_KEY", "")
14
+ LANGFUSE_PUBLIC_KEY = os.getenv("LANGFUSE_PUBLIC_KEY", "")
15
+ LANGFUSE_HOST = os.getenv("LANGFUSE_HOST", "https://cloud.langfuse.com")
16
+
17
+ class LangfuseMonitoring:
18
+ def __init__(self):
19
+ """
20
+ Langfuse λͺ¨λ‹ˆν„°λ§ μ΄ˆκΈ°ν™” (선택적 κΈ°λŠ₯)
21
+ """
22
+ self.enabled = False
23
+ print("λͺ¨λ‹ˆν„°λ§ κΈ°λŠ₯을 μ΄ˆκΈ°ν™”ν•©λ‹ˆλ‹€...")
24
+
25
+ # Langfuseκ°€ μ„€μΉ˜λ˜μ–΄ μžˆλŠ”μ§€ 확인
26
+ try:
27
+ from langfuse import Langfuse
28
+
29
+ if LANGFUSE_PUBLIC_KEY and LANGFUSE_SECRET_KEY:
30
+ try:
31
+ self.langfuse = Langfuse(
32
+ public_key=LANGFUSE_PUBLIC_KEY,
33
+ secret_key=LANGFUSE_SECRET_KEY,
34
+ host=LANGFUSE_HOST,
35
+ )
36
+ self.enabled = True
37
+ print("Langfuse λͺ¨λ‹ˆν„°λ§μ΄ ν™œμ„±ν™”λ˜μ—ˆμŠ΅λ‹ˆλ‹€.")
38
+ except Exception as e:
39
+ print(f"Langfuse μ΄ˆκΈ°ν™” μ‹€νŒ¨: {e}")
40
+ else:
41
+ print("Langfuse API ν‚€κ°€ μ„€μ •λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€. λͺ¨λ‹ˆν„°λ§μ€ λΉ„ν™œμ„±ν™”λ©λ‹ˆλ‹€.")
42
+ except ImportError:
43
+ print("langfuse νŒ¨ν‚€μ§€κ°€ μ„€μΉ˜λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€. λͺ¨λ‹ˆν„°λ§ κΈ°λŠ₯이 λΉ„ν™œμ„±ν™”λ©λ‹ˆλ‹€.")
44
+ print("pip install langfuse λͺ…λ ΉμœΌλ‘œ μ„€μΉ˜ν•  수 μžˆμŠ΅λ‹ˆλ‹€.")
45
+
46
+ def start_trace(self, name: str, user_id: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None) -> Any:
47
+ """
48
+ μƒˆ 트레이슀 μ‹œμž‘
49
+
50
+ Args:
51
+ name: 트레이슀 이름
52
+ user_id: μ‚¬μš©μž ID (선택적)
53
+ metadata: μΆ”κ°€ 메타데이터 (선택적)
54
+
55
+ Returns:
56
+ 트레이슀 객체 λ˜λŠ” None
57
+ """
58
+ if not self.enabled:
59
+ return None
60
+
61
+ try:
62
+ return self.langfuse.trace(
63
+ name=name,
64
+ user_id=user_id,
65
+ metadata=metadata or {},
66
+ )
67
+ except Exception as e:
68
+ print(f"트레이슀 생성 μ‹€νŒ¨: {e}")
69
+ return None
70
+
71
+ def log_generation(self, trace: Any, name: str, prompt: str, response: str, metadata: Optional[Dict[str, Any]] = None) -> None:
72
+ """
73
+ LLM 생성 λ‘œκΉ…
74
+
75
+ Args:
76
+ trace: 트레이슀 객체
77
+ name: 생성 이름
78
+ prompt: μž…λ ₯ ν”„λ‘¬ν”„νŠΈ
79
+ response: λͺ¨λΈ 응닡
80
+ metadata: μΆ”κ°€ 메타데이터 (선택적)
81
+ """
82
+ if not self.enabled or trace is None:
83
+ return
84
+
85
+ try:
86
+ trace.generation(
87
+ name=name,
88
+ model="user-defined-model",
89
+ prompt=prompt,
90
+ completion=response,
91
+ metadata=metadata or {},
92
+ )
93
+ except Exception as e:
94
+ print(f"생성 λ‘œκΉ… μ‹€νŒ¨: {e}")
95
+
96
+ def log_span(self, trace: Any, name: str, input_data: Any, output_data: Any, start_time: float, end_time: float) -> None:
97
+ """
98
+ 처리 ꡬ간 λ‘œκΉ…
99
+
100
+ Args:
101
+ trace: 트레이슀 객체
102
+ name: ꡬ간 이름
103
+ input_data: μž…λ ₯ 데이터
104
+ output_data: 좜λ ₯ 데이터
105
+ start_time: μ‹œμž‘ μ‹œκ°„
106
+ end_time: μ’…λ£Œ μ‹œκ°„
107
+ """
108
+ if not self.enabled or trace is None:
109
+ return
110
+
111
+ try:
112
+ trace.span(
113
+ name=name,
114
+ start_time=start_time,
115
+ end_time=end_time,
116
+ input=input_data,
117
+ output=output_data,
118
+ metadata={"duration_ms": (end_time - start_time) * 1000},
119
+ )
120
+ except Exception as e:
121
+ print(f"ꡬ간 λ‘œκΉ… μ‹€νŒ¨: {e}")
122
+
123
+ def end_trace(self, trace: Any) -> None:
124
+ """
125
+ 트레이슀 μ’…λ£Œ
126
+
127
+ Args:
128
+ trace: μ’…λ£Œν•  트레이슀 객체
129
+ """
130
+ if not self.enabled or trace is None:
131
+ return
132
+
133
+ try:
134
+ trace.update(status="success")
135
+ except Exception as e:
136
+ print(f"트레이슀 μ’…λ£Œ μ‹€νŒ¨: {e}")
optimized_document_processor.py ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ CPU에 μ΅œμ ν™”λœ λ¬Έμ„œ 처리 λͺ¨λ“ˆ - 병렬 처리 적용
3
+ """
4
+ import os
5
+ import time
6
+ from typing import List, Dict, Any, Optional
7
+ from langchain.schema import Document
8
+ from concurrent.futures import ThreadPoolExecutor
9
+
10
+ # λ©€ν‹°ν”„λ‘œμ„Έμ‹± κ°€μ Έμ˜€κΈ°
11
+ import multiprocessing
12
+
13
+ try:
14
+ CPU_COUNT = multiprocessing.cpu_count()
15
+ except:
16
+ CPU_COUNT = 4
17
+
18
+ print(f"CPU μ½”μ–΄ 수: {CPU_COUNT}")
19
+
20
+ # docling 라이브러리 쑴재 μ—¬λΆ€ 확인
21
+ try:
22
+ from docling.datamodel.base_models import InputFormat
23
+ from docling.document_converter import DocumentConverter, PdfFormatOption
24
+ from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode
25
+ from docling.chunking import HybridChunker
26
+
27
+ DOCLING_AVAILABLE = True
28
+ print("docling 라이브러리 μ‚¬μš© κ°€λŠ₯")
29
+ except ImportError:
30
+ print("docling 라이브러리λ₯Ό 찾을 수 μ—†μŠ΅λ‹ˆλ‹€. PyPDFLoader만 μ‚¬μš©ν•©λ‹ˆλ‹€.")
31
+ DOCLING_AVAILABLE = False
32
+
33
+ # LangChain λ¬Έμ„œ λ‘œλ”
34
+ from langchain_community.document_loaders import PyPDFLoader
35
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
36
+
37
+
38
+ class OptimizedDocumentProcessor:
39
+ """
40
+ CPU에 μ΅œμ ν™”λœ 병렬 처리 λ¬Έμ„œ 처리 클래슀
41
+ """
42
+
43
+ def __init__(self,
44
+ chunk_size: int = 1000,
45
+ chunk_overlap: int = 200,
46
+ tokenizer: str = "Alibaba-NLP/gte-multilingual-base", # μ˜¬λ°”λ₯Έ λͺ¨λΈ 경둜둜 μˆ˜μ •
47
+ max_workers: int = CPU_COUNT):
48
+ """
49
+ λ¬Έμ„œ 처리기 μ΄ˆκΈ°ν™”
50
+
51
+ Args:
52
+ chunk_size: ν…μŠ€νŠΈ 청크 크기
53
+ chunk_overlap: 청크 κ°„ κ²ΉμΉ¨ 크기
54
+ tokenizer: HybridChunkerμ—μ„œ μ‚¬μš©ν•  ν† ν¬λ‚˜μ΄μ €
55
+ max_workers: 병렬 μ²˜λ¦¬μ‹œ μ΅œλŒ€ μž‘μ—…μž 수
56
+ """
57
+ self.chunk_size = chunk_size
58
+ self.chunk_overlap = chunk_overlap
59
+ self.tokenizer = tokenizer
60
+ self.max_workers = max(1, min(max_workers, CPU_COUNT)) # CPU μ½”μ–΄ 수 μ΄ˆκ³Όν•˜μ§€ μ•Šλ„λ‘
61
+
62
+ print(f"병렬 처리 μž‘μ—…μž 수: {self.max_workers}")
63
+
64
+ # LangChain ν…μŠ€νŠΈ μŠ€ν”Œλ¦¬ν„°
65
+ self.text_splitter = RecursiveCharacterTextSplitter(
66
+ chunk_size=chunk_size,
67
+ chunk_overlap=chunk_overlap,
68
+ separators=["\n\n", "\n", ". ", " ", ""],
69
+ )
70
+
71
+ # docling κ΄€λ ¨ μ»΄ν¬λ„ŒνŠΈ μ΄ˆκΈ°ν™”
72
+ if DOCLING_AVAILABLE:
73
+ # νŒŒμ΄ν”„λΌμΈ μ˜΅μ…˜ μ„€μ •
74
+ self.pipeline_options = PdfPipelineOptions(do_table_structure=True)
75
+ self.pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
76
+
77
+ # λ¬Έμ„œ λ³€ν™˜κΈ° μ΄ˆκΈ°ν™”
78
+ self.doc_converter = DocumentConverter(
79
+ format_options={
80
+ InputFormat.PDF: PdfFormatOption(pipeline_options=self.pipeline_options)
81
+ }
82
+ )
83
+
84
+ # HybridChunker μ΄ˆκΈ°ν™” (trust_remote_code=True μΆ”κ°€)
85
+ self.hybrid_chunker = HybridChunker(
86
+ tokenizer=tokenizer,
87
+ chunk_size=chunk_size,
88
+ overlap=chunk_overlap,
89
+ tokenizer_kwargs={"trust_remote_code": True} # 원격 μ½”λ“œ μ‹€ν–‰ ν—ˆμš©
90
+ )
91
+
92
+ print(f"docling μ΄ˆκΈ°ν™” μ™„λ£Œ: HybridChunker(청크 크기={chunk_size}, μ˜€λ²„λž©={chunk_overlap})")
93
+
94
+ def process_with_docling(self, pdf_path: str) -> Dict[str, Any]:
95
+ """
96
+ docling을 μ‚¬μš©ν•˜μ—¬ PDF λ¬Έμ„œ 처리
97
+
98
+ Args:
99
+ pdf_path: PDF 파일 경둜
100
+
101
+ Returns:
102
+ 처리된 λ¬Έμ„œ 데이터
103
+ """
104
+ if not DOCLING_AVAILABLE:
105
+ raise ImportError("docling λΌμ΄λΈŒλŸ¬λ¦¬κ°€ μ„€μΉ˜λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€.")
106
+
107
+ try:
108
+ start_time = time.time()
109
+
110
+ # λ¬Έμ„œ λ³€ν™˜
111
+ conv_res = self.doc_converter.convert(pdf_path)
112
+ doc = conv_res.document
113
+
114
+ # μ„±λŠ₯ μΈ‘μ •
115
+ conversion_time = time.time() - start_time
116
+ print(f"PDF λ³€ν™˜ μ‹œκ°„: {conversion_time:.2f}초")
117
+
118
+ # 메타데이터 μΆ”μΆœ
119
+ metadata = {
120
+ "source": pdf_path,
121
+ "title": os.path.basename(pdf_path),
122
+ "processing_time": conversion_time
123
+ }
124
+
125
+ return {
126
+ "content": doc.export_to_markdown(),
127
+ "metadata": metadata,
128
+ "raw_document": doc,
129
+ }
130
+
131
+ except Exception as e:
132
+ print(f"docling으둜 λ¬Έμ„œ 처리 쀑 였λ₯˜ λ°œμƒ: {e}")
133
+ raise
134
+
135
+ def chunk_with_hybrid_chunker(self, doc: Any) -> List[Dict[str, Any]]:
136
+ """
137
+ HybridChunkerλ₯Ό μ‚¬μš©ν•˜μ—¬ λ¬Έμ„œλ₯Ό 청크둜 λΆ„ν• 
138
+
139
+ Args:
140
+ doc: docling λ¬Έμ„œ 객체
141
+
142
+ Returns:
143
+ 청크 리슀트
144
+ """
145
+ start_time = time.time()
146
+
147
+ # μ²­ν‚Ή μˆ˜ν–‰
148
+ chunk_iter = self.hybrid_chunker.chunk(doc)
149
+ chunks = list(chunk_iter)
150
+
151
+ chunking_time = time.time() - start_time
152
+ print(f"μ²­ν‚Ή μ‹œκ°„: {chunking_time:.2f}초 (청크 수: {len(chunks)})")
153
+
154
+ return chunks
155
+
156
+ def create_langchain_documents_from_chunks(self,
157
+ chunks: List[Dict[str, Any]],
158
+ metadata: Dict[str, Any]) -> List[Document]:
159
+ """
160
+ docling 청크λ₯Ό LangChain Document 객체둜 λ³€ν™˜
161
+
162
+ Args:
163
+ chunks: docling HybridChunker둜 μƒμ„±ν•œ 청크 리슀트
164
+ metadata: λ¬Έμ„œ 메타데이터
165
+
166
+ Returns:
167
+ LangChain Document 객체 리슀트
168
+ """
169
+ documents = []
170
+
171
+ for i, chunk in enumerate(chunks):
172
+ # 각 청크에 λŒ€ν•œ 메타데이터
173
+ chunk_metadata = metadata.copy()
174
+ chunk_metadata["chunk_id"] = i
175
+
176
+ # 청크 λ‚΄μš© μΆ”μΆœ
177
+ if hasattr(chunk, "text"):
178
+ content = chunk.text
179
+ elif hasattr(chunk, "content"):
180
+ content = chunk.content
181
+ else:
182
+ content = str(chunk)
183
+
184
+ document = Document(
185
+ page_content=content,
186
+ metadata=chunk_metadata
187
+ )
188
+ documents.append(document)
189
+
190
+ return documents
191
+
192
+ def process_with_langchain(self, pdf_path: str) -> List[Document]:
193
+ """
194
+ LangChain의 PyPDFLoaderλ₯Ό μ‚¬μš©ν•˜μ—¬ PDF λ¬Έμ„œ λ‘œλ“œ
195
+
196
+ Args:
197
+ pdf_path: PDF 파일 경둜
198
+
199
+ Returns:
200
+ LangChain Document 객체 리슀트
201
+ """
202
+ start_time = time.time()
203
+
204
+ try:
205
+ loader = PyPDFLoader(pdf_path)
206
+ documents = loader.load()
207
+
208
+ processing_time = time.time() - start_time
209
+ print(f"PyPDFLoader 처리 μ‹œκ°„: {processing_time:.2f}초")
210
+
211
+ return documents
212
+ except Exception as e:
213
+ print(f"PyPDFLoader둜 λ¬Έμ„œ 처리 쀑 였λ₯˜ λ°œμƒ: {e}")
214
+ raise
215
+
216
+ def process_pdf(self, pdf_path: str, use_docling: bool = True) -> List[Document]:
217
+ """
218
+ PDF 파일 처리
219
+
220
+ Args:
221
+ pdf_path: PDF 파일 경둜
222
+ use_docling: docling μ‚¬μš© μ—¬λΆ€
223
+
224
+ Returns:
225
+ 처리된 λ¬Έμ„œμ˜ 청크 리슀트
226
+ """
227
+ total_start_time = time.time()
228
+
229
+ # docling μ‚¬μš© κ°€λŠ₯ μ—¬λΆ€ 확인
230
+ can_use_docling = use_docling and DOCLING_AVAILABLE
231
+
232
+ if can_use_docling:
233
+ try:
234
+ # 1. docling으둜 PDF 처리
235
+ docling_result = self.process_with_docling(pdf_path)
236
+ doc = docling_result["raw_document"]
237
+ metadata = docling_result["metadata"]
238
+
239
+ # 2. HybridChunker둜 청크 생성
240
+ chunks = self.chunk_with_hybrid_chunker(doc)
241
+
242
+ # 3. 청크λ₯Ό LangChain Document둜 λ³€ν™˜
243
+ documents = self.create_langchain_documents_from_chunks(chunks, metadata)
244
+
245
+ total_time = time.time() - total_start_time
246
+ print(f"docling 처리 μ™„λ£Œ: '{pdf_path}', {len(documents)} 청크, 총 {total_time:.2f}초")
247
+
248
+ return documents
249
+ except Exception as e:
250
+ print(f"docling 처리 μ‹€νŒ¨, PyPDFLoader둜 λŒ€μ²΄: {e}")
251
+ can_use_docling = False
252
+
253
+ if not can_use_docling:
254
+ # PyPDFLoader둜 처리 (λŒ€μ²΄ λ°©μ•ˆ)
255
+ documents = self.process_with_langchain(pdf_path)
256
+ chunks = self.text_splitter.split_documents(documents)
257
+
258
+ total_time = time.time() - total_start_time
259
+ print(f"PyPDFLoader 처리 μ™„λ£Œ: '{pdf_path}', {len(chunks)} 청크, 총 {total_time:.2f}초")
260
+
261
+ return chunks
262
+
263
+ def process_directory_parallel(self, directory: str, use_docling: bool = True) -> List[Document]:
264
+ """
265
+ 디렉토리 λ‚΄ λͺ¨λ“  PDF 파일 병렬 처리 (λ©€ν‹°μŠ€λ ˆλ”©)
266
+
267
+ Args:
268
+ directory: PDF 파일 디렉토리 경둜
269
+ use_docling: docling μ‚¬μš© μ—¬λΆ€
270
+
271
+ Returns:
272
+ 처리된 λͺ¨λ“  λ¬Έμ„œμ˜ 청크 리슀트
273
+ """
274
+ all_documents = []
275
+ pdf_files = []
276
+
277
+ # PDF 파일 λͺ©λ‘ μˆ˜μ§‘
278
+ for file in os.listdir(directory):
279
+ if file.endswith(".pdf"):
280
+ pdf_path = os.path.join(directory, file)
281
+ pdf_files.append(pdf_path)
282
+
283
+ if not pdf_files:
284
+ print(f"'{directory}' 디렉토리에 PDF 파일이 μ—†μŠ΅λ‹ˆλ‹€.")
285
+ return []
286
+
287
+ print(f"총 {len(pdf_files)}개 PDF 파일 병렬 처리 μ‹œμž‘ (μ΅œλŒ€ {self.max_workers} μž‘μ—…μž)")
288
+ start_time = time.time()
289
+
290
+ # 병렬 처리 μ‹€ν–‰
291
+ with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
292
+ # 각 PDF νŒŒμΌμ— λŒ€ν•΄ process_pdf ν•¨μˆ˜ 병렬 μ‹€ν–‰
293
+ future_to_pdf = {executor.submit(self.process_pdf, pdf_path, use_docling): pdf_path
294
+ for pdf_path in pdf_files}
295
+
296
+ # κ²°κ³Ό μˆ˜μ§‘
297
+ for future in future_to_pdf:
298
+ pdf_path = future_to_pdf[future]
299
+ try:
300
+ # κ²°κ³Ό κ°€μ Έμ˜€κΈ°
301
+ chunks = future.result()
302
+ all_documents.extend(chunks)
303
+ print(f"'{os.path.basename(pdf_path)}' 처리 μ™„λ£Œ: {len(chunks)} 청크")
304
+ except Exception as e:
305
+ print(f"'{pdf_path}' 처리 쀑 였λ₯˜ λ°œμƒ: {e}")
306
+
307
+ total_time = time.time() - start_time
308
+ print(f"병렬 처리 μ™„λ£Œ: 총 {len(all_documents)} 청크, 처리 μ‹œκ°„: {total_time:.2f}초")
309
+
310
+ return all_documents
311
+
312
+ def process_directory(self, directory: str, use_docling: bool = True, parallel: bool = True) -> List[Document]:
313
+ """
314
+ 디렉토리 λ‚΄ λͺ¨λ“  PDF 파일 처리
315
+
316
+ Args:
317
+ directory: PDF 파일 디렉토리 경둜
318
+ use_docling: docling μ‚¬μš© μ—¬λΆ€
319
+ parallel: 병렬 처리 μ‚¬μš© μ—¬λΆ€
320
+
321
+ Returns:
322
+ 처리된 λͺ¨λ“  λ¬Έμ„œμ˜ 청크 리슀트
323
+ """
324
+ # 병렬 처리 μ‚¬μš©
325
+ if parallel:
326
+ return self.process_directory_parallel(directory, use_docling)
327
+
328
+ # 순차 처리
329
+ all_documents = []
330
+ start_time = time.time()
331
+
332
+ for file in os.listdir(directory):
333
+ if file.endswith(".pdf"):
334
+ pdf_path = os.path.join(directory, file)
335
+ print(f"처리 쀑: {pdf_path}")
336
+
337
+ try:
338
+ chunks = self.process_pdf(pdf_path, use_docling=use_docling)
339
+ all_documents.extend(chunks)
340
+ except Exception as e:
341
+ print(f"'{pdf_path}' 처리 쀑 였λ₯˜ λ°œμƒ: {e}")
342
+
343
+ total_time = time.time() - start_time
344
+ print(f"순차 처리 μ™„λ£Œ: 총 {len(all_documents)} 청크, 처리 μ‹œκ°„: {total_time:.2f}초")
345
+
346
+ return all_documents
rag_chain.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ LangChain을 ν™œμš©ν•œ RAG 체인 κ΅¬ν˜„
3
+ """
4
+ from typing import List, Dict, Any
5
+ from langchain.schema import Document
6
+ from langchain.prompts import PromptTemplate
7
+ from langchain_core.output_parsers import StrOutputParser
8
+ from langchain_core.runnables import RunnablePassthrough
9
+ from langchain_community.chat_models import ChatOllama
10
+ from langchain_openai import ChatOpenAI
11
+
12
+ from config import (
13
+ OLLAMA_HOST, LLM_MODEL, USE_OPENAI,
14
+ OPENAI_API_KEY, TOP_K_RETRIEVAL, TOP_K_RERANK
15
+ )
16
+ from vector_store import VectorStore
17
+ from reranker import Reranker
18
+
19
+
20
+ class RAGChain:
21
+ def __init__(self, vector_store: VectorStore, use_reranker: bool = True):
22
+ """
23
+ RAG 체인 μ΄ˆκΈ°ν™” (ν™˜κ²½μ— λ”°λ₯Έ LLM 선택)
24
+
25
+ Args:
26
+ vector_store: 벑터 μŠ€ν† μ–΄ μΈμŠ€ν„΄μŠ€
27
+ use_reranker: 리랭컀 μ‚¬μš© μ—¬λΆ€
28
+ """
29
+ try:
30
+ print("RAGChain μ΄ˆκΈ°ν™” μ‹œμž‘...")
31
+ self.vector_store = vector_store
32
+ self.use_reranker = use_reranker
33
+ print(f"리랭컀 μ‚¬μš© μ—¬λΆ€: {use_reranker}")
34
+
35
+ if use_reranker:
36
+ try:
37
+ self.reranker = Reranker()
38
+ print("리랭컀 μ΄ˆκΈ°ν™” 성곡")
39
+ except Exception as e:
40
+ print(f"리랭컀 μ΄ˆκΈ°ν™” μ‹€νŒ¨: {str(e)}")
41
+ self.reranker = None
42
+ self.use_reranker = False
43
+ else:
44
+ self.reranker = None
45
+
46
+ # ν™˜κ²½μ— λ”°λ₯Έ LLM λͺ¨λΈ μ„€μ •
47
+ if USE_OPENAI or IS_HUGGINGFACE:
48
+ print(f"OpenAI λͺ¨λΈ μ΄ˆκΈ°ν™”: {LLM_MODEL}")
49
+ print(f"API ν‚€ 쑴재 μ—¬λΆ€: {'있음' if OPENAI_API_KEY else 'μ—†μŒ'}")
50
+ try:
51
+ self.llm = ChatOpenAI(
52
+ model_name=LLM_MODEL,
53
+ temperature=0.2,
54
+ api_key=OPENAI_API_KEY,
55
+ )
56
+ print("OpenAI λͺ¨λΈ μ΄ˆκΈ°ν™” 성곡")
57
+ except Exception as e:
58
+ print(f"OpenAI λͺ¨λΈ μ΄ˆκΈ°ν™” μ‹€νŒ¨: {str(e)}")
59
+ raise
60
+ else:
61
+ try:
62
+ print(f"Ollama λͺ¨λΈ μ΄ˆκΈ°ν™”: {LLM_MODEL}")
63
+ self.llm = ChatOllama(
64
+ model=LLM_MODEL,
65
+ temperature=0.2,
66
+ base_url=OLLAMA_HOST,
67
+ )
68
+ print("Ollama λͺ¨λΈ μ΄ˆκΈ°ν™” 성곡")
69
+ except Exception as e:
70
+ print(f"Ollama λͺ¨λΈ μ΄ˆκΈ°ν™” μ‹€νŒ¨: {str(e)}")
71
+ raise
72
+
73
+ # RAG 체인 ꡬ성 및 ν”„λ‘¬ν”„νŠΈ μ„€μ •
74
+ print("RAG 체인 μ„€μ • μ‹œμž‘...")
75
+ self.setup_chain()
76
+ print("RAG 체인 μ„€μ • μ™„λ£Œ")
77
+ except Exception as e:
78
+ print(f"RAGChain μ΄ˆκΈ°ν™” 쀑 상세 였λ₯˜: {str(e)}")
79
+ import traceback
80
+ traceback.print_exc()
81
+ raise
82
+
83
+ def setup_chain(self) -> None:
84
+ """
85
+ RAG 체인 및 ν”„λ‘¬ν”„νŠΈ μ„€μ •
86
+ """
87
+ # ν”„λ‘¬ν”„νŠΈ ν…œν”Œλ¦Ώ μ •μ˜
88
+ template = """
89
+ λ‹€μŒ 정보λ₯Ό 기반으둜 μ§ˆλ¬Έμ— μ •ν™•ν•˜κ²Œ λ‹΅λ³€ν•΄μ£Όμ„Έμš”.
90
+
91
+ 질문: {question}
92
+
93
+ μ°Έκ³  정보:
94
+ {context}
95
+
96
+ μ°Έκ³  정보에 닡이 μ—†λŠ” 경우 "제곡된 λ¬Έμ„œμ—μ„œ ν•΄λ‹Ή 정보λ₯Ό 찾을 수 μ—†μŠ΅λ‹ˆλ‹€."라고 λ‹΅λ³€ν•˜μ„Έμš”.
97
+ 닡변은 μ •ν™•ν•˜κ³  κ°„κ²°ν•˜κ²Œ μ œκ³΅ν•˜λ˜, μ°Έκ³  μ •λ³΄μ—μ„œ κ·Όκ±°λ₯Ό μ°Ύμ•„ μ„€λͺ…ν•΄μ£Όμ„Έμš”.
98
+ μ°Έκ³  μ •λ³΄μ˜ μΆœμ²˜λ„ ν•¨κ»˜ μ•Œλ €μ£Όμ„Έμš”.
99
+ """
100
+
101
+ self.prompt = PromptTemplate.from_template(template)
102
+
103
+ # RAG 체인 μ •μ˜
104
+ self.chain = (
105
+ {"context": self._retrieve, "question": RunnablePassthrough()}
106
+ | self.prompt
107
+ | self.llm
108
+ | StrOutputParser()
109
+ )
110
+
111
+ def _retrieve(self, query: str) -> str:
112
+ """
113
+ 쿼리에 λŒ€ν•œ κ΄€λ ¨ λ¬Έμ„œ 검색 및 μ»¨ν…μŠ€νŠΈ ꡬ성
114
+
115
+ Args:
116
+ query: μ‚¬μš©μž 질문
117
+
118
+ Returns:
119
+ 검색 κ²°κ³Όλ₯Ό ν¬ν•¨ν•œ μ»¨ν…μŠ€νŠΈ λ¬Έμžμ—΄
120
+ """
121
+ # 벑터 검색 μˆ˜ν–‰
122
+ docs = self.vector_store.similarity_search(query, k=TOP_K_RETRIEVAL)
123
+
124
+ # 리랭컀 적용 (선택적)
125
+ if self.use_reranker and docs:
126
+ docs = self.reranker.rerank(query, docs, top_k=TOP_K_RERANK)
127
+
128
+ # 검색 κ²°κ³Ό μ»¨ν…μŠ€νŠΈ ꡬ성
129
+ context_parts = []
130
+ for i, doc in enumerate(docs, 1):
131
+ source = doc.metadata.get("source", "μ•Œ 수 μ—†λŠ” 좜처")
132
+ page = doc.metadata.get("page", "")
133
+ source_info = f"{source}"
134
+ if page:
135
+ source_info += f" (νŽ˜μ΄μ§€: {page})"
136
+
137
+ context_parts.append(f"[참고자료 {i}] - 좜처: {source_info}\n{doc.page_content}\n")
138
+
139
+ return "\n".join(context_parts)
140
+
141
+ def run(self, query: str) -> str:
142
+ """
143
+ μ‚¬μš©μž 쿼리에 λŒ€ν•œ RAG νŒŒμ΄ν”„λΌμΈ μ‹€ν–‰
144
+
145
+ Args:
146
+ query: μ‚¬μš©μž 질문
147
+
148
+ Returns:
149
+ λͺ¨λΈ 응닡 λ¬Έμžμ—΄
150
+ """
151
+ return self.chain.invoke(query)
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain>=0.1.0
2
+ langchain-community>=0.0.10
3
+ langchain-huggingface>=0.0.1
4
+ sentence-transformers>=2.2.2
5
+ faiss-cpu>=1.7.4
6
+ pypdf>=3.15.1
7
+ gradio>=4.0.0
8
+ python-dotenv>=1.0.0
9
+ torch>=2.0.0
10
+ transformers>=4.34.0
11
+ langchain-openai>=0.0.2
12
+ openai>=1.0.0
13
+ docling>=0.1.3
14
+ requests>=2.28.0
reranker.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 원격 μ½”λ“œ μ‹€ν–‰ μ˜΅μ…˜μ΄ μΆ”κ°€λœ 리랭컀 λͺ¨λ“ˆ
3
+ """
4
+ from typing import List, Dict, Tuple
5
+ import numpy as np
6
+ from sentence_transformers import CrossEncoder
7
+ from langchain.schema import Document
8
+ from config import RERANKER_MODEL
9
+
10
+ class Reranker:
11
+ def __init__(self, model_name: str = RERANKER_MODEL):
12
+ """
13
+ Cross-Encoder 리랭컀 μ΄ˆκΈ°ν™”
14
+
15
+ Args:
16
+ model_name: μ‚¬μš©ν•  Cross-Encoder λͺ¨λΈ 이름
17
+ """
18
+ print(f"리랭컀 λͺ¨λΈ λ‘œλ“œ 쀑: {model_name}")
19
+
20
+ # 원격 μ½”λ“œ μ‹€ν–‰ ν—ˆμš© μ˜΅μ…˜ μΆ”κ°€
21
+ self.model = CrossEncoder(
22
+ model_name,
23
+ trust_remote_code=True # 원격 μ½”λ“œ μ‹€ν–‰ ν—ˆμš© (ν•„μˆ˜)
24
+ )
25
+
26
+ print(f"리랭컀 λͺ¨λΈ λ‘œλ“œ μ™„λ£Œ: {model_name}")
27
+
28
+ def rerank(self, query: str, documents: List[Document], top_k: int = 3) -> List[Document]:
29
+ """
30
+ 검색 κ²°κ³Ό μž¬μ •λ ¬
31
+
32
+ Args:
33
+ query: 검색 쿼리
34
+ documents: 벑터 검색 κ²°κ³Ό λ¬Έμ„œ 리슀트
35
+ top_k: λ°˜ν™˜ν•  μƒμœ„ κ²°κ³Ό 수
36
+
37
+ Returns:
38
+ μž¬μ •λ ¬λœ μƒμœ„ λ¬Έμ„œ 리슀트
39
+ """
40
+ if not documents:
41
+ return []
42
+
43
+ # Cross-Encoder μž…λ ₯ 쌍 생성
44
+ document_texts = [doc.page_content for doc in documents]
45
+ query_doc_pairs = [(query, doc) for doc in document_texts]
46
+
47
+ # 점수 계산
48
+ print(f"λ¦¬λž­ν‚Ή μˆ˜ν–‰ 쀑: {len(documents)}개 λ¬Έμ„œ")
49
+ scores = self.model.predict(query_doc_pairs)
50
+
51
+ # μ μˆ˜μ— 따라 λ¬Έμ„œ μž¬μ •λ ¬
52
+ doc_score_pairs = list(zip(documents, scores))
53
+ doc_score_pairs.sort(key=lambda x: x[1], reverse=True)
54
+
55
+ print(f"λ¦¬λž­ν‚Ή μ™„λ£Œ: μƒμœ„ {top_k}개 λ¬Έμ„œ 선택")
56
+
57
+ # μƒμœ„ k개 κ²°κ³Ό λ°˜ν™˜
58
+ return [doc for doc, score in doc_score_pairs[:top_k]]
simple_rag_chain.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ κ°„λ‹¨ν•œ RAG 체인 κ΅¬ν˜„ (λ””λ²„κΉ…μš©)
3
+ """
4
+ import os
5
+ from langchain_openai import ChatOpenAI
6
+ from langchain.prompts import PromptTemplate
7
+ from langchain_core.output_parsers import StrOutputParser
8
+ from langchain_core.runnables import RunnablePassthrough
9
+
10
+
11
+ class SimpleRAGChain:
12
+ def __init__(self, vector_store):
13
+ """κ°„λ‹¨ν•œ RAG 체인 μ΄ˆκΈ°ν™”"""
14
+ print("κ°„λ‹¨ν•œ RAG 체인 μ΄ˆκΈ°ν™” 쀑...")
15
+ self.vector_store = vector_store
16
+
17
+ # OpenAI API ν‚€ 확인
18
+ openai_api_key = os.environ.get("OPENAI_API_KEY", "")
19
+ print(f"API ν‚€ 섀정됨: {bool(openai_api_key)}")
20
+
21
+ # OpenAI λͺ¨λΈ μ΄ˆκΈ°ν™”
22
+ self.llm = ChatOpenAI(
23
+ model_name="gpt-3.5-turbo",
24
+ temperature=0.2,
25
+ api_key=openai_api_key,
26
+ )
27
+
28
+ # ν”„λ‘¬ν”„νŠΈ ν…œν”Œλ¦Ώ
29
+ template = """
30
+ λ‹€μŒ 정보λ₯Ό 기반으둜 μ§ˆλ¬Έμ— μ •ν™•ν•˜κ²Œ λ‹΅λ³€ν•΄μ£Όμ„Έμš”.
31
+
32
+ 질문: {question}
33
+
34
+ μ°Έκ³  정보:
35
+ {context}
36
+
37
+ μ°Έκ³  정보에 닡이 μ—†λŠ” 경우 "제곡된 λ¬Έμ„œμ—μ„œ ν•΄λ‹Ή 정보λ₯Ό 찾을 수 μ—†μŠ΅λ‹ˆλ‹€."라고 λ‹΅λ³€ν•˜μ„Έμš”.
38
+ """
39
+
40
+ self.prompt = PromptTemplate.from_template(template)
41
+
42
+ # 체인 ꡬ성
43
+ self.chain = (
44
+ {"context": self._retrieve, "question": RunnablePassthrough()}
45
+ | self.prompt
46
+ | self.llm
47
+ | StrOutputParser()
48
+ )
49
+ print("κ°„λ‹¨ν•œ RAG 체인 μ΄ˆκΈ°ν™” μ™„λ£Œ")
50
+
51
+ def _retrieve(self, query):
52
+ """λ¬Έμ„œ 검색"""
53
+ try:
54
+ docs = self.vector_store.similarity_search(query, k=3)
55
+ return "\n\n".join(doc.page_content for doc in docs)
56
+ except Exception as e:
57
+ print(f"검색 쀑 였λ₯˜: {e}")
58
+ return "λ¬Έμ„œ 검색 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€."
59
+
60
+ def run(self, query):
61
+ """쿼리 처리"""
62
+ try:
63
+ return self.chain.invoke(query)
64
+ except Exception as e:
65
+ print(f"μ‹€ν–‰ 쀑 였λ₯˜: {e}")
66
+ return f"였λ₯˜ λ°œμƒ: {str(e)}"
vector_store.py ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ κ°œμ„ λœ 벑터 μŠ€ν† μ–΄ λͺ¨λ“ˆ - Milvus μ„€μ • μ΅œμ ν™”
3
+ """
4
+ from typing import List, Dict, Any, Optional
5
+ import uuid
6
+ from langchain.schema import Document
7
+
8
+ # 벑터 μŠ€ν† μ–΄ μž„ν¬νŠΈ
9
+ try:
10
+ # μ΅œμ‹  버전 μž„ν¬νŠΈ
11
+ from langchain_milvus import Milvus
12
+ from langchain_community.vectorstores import FAISS
13
+ from langchain_huggingface import HuggingFaceEmbeddings
14
+ MODERN_IMPORTS = True
15
+ print("μ΅œμ‹  langchain νŒ¨ν‚€μ§€ μž„ν¬νŠΈ 성곡")
16
+ except ImportError:
17
+ # 이전 버전 μž„ν¬νŠΈ
18
+ from langchain_community.vectorstores import Milvus, FAISS
19
+ from langchain_community.embeddings import HuggingFaceEmbeddings
20
+ MODERN_IMPORTS = False
21
+ print("λ ˆκ±°μ‹œ langchain_community νŒ¨ν‚€μ§€ μ‚¬μš©")
22
+
23
+ from config import MILVUS_HOST, MILVUS_PORT, MILVUS_COLLECTION, EMBEDDING_MODEL
24
+
25
+ class VectorStore:
26
+ def __init__(self, use_milvus: bool = True):
27
+ """
28
+ 벑터 μŠ€ν† μ–΄ μ΄ˆκΈ°ν™”
29
+
30
+ Args:
31
+ use_milvus: Milvus μ‚¬μš© μ—¬λΆ€ (False이면 FAISS μ‚¬μš©)
32
+ """
33
+ self.use_milvus = use_milvus
34
+
35
+ # μž„λ² λ”© λͺ¨λΈ μ„€μ •
36
+ print(f"μž„λ² λ”© λͺ¨λΈ λ‘œλ“œ 쀑: {EMBEDDING_MODEL}")
37
+ model_kwargs = {
38
+ "device": "cpu",
39
+ "trust_remote_code": True # 원격 μ½”λ“œ μ‹€ν–‰ ν—ˆμš© (ν•„μˆ˜)
40
+ }
41
+ encode_kwargs = {"normalize_embeddings": True}
42
+
43
+ self.embeddings = HuggingFaceEmbeddings(
44
+ model_name=EMBEDDING_MODEL,
45
+ model_kwargs=model_kwargs,
46
+ encode_kwargs=encode_kwargs
47
+ )
48
+ self.vector_store = None
49
+
50
+ print(f"μž„λ² λ”© λͺ¨λΈ μ΄ˆκΈ°ν™” μ™„λ£Œ: {EMBEDDING_MODEL}")
51
+
52
+ def init_milvus(self) -> Milvus:
53
+ """
54
+ Milvus 벑터 μŠ€ν† μ–΄ μ΄ˆκΈ°ν™”
55
+
56
+ Returns:
57
+ Milvus 벑터 μŠ€ν† μ–΄ μΈμŠ€ν„΄μŠ€
58
+ """
59
+ connection_args = {
60
+ "host": MILVUS_HOST,
61
+ "port": MILVUS_PORT,
62
+ }
63
+
64
+ # 벑터 검색 인덱슀 νŒŒλΌλ―Έν„° (FLAT 인덱슀 및 코사인 μœ μ‚¬λ„ λ©”νŠΈλ¦­)
65
+ index_params = {
66
+ "index_type": "FLAT", # 정확도 μš°μ„  FLAT 인덱슀
67
+ "metric_type": "COSINE", # 코사인 μœ μ‚¬λ„ (μ •κ·œν™”λœ 벑터에 적합)
68
+ "params": {} # FLAT μΈλ±μŠ€μ—λŠ” μΆ”κ°€ νŒŒλΌλ―Έν„° μ—†μŒ
69
+ }
70
+
71
+ return Milvus(
72
+ embedding_function=self.embeddings,
73
+ collection_name=MILVUS_COLLECTION,
74
+ connection_args=connection_args,
75
+ index_params=index_params
76
+ )
77
+
78
+ def init_faiss(self) -> FAISS:
79
+ """
80
+ FAISS 벑터 μŠ€ν† μ–΄ μ΄ˆκΈ°ν™” (둜컬 λŒ€μ²΄μš©)
81
+
82
+ Returns:
83
+ FAISS 벑터 μŠ€ν† μ–΄ μΈμŠ€ν„΄μŠ€
84
+ """
85
+ return FAISS.from_documents([], self.embeddings)
86
+
87
+ def create_or_load(self, documents: Optional[List[Document]] = None) -> Any:
88
+ """
89
+ 벑터 μŠ€ν† μ–΄ 생성 λ˜λŠ” λ‘œλ“œ
90
+
91
+ Args:
92
+ documents: μ €μž₯ν•  λ¬Έμ„œ 리슀트 (None이면 빈 μŠ€ν† μ–΄ 생성)
93
+
94
+ Returns:
95
+ 벑터 μŠ€ν† μ–΄ μΈμŠ€ν„΄μŠ€
96
+ """
97
+ if self.use_milvus:
98
+ if documents:
99
+ # λ¬Έμ„œκ°€ 제곡된 경우 μƒˆ μ»¬λ ‰μ…˜ 생성
100
+ try:
101
+ # μ—°κ²° μ„€μ •
102
+ connection_args = {
103
+ "host": MILVUS_HOST,
104
+ "port": MILVUS_PORT,
105
+ }
106
+
107
+ # 검색 인덱슀 μ„€μ •
108
+ index_params = {
109
+ "index_type": "FLAT", # 정확도 μš°μ„ 
110
+ "metric_type": "COSINE", # 코사인 μœ μ‚¬λ„
111
+ "params": {}
112
+ }
113
+
114
+ print(f"Milvus μ»¬λ ‰μ…˜ 생성: {MILVUS_COLLECTION} (κΈ°μ‘΄ μ»¬λ ‰μ…˜ μ‚­μ œ)")
115
+
116
+ # λ¬Έμ„œλ‘œλΆ€ν„° Milvus μ»¬λ ‰μ…˜ 생성
117
+ self.vector_store = Milvus.from_documents(
118
+ documents=documents,
119
+ embedding=self.embeddings,
120
+ collection_name=MILVUS_COLLECTION,
121
+ connection_args=connection_args,
122
+ index_params=index_params,
123
+ drop_old=True # κΈ°μ‘΄ μ»¬λ ‰μ…˜ μ‚­μ œ (μž¬κ΅¬μΆ•)
124
+ )
125
+
126
+ print(f"Milvus μ»¬λ ‰μ…˜ 생성 μ™„λ£Œ: {len(documents)}개 λ¬Έμ„œ 인덱싱됨")
127
+
128
+ except Exception as e:
129
+ print(f"Milvus μ»¬λ ‰μ…˜ 생성 μ‹€νŒ¨: {e}")
130
+ # λŒ€μ²΄ λ°©μ•ˆμœΌλ‘œ FAISS μ‚¬μš©
131
+ print("λŒ€μ²΄ λ°©μ•ˆμœΌλ‘œ FAISS μ‚¬μš©")
132
+ self.use_milvus = False
133
+ self.vector_store = FAISS.from_documents(documents, self.embeddings)
134
+ else:
135
+ # κΈ°μ‘΄ μ»¬λ ‰μ…˜ λ‘œλ“œ
136
+ try:
137
+ self.vector_store = self.init_milvus()
138
+ except Exception as e:
139
+ print(f"Milvus μ»¬λ ‰μ…˜ λ‘œλ“œ μ‹€νŒ¨: {e}")
140
+ # λŒ€μ²΄ λ°©μ•ˆμœΌλ‘œ FAISS μ‚¬μš©
141
+ print("λŒ€μ²΄ λ°©μ•ˆμœΌλ‘œ FAISS μ‚¬μš©")
142
+ self.use_milvus = False
143
+ self.vector_store = self.init_faiss()
144
+ else:
145
+ # FAISS μ‚¬μš©
146
+ if documents:
147
+ print(f"FAISS 인덱슀 생성: {len(documents)}개 λ¬Έμ„œ")
148
+ self.vector_store = FAISS.from_documents(documents, self.embeddings)
149
+ print("FAISS 인덱슀 생성 μ™„λ£Œ")
150
+ else:
151
+ self.vector_store = self.init_faiss()
152
+ print("빈 FAISS 인덱슀 μ΄ˆκΈ°ν™” μ™„λ£Œ")
153
+
154
+ return self.vector_store
155
+
156
+ def add_documents(self, documents: List[Document]) -> None:
157
+ """
158
+ 벑터 μŠ€ν† μ–΄μ— λ¬Έμ„œ μΆ”κ°€
159
+
160
+ Args:
161
+ documents: μΆ”κ°€ν•  λ¬Έμ„œ 리슀트
162
+ """
163
+ if self.vector_store is None:
164
+ self.create_or_load(documents)
165
+ else:
166
+ if self.use_milvus:
167
+ self.vector_store.add_documents(documents)
168
+ else:
169
+ self.vector_store.add_documents(documents)
170
+
171
+ def similarity_search(self, query: str, k: int = 5) -> List[Document]:
172
+ """
173
+ 벑터 μœ μ‚¬λ„ 검색 μˆ˜ν–‰
174
+
175
+ Args:
176
+ query: 검색 쿼리
177
+ k: λ°˜ν™˜ν•  κ²°κ³Ό 수
178
+
179
+ Returns:
180
+ μœ μ‚¬λ„κ°€ 높은 λ¬Έμ„œ 리슀트
181
+ """
182
+ if self.vector_store is None:
183
+ raise ValueError("벑터 μŠ€ν† μ–΄κ°€ μ΄ˆκΈ°ν™”λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€.")
184
+
185
+ print(f"검색 쿼리: '{query}', μƒμœ„ {k}개 κ²°κ³Ό μš”μ²­")
186
+ results = self.vector_store.similarity_search(query, k=k)
187
+ print(f"검색 μ™„λ£Œ: {len(results)}개 κ²°κ³Ό 찾음")
188
+
189
+ return results
190
+
191
+ def save_local(self, path: str = "faiss_index") -> None:
192
+ """
193
+ FAISS 인덱슀 둜컬 μ €μž₯ (Milvus μ‚¬μš© μ•ˆ ν•  경우)
194
+
195
+ Args:
196
+ path: μ €μž₯ 경둜
197
+ """
198
+ if not self.use_milvus and self.vector_store is not None:
199
+ self.vector_store.save_local(path)
200
+ print(f"FAISS 인덱슀 둜컬 μ €μž₯ μ™„λ£Œ: {path}")
201
+
202
+ """
203
+ FAISS 역직렬화 ν—ˆμš© 섀정이, ν¬ν•¨λœ 벑터 μŠ€ν† μ–΄ μ½”λ“œ
204
+ """
205
+
206
+ # vector_store.py νŒŒμΌμ—μ„œ load_local λ©”μ„œλ“œ μˆ˜μ •
207
+
208
+ def load_local(self, path: str = "faiss_index") -> None:
209
+ """
210
+ FAISS 인덱슀 둜컬 λ‘œλ“œ (Milvus μ‚¬μš© μ•ˆ ν•  경우)
211
+
212
+ Args:
213
+ path: λ‘œλ“œν•  인덱슀 경둜
214
+ """
215
+ if not self.use_milvus:
216
+ try:
217
+ print(f"FAISS 인덱슀 λ‘œλ“œ 쀑: {path}")
218
+
219
+ # 역직렬화 ν—ˆμš© μ˜΅μ…˜ μΆ”κ°€ (λ³΄μ•ˆ κ²½κ³  확인 ν•„μš”)
220
+ self.vector_store = FAISS.load_local(
221
+ path,
222
+ self.embeddings,
223
+ allow_dangerous_deserialization=True # 역직렬화 ν—ˆμš©
224
+ )
225
+ print(f"FAISS 인덱슀 λ‘œλ“œ μ™„λ£Œ: {path}")
226
+ except Exception as e:
227
+ print(f"FAISS 인덱슀 λ‘œλ“œ μ‹€νŒ¨: {e}")
228
+
229
+ # 였λ₯˜ μ„ΈλΆ€ 정보 좜λ ₯
230
+ import traceback
231
+ traceback.print_exc()
232
+
233
+ # μƒˆ 인덱슀 μ΄ˆκΈ°ν™”
234
+ self.vector_store = self.init_faiss()
235
+ print("μƒˆ FAISS 인덱슀 μ΄ˆκΈ°ν™”λ¨")
voice_rag_app.py ADDED
@@ -0,0 +1,670 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ μŒμ„±μΈμ‹ κΈ°λŠ₯이 μΆ”κ°€λœ RAG 챗봇 μ•±
3
+ """
4
+ import os
5
+ import time
6
+ import tempfile
7
+ from typing import List, Dict, Tuple, Any, Optional
8
+ import hashlib
9
+ import pickle
10
+ import json
11
+
12
+ # κΈ°μ‘΄ μž„ν¬νŠΈ
13
+ from config import PDF_DIRECTORY, CHUNK_SIZE, CHUNK_OVERLAP, LLM_MODEL
14
+ from optimized_document_processor import OptimizedDocumentProcessor
15
+ from vector_store import VectorStore
16
+ from langchain.schema import Document
17
+
18
+ # ν΄λ‘œλ°” STT λͺ¨λ“ˆ μž„ν¬νŠΈ
19
+ from clova_stt import ClovaSTT
20
+
21
+ # μ•ˆμ „ν•œ μž„ν¬νŠΈ
22
+ try:
23
+ from rag_chain import RAGChain
24
+
25
+ RAG_CHAIN_AVAILABLE = True
26
+ except ImportError:
27
+ print("RAG 체인 λͺ¨λ“ˆμ„ λ‘œλ“œν•  수 μ—†μŠ΅λ‹ˆλ‹€.")
28
+ RAG_CHAIN_AVAILABLE = False
29
+
30
+
31
+ class VoiceRAGChatApp:
32
+ """
33
+ μŒμ„±μΈμ‹ κΈ°λŠ₯이 μΆ”κ°€λœ RAG 챗봇 μ• ν”Œλ¦¬μΌ€μ΄μ…˜
34
+ """
35
+
36
+ def __init__(self):
37
+ """
38
+ μŒμ„±μΈμ‹ RAG 챗봇 μ• ν”Œλ¦¬μΌ€μ΄μ…˜ μ΄ˆκΈ°ν™”
39
+ """
40
+ # 데이터 디렉토리 μ •μ˜
41
+ self.pdf_directory = PDF_DIRECTORY
42
+ self.cache_directory = "cached_data"
43
+ self.index_file = os.path.join(self.cache_directory, "file_index.json")
44
+ self.chunks_dir = os.path.join(self.cache_directory, "chunks")
45
+ self.vector_index_dir = os.path.join(self.cache_directory, "vector_index")
46
+
47
+ # 디렉토리 생성
48
+ os.makedirs(self.pdf_directory, exist_ok=True)
49
+ os.makedirs(self.cache_directory, exist_ok=True)
50
+ os.makedirs(self.chunks_dir, exist_ok=True)
51
+ os.makedirs(self.vector_index_dir, exist_ok=True)
52
+
53
+ print(f"PDF λ¬Έμ„œ 디렉토리: '{self.pdf_directory}'")
54
+ print(f"μΊμ‹œ 디렉토리: '{self.cache_directory}'")
55
+
56
+ # μ»΄ν¬λ„ŒνŠΈ μ΄ˆκΈ°ν™”
57
+ self.document_processor = OptimizedDocumentProcessor(
58
+ chunk_size=CHUNK_SIZE,
59
+ chunk_overlap=CHUNK_OVERLAP
60
+ )
61
+
62
+ # 벑터 μ €μž₯μ†Œ μ΄ˆκΈ°ν™”
63
+ self.vector_store = VectorStore(use_milvus=False)
64
+
65
+ # λ¬Έμ„œ 인덱슀 λ‘œλ“œ
66
+ self.file_index = self._load_file_index()
67
+
68
+ # κΈ°λ³Έ λ³€μˆ˜ μ΄ˆκΈ°ν™”
69
+ self.documents = []
70
+ self.processed_files = []
71
+ self.is_initialized = False
72
+
73
+ # ν΄λ‘œλ°” STT ν΄λΌμ΄μ–ΈνŠΈ μ΄ˆκΈ°ν™”
74
+ self.stt_client = ClovaSTT()
75
+ print("μŒμ„±μΈμ‹(STT) κΈ°λŠ₯이 μ΄ˆκΈ°ν™”λ˜μ—ˆμŠ΅λ‹ˆλ‹€.")
76
+
77
+ # μ‹œμž‘ μ‹œ μžλ™μœΌλ‘œ λ¬Έμ„œ λ‘œλ“œ 및 처리
78
+ print("λ¬Έμ„œ μžλ™ λ‘œλ“œ 및 처리 μ‹œμž‘...")
79
+ self.auto_process_documents()
80
+
81
+ def _load_file_index(self) -> Dict[str, Dict[str, Any]]:
82
+ """
83
+ 파일 인덱슀 λ‘œλ“œ
84
+
85
+ Returns:
86
+ 파일 경둜 -> 메타데이터 λ§€ν•‘
87
+ """
88
+ if os.path.exists(self.index_file):
89
+ try:
90
+ with open(self.index_file, 'r', encoding='utf-8') as f:
91
+ return json.load(f)
92
+ except Exception as e:
93
+ print(f"인덱슀 파일 λ‘œλ“œ μ‹€νŒ¨: {e}")
94
+ return {}
95
+ return {}
96
+
97
+ def _save_file_index(self) -> None:
98
+ """
99
+ 파일 인덱슀 μ €μž₯
100
+ """
101
+ with open(self.index_file, 'w', encoding='utf-8') as f:
102
+ json.dump(self.file_index, f, ensure_ascii=False, indent=2)
103
+
104
+ def _calculate_file_hash(self, file_path: str) -> str:
105
+ """
106
+ 파일 ν•΄μ‹œ 계산
107
+
108
+ Args:
109
+ file_path: 파일 경둜
110
+
111
+ Returns:
112
+ MD5 ν•΄μ‹œκ°’
113
+ """
114
+ hasher = hashlib.md5()
115
+ with open(file_path, 'rb') as f:
116
+ buf = f.read(65536)
117
+ while len(buf) > 0:
118
+ hasher.update(buf)
119
+ buf = f.read(65536)
120
+ return hasher.hexdigest()
121
+
122
+ def _is_file_processed(self, file_path: str) -> bool:
123
+ """
124
+ 파일이 이미 μ²˜λ¦¬λ˜μ—ˆκ³  λ³€κ²½λ˜μ§€ μ•Šμ•˜λŠ”μ§€ 확인
125
+
126
+ Args:
127
+ file_path: 파일 경둜
128
+
129
+ Returns:
130
+ 처리 μ—¬λΆ€
131
+ """
132
+ if file_path not in self.file_index:
133
+ return False
134
+
135
+ # ν˜„μž¬ ν•΄μ‹œκ°’ 계산
136
+ current_hash = self._calculate_file_hash(file_path)
137
+
138
+ # μ €μž₯된 ν•΄μ‹œκ°’κ³Ό 비ꡐ
139
+ if self.file_index[file_path]['hash'] != current_hash:
140
+ print(f"파일 λ³€κ²½ 감지: {file_path}")
141
+ return False
142
+
143
+ # 청크 파일 쑴재 확인
144
+ chunks_path = self.file_index[file_path]['chunks_path']
145
+ if not os.path.exists(chunks_path):
146
+ return False
147
+
148
+ return True
149
+
150
+ def _get_chunks_path(self, file_hash: str) -> str:
151
+ """
152
+ 청크 파일 경둜 생성
153
+
154
+ Args:
155
+ file_hash: 파일 ν•΄μ‹œκ°’
156
+
157
+ Returns:
158
+ 청크 파일 경둜
159
+ """
160
+ return os.path.join(self.chunks_dir, f"{file_hash}.pkl")
161
+
162
+ def _save_chunks(self, file_path: str, chunks: List[Document]) -> None:
163
+ """
164
+ 청크 데이터 μ €μž₯
165
+
166
+ Args:
167
+ file_path: 원본 파일 경둜
168
+ chunks: λ¬Έμ„œ 청크 리슀트
169
+ """
170
+ # ν•΄μ‹œ 계산
171
+ file_hash = self._calculate_file_hash(file_path)
172
+
173
+ # 청크 파일 경둜
174
+ chunks_path = self._get_chunks_path(file_hash)
175
+
176
+ # 청크 데이터 μ €μž₯
177
+ with open(chunks_path, 'wb') as f:
178
+ pickle.dump(chunks, f)
179
+
180
+ # 인덱슀 μ—…λ°μ΄νŠΈ
181
+ self.file_index[file_path] = {
182
+ 'hash': file_hash,
183
+ 'chunks_path': chunks_path,
184
+ 'last_processed': time.time(),
185
+ 'chunks_count': len(chunks)
186
+ }
187
+
188
+ # 인덱슀 μ €μž₯
189
+ self._save_file_index()
190
+
191
+ print(f"청크 μ €μž₯ μ™„λ£Œ: {file_path} ({len(chunks)}개 청크)")
192
+
193
+ def _load_chunks(self, file_path: str) -> List[Document]:
194
+ """
195
+ μ €μž₯된 청크 데이터 λ‘œλ“œ
196
+
197
+ Args:
198
+ file_path: 파일 경둜
199
+
200
+ Returns:
201
+ λ¬Έμ„œ 청크 리슀트
202
+ """
203
+ chunks_path = self.file_index[file_path]['chunks_path']
204
+ with open(chunks_path, 'rb') as f:
205
+ chunks = pickle.load(f)
206
+
207
+ print(f"청크 λ‘œλ“œ μ™„λ£Œ: {file_path} ({len(chunks)}개 청크)")
208
+ return chunks
209
+
210
+ def _process_pdf_file(self, file_path: str) -> List[Document]:
211
+ """
212
+ PDF 파일 처리 - docling μ‹€νŒ¨ μ‹œ PyPDFLoader μ‚¬μš©
213
+
214
+ Args:
215
+ file_path: μ²˜λ¦¬ν•  PDF 파일 경둜
216
+
217
+ Returns:
218
+ 처리된 λ¬Έμ„œ 청크 리슀트
219
+ """
220
+ try:
221
+ print(f"docling으둜 처리 μ‹œλ„: {file_path}")
222
+
223
+ # docling μ‚¬μš© μ‹œλ„
224
+ try:
225
+ # 10초 νƒ€μž„μ•„μ›ƒ μ„€μ • (μ˜΅μ…˜)
226
+ import signal
227
+
228
+ def timeout_handler(signum, frame):
229
+ raise TimeoutError("docling 처리 μ‹œκ°„ 초과")
230
+
231
+ # λ¦¬λˆ…μŠ€/λ§₯μ—μ„œλ§Œ μž‘λ™ (μœˆλ„μš°μ—μ„œλŠ” λ¬΄μ‹œλ¨)
232
+ try:
233
+ signal.signal(signal.SIGALRM, timeout_handler)
234
+ signal.alarm(60) # 60초 νƒ€μž„μ•„μ›ƒ
235
+ except:
236
+ pass
237
+
238
+ # docling으둜 처리 μ‹œλ„
239
+ chunks = self.document_processor.process_pdf(file_path, use_docling=True)
240
+
241
+ # νƒ€μž„μ•„μ›ƒ μ·¨μ†Œ
242
+ try:
243
+ signal.alarm(0)
244
+ except:
245
+ pass
246
+
247
+ return chunks
248
+
249
+ except Exception as e:
250
+ # docling 였λ₯˜ 확인
251
+ error_str = str(e)
252
+ if "Invalid code point" in error_str or "RuntimeError" in error_str:
253
+ print(f"docling 처리 였λ₯˜ (μ½”λ“œ 포인트 문제): {error_str}")
254
+ print("PyPDFLoader둜 λŒ€μ²΄ν•©λ‹ˆλ‹€.")
255
+ else:
256
+ print(f"docling 처리 였λ₯˜: {error_str}")
257
+ print("PyPDFLoader둜 λŒ€μ²΄ν•©λ‹ˆλ‹€.")
258
+
259
+ # PyPDFLoader둜 λŒ€μ²΄
260
+ try:
261
+ return self.document_processor.process_pdf(file_path, use_docling=False)
262
+ except Exception as inner_e:
263
+ print(f"PyPDFLoader 처리 였λ₯˜: {inner_e}")
264
+ raise # 두 방법 λͺ¨λ‘ μ‹€νŒ¨ν•˜λ©΄ μ˜ˆμ™Έ λ°œμƒ
265
+
266
+ except Exception as e:
267
+ print(f"PDF 처리 쀑 μ‹¬κ°ν•œ 였λ₯˜: {e}")
268
+ # 빈 청크라도 λ°˜ν™˜ν•˜μ—¬ 전체 μ²˜λ¦¬κ°€ μ€‘λ‹¨λ˜μ§€ μ•Šλ„λ‘ 함
269
+ return []
270
+
271
+ def auto_process_documents(self) -> str:
272
+ """
273
+ documents ν΄λ”μ˜ PDF 파일 μžλ™ 처리
274
+
275
+ Returns:
276
+ 처리 κ²°κ³Ό λ©”μ‹œμ§€
277
+ """
278
+ try:
279
+ start_time = time.time()
280
+
281
+ # PDF 파일 λͺ©λ‘ μˆ˜μ§‘
282
+ pdf_files = []
283
+ for filename in os.listdir(self.pdf_directory):
284
+ if filename.lower().endswith('.pdf'):
285
+ pdf_files.append(os.path.join(self.pdf_directory, filename))
286
+
287
+ if not pdf_files:
288
+ return f"'{self.pdf_directory}' 폴더에 PDF 파일이 μ—†μŠ΅λ‹ˆλ‹€."
289
+
290
+ print(f"발견된 PDF 파일: {len(pdf_files)}개")
291
+
292
+ # 폴더 λ‚΄ PDF 파일 처리
293
+ new_files = []
294
+ updated_files = []
295
+ cached_files = []
296
+ failed_files = []
297
+ all_chunks = []
298
+
299
+ for file_path in pdf_files:
300
+ if self._is_file_processed(file_path):
301
+ # μΊμ‹œμ—μ„œ 청크 λ‘œλ“œ
302
+ chunks = self._load_chunks(file_path)
303
+ all_chunks.extend(chunks)
304
+ cached_files.append(file_path)
305
+ self.processed_files.append(os.path.basename(file_path))
306
+ else:
307
+ # μƒˆ 파일 λ˜λŠ” λ³€κ²½λœ 파일 처리
308
+ print(f"처리 쀑: {file_path}")
309
+
310
+ try:
311
+ # κ°œμ„ λœ PDF 처리 λ©”μ„œλ“œ μ‚¬μš©
312
+ chunks = self._process_pdf_file(file_path)
313
+
314
+ if chunks: # 청크가 μžˆλŠ” κ²½μš°μ—λ§Œ μ €μž₯
315
+ # 청크 μ €μž₯
316
+ self._save_chunks(file_path, chunks)
317
+
318
+ all_chunks.extend(chunks)
319
+ if file_path in self.file_index:
320
+ updated_files.append(file_path)
321
+ else:
322
+ new_files.append(file_path)
323
+
324
+ self.processed_files.append(os.path.basename(file_path))
325
+ else:
326
+ print(f"'{file_path}' 처리 μ‹€νŒ¨: μΆ”μΆœλœ 청크 μ—†μŒ")
327
+ failed_files.append(file_path)
328
+ except Exception as e:
329
+ print(f"'{file_path}' 처리 쀑 였λ₯˜: {e}")
330
+ failed_files.append(file_path)
331
+
332
+ # λͺ¨λ“  청크 μ €μž₯
333
+ self.documents = all_chunks
334
+
335
+ processing_time = time.time() - start_time
336
+ print(f"λ¬Έμ„œ 처리 μ™„λ£Œ: {len(all_chunks)}개 청크, {processing_time:.2f}초")
337
+
338
+ # 벑터 인덱슀 μ €μž₯ 경둜 확인
339
+ if os.path.exists(self.vector_index_dir) and any(os.listdir(self.vector_index_dir)):
340
+ # κΈ°μ‘΄ 벑터 인덱슀 λ‘œλ“œ
341
+ try:
342
+ print("μ €μž₯된 벑터 인덱슀 λ‘œλ“œ 쀑...")
343
+ vector_store_loaded = self.vector_store.load_local(self.vector_index_dir)
344
+
345
+ # 인덱슀 λ‘œλ“œ 성곡 확인
346
+ if self.vector_store.vector_store is not None:
347
+ # μƒˆ λ¬Έμ„œλ‚˜ λ³€κ²½λœ λ¬Έμ„œκ°€ 있으면 인덱슀 μ—…λ°μ΄νŠΈ
348
+ if new_files or updated_files:
349
+ print("벑터 인덱슀 μ—…λ°μ΄νŠΈ 쀑...")
350
+ self.vector_store.add_documents(self.documents)
351
+
352
+ print("벑터 인덱슀 λ‘œλ“œ μ™„λ£Œ")
353
+ else:
354
+ print("벑터 인덱슀λ₯Ό λ‘œλ“œν–ˆμœΌλ‚˜ μœ νš¨ν•˜μ§€ μ•ŠμŒ, μƒˆλ‘œ μƒμ„±ν•©λ‹ˆλ‹€.")
355
+ self.vector_store.create_or_load(self.documents)
356
+
357
+ except Exception as e:
358
+ print(f"벑터 인덱슀 λ‘œλ“œ μ‹€νŒ¨, μƒˆλ‘œ μƒμ„±ν•©λ‹ˆλ‹€: {e}")
359
+ # 였λ₯˜ 상세 정보 좜λ ₯
360
+ import traceback
361
+ traceback.print_exc()
362
+
363
+ # μƒˆ 벑터 인덱슀 생성
364
+ self.vector_store.create_or_load(self.documents)
365
+ else:
366
+ # μƒˆ 벑터 인덱슀 생성
367
+ print("μƒˆ 벑터 인덱슀 생성 쀑...")
368
+ self.vector_store.create_or_load(self.documents)
369
+
370
+ # 벑터 인덱슀 μ €μž₯
371
+ if self.vector_store and self.vector_store.vector_store is not None:
372
+ try:
373
+ print(f"벑터 인덱슀 μ €μž₯ 쀑: {self.vector_index_dir}")
374
+ save_result = self.vector_store.save_local(self.vector_index_dir)
375
+ print(f"벑터 인덱슀 μ €μž₯ μ™„λ£Œ: {self.vector_index_dir}")
376
+ except Exception as e:
377
+ print(f"벑터 인덱슀 μ €μž₯ μ‹€νŒ¨: {e}")
378
+ # 였λ₯˜ 상세 정보 좜λ ₯
379
+ import traceback
380
+ traceback.print_exc()
381
+ else:
382
+ print("벑터 μΈλ±μŠ€κ°€ μ΄ˆκΈ°ν™”λ˜μ§€ μ•Šμ•„ μ €μž₯ν•˜μ§€ μ•ŠμŠ΅λ‹ˆλ‹€.")
383
+
384
+ # RAG 체인 μ΄ˆκΈ°ν™”
385
+ if RAG_CHAIN_AVAILABLE:
386
+ self.rag_chain = RAGChain(self.vector_store)
387
+ self.is_initialized = True
388
+
389
+ total_time = time.time() - start_time
390
+
391
+ status_message = (
392
+ f"λ¬Έμ„œ 처리 μ™„λ£Œ!\n"
393
+ f"- 처리된 파일: {len(self.processed_files)}개\n"
394
+ f"- μΊμ‹œλœ 파일: {len(cached_files)}개\n"
395
+ f"- μƒˆ 파일: {len(new_files)}개\n"
396
+ f"- μ—…λ°μ΄νŠΈλœ 파일: {len(updated_files)}개\n"
397
+ f"- μ‹€νŒ¨ν•œ 파일: {len(failed_files)}개\n"
398
+ f"- 총 청크 수: {len(self.documents)}개\n"
399
+ f"- 처리 μ‹œκ°„: {total_time:.2f}초\n"
400
+ f"이제 μ§ˆλ¬Έν•  μ€€λΉ„κ°€ λ˜μ—ˆμŠ΅λ‹ˆλ‹€!"
401
+ )
402
+
403
+ print(status_message)
404
+ return status_message
405
+ else:
406
+ return "RAG 체인을 μ΄ˆκΈ°ν™”ν•  수 μ—†μŠ΅λ‹ˆλ‹€. ν•„μš”ν•œ λΌμ΄λΈŒλŸ¬λ¦¬κ°€ μ„€μΉ˜λ˜μ–΄ μžˆλŠ”μ§€ ν™•μΈν•˜μ„Έμš”."
407
+
408
+ except Exception as e:
409
+ error_message = f"λ¬Έμ„œ 처리 쀑 였λ₯˜ λ°œμƒ: {str(e)}"
410
+ print(error_message)
411
+ import traceback
412
+ traceback.print_exc()
413
+ return error_message
414
+
415
+ def reset_cache(self) -> str:
416
+ """
417
+ μΊμ‹œ μ΄ˆκΈ°ν™”
418
+
419
+ Returns:
420
+ κ²°κ³Ό λ©”μ‹œμ§€
421
+ """
422
+ try:
423
+ # μ²­οΏ½οΏ½οΏ½ 파일 μ‚­μ œ
424
+ for filename in os.listdir(self.chunks_dir):
425
+ file_path = os.path.join(self.chunks_dir, filename)
426
+ if os.path.isfile(file_path):
427
+ os.remove(file_path)
428
+
429
+ # 인덱슀 μ΄ˆκΈ°ν™”
430
+ self.file_index = {}
431
+ self._save_file_index()
432
+
433
+ # 벑터 인덱슀 μ‚­μ œ
434
+ for filename in os.listdir(self.vector_index_dir):
435
+ file_path = os.path.join(self.vector_index_dir, filename)
436
+ if os.path.isfile(file_path):
437
+ os.remove(file_path)
438
+
439
+ self.documents = []
440
+ self.processed_files = []
441
+ self.is_initialized = False
442
+
443
+ return "μΊμ‹œκ°€ μ΄ˆκΈ°ν™”λ˜μ—ˆμŠ΅λ‹ˆλ‹€. λ‹€μŒ μ‹€ν–‰ μ‹œ λͺ¨λ“  λ¬Έμ„œκ°€ λ‹€μ‹œ μ²˜λ¦¬λ©λ‹ˆλ‹€."
444
+ except Exception as e:
445
+ return f"μΊμ‹œ μ΄ˆκΈ°ν™” 쀑 였λ₯˜ λ°œμƒ: {str(e)}"
446
+
447
+ def process_query(self, query: str, chat_history: List[Tuple[str, str]]) -> Tuple[str, List[Tuple[str, str]]]:
448
+ """
449
+ μ‚¬μš©μž 쿼리 처리
450
+
451
+ Args:
452
+ query: μ‚¬μš©μž 질문
453
+ chat_history: λŒ€ν™” 기둝
454
+
455
+ Returns:
456
+ 응닡 및 μ—…λ°μ΄νŠΈλœ λŒ€ν™” 기둝
457
+ """
458
+ if not query: # λΉ„μ–΄μžˆλŠ” 쿼리 처리
459
+ return "", chat_history
460
+
461
+ if not self.is_initialized:
462
+ response = "λ¬Έμ„œ λ‘œλ“œκ°€ μ΄ˆκΈ°ν™”λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€. μžλ™ λ‘œλ“œλ₯Ό μ‹œλ„ν•©λ‹ˆλ‹€."
463
+ chat_history.append((query, response))
464
+
465
+ # μžλ™ λ‘œλ“œ μ‹œλ„
466
+ try:
467
+ self.auto_process_documents()
468
+ if not self.is_initialized:
469
+ response = "λ¬Έμ„œλ₯Ό λ‘œλ“œν•  수 μ—†μŠ΅λ‹ˆλ‹€. 'documents' 폴더에 PDF 파일이 μžˆλŠ”μ§€ ν™•μΈν•˜μ„Έμš”."
470
+ chat_history.append((query, response))
471
+ return "", chat_history
472
+ except Exception as e:
473
+ response = f"λ¬Έμ„œ λ‘œλ“œ 쀑 였λ₯˜ λ°œμƒ: {str(e)}"
474
+ chat_history.append((query, response))
475
+ return "", chat_history
476
+
477
+ try:
478
+ # RAG 체인 μ‹€ν–‰ 및 응닡 생성
479
+ start_time = time.time()
480
+ response = self.rag_chain.run(query)
481
+ end_time = time.time()
482
+
483
+ query_time = end_time - start_time
484
+ print(f"쿼리 처리 μ‹œκ°„: {query_time:.2f}초")
485
+
486
+ chat_history.append((query, response))
487
+ return "", chat_history
488
+ except Exception as e:
489
+ error_msg = f"였λ₯˜ λ°œμƒ: {str(e)}"
490
+ chat_history.append((query, error_msg))
491
+ return "", chat_history
492
+
493
+ def process_voice_query(self, audio, chat_history: List[Tuple[str, str]]) -> Tuple[str, List[Tuple[str, str]]]:
494
+ """
495
+ μŒμ„± 쿼리 처리
496
+
497
+ Args:
498
+ audio: λ…ΉμŒλœ μ˜€λ””μ˜€ 데이터
499
+ chat_history: λŒ€ν™” 기둝
500
+
501
+ Returns:
502
+ 응닡 및 μ—…λ°μ΄νŠΈλœ λŒ€ν™” 기둝
503
+ """
504
+ if audio is None:
505
+ return "", chat_history
506
+
507
+ try:
508
+ # μž„μ‹œ νŒŒμΌμ— μ˜€λ””μ˜€ μ €μž₯
509
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
510
+ temp_path = temp_file.name
511
+ temp_file.write(audio)
512
+
513
+ print(f"[STT] μž„μ‹œ μ˜€λ””μ˜€ 파일 생성: {temp_path}")
514
+
515
+ # STT μ‹€ν–‰
516
+ result = self.stt_client.recognize_file(temp_path)
517
+
518
+ # μž„μ‹œ 파일 μ‚­μ œ
519
+ try:
520
+ os.unlink(temp_path)
521
+ print("[STT] μž„μ‹œ μ˜€λ””μ˜€ 파일 μ‚­μ œλ¨")
522
+ except Exception as e:
523
+ print(f"[STT] μž„μ‹œ 파일 μ‚­μ œ μ‹€νŒ¨: {e}")
524
+
525
+ # STT 결과 처리
526
+ if "error" in result:
527
+ error_msg = f"μŒμ„±μΈμ‹ 였λ₯˜: {result.get('error')}"
528
+ print(f"[STT] {error_msg}")
529
+ chat_history.append(("μŒμ„± λ©”μ‹œμ§€", error_msg))
530
+ return "", chat_history
531
+
532
+ # μΈμ‹λœ ν…μŠ€νŠΈ μΆ”μΆœ
533
+ recognized_text = result.get("text", "")
534
+ if not recognized_text:
535
+ error_msg = "μŒμ„±μ„ 인식할 수 μ—†μŠ΅λ‹ˆλ‹€. λ‹€μ‹œ μ‹œλ„ν•΄μ£Όμ„Έμš”."
536
+ print("[STT] μΈμ‹λœ ν…μŠ€νŠΈ μ—†μŒ")
537
+ chat_history.append(("μŒμ„± λ©”μ‹œμ§€", error_msg))
538
+ return "", chat_history
539
+
540
+ print(f"[STT] μΈμ‹λœ ν…μŠ€νŠΈ: {recognized_text}")
541
+
542
+ # μΈμ‹λœ ν…μŠ€νŠΈλ‘œ 쿼리 처리 (μŒμ„± λ©”μ‹œμ§€ 접두어 μΆ”κ°€)
543
+ return self.process_query(f"🎀 {recognized_text}", chat_history)
544
+
545
+ except Exception as e:
546
+ error_msg = f"μŒμ„± 처리 쀑 였λ₯˜ λ°œμƒ: {str(e)}"
547
+ print(f"[STT] {error_msg}")
548
+ chat_history.append(("μŒμ„± λ©”μ‹œμ§€", error_msg))
549
+ return "", chat_history
550
+
551
+ def launch_app(self) -> None:
552
+ """
553
+ μŒμ„±μΈμ‹ κΈ°λŠ₯이 μΆ”κ°€λœ Gradio μ•± μ‹€ν–‰
554
+ """
555
+ import gradio as gr
556
+
557
+ with gr.Blocks(title="μŒμ„±μΈμ‹ κΈ°λŠ₯이 μΆ”κ°€λœ PDF λ¬Έμ„œ 기반 RAG 챗봇") as app:
558
+ gr.Markdown("# μŒμ„±μΈμ‹ κΈ°λŠ₯이 μΆ”κ°€λœ PDF λ¬Έμ„œ 기반 RAG 챗봇")
559
+ gr.Markdown(f"* μ‚¬μš© 쀑인 LLM λͺ¨λΈ: **{LLM_MODEL}**")
560
+ gr.Markdown(f"* PDF λ¬Έμ„œ 폴더: **{self.pdf_directory}**")
561
+ gr.Markdown("* 넀이버 ν΄λ‘œλ°” μŒμ„±μΈμ‹ API 톡합")
562
+
563
+ with gr.Row():
564
+ with gr.Column(scale=1):
565
+ # λ¬Έμ„œ μƒνƒœ μ„Ήμ…˜
566
+ status_box = gr.Textbox(
567
+ label="λ¬Έμ„œ 처리 μƒνƒœ",
568
+ value=f"처리된 λ¬Έμ„œ ({len(self.processed_files)}개): {', '.join(self.processed_files)}",
569
+ lines=5,
570
+ interactive=False
571
+ )
572
+
573
+ # μΊμ‹œ 관리 λ²„νŠΌ
574
+ refresh_button = gr.Button("λ¬Έμ„œ μƒˆλ‘œ 읽기", variant="primary")
575
+ reset_button = gr.Button("μΊμ‹œ μ΄ˆκΈ°ν™”", variant="stop")
576
+
577
+ # 처리된 파일 정보
578
+ with gr.Accordion("μΊμ‹œ μ„ΈλΆ€ 정보", open=False):
579
+ file_info = ""
580
+ for file_path, info in self.file_index.items():
581
+ file_info += f"- {os.path.basename(file_path)}: {info['chunks_count']}개 청크\n"
582
+
583
+ cache_info = gr.Textbox(
584
+ label="μΊμ‹œλœ 파일 정보",
585
+ value=file_info or "μΊμ‹œλœ 파일이 μ—†μŠ΅λ‹ˆλ‹€.",
586
+ lines=5,
587
+ interactive=False
588
+ )
589
+
590
+ with gr.Column(scale=2):
591
+ # μ±„νŒ… μΈν„°νŽ˜μ΄μŠ€
592
+ chatbot = gr.Chatbot(
593
+ label="λŒ€ν™” λ‚΄μš©",
594
+ bubble_full_width=False,
595
+ height=500,
596
+ show_copy_button=True
597
+ )
598
+
599
+ with gr.Tabs() as input_tabs:
600
+ # ν…μŠ€νŠΈ μž…λ ₯ νƒ­
601
+ with gr.Tab("ν…μŠ€νŠΈ μž…λ ₯"):
602
+ # ν…μŠ€νŠΈ μž…λ ₯κ³Ό 전솑 λ²„νŠΌμ„ μˆ˜ν‰μœΌλ‘œ 배치
603
+ with gr.Row():
604
+ query_box = gr.Textbox(
605
+ label="질문",
606
+ placeholder="처리된 λ¬Έμ„œ λ‚΄μš©μ— λŒ€ν•΄ μ§ˆλ¬Έν•˜μ„Έμš”...",
607
+ lines=2,
608
+ scale=4
609
+ )
610
+ submit_btn = gr.Button("전솑", variant="primary", scale=1)
611
+
612
+ # μŒμ„± μž…λ ₯ νƒ­
613
+ with gr.Tab("μŒμ„± μž…λ ₯"):
614
+ audio_input = gr.Audio(
615
+ label="마이크 μž…λ ₯",
616
+ sources=["microphone"],
617
+ type="bytes",
618
+ format="wav"
619
+ )
620
+ voice_submit_btn = gr.Button("μŒμ„± 질문 전솑", variant="primary")
621
+
622
+ clear_chat_button = gr.Button("λŒ€ν™” μ΄ˆκΈ°ν™”")
623
+
624
+ # 이벀트 ν•Έλ“€λŸ¬ μ„€μ •
625
+ refresh_button.click(
626
+ fn=self.auto_process_documents,
627
+ inputs=[],
628
+ outputs=[status_box]
629
+ )
630
+
631
+ reset_button.click(
632
+ fn=lambda: (self.reset_cache(), self.auto_process_documents()),
633
+ inputs=[],
634
+ outputs=[status_box]
635
+ )
636
+
637
+ # ν…μŠ€νŠΈ 전솑 λ²„νŠΌ 클릭 이벀트
638
+ submit_btn.click(
639
+ fn=self.process_query,
640
+ inputs=[query_box, chatbot],
641
+ outputs=[query_box, chatbot]
642
+ )
643
+
644
+ # μ—”ν„°ν‚€ μž…λ ₯ 이벀트
645
+ query_box.submit(
646
+ fn=self.process_query,
647
+ inputs=[query_box, chatbot],
648
+ outputs=[query_box, chatbot]
649
+ )
650
+
651
+ # μŒμ„± 전솑 λ²„νŠΌ 클릭 이벀트
652
+ voice_submit_btn.click(
653
+ fn=self.process_voice_query,
654
+ inputs=[audio_input, chatbot],
655
+ outputs=[audio_input, chatbot]
656
+ )
657
+
658
+ # λŒ€ν™” μ΄ˆκΈ°ν™” λ²„νŠΌ
659
+ clear_chat_button.click(
660
+ fn=lambda: [],
661
+ outputs=[chatbot]
662
+ )
663
+
664
+ # μ•± μ‹€ν–‰
665
+ app.launch(share=False)
666
+
667
+
668
+ if __name__ == "__main__":
669
+ app = VoiceRAGChatApp()
670
+ app.launch_app()