Add application file
Browse files- .gitignore +34 -0
- app.py +670 -0
- config.py +60 -0
- dir +154 -0
- monitoring.py +136 -0
- optimized_document_processor.py +346 -0
- rag_chain.py +151 -0
- requirements.txt +14 -0
- reranker.py +58 -0
- simple_rag_chain.py +66 -0
- vector_store.py +235 -0
- voice_rag_app.py +670 -0
.gitignore
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# νκ²½ λ³μ
|
2 |
+
.env
|
3 |
+
|
4 |
+
# μΊμ λ° μμ νμΌ
|
5 |
+
__pycache__/
|
6 |
+
*.py[cod]
|
7 |
+
*.so
|
8 |
+
.Python
|
9 |
+
env/
|
10 |
+
build/
|
11 |
+
develop-eggs/
|
12 |
+
dist/
|
13 |
+
downloads/
|
14 |
+
eggs/
|
15 |
+
.eggs/
|
16 |
+
lib/
|
17 |
+
lib64/
|
18 |
+
parts/
|
19 |
+
sdist/
|
20 |
+
var/
|
21 |
+
*.egg-info/
|
22 |
+
.installed.cfg
|
23 |
+
*.egg
|
24 |
+
|
25 |
+
# ν΄λ
|
26 |
+
documents/
|
27 |
+
faiss_index/
|
28 |
+
cached_data/
|
29 |
+
preprocessed_index/
|
30 |
+
**/__pycache__/
|
31 |
+
|
32 |
+
# νλ‘μ νΈ νΉν νμΌ
|
33 |
+
parts_extraction_cache.json
|
34 |
+
.venv/
|
app.py
ADDED
@@ -0,0 +1,670 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
μμ±μΈμ(STT) κΈ°λ₯μ΄ κ΅¬νλ RAG μ±λ΄ μ±
|
3 |
+
"""
|
4 |
+
import os
|
5 |
+
import time
|
6 |
+
import hashlib
|
7 |
+
import pickle
|
8 |
+
import json
|
9 |
+
import tempfile
|
10 |
+
from typing import List, Dict, Tuple, Any
|
11 |
+
|
12 |
+
from langchain.schema import Document
|
13 |
+
|
14 |
+
from config import (
|
15 |
+
PDF_DIRECTORY, CHUNK_SIZE, CHUNK_OVERLAP, LLM_MODEL,
|
16 |
+
STT_LANGUAGE, IS_HUGGINGFACE
|
17 |
+
)
|
18 |
+
from optimized_document_processor import OptimizedDocumentProcessor
|
19 |
+
from vector_store import VectorStore
|
20 |
+
from clova_stt import ClovaSTT
|
21 |
+
|
22 |
+
# μμ ν μν¬νΈ
|
23 |
+
try:
|
24 |
+
from rag_chain import RAGChain
|
25 |
+
RAG_CHAIN_AVAILABLE = True
|
26 |
+
except ImportError:
|
27 |
+
print("RAG μ²΄μΈ λͺ¨λμ λ‘λν μ μμ΅λλ€.")
|
28 |
+
RAG_CHAIN_AVAILABLE = False
|
29 |
+
|
30 |
+
|
31 |
+
class AutoRAGChatApp:
|
32 |
+
"""
|
33 |
+
documents ν΄λμ PDF νμΌμ μλμΌλ‘ μ²λ¦¬νκ³ μμ±μΈμ κΈ°λ₯μ μ 곡νλ RAG μ±λ΄
|
34 |
+
"""
|
35 |
+
|
36 |
+
def __init__(self):
|
37 |
+
"""
|
38 |
+
RAG μ±λ΄ μ ν리μΌμ΄μ
μ΄κΈ°ν
|
39 |
+
"""
|
40 |
+
# λ°μ΄ν° λλ ν 리 μ μ
|
41 |
+
self.pdf_directory = PDF_DIRECTORY
|
42 |
+
self.cache_directory = "cached_data"
|
43 |
+
self.index_file = os.path.join(self.cache_directory, "file_index.json")
|
44 |
+
self.chunks_dir = os.path.join(self.cache_directory, "chunks")
|
45 |
+
self.vector_index_dir = os.path.join(self.cache_directory, "vector_index")
|
46 |
+
|
47 |
+
# λλ ν 리 μμ±
|
48 |
+
os.makedirs(self.pdf_directory, exist_ok=True)
|
49 |
+
os.makedirs(self.cache_directory, exist_ok=True)
|
50 |
+
os.makedirs(self.chunks_dir, exist_ok=True)
|
51 |
+
os.makedirs(self.vector_index_dir, exist_ok=True)
|
52 |
+
|
53 |
+
print(f"PDF λ¬Έμ λλ ν 리: '{self.pdf_directory}'")
|
54 |
+
print(f"μΊμ λλ ν 리: '{self.cache_directory}'")
|
55 |
+
|
56 |
+
# μ»΄ν¬λνΈ μ΄κΈ°ν
|
57 |
+
self.document_processor = OptimizedDocumentProcessor(
|
58 |
+
chunk_size=CHUNK_SIZE,
|
59 |
+
chunk_overlap=CHUNK_OVERLAP
|
60 |
+
)
|
61 |
+
|
62 |
+
# λ²‘ν° μ μ₯μ μ΄κΈ°ν
|
63 |
+
self.vector_store = VectorStore(use_milvus=False)
|
64 |
+
|
65 |
+
# λ¬Έμ μΈλ±μ€ λ‘λ
|
66 |
+
self.file_index = self._load_file_index()
|
67 |
+
|
68 |
+
# κΈ°λ³Έ λ³μ μ΄κΈ°ν
|
69 |
+
self.documents = []
|
70 |
+
self.processed_files = []
|
71 |
+
self.is_initialized = False
|
72 |
+
|
73 |
+
# ν΄λ‘λ° STT ν΄λΌμ΄μΈνΈ μ΄κΈ°ν
|
74 |
+
self.stt_client = ClovaSTT()
|
75 |
+
print("μμ±μΈμ(STT) κΈ°λ₯μ΄ μ΄κΈ°νλμμ΅λλ€.")
|
76 |
+
|
77 |
+
# μμ μ μλμΌλ‘ λ¬Έμ λ‘λ λ° μ²λ¦¬
|
78 |
+
print("λ¬Έμ μλ λ‘λ λ° μ²λ¦¬ μμ...")
|
79 |
+
self.auto_process_documents()
|
80 |
+
|
81 |
+
def _process_pdf_file(self, file_path: str) -> List[Document]:
|
82 |
+
"""
|
83 |
+
PDF νμΌ μ²λ¦¬ - docling μ€ν¨ μ PyPDFLoader μ¬μ©
|
84 |
+
|
85 |
+
Args:
|
86 |
+
file_path: μ²λ¦¬ν PDF νμΌ κ²½λ‘
|
87 |
+
|
88 |
+
Returns:
|
89 |
+
μ²λ¦¬λ λ¬Έμ μ²ν¬ 리μ€νΈ
|
90 |
+
"""
|
91 |
+
try:
|
92 |
+
print(f"doclingμΌλ‘ μ²λ¦¬ μλ: {file_path}")
|
93 |
+
|
94 |
+
# docling μ¬μ© μλ
|
95 |
+
try:
|
96 |
+
# 10μ΄ νμμμ μ€μ (μ΅μ
)
|
97 |
+
import signal
|
98 |
+
|
99 |
+
def timeout_handler(signum, frame):
|
100 |
+
raise TimeoutError("docling μ²λ¦¬ μκ° μ΄κ³Ό")
|
101 |
+
|
102 |
+
# 리λ
μ€/λ§₯μμλ§ μλ (μλμ°μμλ 무μλ¨)
|
103 |
+
try:
|
104 |
+
signal.signal(signal.SIGALRM, timeout_handler)
|
105 |
+
signal.alarm(60) # 60μ΄ νμμμ
|
106 |
+
except:
|
107 |
+
pass
|
108 |
+
|
109 |
+
# doclingμΌλ‘ μ²λ¦¬ μλ
|
110 |
+
chunks = self.document_processor.process_pdf(file_path, use_docling=True)
|
111 |
+
|
112 |
+
# νμμμ μ·¨μ
|
113 |
+
try:
|
114 |
+
signal.alarm(0)
|
115 |
+
except:
|
116 |
+
pass
|
117 |
+
|
118 |
+
return chunks
|
119 |
+
|
120 |
+
except Exception as e:
|
121 |
+
# docling μ€λ₯ νμΈ
|
122 |
+
error_str = str(e)
|
123 |
+
if "Invalid code point" in error_str or "RuntimeError" in error_str:
|
124 |
+
print(f"docling μ²λ¦¬ μ€λ₯ (μ½λ ν¬μΈνΈ λ¬Έμ ): {error_str}")
|
125 |
+
print("PyPDFLoaderλ‘ λ체ν©λλ€.")
|
126 |
+
else:
|
127 |
+
print(f"docling μ²λ¦¬ μ€λ₯: {error_str}")
|
128 |
+
print("PyPDFLoaderλ‘ λ체ν©λλ€.")
|
129 |
+
|
130 |
+
# PyPDFLoaderλ‘ λ체
|
131 |
+
try:
|
132 |
+
return self.document_processor.process_pdf(file_path, use_docling=False)
|
133 |
+
except Exception as inner_e:
|
134 |
+
print(f"PyPDFLoader μ²λ¦¬ μ€λ₯: {inner_e}")
|
135 |
+
raise # λ λ°©λ² λͺ¨λ μ€ν¨νλ©΄ μμΈ λ°μ
|
136 |
+
|
137 |
+
except Exception as e:
|
138 |
+
print(f"PDF μ²λ¦¬ μ€ μ¬κ°ν μ€λ₯: {e}")
|
139 |
+
# λΉ μ²ν¬λΌλ λ°ννμ¬ μ 체 μ²λ¦¬κ° μ€λ¨λμ§ μλλ‘ ν¨
|
140 |
+
return []
|
141 |
+
|
142 |
+
def _load_file_index(self) -> Dict[str, Dict[str, Any]]:
|
143 |
+
"""
|
144 |
+
νμΌ μΈλ±μ€ λ‘λ
|
145 |
+
|
146 |
+
Returns:
|
147 |
+
νμΌ κ²½λ‘ -> λ©νλ°μ΄ν° λ§€ν
|
148 |
+
"""
|
149 |
+
if os.path.exists(self.index_file):
|
150 |
+
try:
|
151 |
+
with open(self.index_file, 'r', encoding='utf-8') as f:
|
152 |
+
return json.load(f)
|
153 |
+
except Exception as e:
|
154 |
+
print(f"μΈλ±μ€ νμΌ λ‘λ μ€ν¨: {e}")
|
155 |
+
return {}
|
156 |
+
return {}
|
157 |
+
|
158 |
+
def _save_file_index(self) -> None:
|
159 |
+
"""
|
160 |
+
νμΌ μΈλ±μ€ μ μ₯
|
161 |
+
"""
|
162 |
+
with open(self.index_file, 'w', encoding='utf-8') as f:
|
163 |
+
json.dump(self.file_index, f, ensure_ascii=False, indent=2)
|
164 |
+
|
165 |
+
def _calculate_file_hash(self, file_path: str) -> str:
|
166 |
+
"""
|
167 |
+
νμΌ ν΄μ κ³μ°
|
168 |
+
|
169 |
+
Args:
|
170 |
+
file_path: νμΌ κ²½λ‘
|
171 |
+
|
172 |
+
Returns:
|
173 |
+
MD5 ν΄μκ°
|
174 |
+
"""
|
175 |
+
hasher = hashlib.md5()
|
176 |
+
with open(file_path, 'rb') as f:
|
177 |
+
buf = f.read(65536)
|
178 |
+
while len(buf) > 0:
|
179 |
+
hasher.update(buf)
|
180 |
+
buf = f.read(65536)
|
181 |
+
return hasher.hexdigest()
|
182 |
+
|
183 |
+
def _is_file_processed(self, file_path: str) -> bool:
|
184 |
+
"""
|
185 |
+
νμΌμ΄ μ΄λ―Έ μ²λ¦¬λμκ³ λ³κ²½λμ§ μμλμ§ νμΈ
|
186 |
+
|
187 |
+
Args:
|
188 |
+
file_path: νμΌ κ²½λ‘
|
189 |
+
|
190 |
+
Returns:
|
191 |
+
μ²λ¦¬ μ¬λΆ
|
192 |
+
"""
|
193 |
+
if file_path not in self.file_index:
|
194 |
+
return False
|
195 |
+
|
196 |
+
# νμ¬ ν΄μκ° κ³μ°
|
197 |
+
current_hash = self._calculate_file_hash(file_path)
|
198 |
+
|
199 |
+
# μ μ₯λ ν΄μκ°κ³Ό λΉκ΅
|
200 |
+
if self.file_index[file_path]['hash'] != current_hash:
|
201 |
+
print(f"νμΌ λ³κ²½ κ°μ§: {file_path}")
|
202 |
+
return False
|
203 |
+
|
204 |
+
# μ²ν¬ νμΌ μ‘΄μ¬ νμΈ
|
205 |
+
chunks_path = self.file_index[file_path]['chunks_path']
|
206 |
+
if not os.path.exists(chunks_path):
|
207 |
+
return False
|
208 |
+
|
209 |
+
return True
|
210 |
+
|
211 |
+
def _get_chunks_path(self, file_hash: str) -> str:
|
212 |
+
"""
|
213 |
+
μ²ν¬ νμΌ κ²½λ‘ μμ±
|
214 |
+
|
215 |
+
Args:
|
216 |
+
file_hash: νμΌ ν΄μκ°
|
217 |
+
|
218 |
+
Returns:
|
219 |
+
μ²ν¬ νμΌ κ²½λ‘
|
220 |
+
"""
|
221 |
+
return os.path.join(self.chunks_dir, f"{file_hash}.pkl")
|
222 |
+
|
223 |
+
def _save_chunks(self, file_path: str, chunks: List[Document]) -> None:
|
224 |
+
"""
|
225 |
+
μ²ν¬ λ°μ΄ν° μ μ₯
|
226 |
+
|
227 |
+
Args:
|
228 |
+
file_path: μλ³Έ νμΌ κ²½λ‘
|
229 |
+
chunks: λ¬Έμ μ²ν¬ 리μ€νΈ
|
230 |
+
"""
|
231 |
+
# ν΄μ κ³μ°
|
232 |
+
file_hash = self._calculate_file_hash(file_path)
|
233 |
+
|
234 |
+
# μ²ν¬ νμΌ κ²½λ‘
|
235 |
+
chunks_path = self._get_chunks_path(file_hash)
|
236 |
+
|
237 |
+
# μ²ν¬ λ°μ΄ν° μ μ₯
|
238 |
+
with open(chunks_path, 'wb') as f:
|
239 |
+
pickle.dump(chunks, f)
|
240 |
+
|
241 |
+
# μΈλ±μ€ μ
λ°μ΄νΈ
|
242 |
+
self.file_index[file_path] = {
|
243 |
+
'hash': file_hash,
|
244 |
+
'chunks_path': chunks_path,
|
245 |
+
'last_processed': time.time(),
|
246 |
+
'chunks_count': len(chunks)
|
247 |
+
}
|
248 |
+
|
249 |
+
# μΈλ±μ€ μ μ₯
|
250 |
+
self._save_file_index()
|
251 |
+
|
252 |
+
print(f"μ²ν¬ μ μ₯ μλ£: {file_path} ({len(chunks)}κ° μ²ν¬)")
|
253 |
+
|
254 |
+
def _load_chunks(self, file_path: str) -> List[Document]:
|
255 |
+
"""
|
256 |
+
μ μ₯λ μ²ν¬ λ°μ΄ν° λ‘λ
|
257 |
+
|
258 |
+
Args:
|
259 |
+
file_path: νμΌ κ²½λ‘
|
260 |
+
|
261 |
+
Returns:
|
262 |
+
λ¬Έμ μ²ν¬ 리μ€νΈ
|
263 |
+
"""
|
264 |
+
chunks_path = self.file_index[file_path]['chunks_path']
|
265 |
+
with open(chunks_path, 'rb') as f:
|
266 |
+
chunks = pickle.load(f)
|
267 |
+
|
268 |
+
print(f"μ²ν¬ λ‘λ μλ£: {file_path} ({len(chunks)}κ° μ²ν¬)")
|
269 |
+
return chunks
|
270 |
+
|
271 |
+
def auto_process_documents(self) -> str:
|
272 |
+
"""
|
273 |
+
documents ν΄λμ PDF νμΌ μλ μ²λ¦¬
|
274 |
+
|
275 |
+
Returns:
|
276 |
+
μ²λ¦¬ κ²°κ³Ό λ©μμ§
|
277 |
+
"""
|
278 |
+
try:
|
279 |
+
start_time = time.time()
|
280 |
+
|
281 |
+
# PDF νμΌ λͺ©λ‘ μμ§
|
282 |
+
pdf_files = []
|
283 |
+
for filename in os.listdir(self.pdf_directory):
|
284 |
+
if filename.lower().endswith('.pdf'):
|
285 |
+
pdf_files.append(os.path.join(self.pdf_directory, filename))
|
286 |
+
|
287 |
+
if not pdf_files:
|
288 |
+
return f"'{self.pdf_directory}' ν΄λμ PDF νμΌμ΄ μμ΅λλ€."
|
289 |
+
|
290 |
+
print(f"λ°κ²¬λ PDF νμΌ: {len(pdf_files)}κ°")
|
291 |
+
|
292 |
+
# ν΄λ λ΄ PDF νμΌ μ²λ¦¬
|
293 |
+
new_files = []
|
294 |
+
updated_files = []
|
295 |
+
cached_files = []
|
296 |
+
failed_files = []
|
297 |
+
all_chunks = []
|
298 |
+
|
299 |
+
for file_path in pdf_files:
|
300 |
+
if self._is_file_processed(file_path):
|
301 |
+
# μΊμμμ μ²ν¬ λ‘λ
|
302 |
+
chunks = self._load_chunks(file_path)
|
303 |
+
all_chunks.extend(chunks)
|
304 |
+
cached_files.append(file_path)
|
305 |
+
self.processed_files.append(os.path.basename(file_path))
|
306 |
+
else:
|
307 |
+
# μ νμΌ λλ λ³κ²½λ νμΌ μ²λ¦¬
|
308 |
+
print(f"μ²λ¦¬ μ€: {file_path}")
|
309 |
+
|
310 |
+
try:
|
311 |
+
# κ°μ λ PDF μ²λ¦¬ λ©μλ μ¬μ©
|
312 |
+
chunks = self._process_pdf_file(file_path)
|
313 |
+
|
314 |
+
if chunks: # μ²ν¬κ° μλ κ²½μ°μλ§ μ μ₯
|
315 |
+
# μ²ν¬ μ μ₯
|
316 |
+
self._save_chunks(file_path, chunks)
|
317 |
+
|
318 |
+
all_chunks.extend(chunks)
|
319 |
+
if file_path in self.file_index:
|
320 |
+
updated_files.append(file_path)
|
321 |
+
else:
|
322 |
+
new_files.append(file_path)
|
323 |
+
|
324 |
+
self.processed_files.append(os.path.basename(file_path))
|
325 |
+
else:
|
326 |
+
print(f"'{file_path}' μ²λ¦¬ μ€ν¨: μΆμΆλ μ²ν¬ μμ")
|
327 |
+
failed_files.append(file_path)
|
328 |
+
except Exception as e:
|
329 |
+
print(f"'{file_path}' μ²λ¦¬ μ€ μ€λ₯: {e}")
|
330 |
+
failed_files.append(file_path)
|
331 |
+
|
332 |
+
# λͺ¨λ μ²ν¬ μ μ₯
|
333 |
+
self.documents = all_chunks
|
334 |
+
|
335 |
+
processing_time = time.time() - start_time
|
336 |
+
print(f"λ¬Έμ μ²λ¦¬ μλ£: {len(all_chunks)}κ° μ²ν¬, {processing_time:.2f}μ΄")
|
337 |
+
|
338 |
+
# λ²‘ν° μΈλ±μ€ μ μ₯ κ²½λ‘ νμΈ
|
339 |
+
if os.path.exists(self.vector_index_dir) and any(os.listdir(self.vector_index_dir)):
|
340 |
+
# κΈ°μ‘΄ λ²‘ν° μΈλ±μ€ λ‘λ
|
341 |
+
try:
|
342 |
+
print("μ μ₯λ λ²‘ν° μΈλ±μ€ λ‘λ μ€...")
|
343 |
+
vector_store_loaded = self.vector_store.load_local(self.vector_index_dir)
|
344 |
+
|
345 |
+
# μΈλ±μ€ λ‘λ μ±κ³΅ νμΈ
|
346 |
+
if self.vector_store.vector_store is not None:
|
347 |
+
# μ λ¬Έμλ λ³κ²½λ λ¬Έμκ° μμΌλ©΄ μΈλ±μ€ μ
λ°μ΄νΈ
|
348 |
+
if new_files or updated_files:
|
349 |
+
print("λ²‘ν° μΈλ±μ€ μ
λ°μ΄νΈ μ€...")
|
350 |
+
self.vector_store.add_documents(self.documents)
|
351 |
+
|
352 |
+
print("λ²‘ν° μΈλ±μ€ λ‘λ μλ£")
|
353 |
+
else:
|
354 |
+
print("λ²‘ν° μΈλ±μ€λ₯Ό λ‘λνμΌλ μ ν¨νμ§ μμ, μλ‘ μμ±ν©λλ€.")
|
355 |
+
self.vector_store.create_or_load(self.documents)
|
356 |
+
|
357 |
+
except Exception as e:
|
358 |
+
print(f"λ²‘ν° μΈλ±μ€ λ‘λ μ€ν¨, μλ‘ μμ±ν©λλ€: {e}")
|
359 |
+
# μ€λ₯ μμΈ μ 보 μΆλ ₯
|
360 |
+
import traceback
|
361 |
+
traceback.print_exc()
|
362 |
+
|
363 |
+
# μ λ²‘ν° μΈλ±μ€ μμ±
|
364 |
+
self.vector_store.create_or_load(self.documents)
|
365 |
+
else:
|
366 |
+
# μ λ²‘ν° μΈλ±μ€ μμ±
|
367 |
+
print("μ λ²‘ν° μΈλ±μ€ μμ± μ€...")
|
368 |
+
self.vector_store.create_or_load(self.documents)
|
369 |
+
|
370 |
+
# λ²‘ν° μΈλ±μ€ μ μ₯
|
371 |
+
if self.vector_store and self.vector_store.vector_store is not None:
|
372 |
+
try:
|
373 |
+
print(f"λ²‘ν° μΈλ±μ€ μ μ₯ μ€: {self.vector_index_dir}")
|
374 |
+
save_result = self.vector_store.save_local(self.vector_index_dir)
|
375 |
+
print(f"λ²‘ν° μΈλ±μ€ μ μ₯ μλ£: {self.vector_index_dir}")
|
376 |
+
except Exception as e:
|
377 |
+
print(f"λ²‘ν° μΈλ±μ€ μ μ₯ μ€ν¨: {e}")
|
378 |
+
# μ€λ₯ μμΈ μ 보 μΆλ ₯
|
379 |
+
import traceback
|
380 |
+
traceback.print_exc()
|
381 |
+
else:
|
382 |
+
print("λ²‘ν° μΈλ±μ€κ° μ΄κΈ°νλμ§ μμ μ μ₯νμ§ μμ΅λλ€.")
|
383 |
+
|
384 |
+
# RAG μ²΄μΈ μ΄κΈ°ν
|
385 |
+
if RAG_CHAIN_AVAILABLE:
|
386 |
+
self.rag_chain = RAGChain(self.vector_store)
|
387 |
+
self.is_initialized = True
|
388 |
+
|
389 |
+
total_time = time.time() - start_time
|
390 |
+
|
391 |
+
status_message = (
|
392 |
+
f"λ¬Έμ μ²λ¦¬ μλ£!\n"
|
393 |
+
f"- μ²λ¦¬λ νμΌ: {len(self.processed_files)}κ°\n"
|
394 |
+
f"- μΊμλ νμΌ: {len(cached_files)}κ°\n"
|
395 |
+
f"- μ νμΌ: {len(new_files)}κ°\n"
|
396 |
+
f"- μ
λ°μ΄νΈλ νμΌ: {len(updated_files)}κ°\n"
|
397 |
+
f"- μ€ν¨ν νμΌ: {len(failed_files)}κ°\n"
|
398 |
+
f"- μ΄ μ²ν¬ μ: {len(self.documents)}κ°\n"
|
399 |
+
f"- μ²λ¦¬ μκ°: {total_time:.2f}μ΄\n"
|
400 |
+
f"μ΄μ μ§λ¬Έν μ€λΉκ° λμμ΅λλ€!"
|
401 |
+
)
|
402 |
+
|
403 |
+
print(status_message)
|
404 |
+
return status_message
|
405 |
+
else:
|
406 |
+
return "RAG 체μΈμ μ΄κΈ°νν μ μμ΅λλ€. νμν λΌμ΄λΈλ¬λ¦¬κ° μ€μΉλμ΄ μλμ§ νμΈνμΈμ."
|
407 |
+
|
408 |
+
except Exception as e:
|
409 |
+
error_message = f"λ¬Έμ μ²λ¦¬ μ€ μ€λ₯ λ°μ: {str(e)}"
|
410 |
+
print(error_message)
|
411 |
+
import traceback
|
412 |
+
traceback.print_exc()
|
413 |
+
return error_message
|
414 |
+
|
415 |
+
def reset_cache(self) -> str:
|
416 |
+
"""
|
417 |
+
μΊμ μ΄κΈ°ν
|
418 |
+
|
419 |
+
Returns:
|
420 |
+
κ²°κ³Ό λ©μμ§
|
421 |
+
"""
|
422 |
+
try:
|
423 |
+
# μ²ν¬ νμΌ μμ
|
424 |
+
for filename in os.listdir(self.chunks_dir):
|
425 |
+
file_path = os.path.join(self.chunks_dir, filename)
|
426 |
+
if os.path.isfile(file_path):
|
427 |
+
os.remove(file_path)
|
428 |
+
|
429 |
+
# μΈλ±μ€ μ΄κΈ°ν
|
430 |
+
self.file_index = {}
|
431 |
+
self._save_file_index()
|
432 |
+
|
433 |
+
# λ²‘ν° μΈλ±μ€ μμ
|
434 |
+
for filename in os.listdir(self.vector_index_dir):
|
435 |
+
file_path = os.path.join(self.vector_index_dir, filename)
|
436 |
+
if os.path.isfile(file_path):
|
437 |
+
os.remove(file_path)
|
438 |
+
|
439 |
+
self.documents = []
|
440 |
+
self.processed_files = []
|
441 |
+
self.is_initialized = False
|
442 |
+
|
443 |
+
return "μΊμκ° μ΄κΈ°νλμμ΅λλ€. λ€μ μ€ν μ λͺ¨λ λ¬Έμκ° λ€μ μ²λ¦¬λ©λλ€."
|
444 |
+
except Exception as e:
|
445 |
+
return f"μΊμ μ΄κΈ°ν μ€ μ€λ₯ λ°μ: {str(e)}"
|
446 |
+
|
447 |
+
def process_query(self, query: str, chat_history: List[Tuple[str, str]]) -> Tuple[str, List[Tuple[str, str]]]:
|
448 |
+
"""
|
449 |
+
μ¬μ©μ 쿼리 μ²λ¦¬
|
450 |
+
|
451 |
+
Args:
|
452 |
+
query: μ¬μ©μ μ§λ¬Έ
|
453 |
+
chat_history: λν κΈ°λ‘
|
454 |
+
|
455 |
+
Returns:
|
456 |
+
μλ΅ λ° μ
λ°μ΄νΈλ λν κΈ°λ‘
|
457 |
+
"""
|
458 |
+
if not query: # λΉμ΄μλ 쿼리 μ²λ¦¬
|
459 |
+
return "", chat_history
|
460 |
+
|
461 |
+
if not self.is_initialized:
|
462 |
+
response = "λ¬Έμ λ‘λκ° μ΄κΈ°νλμ§ μμμ΅λλ€. μλ λ‘λλ₯Ό μλν©λλ€."
|
463 |
+
chat_history.append((query, response))
|
464 |
+
|
465 |
+
# μλ λ‘λ μλ
|
466 |
+
try:
|
467 |
+
self.auto_process_documents()
|
468 |
+
if not self.is_initialized:
|
469 |
+
response = "λ¬Έμλ₯Ό λ‘λν μ μμ΅λλ€. 'documents' ν΄λμ PDF νμΌμ΄ μλμ§ νμΈνμΈμ."
|
470 |
+
chat_history.append((query, response))
|
471 |
+
return "", chat_history
|
472 |
+
except Exception as e:
|
473 |
+
response = f"λ¬Έμ λ‘λ μ€ μ€λ₯ λ°μ: {str(e)}"
|
474 |
+
chat_history.append((query, response))
|
475 |
+
return "", chat_history
|
476 |
+
|
477 |
+
try:
|
478 |
+
# RAG μ²΄μΈ μ€ν λ° μλ΅ μμ±
|
479 |
+
start_time = time.time()
|
480 |
+
response = self.rag_chain.run(query)
|
481 |
+
end_time = time.time()
|
482 |
+
|
483 |
+
query_time = end_time - start_time
|
484 |
+
print(f"쿼리 μ²λ¦¬ μκ°: {query_time:.2f}μ΄")
|
485 |
+
|
486 |
+
chat_history.append((query, response))
|
487 |
+
return "", chat_history
|
488 |
+
except Exception as e:
|
489 |
+
error_msg = f"μ€λ₯ λ°μ: {str(e)}"
|
490 |
+
chat_history.append((query, error_msg))
|
491 |
+
return "", chat_history
|
492 |
+
|
493 |
+
def process_voice_query(self, audio, chat_history: List[Tuple[str, str]]) -> Tuple[str, List[Tuple[str, str]]]:
|
494 |
+
"""
|
495 |
+
μμ± μΏΌλ¦¬ μ²λ¦¬
|
496 |
+
|
497 |
+
Args:
|
498 |
+
audio: λ
Ήμλ μ€λμ€ λ°μ΄ν°
|
499 |
+
chat_history: λν κΈ°λ‘
|
500 |
+
|
501 |
+
Returns:
|
502 |
+
μλ΅ λ° μ
λ°μ΄νΈλ λν κΈ°λ‘
|
503 |
+
"""
|
504 |
+
if audio is None:
|
505 |
+
return "", chat_history
|
506 |
+
|
507 |
+
try:
|
508 |
+
# μμ νμΌμ μ€λμ€ μ μ₯
|
509 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
|
510 |
+
temp_path = temp_file.name
|
511 |
+
temp_file.write(audio)
|
512 |
+
|
513 |
+
print(f"[STT] μμ μ€λμ€ νμΌ μμ±: {temp_path}")
|
514 |
+
|
515 |
+
# config.pyμμ μ€μ ν μΈμ΄ μ½λλ‘ STT μ€ν
|
516 |
+
result = self.stt_client.recognize_file(temp_path, language=STT_LANGUAGE)
|
517 |
+
|
518 |
+
# μμ νμΌ μμ
|
519 |
+
try:
|
520 |
+
os.unlink(temp_path)
|
521 |
+
print("[STT] μμ μ€λμ€ νμΌ μμ λ¨")
|
522 |
+
except Exception as e:
|
523 |
+
print(f"[STT] μμ νμΌ μμ μ€ν¨: {e}")
|
524 |
+
|
525 |
+
# STT κ²°κ³Ό μ²λ¦¬
|
526 |
+
if "error" in result:
|
527 |
+
error_msg = f"μμ±μΈμ μ€λ₯: {result.get('error')}"
|
528 |
+
print(f"[STT] {error_msg}")
|
529 |
+
chat_history.append(("μμ± λ©μμ§", error_msg))
|
530 |
+
return "", chat_history
|
531 |
+
|
532 |
+
# μΈμλ ν
μ€νΈ μΆμΆ
|
533 |
+
recognized_text = result.get("text", "")
|
534 |
+
if not recognized_text:
|
535 |
+
error_msg = "μμ±μ μΈμν μ μμ΅λλ€. λ€μ μλν΄μ£ΌμΈμ."
|
536 |
+
print("[STT] μΈμλ ν
μ€νΈ μμ")
|
537 |
+
chat_history.append(("μμ± λ©μμ§", error_msg))
|
538 |
+
return "", chat_history
|
539 |
+
|
540 |
+
print(f"[STT] μΈμλ ν
μ€νΈ: {recognized_text}")
|
541 |
+
|
542 |
+
# μΈμλ ν
μ€νΈλ‘ 쿼리 μ²λ¦¬ (μμ± λ©μμ§ μ λμ΄ μΆκ°)
|
543 |
+
return self.process_query(f"π€ {recognized_text}", chat_history)
|
544 |
+
|
545 |
+
except Exception as e:
|
546 |
+
error_msg = f"μμ± μ²λ¦¬ μ€ μ€λ₯ λ°μ: {str(e)}"
|
547 |
+
print(f"[STT] {error_msg}")
|
548 |
+
chat_history.append(("μμ± λ©μμ§", error_msg))
|
549 |
+
return "", chat_history
|
550 |
+
|
551 |
+
def launch_app(self) -> None:
|
552 |
+
"""
|
553 |
+
μμ±μΈμ κΈ°λ₯μ΄ μΆκ°λ Gradio μ± μ€ν
|
554 |
+
"""
|
555 |
+
import gradio as gr
|
556 |
+
|
557 |
+
with gr.Blocks(title="μμ±μΈμ κΈ°λ₯μ΄ μΆκ°λ PDF λ¬Έμ κΈ°λ° RAG μ±λ΄") as app:
|
558 |
+
gr.Markdown("# μμ±μΈμ κΈ°λ₯μ΄ μΆκ°λ PDF λ¬Έμ κΈ°λ° RAG μ±λ΄")
|
559 |
+
gr.Markdown(f"* μ¬μ© μ€μΈ LLM λͺ¨λΈ: **{LLM_MODEL}**")
|
560 |
+
gr.Markdown(f"* PDF λ¬Έμ ν΄λ: **{self.pdf_directory}**")
|
561 |
+
gr.Markdown("* λ€μ΄λ² ν΄λ‘λ° μμ±μΈμ API ν΅ν©")
|
562 |
+
|
563 |
+
with gr.Row():
|
564 |
+
with gr.Column(scale=1):
|
565 |
+
# λ¬Έμ μν μΉμ
|
566 |
+
status_box = gr.Textbox(
|
567 |
+
label="λ¬Έμ μ²λ¦¬ μν",
|
568 |
+
value=f"μ²λ¦¬λ λ¬Έμ ({len(self.processed_files)}κ°): {', '.join(self.processed_files)}",
|
569 |
+
lines=5,
|
570 |
+
interactive=False
|
571 |
+
)
|
572 |
+
|
573 |
+
# μΊμ κ΄λ¦¬ λ²νΌ
|
574 |
+
refresh_button = gr.Button("λ¬Έμ μλ‘ μ½κΈ°", variant="primary")
|
575 |
+
reset_button = gr.Button("μΊμ μ΄κΈ°ν", variant="stop")
|
576 |
+
|
577 |
+
# μ²λ¦¬λ νμΌ μ 보
|
578 |
+
with gr.Accordion("μΊμ μΈλΆ μ 보", open=False):
|
579 |
+
file_info = ""
|
580 |
+
for file_path, info in self.file_index.items():
|
581 |
+
file_info += f"- {os.path.basename(file_path)}: {info['chunks_count']}κ° μ²ν¬\n"
|
582 |
+
|
583 |
+
cache_info = gr.Textbox(
|
584 |
+
label="μΊμλ νμΌ μ 보",
|
585 |
+
value=file_info or "μΊμλ νμΌμ΄ μμ΅λλ€.",
|
586 |
+
lines=5,
|
587 |
+
interactive=False
|
588 |
+
)
|
589 |
+
|
590 |
+
with gr.Column(scale=2):
|
591 |
+
# μ±ν
μΈν°νμ΄μ€
|
592 |
+
chatbot = gr.Chatbot(
|
593 |
+
label="λν λ΄μ©",
|
594 |
+
bubble_full_width=False,
|
595 |
+
height=500,
|
596 |
+
show_copy_button=True
|
597 |
+
)
|
598 |
+
|
599 |
+
with gr.Tabs() as input_tabs:
|
600 |
+
# ν
μ€νΈ μ
λ ₯ ν
|
601 |
+
with gr.Tab("ν
μ€νΈ μ
λ ₯"):
|
602 |
+
# ν
μ€νΈ μ
λ ₯κ³Ό μ μ‘ λ²νΌμ μνμΌλ‘ λ°°μΉ
|
603 |
+
with gr.Row():
|
604 |
+
query_box = gr.Textbox(
|
605 |
+
label="μ§λ¬Έ",
|
606 |
+
placeholder="μ²λ¦¬λ λ¬Έμ λ΄μ©μ λν΄ μ§λ¬ΈνμΈμ...",
|
607 |
+
lines=2,
|
608 |
+
scale=4
|
609 |
+
)
|
610 |
+
submit_btn = gr.Button("μ μ‘", variant="primary", scale=1)
|
611 |
+
|
612 |
+
# μμ± μ
λ ₯ ν
|
613 |
+
with gr.Tab("μμ± μ
λ ₯"):
|
614 |
+
audio_input = gr.Audio(
|
615 |
+
label="λ§μ΄ν¬ μ
λ ₯",
|
616 |
+
sources=["microphone"],
|
617 |
+
type="bytes",
|
618 |
+
format="wav"
|
619 |
+
)
|
620 |
+
voice_submit_btn = gr.Button("μμ± μ§λ¬Έ μ μ‘", variant="primary")
|
621 |
+
|
622 |
+
clear_chat_button = gr.Button("λν μ΄κΈ°ν")
|
623 |
+
|
624 |
+
# μ΄λ²€νΈ νΈλ€λ¬ μ€μ
|
625 |
+
refresh_button.click(
|
626 |
+
fn=self.auto_process_documents,
|
627 |
+
inputs=[],
|
628 |
+
outputs=[status_box]
|
629 |
+
)
|
630 |
+
|
631 |
+
reset_button.click(
|
632 |
+
fn=lambda: (self.reset_cache(), self.auto_process_documents()),
|
633 |
+
inputs=[],
|
634 |
+
outputs=[status_box]
|
635 |
+
)
|
636 |
+
|
637 |
+
# ν
μ€νΈ μ μ‘ λ²νΌ ν΄λ¦ μ΄λ²€νΈ
|
638 |
+
submit_btn.click(
|
639 |
+
fn=self.process_query,
|
640 |
+
inputs=[query_box, chatbot],
|
641 |
+
outputs=[query_box, chatbot]
|
642 |
+
)
|
643 |
+
|
644 |
+
# μν°ν€ μ
λ ₯ μ΄λ²€νΈ
|
645 |
+
query_box.submit(
|
646 |
+
fn=self.process_query,
|
647 |
+
inputs=[query_box, chatbot],
|
648 |
+
outputs=[query_box, chatbot]
|
649 |
+
)
|
650 |
+
|
651 |
+
# μμ± μ μ‘ λ²νΌ ν΄λ¦ μ΄λ²€νΈ
|
652 |
+
voice_submit_btn.click(
|
653 |
+
fn=self.process_voice_query,
|
654 |
+
inputs=[audio_input, chatbot],
|
655 |
+
outputs=[audio_input, chatbot]
|
656 |
+
)
|
657 |
+
|
658 |
+
# λν μ΄κΈ°ν λ²νΌ
|
659 |
+
clear_chat_button.click(
|
660 |
+
fn=lambda: [],
|
661 |
+
outputs=[chatbot]
|
662 |
+
)
|
663 |
+
|
664 |
+
# μ± μ€ν
|
665 |
+
app.launch(share=False)
|
666 |
+
|
667 |
+
|
668 |
+
if __name__ == "__main__":
|
669 |
+
app = AutoRAGChatApp()
|
670 |
+
app.launch_app()
|
config.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
λ²‘ν° μ€ν μ΄, μλ² λ© λͺ¨λΈ, LLM λ± κ΅¬μ± μμ μ€μ
|
3 |
+
"""
|
4 |
+
import os
|
5 |
+
from dotenv import load_dotenv
|
6 |
+
|
7 |
+
# .env νμΌμ΄ μμΌλ©΄ λ‘λ (λ‘컬 νκ²½μ©)
|
8 |
+
load_dotenv(verbose=True)
|
9 |
+
|
10 |
+
# νκ²½ κ°μ§
|
11 |
+
IS_HUGGINGFACE = os.getenv('SPACE_ID') is not None
|
12 |
+
|
13 |
+
# API ν€ λ° νκ²½ μ€μ
|
14 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "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")
|
15 |
+
LANGFUSE_PUBLIC_KEY = os.getenv("LANGFUSE_PUBLIC_KEY", "pk-lf-cd6248e2-59ad-496d-a4cb-487bb3ecfcd5")
|
16 |
+
LANGFUSE_SECRET_KEY = os.getenv("LANGFUSE_SECRET_KEY", "sk-lf-61460a1d-e637-4c22-b5e9-9250ac2579ba")
|
17 |
+
LANGFUSE_HOST = os.getenv("LANGFUSE_HOST", "https://cloud.langfuse.com")
|
18 |
+
|
19 |
+
# λ€μ΄λ² ν΄λ‘λ° STT API μ€μ
|
20 |
+
NAVER_CLIENT_ID = os.getenv("NAVER_CLIENT_ID", "xae4kga9s5")
|
21 |
+
NAVER_CLIENT_SECRET = os.getenv("NAVER_CLIENT_SECRET", "aoSmmr3xMrdVopxGduFX5YfGZRJpu2MDUiUvlvQx")
|
22 |
+
|
23 |
+
# λ€μ΄λ² ν΄λ‘λ° API ν€ νμΈ
|
24 |
+
if NAVER_CLIENT_ID and NAVER_CLIENT_SECRET:
|
25 |
+
print("λ€μ΄λ² ν΄λ‘λ° STT API ν€κ° μ€μ λμμ΅λλ€.")
|
26 |
+
else:
|
27 |
+
print("κ²½κ³ : λ€μ΄λ² ν΄λ‘λ° STT API ν€κ° μ€μ λμ§ μμμ΅λλ€.")
|
28 |
+
print("STT κΈ°λ₯μ μ¬μ©νλ €λ©΄ NAVER_CLIENT_IDμ NAVER_CLIENT_SECRET νκ²½ λ³μλ₯Ό μ€μ νμΈμ.")
|
29 |
+
|
30 |
+
# Milvus λ²‘ν° DB μ€μ
|
31 |
+
MILVUS_HOST = os.getenv("MILVUS_HOST", "localhost")
|
32 |
+
MILVUS_PORT = os.getenv("MILVUS_PORT", "19530")
|
33 |
+
MILVUS_COLLECTION = "pdf_documents"
|
34 |
+
|
35 |
+
# μλ² λ© λͺ¨λΈ μ€μ
|
36 |
+
EMBEDDING_MODEL = "Alibaba-NLP/gte-multilingual-base" # λ€κ΅μ΄ μ§μ λͺ¨λΈ
|
37 |
+
RERANKER_MODEL = "Alibaba-NLP/gte-multilingual-reranker-base" # λ€κ΅μ΄ μ§μ 리λ컀
|
38 |
+
|
39 |
+
# LLM λͺ¨λΈ μ€μ (νκ²½μ λ°λΌ μλ μ ν)
|
40 |
+
if IS_HUGGINGFACE:
|
41 |
+
# HuggingFace νκ²½μμλ OpenAI μ¬μ©
|
42 |
+
USE_OPENAI = True
|
43 |
+
LLM_MODEL = "gpt-3.5-turbo" # λλ λ€λ₯Έ μ μ ν λͺ¨λΈ
|
44 |
+
print("HuggingFace Spaces νκ²½ κ°μ§: OpenAI λͺ¨λΈ μ¬μ©")
|
45 |
+
else:
|
46 |
+
# λ‘컬 νκ²½μμλ Ollama μ¬μ©
|
47 |
+
USE_OPENAI = os.getenv("USE_OPENAI", "False").lower() == "true"
|
48 |
+
LLM_MODEL = os.getenv("LLM_MODEL", "gemma3:latest" if not USE_OPENAI else "gpt-3.5-turbo")
|
49 |
+
OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://localhost:11434")
|
50 |
+
print(f"λ‘컬 νκ²½: {'OpenAI' if USE_OPENAI else 'Ollama'} λͺ¨λΈ μ¬μ©")
|
51 |
+
|
52 |
+
# μ± μ€μ
|
53 |
+
CHUNK_SIZE = 1000
|
54 |
+
CHUNK_OVERLAP = 200
|
55 |
+
TOP_K_RETRIEVAL = 5 # λ²‘ν° κ²μ κ²°κ³Ό μ
|
56 |
+
TOP_K_RERANK = 3 # 리λνΉ ν μ νν κ²°κ³Ό μ
|
57 |
+
PDF_DIRECTORY = "documents" # PDF λ¬Έμκ° μ μ₯λ λλ ν 리
|
58 |
+
|
59 |
+
# μμ±μΈμ μ€μ
|
60 |
+
STT_LANGUAGE = "Kor" # κΈ°λ³Έ μΈμ΄ μ€μ (Kor, Eng, Jpn, Chn λ±)
|
dir
ADDED
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
λμμ΄ μ²λ¦¬ λͺ¨λ
|
3 |
+
"""
|
4 |
+
import os
|
5 |
+
import sys
|
6 |
+
import re
|
7 |
+
from typing import Dict, List, Optional, Set
|
8 |
+
|
9 |
+
# κΈ°λ³Έ λμμ΄ μ¬μ (MP_synonyms.py νμΌμ΄ μμ κ²½μ° μ¬μ©)
|
10 |
+
DEFAULT_SYNONYMS = {
|
11 |
+
"μμΈλ μ΄ν°": "μ‘μΈμμ΄ν°",
|
12 |
+
"μ‘μΈμμ΄ν°": "μ‘μΈμμ΄ν°",
|
13 |
+
"λͺ¨ν°": "μ‘μΈμμ΄ν°",
|
14 |
+
"컨λ°": "컨νΈλ‘€λ°μ€"
|
15 |
+
}
|
16 |
+
|
17 |
+
|
18 |
+
class SynonymsHandler:
|
19 |
+
"""
|
20 |
+
λΆνλͺ
μ λμμ΄λ₯Ό μ²λ¦¬νλ ν΄λμ€
|
21 |
+
"""
|
22 |
+
|
23 |
+
def __init__(self, synonyms_file: Optional[str] = None):
|
24 |
+
"""
|
25 |
+
λμμ΄ νΈλ€λ¬ μ΄κΈ°ν
|
26 |
+
|
27 |
+
Args:
|
28 |
+
synonyms_file: λμμ΄ νμΌ κ²½λ‘ (μ νμ )
|
29 |
+
"""
|
30 |
+
self.synonyms = {}
|
31 |
+
self.loaded = False
|
32 |
+
|
33 |
+
# 1. κΈ°λ³Έ μ 곡λ νμΌ κ²½λ‘ νμΈ
|
34 |
+
if synonyms_file and os.path.exists(synonyms_file):
|
35 |
+
self._load_from_file(synonyms_file)
|
36 |
+
|
37 |
+
# 2. μΌλ°μ μΈ μμΉ νμΈ (.venv/SYNONYMS/MP_synonyms.py)
|
38 |
+
elif os.path.exists(".venv/SYNONYMS/MP_synonyms.py"):
|
39 |
+
self._load_from_file(".venv/SYNONYMS/MP_synonyms.py")
|
40 |
+
|
41 |
+
# 3. νμ¬ λλ ν 리 νμΈ
|
42 |
+
elif os.path.exists("MP_synonyms.py"):
|
43 |
+
self._load_from_file("MP_synonyms.py")
|
44 |
+
|
45 |
+
# 4. κΈ°λ³Έ λμμ΄ μ¬μ©
|
46 |
+
else:
|
47 |
+
print("λμμ΄ νμΌμ μ°Ύμ μ μμ΄ κΈ°λ³Έ λμμ΄ μ¬μ μ μ¬μ©ν©λλ€.")
|
48 |
+
self.synonyms = DEFAULT_SYNONYMS
|
49 |
+
self.loaded = True
|
50 |
+
|
51 |
+
def _load_from_file(self, file_path: str) -> None:
|
52 |
+
"""
|
53 |
+
νμΌμμ λμμ΄ μ¬μ λ‘λ
|
54 |
+
|
55 |
+
Args:
|
56 |
+
file_path: λμμ΄ νμΌ κ²½λ‘
|
57 |
+
"""
|
58 |
+
try:
|
59 |
+
# νμΌ λ΄μ© μ½κΈ°
|
60 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
61 |
+
content = f.read()
|
62 |
+
|
63 |
+
# SYNONYMS λμ
λ리 μΆμΆ
|
64 |
+
synonyms_match = re.search(r'SYNONYMS\s*=\s*\{(.*?)\}', content, re.DOTALL)
|
65 |
+
if synonyms_match:
|
66 |
+
# μ€ννμ§ μκ³ λ³ννλ λ°©λ²
|
67 |
+
synonyms_str = "{" + synonyms_match.group(1) + "}"
|
68 |
+
|
69 |
+
# μ κ·μμ μ¬μ©νμ¬ λμ
λ리 ννλ‘ νμ±
|
70 |
+
pattern = r'"([^"]*)"\s*:\s*"([^"]*)"'
|
71 |
+
matches = re.findall(pattern, synonyms_str)
|
72 |
+
|
73 |
+
self.synonyms = {key: value for key, value in matches}
|
74 |
+
self.loaded = True
|
75 |
+
print(f"λμμ΄ μ¬μ λ‘λ μλ£: {file_path}, {len(self.synonyms)}κ° νλͺ©")
|
76 |
+
else:
|
77 |
+
print(f"νμΌμμ SYNONYMS λμ
λ리λ₯Ό μ°Ύμ μ μμ΅λλ€: {file_path}")
|
78 |
+
self.synonyms = DEFAULT_SYNONYMS
|
79 |
+
self.loaded = True
|
80 |
+
|
81 |
+
except Exception as e:
|
82 |
+
print(f"λμμ΄ μ¬μ λ‘λ μ€ μ€λ₯: {e}")
|
83 |
+
self.synonyms = DEFAULT_SYNONYMS
|
84 |
+
self.loaded = True
|
85 |
+
|
86 |
+
def find_in_text(self, text: str) -> List[str]:
|
87 |
+
"""
|
88 |
+
ν
μ€νΈμμ λμμ΄ μ°ΎκΈ°
|
89 |
+
|
90 |
+
Args:
|
91 |
+
text: κ²μν ν
μ€νΈ
|
92 |
+
|
93 |
+
Returns:
|
94 |
+
μ°Ύμ νμ€νλ λΆνλͺ
리μ€νΈ
|
95 |
+
"""
|
96 |
+
if not text or not self.loaded:
|
97 |
+
return []
|
98 |
+
|
99 |
+
# 곡백 μ κ±° λ° μλ¬Έμ λ³ν
|
100 |
+
text = text.lower()
|
101 |
+
|
102 |
+
found_parts = set()
|
103 |
+
|
104 |
+
# λμμ΄ ν€μλκ° ν
μ€νΈμ ν¬ν¨λμ΄ μλμ§ νμΈ
|
105 |
+
for keyword, standard_name in self.synonyms.items():
|
106 |
+
if keyword.lower() in text:
|
107 |
+
found_parts.add(standard_name)
|
108 |
+
|
109 |
+
return list(found_parts)
|
110 |
+
|
111 |
+
def standardize(self, part_name: str) -> str:
|
112 |
+
"""
|
113 |
+
λΆνλͺ
μ νμ€ν
|
114 |
+
|
115 |
+
Args:
|
116 |
+
part_name: νμ€νν λΆνλͺ
|
117 |
+
|
118 |
+
Returns:
|
119 |
+
νμ€νλ λΆνλͺ
|
120 |
+
"""
|
121 |
+
if not part_name or not self.loaded:
|
122 |
+
return part_name
|
123 |
+
|
124 |
+
# μλ¬Έμ λ³ννμ¬ λΉκ΅
|
125 |
+
part_lower = part_name.lower().strip()
|
126 |
+
|
127 |
+
# λμμ΄ μ¬μ μμ κ²μ
|
128 |
+
for keyword, standard_name in self.synonyms.items():
|
129 |
+
if part_lower == keyword.lower():
|
130 |
+
return standard_name
|
131 |
+
|
132 |
+
# λ§€μΉλμ§ μμΌλ©΄ μλ μ΄λ¦ λ°ν
|
133 |
+
return part_name
|
134 |
+
|
135 |
+
def standardize_parts_list(self, parts: List[str]) -> List[str]:
|
136 |
+
"""
|
137 |
+
λΆνλͺ
리μ€νΈλ₯Ό νμ€ν
|
138 |
+
|
139 |
+
Args:
|
140 |
+
parts: νμ€νν λΆνλͺ
리μ€νΈ
|
141 |
+
|
142 |
+
Returns:
|
143 |
+
νμ€νλ λΆνλͺ
리μ€νΈ
|
144 |
+
"""
|
145 |
+
if not parts or not self.loaded:
|
146 |
+
return parts
|
147 |
+
|
148 |
+
standardized = set()
|
149 |
+
|
150 |
+
for part in parts:
|
151 |
+
if part:
|
152 |
+
standardized.add(self.standardize(part))
|
153 |
+
|
154 |
+
return list(standardized)
|
monitoring.py
ADDED
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Langfuseλ₯Ό νμ©ν λͺ¨λν°λ§ ꡬν (μ νμ )
|
3 |
+
"""
|
4 |
+
from typing import Dict, Any, Optional
|
5 |
+
import time
|
6 |
+
import os
|
7 |
+
from dotenv import load_dotenv
|
8 |
+
|
9 |
+
# νκ²½ λ³μ λ‘λ
|
10 |
+
load_dotenv()
|
11 |
+
|
12 |
+
# μ€μ κ° κ°μ Έμ€κΈ°
|
13 |
+
LANGFUSE_SECRET_KEY = os.getenv("LANGFUSE_SECRET_KEY", "")
|
14 |
+
LANGFUSE_PUBLIC_KEY = os.getenv("LANGFUSE_PUBLIC_KEY", "")
|
15 |
+
LANGFUSE_HOST = os.getenv("LANGFUSE_HOST", "https://cloud.langfuse.com")
|
16 |
+
|
17 |
+
class LangfuseMonitoring:
|
18 |
+
def __init__(self):
|
19 |
+
"""
|
20 |
+
Langfuse λͺ¨λν°λ§ μ΄κΈ°ν (μ νμ κΈ°λ₯)
|
21 |
+
"""
|
22 |
+
self.enabled = False
|
23 |
+
print("λͺ¨λν°λ§ κΈ°λ₯μ μ΄κΈ°νν©λλ€...")
|
24 |
+
|
25 |
+
# Langfuseκ° μ€μΉλμ΄ μλμ§ νμΈ
|
26 |
+
try:
|
27 |
+
from langfuse import Langfuse
|
28 |
+
|
29 |
+
if LANGFUSE_PUBLIC_KEY and LANGFUSE_SECRET_KEY:
|
30 |
+
try:
|
31 |
+
self.langfuse = Langfuse(
|
32 |
+
public_key=LANGFUSE_PUBLIC_KEY,
|
33 |
+
secret_key=LANGFUSE_SECRET_KEY,
|
34 |
+
host=LANGFUSE_HOST,
|
35 |
+
)
|
36 |
+
self.enabled = True
|
37 |
+
print("Langfuse λͺ¨λν°λ§μ΄ νμ±νλμμ΅λλ€.")
|
38 |
+
except Exception as e:
|
39 |
+
print(f"Langfuse μ΄κΈ°ν μ€ν¨: {e}")
|
40 |
+
else:
|
41 |
+
print("Langfuse API ν€κ° μ€μ λμ§ μμμ΅λλ€. λͺ¨λν°λ§μ λΉνμ±νλ©λλ€.")
|
42 |
+
except ImportError:
|
43 |
+
print("langfuse ν¨ν€μ§κ° μ€μΉλμ§ μμμ΅λλ€. λͺ¨λν°λ§ κΈ°λ₯μ΄ λΉνμ±νλ©λλ€.")
|
44 |
+
print("pip install langfuse λͺ
λ ΉμΌλ‘ μ€μΉν μ μμ΅λλ€.")
|
45 |
+
|
46 |
+
def start_trace(self, name: str, user_id: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None) -> Any:
|
47 |
+
"""
|
48 |
+
μ νΈλ μ΄μ€ μμ
|
49 |
+
|
50 |
+
Args:
|
51 |
+
name: νΈλ μ΄μ€ μ΄λ¦
|
52 |
+
user_id: μ¬μ©μ ID (μ νμ )
|
53 |
+
metadata: μΆκ° λ©νλ°μ΄ν° (μ νμ )
|
54 |
+
|
55 |
+
Returns:
|
56 |
+
νΈλ μ΄μ€ κ°μ²΄ λλ None
|
57 |
+
"""
|
58 |
+
if not self.enabled:
|
59 |
+
return None
|
60 |
+
|
61 |
+
try:
|
62 |
+
return self.langfuse.trace(
|
63 |
+
name=name,
|
64 |
+
user_id=user_id,
|
65 |
+
metadata=metadata or {},
|
66 |
+
)
|
67 |
+
except Exception as e:
|
68 |
+
print(f"νΈλ μ΄μ€ μμ± μ€ν¨: {e}")
|
69 |
+
return None
|
70 |
+
|
71 |
+
def log_generation(self, trace: Any, name: str, prompt: str, response: str, metadata: Optional[Dict[str, Any]] = None) -> None:
|
72 |
+
"""
|
73 |
+
LLM μμ± λ‘κΉ
|
74 |
+
|
75 |
+
Args:
|
76 |
+
trace: νΈλ μ΄μ€ κ°μ²΄
|
77 |
+
name: μμ± μ΄λ¦
|
78 |
+
prompt: μ
λ ₯ ν둬ννΈ
|
79 |
+
response: λͺ¨λΈ μλ΅
|
80 |
+
metadata: μΆκ° λ©νλ°μ΄ν° (μ νμ )
|
81 |
+
"""
|
82 |
+
if not self.enabled or trace is None:
|
83 |
+
return
|
84 |
+
|
85 |
+
try:
|
86 |
+
trace.generation(
|
87 |
+
name=name,
|
88 |
+
model="user-defined-model",
|
89 |
+
prompt=prompt,
|
90 |
+
completion=response,
|
91 |
+
metadata=metadata or {},
|
92 |
+
)
|
93 |
+
except Exception as e:
|
94 |
+
print(f"μμ± λ‘κΉ
μ€ν¨: {e}")
|
95 |
+
|
96 |
+
def log_span(self, trace: Any, name: str, input_data: Any, output_data: Any, start_time: float, end_time: float) -> None:
|
97 |
+
"""
|
98 |
+
μ²λ¦¬ κ΅¬κ° λ‘κΉ
|
99 |
+
|
100 |
+
Args:
|
101 |
+
trace: νΈλ μ΄μ€ κ°μ²΄
|
102 |
+
name: κ΅¬κ° μ΄λ¦
|
103 |
+
input_data: μ
λ ₯ λ°μ΄ν°
|
104 |
+
output_data: μΆλ ₯ λ°μ΄ν°
|
105 |
+
start_time: μμ μκ°
|
106 |
+
end_time: μ’
λ£ μκ°
|
107 |
+
"""
|
108 |
+
if not self.enabled or trace is None:
|
109 |
+
return
|
110 |
+
|
111 |
+
try:
|
112 |
+
trace.span(
|
113 |
+
name=name,
|
114 |
+
start_time=start_time,
|
115 |
+
end_time=end_time,
|
116 |
+
input=input_data,
|
117 |
+
output=output_data,
|
118 |
+
metadata={"duration_ms": (end_time - start_time) * 1000},
|
119 |
+
)
|
120 |
+
except Exception as e:
|
121 |
+
print(f"κ΅¬κ° λ‘κΉ
μ€ν¨: {e}")
|
122 |
+
|
123 |
+
def end_trace(self, trace: Any) -> None:
|
124 |
+
"""
|
125 |
+
νΈλ μ΄μ€ μ’
λ£
|
126 |
+
|
127 |
+
Args:
|
128 |
+
trace: μ’
λ£ν νΈλ μ΄μ€ κ°μ²΄
|
129 |
+
"""
|
130 |
+
if not self.enabled or trace is None:
|
131 |
+
return
|
132 |
+
|
133 |
+
try:
|
134 |
+
trace.update(status="success")
|
135 |
+
except Exception as e:
|
136 |
+
print(f"νΈλ μ΄μ€ μ’
λ£ μ€ν¨: {e}")
|
optimized_document_processor.py
ADDED
@@ -0,0 +1,346 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
CPUμ μ΅μ νλ λ¬Έμ μ²λ¦¬ λͺ¨λ - λ³λ ¬ μ²λ¦¬ μ μ©
|
3 |
+
"""
|
4 |
+
import os
|
5 |
+
import time
|
6 |
+
from typing import List, Dict, Any, Optional
|
7 |
+
from langchain.schema import Document
|
8 |
+
from concurrent.futures import ThreadPoolExecutor
|
9 |
+
|
10 |
+
# λ©ν°νλ‘μΈμ± κ°μ Έμ€κΈ°
|
11 |
+
import multiprocessing
|
12 |
+
|
13 |
+
try:
|
14 |
+
CPU_COUNT = multiprocessing.cpu_count()
|
15 |
+
except:
|
16 |
+
CPU_COUNT = 4
|
17 |
+
|
18 |
+
print(f"CPU μ½μ΄ μ: {CPU_COUNT}")
|
19 |
+
|
20 |
+
# docling λΌμ΄λΈλ¬λ¦¬ μ‘΄μ¬ μ¬λΆ νμΈ
|
21 |
+
try:
|
22 |
+
from docling.datamodel.base_models import InputFormat
|
23 |
+
from docling.document_converter import DocumentConverter, PdfFormatOption
|
24 |
+
from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode
|
25 |
+
from docling.chunking import HybridChunker
|
26 |
+
|
27 |
+
DOCLING_AVAILABLE = True
|
28 |
+
print("docling λΌμ΄λΈλ¬λ¦¬ μ¬μ© κ°λ₯")
|
29 |
+
except ImportError:
|
30 |
+
print("docling λΌμ΄λΈλ¬λ¦¬λ₯Ό μ°Ύμ μ μμ΅λλ€. PyPDFLoaderλ§ μ¬μ©ν©λλ€.")
|
31 |
+
DOCLING_AVAILABLE = False
|
32 |
+
|
33 |
+
# LangChain λ¬Έμ λ‘λ
|
34 |
+
from langchain_community.document_loaders import PyPDFLoader
|
35 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
36 |
+
|
37 |
+
|
38 |
+
class OptimizedDocumentProcessor:
|
39 |
+
"""
|
40 |
+
CPUμ μ΅μ νλ λ³λ ¬ μ²λ¦¬ λ¬Έμ μ²λ¦¬ ν΄λμ€
|
41 |
+
"""
|
42 |
+
|
43 |
+
def __init__(self,
|
44 |
+
chunk_size: int = 1000,
|
45 |
+
chunk_overlap: int = 200,
|
46 |
+
tokenizer: str = "Alibaba-NLP/gte-multilingual-base", # μ¬λ°λ₯Έ λͺ¨λΈ κ²½λ‘λ‘ μμ
|
47 |
+
max_workers: int = CPU_COUNT):
|
48 |
+
"""
|
49 |
+
λ¬Έμ μ²λ¦¬κΈ° μ΄κΈ°ν
|
50 |
+
|
51 |
+
Args:
|
52 |
+
chunk_size: ν
μ€νΈ μ²ν¬ ν¬κΈ°
|
53 |
+
chunk_overlap: μ²ν¬ κ° κ²ΉμΉ¨ ν¬κΈ°
|
54 |
+
tokenizer: HybridChunkerμμ μ¬μ©ν ν ν¬λμ΄μ
|
55 |
+
max_workers: λ³λ ¬ μ²λ¦¬μ μ΅λ μμ
μ μ
|
56 |
+
"""
|
57 |
+
self.chunk_size = chunk_size
|
58 |
+
self.chunk_overlap = chunk_overlap
|
59 |
+
self.tokenizer = tokenizer
|
60 |
+
self.max_workers = max(1, min(max_workers, CPU_COUNT)) # CPU μ½μ΄ μ μ΄κ³Όνμ§ μλλ‘
|
61 |
+
|
62 |
+
print(f"λ³λ ¬ μ²λ¦¬ μμ
μ μ: {self.max_workers}")
|
63 |
+
|
64 |
+
# LangChain ν
μ€νΈ μ€ν리ν°
|
65 |
+
self.text_splitter = RecursiveCharacterTextSplitter(
|
66 |
+
chunk_size=chunk_size,
|
67 |
+
chunk_overlap=chunk_overlap,
|
68 |
+
separators=["\n\n", "\n", ". ", " ", ""],
|
69 |
+
)
|
70 |
+
|
71 |
+
# docling κ΄λ ¨ μ»΄ν¬λνΈ μ΄κΈ°ν
|
72 |
+
if DOCLING_AVAILABLE:
|
73 |
+
# νμ΄νλΌμΈ μ΅μ
μ€μ
|
74 |
+
self.pipeline_options = PdfPipelineOptions(do_table_structure=True)
|
75 |
+
self.pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
|
76 |
+
|
77 |
+
# λ¬Έμ λ³νκΈ° μ΄κΈ°ν
|
78 |
+
self.doc_converter = DocumentConverter(
|
79 |
+
format_options={
|
80 |
+
InputFormat.PDF: PdfFormatOption(pipeline_options=self.pipeline_options)
|
81 |
+
}
|
82 |
+
)
|
83 |
+
|
84 |
+
# HybridChunker μ΄κΈ°ν (trust_remote_code=True μΆκ°)
|
85 |
+
self.hybrid_chunker = HybridChunker(
|
86 |
+
tokenizer=tokenizer,
|
87 |
+
chunk_size=chunk_size,
|
88 |
+
overlap=chunk_overlap,
|
89 |
+
tokenizer_kwargs={"trust_remote_code": True} # μ격 μ½λ μ€ν νμ©
|
90 |
+
)
|
91 |
+
|
92 |
+
print(f"docling μ΄κΈ°ν μλ£: HybridChunker(μ²ν¬ ν¬κΈ°={chunk_size}, μ€λ²λ©={chunk_overlap})")
|
93 |
+
|
94 |
+
def process_with_docling(self, pdf_path: str) -> Dict[str, Any]:
|
95 |
+
"""
|
96 |
+
doclingμ μ¬μ©νμ¬ PDF λ¬Έμ μ²λ¦¬
|
97 |
+
|
98 |
+
Args:
|
99 |
+
pdf_path: PDF νμΌ κ²½λ‘
|
100 |
+
|
101 |
+
Returns:
|
102 |
+
μ²λ¦¬λ λ¬Έμ λ°μ΄ν°
|
103 |
+
"""
|
104 |
+
if not DOCLING_AVAILABLE:
|
105 |
+
raise ImportError("docling λΌμ΄λΈλ¬λ¦¬κ° μ€μΉλμ§ μμμ΅λλ€.")
|
106 |
+
|
107 |
+
try:
|
108 |
+
start_time = time.time()
|
109 |
+
|
110 |
+
# λ¬Έμ λ³ν
|
111 |
+
conv_res = self.doc_converter.convert(pdf_path)
|
112 |
+
doc = conv_res.document
|
113 |
+
|
114 |
+
# μ±λ₯ μΈ‘μ
|
115 |
+
conversion_time = time.time() - start_time
|
116 |
+
print(f"PDF λ³ν μκ°: {conversion_time:.2f}μ΄")
|
117 |
+
|
118 |
+
# λ©νλ°μ΄ν° μΆμΆ
|
119 |
+
metadata = {
|
120 |
+
"source": pdf_path,
|
121 |
+
"title": os.path.basename(pdf_path),
|
122 |
+
"processing_time": conversion_time
|
123 |
+
}
|
124 |
+
|
125 |
+
return {
|
126 |
+
"content": doc.export_to_markdown(),
|
127 |
+
"metadata": metadata,
|
128 |
+
"raw_document": doc,
|
129 |
+
}
|
130 |
+
|
131 |
+
except Exception as e:
|
132 |
+
print(f"doclingμΌλ‘ λ¬Έμ μ²λ¦¬ μ€ μ€λ₯ λ°μ: {e}")
|
133 |
+
raise
|
134 |
+
|
135 |
+
def chunk_with_hybrid_chunker(self, doc: Any) -> List[Dict[str, Any]]:
|
136 |
+
"""
|
137 |
+
HybridChunkerλ₯Ό μ¬μ©νμ¬ λ¬Έμλ₯Ό μ²ν¬λ‘ λΆν
|
138 |
+
|
139 |
+
Args:
|
140 |
+
doc: docling λ¬Έμ κ°μ²΄
|
141 |
+
|
142 |
+
Returns:
|
143 |
+
μ²ν¬ 리μ€νΈ
|
144 |
+
"""
|
145 |
+
start_time = time.time()
|
146 |
+
|
147 |
+
# μ²νΉ μν
|
148 |
+
chunk_iter = self.hybrid_chunker.chunk(doc)
|
149 |
+
chunks = list(chunk_iter)
|
150 |
+
|
151 |
+
chunking_time = time.time() - start_time
|
152 |
+
print(f"μ²νΉ μκ°: {chunking_time:.2f}μ΄ (μ²ν¬ μ: {len(chunks)})")
|
153 |
+
|
154 |
+
return chunks
|
155 |
+
|
156 |
+
def create_langchain_documents_from_chunks(self,
|
157 |
+
chunks: List[Dict[str, Any]],
|
158 |
+
metadata: Dict[str, Any]) -> List[Document]:
|
159 |
+
"""
|
160 |
+
docling μ²ν¬λ₯Ό LangChain Document κ°μ²΄λ‘ λ³ν
|
161 |
+
|
162 |
+
Args:
|
163 |
+
chunks: docling HybridChunkerλ‘ μμ±ν μ²ν¬ 리μ€νΈ
|
164 |
+
metadata: λ¬Έμ λ©νλ°μ΄ν°
|
165 |
+
|
166 |
+
Returns:
|
167 |
+
LangChain Document κ°μ²΄ 리μ€νΈ
|
168 |
+
"""
|
169 |
+
documents = []
|
170 |
+
|
171 |
+
for i, chunk in enumerate(chunks):
|
172 |
+
# κ° μ²ν¬μ λν λ©νλ°μ΄ν°
|
173 |
+
chunk_metadata = metadata.copy()
|
174 |
+
chunk_metadata["chunk_id"] = i
|
175 |
+
|
176 |
+
# μ²ν¬ λ΄μ© μΆμΆ
|
177 |
+
if hasattr(chunk, "text"):
|
178 |
+
content = chunk.text
|
179 |
+
elif hasattr(chunk, "content"):
|
180 |
+
content = chunk.content
|
181 |
+
else:
|
182 |
+
content = str(chunk)
|
183 |
+
|
184 |
+
document = Document(
|
185 |
+
page_content=content,
|
186 |
+
metadata=chunk_metadata
|
187 |
+
)
|
188 |
+
documents.append(document)
|
189 |
+
|
190 |
+
return documents
|
191 |
+
|
192 |
+
def process_with_langchain(self, pdf_path: str) -> List[Document]:
|
193 |
+
"""
|
194 |
+
LangChainμ PyPDFLoaderλ₯Ό μ¬μ©νμ¬ PDF λ¬Έμ λ‘λ
|
195 |
+
|
196 |
+
Args:
|
197 |
+
pdf_path: PDF νμΌ κ²½λ‘
|
198 |
+
|
199 |
+
Returns:
|
200 |
+
LangChain Document κ°μ²΄ 리μ€νΈ
|
201 |
+
"""
|
202 |
+
start_time = time.time()
|
203 |
+
|
204 |
+
try:
|
205 |
+
loader = PyPDFLoader(pdf_path)
|
206 |
+
documents = loader.load()
|
207 |
+
|
208 |
+
processing_time = time.time() - start_time
|
209 |
+
print(f"PyPDFLoader μ²λ¦¬ μκ°: {processing_time:.2f}μ΄")
|
210 |
+
|
211 |
+
return documents
|
212 |
+
except Exception as e:
|
213 |
+
print(f"PyPDFLoaderλ‘ λ¬Έμ μ²λ¦¬ μ€ μ€λ₯ λ°μ: {e}")
|
214 |
+
raise
|
215 |
+
|
216 |
+
def process_pdf(self, pdf_path: str, use_docling: bool = True) -> List[Document]:
|
217 |
+
"""
|
218 |
+
PDF νμΌ μ²λ¦¬
|
219 |
+
|
220 |
+
Args:
|
221 |
+
pdf_path: PDF νμΌ κ²½λ‘
|
222 |
+
use_docling: docling μ¬μ© μ¬λΆ
|
223 |
+
|
224 |
+
Returns:
|
225 |
+
μ²λ¦¬λ λ¬Έμμ μ²ν¬ 리μ€νΈ
|
226 |
+
"""
|
227 |
+
total_start_time = time.time()
|
228 |
+
|
229 |
+
# docling μ¬μ© κ°λ₯ μ¬λΆ νμΈ
|
230 |
+
can_use_docling = use_docling and DOCLING_AVAILABLE
|
231 |
+
|
232 |
+
if can_use_docling:
|
233 |
+
try:
|
234 |
+
# 1. doclingμΌλ‘ PDF μ²λ¦¬
|
235 |
+
docling_result = self.process_with_docling(pdf_path)
|
236 |
+
doc = docling_result["raw_document"]
|
237 |
+
metadata = docling_result["metadata"]
|
238 |
+
|
239 |
+
# 2. HybridChunkerλ‘ μ²ν¬ μμ±
|
240 |
+
chunks = self.chunk_with_hybrid_chunker(doc)
|
241 |
+
|
242 |
+
# 3. μ²ν¬λ₯Ό LangChain Documentλ‘ λ³ν
|
243 |
+
documents = self.create_langchain_documents_from_chunks(chunks, metadata)
|
244 |
+
|
245 |
+
total_time = time.time() - total_start_time
|
246 |
+
print(f"docling μ²λ¦¬ μλ£: '{pdf_path}', {len(documents)} μ²ν¬, μ΄ {total_time:.2f}μ΄")
|
247 |
+
|
248 |
+
return documents
|
249 |
+
except Exception as e:
|
250 |
+
print(f"docling μ²λ¦¬ μ€ν¨, PyPDFLoaderλ‘ λ체: {e}")
|
251 |
+
can_use_docling = False
|
252 |
+
|
253 |
+
if not can_use_docling:
|
254 |
+
# PyPDFLoaderλ‘ μ²λ¦¬ (λ체 λ°©μ)
|
255 |
+
documents = self.process_with_langchain(pdf_path)
|
256 |
+
chunks = self.text_splitter.split_documents(documents)
|
257 |
+
|
258 |
+
total_time = time.time() - total_start_time
|
259 |
+
print(f"PyPDFLoader μ²λ¦¬ μλ£: '{pdf_path}', {len(chunks)} μ²ν¬, μ΄ {total_time:.2f}μ΄")
|
260 |
+
|
261 |
+
return chunks
|
262 |
+
|
263 |
+
def process_directory_parallel(self, directory: str, use_docling: bool = True) -> List[Document]:
|
264 |
+
"""
|
265 |
+
λλ ν 리 λ΄ λͺ¨λ PDF νμΌ λ³λ ¬ μ²λ¦¬ (λ©ν°μ€λ λ©)
|
266 |
+
|
267 |
+
Args:
|
268 |
+
directory: PDF νμΌ λλ ν 리 κ²½λ‘
|
269 |
+
use_docling: docling μ¬μ© μ¬λΆ
|
270 |
+
|
271 |
+
Returns:
|
272 |
+
μ²λ¦¬λ λͺ¨λ λ¬Έμμ μ²ν¬ 리μ€νΈ
|
273 |
+
"""
|
274 |
+
all_documents = []
|
275 |
+
pdf_files = []
|
276 |
+
|
277 |
+
# PDF νμΌ λͺ©λ‘ μμ§
|
278 |
+
for file in os.listdir(directory):
|
279 |
+
if file.endswith(".pdf"):
|
280 |
+
pdf_path = os.path.join(directory, file)
|
281 |
+
pdf_files.append(pdf_path)
|
282 |
+
|
283 |
+
if not pdf_files:
|
284 |
+
print(f"'{directory}' λλ ν 리μ PDF νμΌμ΄ μμ΅λλ€.")
|
285 |
+
return []
|
286 |
+
|
287 |
+
print(f"μ΄ {len(pdf_files)}κ° PDF νμΌ λ³λ ¬ μ²λ¦¬ μμ (μ΅λ {self.max_workers} μμ
μ)")
|
288 |
+
start_time = time.time()
|
289 |
+
|
290 |
+
# λ³λ ¬ μ²λ¦¬ μ€ν
|
291 |
+
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
292 |
+
# κ° PDF νμΌμ λν΄ process_pdf ν¨μ λ³λ ¬ μ€ν
|
293 |
+
future_to_pdf = {executor.submit(self.process_pdf, pdf_path, use_docling): pdf_path
|
294 |
+
for pdf_path in pdf_files}
|
295 |
+
|
296 |
+
# κ²°κ³Ό μμ§
|
297 |
+
for future in future_to_pdf:
|
298 |
+
pdf_path = future_to_pdf[future]
|
299 |
+
try:
|
300 |
+
# κ²°κ³Ό κ°μ Έμ€κΈ°
|
301 |
+
chunks = future.result()
|
302 |
+
all_documents.extend(chunks)
|
303 |
+
print(f"'{os.path.basename(pdf_path)}' μ²λ¦¬ μλ£: {len(chunks)} μ²ν¬")
|
304 |
+
except Exception as e:
|
305 |
+
print(f"'{pdf_path}' μ²λ¦¬ μ€ μ€λ₯ λ°μ: {e}")
|
306 |
+
|
307 |
+
total_time = time.time() - start_time
|
308 |
+
print(f"λ³λ ¬ μ²λ¦¬ μλ£: μ΄ {len(all_documents)} μ²ν¬, μ²λ¦¬ μκ°: {total_time:.2f}μ΄")
|
309 |
+
|
310 |
+
return all_documents
|
311 |
+
|
312 |
+
def process_directory(self, directory: str, use_docling: bool = True, parallel: bool = True) -> List[Document]:
|
313 |
+
"""
|
314 |
+
λλ ν 리 λ΄ λͺ¨λ PDF νμΌ μ²λ¦¬
|
315 |
+
|
316 |
+
Args:
|
317 |
+
directory: PDF νμΌ λλ ν 리 κ²½λ‘
|
318 |
+
use_docling: docling μ¬μ© μ¬λΆ
|
319 |
+
parallel: λ³λ ¬ μ²λ¦¬ μ¬μ© μ¬λΆ
|
320 |
+
|
321 |
+
Returns:
|
322 |
+
μ²λ¦¬λ λͺ¨λ λ¬Έμμ μ²ν¬ 리μ€νΈ
|
323 |
+
"""
|
324 |
+
# λ³λ ¬ μ²λ¦¬ μ¬μ©
|
325 |
+
if parallel:
|
326 |
+
return self.process_directory_parallel(directory, use_docling)
|
327 |
+
|
328 |
+
# μμ°¨ μ²λ¦¬
|
329 |
+
all_documents = []
|
330 |
+
start_time = time.time()
|
331 |
+
|
332 |
+
for file in os.listdir(directory):
|
333 |
+
if file.endswith(".pdf"):
|
334 |
+
pdf_path = os.path.join(directory, file)
|
335 |
+
print(f"μ²λ¦¬ μ€: {pdf_path}")
|
336 |
+
|
337 |
+
try:
|
338 |
+
chunks = self.process_pdf(pdf_path, use_docling=use_docling)
|
339 |
+
all_documents.extend(chunks)
|
340 |
+
except Exception as e:
|
341 |
+
print(f"'{pdf_path}' μ²λ¦¬ μ€ μ€λ₯ λ°μ: {e}")
|
342 |
+
|
343 |
+
total_time = time.time() - start_time
|
344 |
+
print(f"μμ°¨ μ²λ¦¬ μλ£: μ΄ {len(all_documents)} μ²ν¬, μ²λ¦¬ μκ°: {total_time:.2f}μ΄")
|
345 |
+
|
346 |
+
return all_documents
|
rag_chain.py
ADDED
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
LangChainμ νμ©ν RAG μ²΄μΈ κ΅¬ν
|
3 |
+
"""
|
4 |
+
from typing import List, Dict, Any
|
5 |
+
from langchain.schema import Document
|
6 |
+
from langchain.prompts import PromptTemplate
|
7 |
+
from langchain_core.output_parsers import StrOutputParser
|
8 |
+
from langchain_core.runnables import RunnablePassthrough
|
9 |
+
from langchain_community.chat_models import ChatOllama
|
10 |
+
from langchain_openai import ChatOpenAI
|
11 |
+
|
12 |
+
from config import (
|
13 |
+
OLLAMA_HOST, LLM_MODEL, USE_OPENAI,
|
14 |
+
OPENAI_API_KEY, TOP_K_RETRIEVAL, TOP_K_RERANK
|
15 |
+
)
|
16 |
+
from vector_store import VectorStore
|
17 |
+
from reranker import Reranker
|
18 |
+
|
19 |
+
|
20 |
+
class RAGChain:
|
21 |
+
def __init__(self, vector_store: VectorStore, use_reranker: bool = True):
|
22 |
+
"""
|
23 |
+
RAG μ²΄μΈ μ΄κΈ°ν (νκ²½μ λ°λ₯Έ LLM μ ν)
|
24 |
+
|
25 |
+
Args:
|
26 |
+
vector_store: λ²‘ν° μ€ν μ΄ μΈμ€ν΄μ€
|
27 |
+
use_reranker: 리λ컀 μ¬μ© μ¬λΆ
|
28 |
+
"""
|
29 |
+
try:
|
30 |
+
print("RAGChain μ΄κΈ°ν μμ...")
|
31 |
+
self.vector_store = vector_store
|
32 |
+
self.use_reranker = use_reranker
|
33 |
+
print(f"리λ컀 μ¬μ© μ¬λΆ: {use_reranker}")
|
34 |
+
|
35 |
+
if use_reranker:
|
36 |
+
try:
|
37 |
+
self.reranker = Reranker()
|
38 |
+
print("리λ컀 μ΄κΈ°ν μ±κ³΅")
|
39 |
+
except Exception as e:
|
40 |
+
print(f"리λ컀 μ΄κΈ°ν μ€ν¨: {str(e)}")
|
41 |
+
self.reranker = None
|
42 |
+
self.use_reranker = False
|
43 |
+
else:
|
44 |
+
self.reranker = None
|
45 |
+
|
46 |
+
# νκ²½μ λ°λ₯Έ LLM λͺ¨λΈ μ€μ
|
47 |
+
if USE_OPENAI or IS_HUGGINGFACE:
|
48 |
+
print(f"OpenAI λͺ¨λΈ μ΄κΈ°ν: {LLM_MODEL}")
|
49 |
+
print(f"API ν€ μ‘΄μ¬ μ¬λΆ: {'μμ' if OPENAI_API_KEY else 'μμ'}")
|
50 |
+
try:
|
51 |
+
self.llm = ChatOpenAI(
|
52 |
+
model_name=LLM_MODEL,
|
53 |
+
temperature=0.2,
|
54 |
+
api_key=OPENAI_API_KEY,
|
55 |
+
)
|
56 |
+
print("OpenAI λͺ¨λΈ μ΄κΈ°ν μ±κ³΅")
|
57 |
+
except Exception as e:
|
58 |
+
print(f"OpenAI λͺ¨λΈ μ΄κΈ°ν μ€ν¨: {str(e)}")
|
59 |
+
raise
|
60 |
+
else:
|
61 |
+
try:
|
62 |
+
print(f"Ollama λͺ¨λΈ μ΄κΈ°ν: {LLM_MODEL}")
|
63 |
+
self.llm = ChatOllama(
|
64 |
+
model=LLM_MODEL,
|
65 |
+
temperature=0.2,
|
66 |
+
base_url=OLLAMA_HOST,
|
67 |
+
)
|
68 |
+
print("Ollama λͺ¨λΈ μ΄κΈ°ν μ±κ³΅")
|
69 |
+
except Exception as e:
|
70 |
+
print(f"Ollama λͺ¨λΈ μ΄κΈ°ν μ€ν¨: {str(e)}")
|
71 |
+
raise
|
72 |
+
|
73 |
+
# RAG μ²΄μΈ κ΅¬μ± λ° ν둬ννΈ μ€μ
|
74 |
+
print("RAG μ²΄μΈ μ€μ μμ...")
|
75 |
+
self.setup_chain()
|
76 |
+
print("RAG μ²΄μΈ μ€μ μλ£")
|
77 |
+
except Exception as e:
|
78 |
+
print(f"RAGChain μ΄κΈ°ν μ€ μμΈ μ€λ₯: {str(e)}")
|
79 |
+
import traceback
|
80 |
+
traceback.print_exc()
|
81 |
+
raise
|
82 |
+
|
83 |
+
def setup_chain(self) -> None:
|
84 |
+
"""
|
85 |
+
RAG μ²΄μΈ λ° ν둬ννΈ μ€μ
|
86 |
+
"""
|
87 |
+
# ν둬ννΈ ν
νλ¦Ώ μ μ
|
88 |
+
template = """
|
89 |
+
λ€μ μ 보λ₯Ό κΈ°λ°μΌλ‘ μ§λ¬Έμ μ ννκ² λ΅λ³ν΄μ£ΌμΈμ.
|
90 |
+
|
91 |
+
μ§λ¬Έ: {question}
|
92 |
+
|
93 |
+
μ°Έκ³ μ 보:
|
94 |
+
{context}
|
95 |
+
|
96 |
+
μ°Έκ³ μ 보μ λ΅μ΄ μλ κ²½μ° "μ 곡λ λ¬Έμμμ ν΄λΉ μ 보λ₯Ό μ°Ύμ μ μμ΅λλ€."λΌκ³ λ΅λ³νμΈμ.
|
97 |
+
λ΅λ³μ μ ννκ³ κ°κ²°νκ² μ 곡νλ, μ°Έκ³ μ 보μμ κ·Όκ±°λ₯Ό μ°Ύμ μ€λͺ
ν΄μ£ΌμΈμ.
|
98 |
+
μ°Έκ³ μ 보μ μΆμ²λ ν¨κ» μλ €μ£ΌμΈμ.
|
99 |
+
"""
|
100 |
+
|
101 |
+
self.prompt = PromptTemplate.from_template(template)
|
102 |
+
|
103 |
+
# RAG μ²΄μΈ μ μ
|
104 |
+
self.chain = (
|
105 |
+
{"context": self._retrieve, "question": RunnablePassthrough()}
|
106 |
+
| self.prompt
|
107 |
+
| self.llm
|
108 |
+
| StrOutputParser()
|
109 |
+
)
|
110 |
+
|
111 |
+
def _retrieve(self, query: str) -> str:
|
112 |
+
"""
|
113 |
+
쿼리μ λν κ΄λ ¨ λ¬Έμ κ²μ λ° μ»¨ν
μ€νΈ ꡬμ±
|
114 |
+
|
115 |
+
Args:
|
116 |
+
query: μ¬μ©μ μ§λ¬Έ
|
117 |
+
|
118 |
+
Returns:
|
119 |
+
κ²μ κ²°κ³Όλ₯Ό ν¬ν¨ν 컨ν
μ€νΈ λ¬Έμμ΄
|
120 |
+
"""
|
121 |
+
# λ²‘ν° κ²μ μν
|
122 |
+
docs = self.vector_store.similarity_search(query, k=TOP_K_RETRIEVAL)
|
123 |
+
|
124 |
+
# 리λ컀 μ μ© (μ νμ )
|
125 |
+
if self.use_reranker and docs:
|
126 |
+
docs = self.reranker.rerank(query, docs, top_k=TOP_K_RERANK)
|
127 |
+
|
128 |
+
# κ²μ κ²°κ³Ό 컨ν
μ€νΈ ꡬμ±
|
129 |
+
context_parts = []
|
130 |
+
for i, doc in enumerate(docs, 1):
|
131 |
+
source = doc.metadata.get("source", "μ μ μλ μΆμ²")
|
132 |
+
page = doc.metadata.get("page", "")
|
133 |
+
source_info = f"{source}"
|
134 |
+
if page:
|
135 |
+
source_info += f" (νμ΄μ§: {page})"
|
136 |
+
|
137 |
+
context_parts.append(f"[μ°Έκ³ μλ£ {i}] - μΆμ²: {source_info}\n{doc.page_content}\n")
|
138 |
+
|
139 |
+
return "\n".join(context_parts)
|
140 |
+
|
141 |
+
def run(self, query: str) -> str:
|
142 |
+
"""
|
143 |
+
μ¬μ©μ 쿼리μ λν RAG νμ΄νλΌμΈ μ€ν
|
144 |
+
|
145 |
+
Args:
|
146 |
+
query: μ¬μ©μ μ§λ¬Έ
|
147 |
+
|
148 |
+
Returns:
|
149 |
+
λͺ¨λΈ μλ΅ λ¬Έμμ΄
|
150 |
+
"""
|
151 |
+
return self.chain.invoke(query)
|
requirements.txt
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
langchain>=0.1.0
|
2 |
+
langchain-community>=0.0.10
|
3 |
+
langchain-huggingface>=0.0.1
|
4 |
+
sentence-transformers>=2.2.2
|
5 |
+
faiss-cpu>=1.7.4
|
6 |
+
pypdf>=3.15.1
|
7 |
+
gradio>=4.0.0
|
8 |
+
python-dotenv>=1.0.0
|
9 |
+
torch>=2.0.0
|
10 |
+
transformers>=4.34.0
|
11 |
+
langchain-openai>=0.0.2
|
12 |
+
openai>=1.0.0
|
13 |
+
docling>=0.1.3
|
14 |
+
requests>=2.28.0
|
reranker.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
μ격 μ½λ μ€ν μ΅μ
μ΄ μΆκ°λ 리λ컀 λͺ¨λ
|
3 |
+
"""
|
4 |
+
from typing import List, Dict, Tuple
|
5 |
+
import numpy as np
|
6 |
+
from sentence_transformers import CrossEncoder
|
7 |
+
from langchain.schema import Document
|
8 |
+
from config import RERANKER_MODEL
|
9 |
+
|
10 |
+
class Reranker:
|
11 |
+
def __init__(self, model_name: str = RERANKER_MODEL):
|
12 |
+
"""
|
13 |
+
Cross-Encoder 리λ컀 μ΄κΈ°ν
|
14 |
+
|
15 |
+
Args:
|
16 |
+
model_name: μ¬μ©ν Cross-Encoder λͺ¨λΈ μ΄λ¦
|
17 |
+
"""
|
18 |
+
print(f"리λ컀 λͺ¨λΈ λ‘λ μ€: {model_name}")
|
19 |
+
|
20 |
+
# μ격 μ½λ μ€ν νμ© μ΅μ
μΆκ°
|
21 |
+
self.model = CrossEncoder(
|
22 |
+
model_name,
|
23 |
+
trust_remote_code=True # μ격 μ½λ μ€ν νμ© (νμ)
|
24 |
+
)
|
25 |
+
|
26 |
+
print(f"리λ컀 λͺ¨λΈ λ‘λ μλ£: {model_name}")
|
27 |
+
|
28 |
+
def rerank(self, query: str, documents: List[Document], top_k: int = 3) -> List[Document]:
|
29 |
+
"""
|
30 |
+
κ²μ κ²°κ³Ό μ¬μ λ ¬
|
31 |
+
|
32 |
+
Args:
|
33 |
+
query: κ²μ 쿼리
|
34 |
+
documents: λ²‘ν° κ²μ κ²°κ³Ό λ¬Έμ 리μ€νΈ
|
35 |
+
top_k: λ°νν μμ κ²°κ³Ό μ
|
36 |
+
|
37 |
+
Returns:
|
38 |
+
μ¬μ λ ¬λ μμ λ¬Έμ 리μ€νΈ
|
39 |
+
"""
|
40 |
+
if not documents:
|
41 |
+
return []
|
42 |
+
|
43 |
+
# Cross-Encoder μ
λ ₯ μ μμ±
|
44 |
+
document_texts = [doc.page_content for doc in documents]
|
45 |
+
query_doc_pairs = [(query, doc) for doc in document_texts]
|
46 |
+
|
47 |
+
# μ μ κ³μ°
|
48 |
+
print(f"리λνΉ μν μ€: {len(documents)}κ° λ¬Έμ")
|
49 |
+
scores = self.model.predict(query_doc_pairs)
|
50 |
+
|
51 |
+
# μ μμ λ°λΌ λ¬Έμ μ¬μ λ ¬
|
52 |
+
doc_score_pairs = list(zip(documents, scores))
|
53 |
+
doc_score_pairs.sort(key=lambda x: x[1], reverse=True)
|
54 |
+
|
55 |
+
print(f"리λνΉ μλ£: μμ {top_k}κ° λ¬Έμ μ ν")
|
56 |
+
|
57 |
+
# μμ kκ° κ²°κ³Ό λ°ν
|
58 |
+
return [doc for doc, score in doc_score_pairs[:top_k]]
|
simple_rag_chain.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
κ°λ¨ν RAG μ²΄μΈ κ΅¬ν (λλ²κΉ
μ©)
|
3 |
+
"""
|
4 |
+
import os
|
5 |
+
from langchain_openai import ChatOpenAI
|
6 |
+
from langchain.prompts import PromptTemplate
|
7 |
+
from langchain_core.output_parsers import StrOutputParser
|
8 |
+
from langchain_core.runnables import RunnablePassthrough
|
9 |
+
|
10 |
+
|
11 |
+
class SimpleRAGChain:
|
12 |
+
def __init__(self, vector_store):
|
13 |
+
"""κ°λ¨ν RAG μ²΄μΈ μ΄κΈ°ν"""
|
14 |
+
print("κ°λ¨ν RAG μ²΄μΈ μ΄κΈ°ν μ€...")
|
15 |
+
self.vector_store = vector_store
|
16 |
+
|
17 |
+
# OpenAI API ν€ νμΈ
|
18 |
+
openai_api_key = os.environ.get("OPENAI_API_KEY", "")
|
19 |
+
print(f"API ν€ μ€μ λ¨: {bool(openai_api_key)}")
|
20 |
+
|
21 |
+
# OpenAI λͺ¨λΈ μ΄κΈ°ν
|
22 |
+
self.llm = ChatOpenAI(
|
23 |
+
model_name="gpt-3.5-turbo",
|
24 |
+
temperature=0.2,
|
25 |
+
api_key=openai_api_key,
|
26 |
+
)
|
27 |
+
|
28 |
+
# ν둬ννΈ ν
νλ¦Ώ
|
29 |
+
template = """
|
30 |
+
λ€μ μ 보λ₯Ό κΈ°λ°μΌλ‘ μ§λ¬Έμ μ ννκ² λ΅λ³ν΄μ£ΌμΈμ.
|
31 |
+
|
32 |
+
μ§λ¬Έ: {question}
|
33 |
+
|
34 |
+
μ°Έκ³ μ 보:
|
35 |
+
{context}
|
36 |
+
|
37 |
+
μ°Έκ³ μ 보μ λ΅μ΄ μλ κ²½μ° "μ 곡λ λ¬Έμμμ ν΄λΉ μ 보λ₯Ό μ°Ύμ μ μμ΅λλ€."λΌκ³ λ΅λ³νμΈμ.
|
38 |
+
"""
|
39 |
+
|
40 |
+
self.prompt = PromptTemplate.from_template(template)
|
41 |
+
|
42 |
+
# μ²΄μΈ κ΅¬μ±
|
43 |
+
self.chain = (
|
44 |
+
{"context": self._retrieve, "question": RunnablePassthrough()}
|
45 |
+
| self.prompt
|
46 |
+
| self.llm
|
47 |
+
| StrOutputParser()
|
48 |
+
)
|
49 |
+
print("κ°λ¨ν RAG μ²΄μΈ μ΄κΈ°ν μλ£")
|
50 |
+
|
51 |
+
def _retrieve(self, query):
|
52 |
+
"""λ¬Έμ κ²μ"""
|
53 |
+
try:
|
54 |
+
docs = self.vector_store.similarity_search(query, k=3)
|
55 |
+
return "\n\n".join(doc.page_content for doc in docs)
|
56 |
+
except Exception as e:
|
57 |
+
print(f"κ²μ μ€ μ€λ₯: {e}")
|
58 |
+
return "λ¬Έμ κ²μ μ€ μ€λ₯κ° λ°μνμ΅λλ€."
|
59 |
+
|
60 |
+
def run(self, query):
|
61 |
+
"""쿼리 μ²λ¦¬"""
|
62 |
+
try:
|
63 |
+
return self.chain.invoke(query)
|
64 |
+
except Exception as e:
|
65 |
+
print(f"μ€ν μ€ μ€λ₯: {e}")
|
66 |
+
return f"μ€λ₯ λ°μ: {str(e)}"
|
vector_store.py
ADDED
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
κ°μ λ λ²‘ν° μ€ν μ΄ λͺ¨λ - Milvus μ€μ μ΅μ ν
|
3 |
+
"""
|
4 |
+
from typing import List, Dict, Any, Optional
|
5 |
+
import uuid
|
6 |
+
from langchain.schema import Document
|
7 |
+
|
8 |
+
# λ²‘ν° μ€ν μ΄ μν¬νΈ
|
9 |
+
try:
|
10 |
+
# μ΅μ λ²μ μν¬νΈ
|
11 |
+
from langchain_milvus import Milvus
|
12 |
+
from langchain_community.vectorstores import FAISS
|
13 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
14 |
+
MODERN_IMPORTS = True
|
15 |
+
print("μ΅μ langchain ν¨ν€μ§ μν¬νΈ μ±κ³΅")
|
16 |
+
except ImportError:
|
17 |
+
# μ΄μ λ²μ μν¬νΈ
|
18 |
+
from langchain_community.vectorstores import Milvus, FAISS
|
19 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
20 |
+
MODERN_IMPORTS = False
|
21 |
+
print("λ κ±°μ langchain_community ν¨ν€μ§ μ¬μ©")
|
22 |
+
|
23 |
+
from config import MILVUS_HOST, MILVUS_PORT, MILVUS_COLLECTION, EMBEDDING_MODEL
|
24 |
+
|
25 |
+
class VectorStore:
|
26 |
+
def __init__(self, use_milvus: bool = True):
|
27 |
+
"""
|
28 |
+
λ²‘ν° μ€ν μ΄ μ΄κΈ°ν
|
29 |
+
|
30 |
+
Args:
|
31 |
+
use_milvus: Milvus μ¬μ© μ¬λΆ (Falseμ΄λ©΄ FAISS μ¬μ©)
|
32 |
+
"""
|
33 |
+
self.use_milvus = use_milvus
|
34 |
+
|
35 |
+
# μλ² λ© λͺ¨λΈ μ€μ
|
36 |
+
print(f"μλ² λ© λͺ¨λΈ λ‘λ μ€: {EMBEDDING_MODEL}")
|
37 |
+
model_kwargs = {
|
38 |
+
"device": "cpu",
|
39 |
+
"trust_remote_code": True # μ격 μ½λ μ€ν νμ© (νμ)
|
40 |
+
}
|
41 |
+
encode_kwargs = {"normalize_embeddings": True}
|
42 |
+
|
43 |
+
self.embeddings = HuggingFaceEmbeddings(
|
44 |
+
model_name=EMBEDDING_MODEL,
|
45 |
+
model_kwargs=model_kwargs,
|
46 |
+
encode_kwargs=encode_kwargs
|
47 |
+
)
|
48 |
+
self.vector_store = None
|
49 |
+
|
50 |
+
print(f"μλ² λ© λͺ¨λΈ μ΄κΈ°ν μλ£: {EMBEDDING_MODEL}")
|
51 |
+
|
52 |
+
def init_milvus(self) -> Milvus:
|
53 |
+
"""
|
54 |
+
Milvus λ²‘ν° μ€ν μ΄ μ΄κΈ°ν
|
55 |
+
|
56 |
+
Returns:
|
57 |
+
Milvus λ²‘ν° μ€ν μ΄ μΈμ€ν΄μ€
|
58 |
+
"""
|
59 |
+
connection_args = {
|
60 |
+
"host": MILVUS_HOST,
|
61 |
+
"port": MILVUS_PORT,
|
62 |
+
}
|
63 |
+
|
64 |
+
# λ²‘ν° κ²μ μΈλ±μ€ νλΌλ―Έν° (FLAT μΈλ±μ€ λ° μ½μ¬μΈ μ μ¬λ λ©νΈλ¦)
|
65 |
+
index_params = {
|
66 |
+
"index_type": "FLAT", # μ νλ μ°μ FLAT μΈλ±μ€
|
67 |
+
"metric_type": "COSINE", # μ½μ¬μΈ μ μ¬λ (μ κ·νλ 벑ν°μ μ ν©)
|
68 |
+
"params": {} # FLAT μΈλ±μ€μλ μΆκ° νλΌλ―Έν° μμ
|
69 |
+
}
|
70 |
+
|
71 |
+
return Milvus(
|
72 |
+
embedding_function=self.embeddings,
|
73 |
+
collection_name=MILVUS_COLLECTION,
|
74 |
+
connection_args=connection_args,
|
75 |
+
index_params=index_params
|
76 |
+
)
|
77 |
+
|
78 |
+
def init_faiss(self) -> FAISS:
|
79 |
+
"""
|
80 |
+
FAISS λ²‘ν° μ€ν μ΄ μ΄κΈ°ν (λ‘컬 λ체μ©)
|
81 |
+
|
82 |
+
Returns:
|
83 |
+
FAISS λ²‘ν° μ€ν μ΄ μΈμ€ν΄μ€
|
84 |
+
"""
|
85 |
+
return FAISS.from_documents([], self.embeddings)
|
86 |
+
|
87 |
+
def create_or_load(self, documents: Optional[List[Document]] = None) -> Any:
|
88 |
+
"""
|
89 |
+
λ²‘ν° μ€ν μ΄ μμ± λλ λ‘λ
|
90 |
+
|
91 |
+
Args:
|
92 |
+
documents: μ μ₯ν λ¬Έμ 리μ€νΈ (Noneμ΄λ©΄ λΉ μ€ν μ΄ μμ±)
|
93 |
+
|
94 |
+
Returns:
|
95 |
+
λ²‘ν° μ€ν μ΄ μΈμ€ν΄μ€
|
96 |
+
"""
|
97 |
+
if self.use_milvus:
|
98 |
+
if documents:
|
99 |
+
# λ¬Έμκ° μ 곡λ κ²½μ° μ 컬λ μ
μμ±
|
100 |
+
try:
|
101 |
+
# μ°κ²° μ€μ
|
102 |
+
connection_args = {
|
103 |
+
"host": MILVUS_HOST,
|
104 |
+
"port": MILVUS_PORT,
|
105 |
+
}
|
106 |
+
|
107 |
+
# κ²μ μΈλ±μ€ μ€μ
|
108 |
+
index_params = {
|
109 |
+
"index_type": "FLAT", # μ νλ μ°μ
|
110 |
+
"metric_type": "COSINE", # μ½μ¬μΈ μ μ¬λ
|
111 |
+
"params": {}
|
112 |
+
}
|
113 |
+
|
114 |
+
print(f"Milvus 컬λ μ
μμ±: {MILVUS_COLLECTION} (κΈ°μ‘΄ 컬λ μ
μμ )")
|
115 |
+
|
116 |
+
# λ¬Έμλ‘λΆν° Milvus 컬λ μ
μμ±
|
117 |
+
self.vector_store = Milvus.from_documents(
|
118 |
+
documents=documents,
|
119 |
+
embedding=self.embeddings,
|
120 |
+
collection_name=MILVUS_COLLECTION,
|
121 |
+
connection_args=connection_args,
|
122 |
+
index_params=index_params,
|
123 |
+
drop_old=True # κΈ°μ‘΄ 컬λ μ
μμ (μ¬κ΅¬μΆ)
|
124 |
+
)
|
125 |
+
|
126 |
+
print(f"Milvus 컬λ μ
μμ± μλ£: {len(documents)}κ° λ¬Έμ μΈλ±μ±λ¨")
|
127 |
+
|
128 |
+
except Exception as e:
|
129 |
+
print(f"Milvus 컬λ μ
μμ± μ€ν¨: {e}")
|
130 |
+
# λ체 λ°©μμΌλ‘ FAISS μ¬μ©
|
131 |
+
print("λ체 λ°©μμΌλ‘ FAISS μ¬μ©")
|
132 |
+
self.use_milvus = False
|
133 |
+
self.vector_store = FAISS.from_documents(documents, self.embeddings)
|
134 |
+
else:
|
135 |
+
# κΈ°μ‘΄ 컬λ μ
λ‘λ
|
136 |
+
try:
|
137 |
+
self.vector_store = self.init_milvus()
|
138 |
+
except Exception as e:
|
139 |
+
print(f"Milvus 컬λ μ
λ‘λ μ€ν¨: {e}")
|
140 |
+
# λ체 λ°©μμΌλ‘ FAISS μ¬μ©
|
141 |
+
print("λ체 λ°©μμΌλ‘ FAISS μ¬μ©")
|
142 |
+
self.use_milvus = False
|
143 |
+
self.vector_store = self.init_faiss()
|
144 |
+
else:
|
145 |
+
# FAISS μ¬μ©
|
146 |
+
if documents:
|
147 |
+
print(f"FAISS μΈλ±μ€ μμ±: {len(documents)}κ° λ¬Έμ")
|
148 |
+
self.vector_store = FAISS.from_documents(documents, self.embeddings)
|
149 |
+
print("FAISS μΈλ±μ€ μμ± μλ£")
|
150 |
+
else:
|
151 |
+
self.vector_store = self.init_faiss()
|
152 |
+
print("λΉ FAISS μΈλ±μ€ μ΄κΈ°ν μλ£")
|
153 |
+
|
154 |
+
return self.vector_store
|
155 |
+
|
156 |
+
def add_documents(self, documents: List[Document]) -> None:
|
157 |
+
"""
|
158 |
+
λ²‘ν° μ€ν μ΄μ λ¬Έμ μΆκ°
|
159 |
+
|
160 |
+
Args:
|
161 |
+
documents: μΆκ°ν λ¬Έμ 리μ€νΈ
|
162 |
+
"""
|
163 |
+
if self.vector_store is None:
|
164 |
+
self.create_or_load(documents)
|
165 |
+
else:
|
166 |
+
if self.use_milvus:
|
167 |
+
self.vector_store.add_documents(documents)
|
168 |
+
else:
|
169 |
+
self.vector_store.add_documents(documents)
|
170 |
+
|
171 |
+
def similarity_search(self, query: str, k: int = 5) -> List[Document]:
|
172 |
+
"""
|
173 |
+
λ²‘ν° μ μ¬λ κ²μ μν
|
174 |
+
|
175 |
+
Args:
|
176 |
+
query: κ²μ 쿼리
|
177 |
+
k: λ°νν κ²°κ³Ό μ
|
178 |
+
|
179 |
+
Returns:
|
180 |
+
μ μ¬λκ° λμ λ¬Έμ 리μ€νΈ
|
181 |
+
"""
|
182 |
+
if self.vector_store is None:
|
183 |
+
raise ValueError("λ²‘ν° μ€ν μ΄κ° μ΄κΈ°νλμ§ μμμ΅λλ€.")
|
184 |
+
|
185 |
+
print(f"κ²μ 쿼리: '{query}', μμ {k}κ° κ²°κ³Ό μμ²")
|
186 |
+
results = self.vector_store.similarity_search(query, k=k)
|
187 |
+
print(f"κ²μ μλ£: {len(results)}κ° κ²°κ³Ό μ°Ύμ")
|
188 |
+
|
189 |
+
return results
|
190 |
+
|
191 |
+
def save_local(self, path: str = "faiss_index") -> None:
|
192 |
+
"""
|
193 |
+
FAISS μΈλ±μ€ λ‘컬 μ μ₯ (Milvus μ¬μ© μ ν κ²½μ°)
|
194 |
+
|
195 |
+
Args:
|
196 |
+
path: μ μ₯ κ²½λ‘
|
197 |
+
"""
|
198 |
+
if not self.use_milvus and self.vector_store is not None:
|
199 |
+
self.vector_store.save_local(path)
|
200 |
+
print(f"FAISS μΈλ±μ€ λ‘컬 μ μ₯ μλ£: {path}")
|
201 |
+
|
202 |
+
"""
|
203 |
+
FAISS μμ§λ ¬ν νμ© μ€μ μ΄, ν¬ν¨λ λ²‘ν° μ€ν μ΄ μ½λ
|
204 |
+
"""
|
205 |
+
|
206 |
+
# vector_store.py νμΌμμ load_local λ©μλ μμ
|
207 |
+
|
208 |
+
def load_local(self, path: str = "faiss_index") -> None:
|
209 |
+
"""
|
210 |
+
FAISS μΈλ±μ€ λ‘컬 λ‘λ (Milvus μ¬μ© μ ν κ²½μ°)
|
211 |
+
|
212 |
+
Args:
|
213 |
+
path: λ‘λν μΈλ±μ€ κ²½λ‘
|
214 |
+
"""
|
215 |
+
if not self.use_milvus:
|
216 |
+
try:
|
217 |
+
print(f"FAISS μΈλ±μ€ λ‘λ μ€: {path}")
|
218 |
+
|
219 |
+
# μμ§λ ¬ν νμ© μ΅μ
μΆκ° (보μ κ²½κ³ νμΈ νμ)
|
220 |
+
self.vector_store = FAISS.load_local(
|
221 |
+
path,
|
222 |
+
self.embeddings,
|
223 |
+
allow_dangerous_deserialization=True # μμ§λ ¬ν νμ©
|
224 |
+
)
|
225 |
+
print(f"FAISS μΈλ±μ€ λ‘λ μλ£: {path}")
|
226 |
+
except Exception as e:
|
227 |
+
print(f"FAISS μΈλ±μ€ λ‘λ μ€ν¨: {e}")
|
228 |
+
|
229 |
+
# μ€λ₯ μΈλΆ μ 보 μΆλ ₯
|
230 |
+
import traceback
|
231 |
+
traceback.print_exc()
|
232 |
+
|
233 |
+
# μ μΈλ±μ€ μ΄κΈ°ν
|
234 |
+
self.vector_store = self.init_faiss()
|
235 |
+
print("μ FAISS μΈλ±μ€ μ΄κΈ°νλ¨")
|
voice_rag_app.py
ADDED
@@ -0,0 +1,670 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
μμ±μΈμ κΈ°λ₯μ΄ μΆκ°λ RAG μ±λ΄ μ±
|
3 |
+
"""
|
4 |
+
import os
|
5 |
+
import time
|
6 |
+
import tempfile
|
7 |
+
from typing import List, Dict, Tuple, Any, Optional
|
8 |
+
import hashlib
|
9 |
+
import pickle
|
10 |
+
import json
|
11 |
+
|
12 |
+
# κΈ°μ‘΄ μν¬νΈ
|
13 |
+
from config import PDF_DIRECTORY, CHUNK_SIZE, CHUNK_OVERLAP, LLM_MODEL
|
14 |
+
from optimized_document_processor import OptimizedDocumentProcessor
|
15 |
+
from vector_store import VectorStore
|
16 |
+
from langchain.schema import Document
|
17 |
+
|
18 |
+
# ν΄λ‘λ° STT λͺ¨λ μν¬νΈ
|
19 |
+
from clova_stt import ClovaSTT
|
20 |
+
|
21 |
+
# μμ ν μν¬νΈ
|
22 |
+
try:
|
23 |
+
from rag_chain import RAGChain
|
24 |
+
|
25 |
+
RAG_CHAIN_AVAILABLE = True
|
26 |
+
except ImportError:
|
27 |
+
print("RAG μ²΄μΈ λͺ¨λμ λ‘λν μ μμ΅λλ€.")
|
28 |
+
RAG_CHAIN_AVAILABLE = False
|
29 |
+
|
30 |
+
|
31 |
+
class VoiceRAGChatApp:
|
32 |
+
"""
|
33 |
+
μμ±μΈμ κΈ°λ₯μ΄ μΆκ°λ RAG μ±λ΄ μ ν리μΌμ΄μ
|
34 |
+
"""
|
35 |
+
|
36 |
+
def __init__(self):
|
37 |
+
"""
|
38 |
+
μμ±μΈμ RAG μ±λ΄ μ ν리μΌμ΄μ
μ΄κΈ°ν
|
39 |
+
"""
|
40 |
+
# λ°μ΄ν° λλ ν 리 μ μ
|
41 |
+
self.pdf_directory = PDF_DIRECTORY
|
42 |
+
self.cache_directory = "cached_data"
|
43 |
+
self.index_file = os.path.join(self.cache_directory, "file_index.json")
|
44 |
+
self.chunks_dir = os.path.join(self.cache_directory, "chunks")
|
45 |
+
self.vector_index_dir = os.path.join(self.cache_directory, "vector_index")
|
46 |
+
|
47 |
+
# λλ ν 리 μμ±
|
48 |
+
os.makedirs(self.pdf_directory, exist_ok=True)
|
49 |
+
os.makedirs(self.cache_directory, exist_ok=True)
|
50 |
+
os.makedirs(self.chunks_dir, exist_ok=True)
|
51 |
+
os.makedirs(self.vector_index_dir, exist_ok=True)
|
52 |
+
|
53 |
+
print(f"PDF λ¬Έμ λλ ν 리: '{self.pdf_directory}'")
|
54 |
+
print(f"μΊμ λλ ν 리: '{self.cache_directory}'")
|
55 |
+
|
56 |
+
# μ»΄ν¬λνΈ μ΄κΈ°ν
|
57 |
+
self.document_processor = OptimizedDocumentProcessor(
|
58 |
+
chunk_size=CHUNK_SIZE,
|
59 |
+
chunk_overlap=CHUNK_OVERLAP
|
60 |
+
)
|
61 |
+
|
62 |
+
# λ²‘ν° μ μ₯μ μ΄κΈ°ν
|
63 |
+
self.vector_store = VectorStore(use_milvus=False)
|
64 |
+
|
65 |
+
# λ¬Έμ μΈλ±μ€ λ‘λ
|
66 |
+
self.file_index = self._load_file_index()
|
67 |
+
|
68 |
+
# κΈ°λ³Έ λ³μ μ΄κΈ°ν
|
69 |
+
self.documents = []
|
70 |
+
self.processed_files = []
|
71 |
+
self.is_initialized = False
|
72 |
+
|
73 |
+
# ν΄λ‘λ° STT ν΄λΌμ΄μΈνΈ μ΄κΈ°ν
|
74 |
+
self.stt_client = ClovaSTT()
|
75 |
+
print("μμ±μΈμ(STT) κΈ°λ₯μ΄ μ΄κΈ°νλμμ΅λλ€.")
|
76 |
+
|
77 |
+
# μμ μ μλμΌλ‘ λ¬Έμ λ‘λ λ° μ²λ¦¬
|
78 |
+
print("λ¬Έμ μλ λ‘λ λ° μ²λ¦¬ μμ...")
|
79 |
+
self.auto_process_documents()
|
80 |
+
|
81 |
+
def _load_file_index(self) -> Dict[str, Dict[str, Any]]:
|
82 |
+
"""
|
83 |
+
νμΌ μΈλ±μ€ λ‘λ
|
84 |
+
|
85 |
+
Returns:
|
86 |
+
νμΌ κ²½λ‘ -> λ©νλ°μ΄ν° λ§€ν
|
87 |
+
"""
|
88 |
+
if os.path.exists(self.index_file):
|
89 |
+
try:
|
90 |
+
with open(self.index_file, 'r', encoding='utf-8') as f:
|
91 |
+
return json.load(f)
|
92 |
+
except Exception as e:
|
93 |
+
print(f"μΈλ±μ€ νμΌ λ‘λ μ€ν¨: {e}")
|
94 |
+
return {}
|
95 |
+
return {}
|
96 |
+
|
97 |
+
def _save_file_index(self) -> None:
|
98 |
+
"""
|
99 |
+
νμΌ μΈλ±μ€ μ μ₯
|
100 |
+
"""
|
101 |
+
with open(self.index_file, 'w', encoding='utf-8') as f:
|
102 |
+
json.dump(self.file_index, f, ensure_ascii=False, indent=2)
|
103 |
+
|
104 |
+
def _calculate_file_hash(self, file_path: str) -> str:
|
105 |
+
"""
|
106 |
+
νμΌ ν΄μ κ³μ°
|
107 |
+
|
108 |
+
Args:
|
109 |
+
file_path: νμΌ κ²½λ‘
|
110 |
+
|
111 |
+
Returns:
|
112 |
+
MD5 ν΄μκ°
|
113 |
+
"""
|
114 |
+
hasher = hashlib.md5()
|
115 |
+
with open(file_path, 'rb') as f:
|
116 |
+
buf = f.read(65536)
|
117 |
+
while len(buf) > 0:
|
118 |
+
hasher.update(buf)
|
119 |
+
buf = f.read(65536)
|
120 |
+
return hasher.hexdigest()
|
121 |
+
|
122 |
+
def _is_file_processed(self, file_path: str) -> bool:
|
123 |
+
"""
|
124 |
+
νμΌμ΄ μ΄λ―Έ μ²λ¦¬λμκ³ λ³κ²½λμ§ μμλμ§ νμΈ
|
125 |
+
|
126 |
+
Args:
|
127 |
+
file_path: νμΌ κ²½λ‘
|
128 |
+
|
129 |
+
Returns:
|
130 |
+
μ²λ¦¬ μ¬λΆ
|
131 |
+
"""
|
132 |
+
if file_path not in self.file_index:
|
133 |
+
return False
|
134 |
+
|
135 |
+
# νμ¬ ν΄μκ° κ³μ°
|
136 |
+
current_hash = self._calculate_file_hash(file_path)
|
137 |
+
|
138 |
+
# μ μ₯λ ν΄μκ°κ³Ό λΉκ΅
|
139 |
+
if self.file_index[file_path]['hash'] != current_hash:
|
140 |
+
print(f"νμΌ λ³κ²½ κ°μ§: {file_path}")
|
141 |
+
return False
|
142 |
+
|
143 |
+
# μ²ν¬ νμΌ μ‘΄μ¬ νμΈ
|
144 |
+
chunks_path = self.file_index[file_path]['chunks_path']
|
145 |
+
if not os.path.exists(chunks_path):
|
146 |
+
return False
|
147 |
+
|
148 |
+
return True
|
149 |
+
|
150 |
+
def _get_chunks_path(self, file_hash: str) -> str:
|
151 |
+
"""
|
152 |
+
μ²ν¬ νμΌ κ²½λ‘ μμ±
|
153 |
+
|
154 |
+
Args:
|
155 |
+
file_hash: νμΌ ν΄μκ°
|
156 |
+
|
157 |
+
Returns:
|
158 |
+
μ²ν¬ νμΌ κ²½λ‘
|
159 |
+
"""
|
160 |
+
return os.path.join(self.chunks_dir, f"{file_hash}.pkl")
|
161 |
+
|
162 |
+
def _save_chunks(self, file_path: str, chunks: List[Document]) -> None:
|
163 |
+
"""
|
164 |
+
μ²ν¬ λ°μ΄ν° μ μ₯
|
165 |
+
|
166 |
+
Args:
|
167 |
+
file_path: μλ³Έ νμΌ κ²½λ‘
|
168 |
+
chunks: λ¬Έμ μ²ν¬ 리μ€νΈ
|
169 |
+
"""
|
170 |
+
# ν΄μ κ³μ°
|
171 |
+
file_hash = self._calculate_file_hash(file_path)
|
172 |
+
|
173 |
+
# μ²ν¬ νμΌ κ²½λ‘
|
174 |
+
chunks_path = self._get_chunks_path(file_hash)
|
175 |
+
|
176 |
+
# μ²ν¬ λ°μ΄ν° μ μ₯
|
177 |
+
with open(chunks_path, 'wb') as f:
|
178 |
+
pickle.dump(chunks, f)
|
179 |
+
|
180 |
+
# μΈλ±μ€ μ
λ°μ΄νΈ
|
181 |
+
self.file_index[file_path] = {
|
182 |
+
'hash': file_hash,
|
183 |
+
'chunks_path': chunks_path,
|
184 |
+
'last_processed': time.time(),
|
185 |
+
'chunks_count': len(chunks)
|
186 |
+
}
|
187 |
+
|
188 |
+
# μΈλ±μ€ μ μ₯
|
189 |
+
self._save_file_index()
|
190 |
+
|
191 |
+
print(f"μ²ν¬ μ μ₯ μλ£: {file_path} ({len(chunks)}κ° μ²ν¬)")
|
192 |
+
|
193 |
+
def _load_chunks(self, file_path: str) -> List[Document]:
|
194 |
+
"""
|
195 |
+
μ μ₯λ μ²ν¬ λ°μ΄ν° λ‘λ
|
196 |
+
|
197 |
+
Args:
|
198 |
+
file_path: νμΌ κ²½λ‘
|
199 |
+
|
200 |
+
Returns:
|
201 |
+
λ¬Έμ μ²ν¬ 리μ€νΈ
|
202 |
+
"""
|
203 |
+
chunks_path = self.file_index[file_path]['chunks_path']
|
204 |
+
with open(chunks_path, 'rb') as f:
|
205 |
+
chunks = pickle.load(f)
|
206 |
+
|
207 |
+
print(f"μ²ν¬ λ‘λ μλ£: {file_path} ({len(chunks)}κ° μ²ν¬)")
|
208 |
+
return chunks
|
209 |
+
|
210 |
+
def _process_pdf_file(self, file_path: str) -> List[Document]:
|
211 |
+
"""
|
212 |
+
PDF νμΌ μ²λ¦¬ - docling μ€ν¨ μ PyPDFLoader μ¬μ©
|
213 |
+
|
214 |
+
Args:
|
215 |
+
file_path: μ²λ¦¬ν PDF νμΌ κ²½λ‘
|
216 |
+
|
217 |
+
Returns:
|
218 |
+
μ²λ¦¬λ λ¬Έμ μ²ν¬ 리μ€νΈ
|
219 |
+
"""
|
220 |
+
try:
|
221 |
+
print(f"doclingμΌλ‘ μ²λ¦¬ μλ: {file_path}")
|
222 |
+
|
223 |
+
# docling μ¬μ© μλ
|
224 |
+
try:
|
225 |
+
# 10μ΄ νμμμ μ€μ (μ΅μ
)
|
226 |
+
import signal
|
227 |
+
|
228 |
+
def timeout_handler(signum, frame):
|
229 |
+
raise TimeoutError("docling μ²λ¦¬ μκ° μ΄κ³Ό")
|
230 |
+
|
231 |
+
# 리λ
μ€/λ§₯μμλ§ μλ (μλμ°μμλ 무μλ¨)
|
232 |
+
try:
|
233 |
+
signal.signal(signal.SIGALRM, timeout_handler)
|
234 |
+
signal.alarm(60) # 60μ΄ νμμμ
|
235 |
+
except:
|
236 |
+
pass
|
237 |
+
|
238 |
+
# doclingμΌλ‘ μ²λ¦¬ μλ
|
239 |
+
chunks = self.document_processor.process_pdf(file_path, use_docling=True)
|
240 |
+
|
241 |
+
# νμμμ μ·¨μ
|
242 |
+
try:
|
243 |
+
signal.alarm(0)
|
244 |
+
except:
|
245 |
+
pass
|
246 |
+
|
247 |
+
return chunks
|
248 |
+
|
249 |
+
except Exception as e:
|
250 |
+
# docling μ€λ₯ νμΈ
|
251 |
+
error_str = str(e)
|
252 |
+
if "Invalid code point" in error_str or "RuntimeError" in error_str:
|
253 |
+
print(f"docling μ²λ¦¬ μ€λ₯ (μ½λ ν¬μΈνΈ λ¬Έμ ): {error_str}")
|
254 |
+
print("PyPDFLoaderλ‘ λ체ν©λλ€.")
|
255 |
+
else:
|
256 |
+
print(f"docling μ²λ¦¬ μ€λ₯: {error_str}")
|
257 |
+
print("PyPDFLoaderλ‘ λ체ν©λλ€.")
|
258 |
+
|
259 |
+
# PyPDFLoaderλ‘ λ체
|
260 |
+
try:
|
261 |
+
return self.document_processor.process_pdf(file_path, use_docling=False)
|
262 |
+
except Exception as inner_e:
|
263 |
+
print(f"PyPDFLoader μ²λ¦¬ μ€λ₯: {inner_e}")
|
264 |
+
raise # λ λ°©λ² λͺ¨λ μ€ν¨νλ©΄ μμΈ λ°μ
|
265 |
+
|
266 |
+
except Exception as e:
|
267 |
+
print(f"PDF μ²λ¦¬ μ€ μ¬κ°ν μ€λ₯: {e}")
|
268 |
+
# λΉ μ²ν¬λΌλ λ°ννμ¬ μ 체 μ²λ¦¬κ° μ€λ¨λμ§ μλλ‘ ν¨
|
269 |
+
return []
|
270 |
+
|
271 |
+
def auto_process_documents(self) -> str:
|
272 |
+
"""
|
273 |
+
documents ν΄λμ PDF νμΌ μλ μ²λ¦¬
|
274 |
+
|
275 |
+
Returns:
|
276 |
+
μ²λ¦¬ κ²°κ³Ό λ©μμ§
|
277 |
+
"""
|
278 |
+
try:
|
279 |
+
start_time = time.time()
|
280 |
+
|
281 |
+
# PDF νμΌ λͺ©λ‘ μμ§
|
282 |
+
pdf_files = []
|
283 |
+
for filename in os.listdir(self.pdf_directory):
|
284 |
+
if filename.lower().endswith('.pdf'):
|
285 |
+
pdf_files.append(os.path.join(self.pdf_directory, filename))
|
286 |
+
|
287 |
+
if not pdf_files:
|
288 |
+
return f"'{self.pdf_directory}' ν΄λμ PDF νμΌμ΄ μμ΅λλ€."
|
289 |
+
|
290 |
+
print(f"λ°κ²¬λ PDF νμΌ: {len(pdf_files)}κ°")
|
291 |
+
|
292 |
+
# ν΄λ λ΄ PDF νμΌ μ²λ¦¬
|
293 |
+
new_files = []
|
294 |
+
updated_files = []
|
295 |
+
cached_files = []
|
296 |
+
failed_files = []
|
297 |
+
all_chunks = []
|
298 |
+
|
299 |
+
for file_path in pdf_files:
|
300 |
+
if self._is_file_processed(file_path):
|
301 |
+
# μΊμμμ μ²ν¬ λ‘λ
|
302 |
+
chunks = self._load_chunks(file_path)
|
303 |
+
all_chunks.extend(chunks)
|
304 |
+
cached_files.append(file_path)
|
305 |
+
self.processed_files.append(os.path.basename(file_path))
|
306 |
+
else:
|
307 |
+
# μ νμΌ λλ λ³κ²½λ νμΌ μ²λ¦¬
|
308 |
+
print(f"μ²λ¦¬ μ€: {file_path}")
|
309 |
+
|
310 |
+
try:
|
311 |
+
# κ°μ λ PDF μ²λ¦¬ λ©μλ μ¬μ©
|
312 |
+
chunks = self._process_pdf_file(file_path)
|
313 |
+
|
314 |
+
if chunks: # μ²ν¬κ° μλ κ²½μ°μλ§ μ μ₯
|
315 |
+
# μ²ν¬ μ μ₯
|
316 |
+
self._save_chunks(file_path, chunks)
|
317 |
+
|
318 |
+
all_chunks.extend(chunks)
|
319 |
+
if file_path in self.file_index:
|
320 |
+
updated_files.append(file_path)
|
321 |
+
else:
|
322 |
+
new_files.append(file_path)
|
323 |
+
|
324 |
+
self.processed_files.append(os.path.basename(file_path))
|
325 |
+
else:
|
326 |
+
print(f"'{file_path}' μ²λ¦¬ μ€ν¨: μΆμΆλ μ²ν¬ μμ")
|
327 |
+
failed_files.append(file_path)
|
328 |
+
except Exception as e:
|
329 |
+
print(f"'{file_path}' μ²λ¦¬ μ€ μ€λ₯: {e}")
|
330 |
+
failed_files.append(file_path)
|
331 |
+
|
332 |
+
# λͺ¨λ μ²ν¬ μ μ₯
|
333 |
+
self.documents = all_chunks
|
334 |
+
|
335 |
+
processing_time = time.time() - start_time
|
336 |
+
print(f"λ¬Έμ μ²λ¦¬ μλ£: {len(all_chunks)}κ° μ²ν¬, {processing_time:.2f}μ΄")
|
337 |
+
|
338 |
+
# λ²‘ν° μΈλ±μ€ μ μ₯ κ²½λ‘ νμΈ
|
339 |
+
if os.path.exists(self.vector_index_dir) and any(os.listdir(self.vector_index_dir)):
|
340 |
+
# κΈ°μ‘΄ λ²‘ν° μΈλ±μ€ λ‘λ
|
341 |
+
try:
|
342 |
+
print("μ μ₯λ λ²‘ν° μΈλ±μ€ λ‘λ μ€...")
|
343 |
+
vector_store_loaded = self.vector_store.load_local(self.vector_index_dir)
|
344 |
+
|
345 |
+
# μΈλ±μ€ λ‘λ μ±κ³΅ νμΈ
|
346 |
+
if self.vector_store.vector_store is not None:
|
347 |
+
# μ λ¬Έμλ λ³κ²½λ λ¬Έμκ° μμΌλ©΄ μΈλ±μ€ μ
λ°μ΄νΈ
|
348 |
+
if new_files or updated_files:
|
349 |
+
print("λ²‘ν° μΈλ±μ€ μ
λ°μ΄νΈ μ€...")
|
350 |
+
self.vector_store.add_documents(self.documents)
|
351 |
+
|
352 |
+
print("λ²‘ν° μΈλ±μ€ λ‘λ μλ£")
|
353 |
+
else:
|
354 |
+
print("λ²‘ν° μΈλ±μ€λ₯Ό λ‘λνμΌλ μ ν¨νμ§ μμ, μλ‘ μμ±ν©λλ€.")
|
355 |
+
self.vector_store.create_or_load(self.documents)
|
356 |
+
|
357 |
+
except Exception as e:
|
358 |
+
print(f"λ²‘ν° μΈλ±μ€ λ‘λ μ€ν¨, μλ‘ μμ±ν©λλ€: {e}")
|
359 |
+
# μ€λ₯ μμΈ μ 보 μΆλ ₯
|
360 |
+
import traceback
|
361 |
+
traceback.print_exc()
|
362 |
+
|
363 |
+
# μ λ²‘ν° μΈλ±μ€ μμ±
|
364 |
+
self.vector_store.create_or_load(self.documents)
|
365 |
+
else:
|
366 |
+
# μ λ²‘ν° μΈλ±μ€ μμ±
|
367 |
+
print("μ λ²‘ν° μΈλ±μ€ μμ± μ€...")
|
368 |
+
self.vector_store.create_or_load(self.documents)
|
369 |
+
|
370 |
+
# λ²‘ν° μΈλ±μ€ μ μ₯
|
371 |
+
if self.vector_store and self.vector_store.vector_store is not None:
|
372 |
+
try:
|
373 |
+
print(f"λ²‘ν° μΈλ±μ€ μ μ₯ μ€: {self.vector_index_dir}")
|
374 |
+
save_result = self.vector_store.save_local(self.vector_index_dir)
|
375 |
+
print(f"λ²‘ν° μΈλ±μ€ μ μ₯ μλ£: {self.vector_index_dir}")
|
376 |
+
except Exception as e:
|
377 |
+
print(f"λ²‘ν° μΈλ±μ€ μ μ₯ μ€ν¨: {e}")
|
378 |
+
# μ€λ₯ μμΈ μ 보 μΆλ ₯
|
379 |
+
import traceback
|
380 |
+
traceback.print_exc()
|
381 |
+
else:
|
382 |
+
print("λ²‘ν° μΈλ±μ€κ° μ΄κΈ°νλμ§ μμ μ μ₯νμ§ μμ΅λλ€.")
|
383 |
+
|
384 |
+
# RAG μ²΄μΈ μ΄κΈ°ν
|
385 |
+
if RAG_CHAIN_AVAILABLE:
|
386 |
+
self.rag_chain = RAGChain(self.vector_store)
|
387 |
+
self.is_initialized = True
|
388 |
+
|
389 |
+
total_time = time.time() - start_time
|
390 |
+
|
391 |
+
status_message = (
|
392 |
+
f"λ¬Έμ μ²λ¦¬ μλ£!\n"
|
393 |
+
f"- μ²λ¦¬λ νμΌ: {len(self.processed_files)}κ°\n"
|
394 |
+
f"- μΊμλ νμΌ: {len(cached_files)}κ°\n"
|
395 |
+
f"- μ νμΌ: {len(new_files)}κ°\n"
|
396 |
+
f"- μ
λ°μ΄νΈλ νμΌ: {len(updated_files)}κ°\n"
|
397 |
+
f"- μ€ν¨ν νμΌ: {len(failed_files)}κ°\n"
|
398 |
+
f"- μ΄ μ²ν¬ μ: {len(self.documents)}κ°\n"
|
399 |
+
f"- μ²λ¦¬ μκ°: {total_time:.2f}μ΄\n"
|
400 |
+
f"μ΄μ μ§λ¬Έν μ€λΉκ° λμμ΅λλ€!"
|
401 |
+
)
|
402 |
+
|
403 |
+
print(status_message)
|
404 |
+
return status_message
|
405 |
+
else:
|
406 |
+
return "RAG 체μΈμ μ΄κΈ°νν μ μμ΅λλ€. νμν λΌμ΄λΈλ¬λ¦¬κ° μ€μΉλμ΄ μλμ§ νμΈνμΈμ."
|
407 |
+
|
408 |
+
except Exception as e:
|
409 |
+
error_message = f"λ¬Έμ μ²λ¦¬ μ€ μ€λ₯ λ°μ: {str(e)}"
|
410 |
+
print(error_message)
|
411 |
+
import traceback
|
412 |
+
traceback.print_exc()
|
413 |
+
return error_message
|
414 |
+
|
415 |
+
def reset_cache(self) -> str:
|
416 |
+
"""
|
417 |
+
μΊμ μ΄κΈ°ν
|
418 |
+
|
419 |
+
Returns:
|
420 |
+
κ²°κ³Ό λ©μμ§
|
421 |
+
"""
|
422 |
+
try:
|
423 |
+
# μ²οΏ½οΏ½οΏ½ νμΌ μμ
|
424 |
+
for filename in os.listdir(self.chunks_dir):
|
425 |
+
file_path = os.path.join(self.chunks_dir, filename)
|
426 |
+
if os.path.isfile(file_path):
|
427 |
+
os.remove(file_path)
|
428 |
+
|
429 |
+
# μΈλ±μ€ μ΄κΈ°ν
|
430 |
+
self.file_index = {}
|
431 |
+
self._save_file_index()
|
432 |
+
|
433 |
+
# λ²‘ν° μΈλ±μ€ μμ
|
434 |
+
for filename in os.listdir(self.vector_index_dir):
|
435 |
+
file_path = os.path.join(self.vector_index_dir, filename)
|
436 |
+
if os.path.isfile(file_path):
|
437 |
+
os.remove(file_path)
|
438 |
+
|
439 |
+
self.documents = []
|
440 |
+
self.processed_files = []
|
441 |
+
self.is_initialized = False
|
442 |
+
|
443 |
+
return "μΊμκ° μ΄κΈ°νλμμ΅λλ€. λ€μ μ€ν μ λͺ¨λ λ¬Έμκ° λ€μ μ²λ¦¬λ©λλ€."
|
444 |
+
except Exception as e:
|
445 |
+
return f"μΊμ μ΄κΈ°ν μ€ μ€λ₯ λ°μ: {str(e)}"
|
446 |
+
|
447 |
+
def process_query(self, query: str, chat_history: List[Tuple[str, str]]) -> Tuple[str, List[Tuple[str, str]]]:
|
448 |
+
"""
|
449 |
+
μ¬μ©μ 쿼리 μ²λ¦¬
|
450 |
+
|
451 |
+
Args:
|
452 |
+
query: μ¬μ©μ μ§λ¬Έ
|
453 |
+
chat_history: λν κΈ°λ‘
|
454 |
+
|
455 |
+
Returns:
|
456 |
+
μλ΅ λ° μ
λ°μ΄νΈλ λν κΈ°λ‘
|
457 |
+
"""
|
458 |
+
if not query: # λΉμ΄μλ 쿼리 μ²λ¦¬
|
459 |
+
return "", chat_history
|
460 |
+
|
461 |
+
if not self.is_initialized:
|
462 |
+
response = "λ¬Έμ λ‘λκ° μ΄κΈ°νλμ§ μμμ΅λλ€. μλ λ‘λλ₯Ό μλν©λλ€."
|
463 |
+
chat_history.append((query, response))
|
464 |
+
|
465 |
+
# μλ λ‘λ μλ
|
466 |
+
try:
|
467 |
+
self.auto_process_documents()
|
468 |
+
if not self.is_initialized:
|
469 |
+
response = "λ¬Έμλ₯Ό λ‘λν μ μμ΅λλ€. 'documents' ν΄λμ PDF νμΌμ΄ μλμ§ νμΈνμΈμ."
|
470 |
+
chat_history.append((query, response))
|
471 |
+
return "", chat_history
|
472 |
+
except Exception as e:
|
473 |
+
response = f"λ¬Έμ λ‘λ μ€ μ€λ₯ λ°μ: {str(e)}"
|
474 |
+
chat_history.append((query, response))
|
475 |
+
return "", chat_history
|
476 |
+
|
477 |
+
try:
|
478 |
+
# RAG μ²΄μΈ μ€ν λ° μλ΅ μμ±
|
479 |
+
start_time = time.time()
|
480 |
+
response = self.rag_chain.run(query)
|
481 |
+
end_time = time.time()
|
482 |
+
|
483 |
+
query_time = end_time - start_time
|
484 |
+
print(f"쿼리 μ²λ¦¬ μκ°: {query_time:.2f}μ΄")
|
485 |
+
|
486 |
+
chat_history.append((query, response))
|
487 |
+
return "", chat_history
|
488 |
+
except Exception as e:
|
489 |
+
error_msg = f"μ€λ₯ λ°μ: {str(e)}"
|
490 |
+
chat_history.append((query, error_msg))
|
491 |
+
return "", chat_history
|
492 |
+
|
493 |
+
def process_voice_query(self, audio, chat_history: List[Tuple[str, str]]) -> Tuple[str, List[Tuple[str, str]]]:
|
494 |
+
"""
|
495 |
+
μμ± μΏΌλ¦¬ μ²λ¦¬
|
496 |
+
|
497 |
+
Args:
|
498 |
+
audio: λ
Ήμλ μ€λμ€ λ°μ΄ν°
|
499 |
+
chat_history: λν κΈ°λ‘
|
500 |
+
|
501 |
+
Returns:
|
502 |
+
μλ΅ λ° μ
λ°μ΄νΈλ λν κΈ°λ‘
|
503 |
+
"""
|
504 |
+
if audio is None:
|
505 |
+
return "", chat_history
|
506 |
+
|
507 |
+
try:
|
508 |
+
# μμ νμΌμ μ€λμ€ μ μ₯
|
509 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
|
510 |
+
temp_path = temp_file.name
|
511 |
+
temp_file.write(audio)
|
512 |
+
|
513 |
+
print(f"[STT] μμ μ€λμ€ νμΌ μμ±: {temp_path}")
|
514 |
+
|
515 |
+
# STT μ€ν
|
516 |
+
result = self.stt_client.recognize_file(temp_path)
|
517 |
+
|
518 |
+
# μμ νμΌ μμ
|
519 |
+
try:
|
520 |
+
os.unlink(temp_path)
|
521 |
+
print("[STT] μμ μ€λμ€ νμΌ μμ λ¨")
|
522 |
+
except Exception as e:
|
523 |
+
print(f"[STT] μμ νμΌ μμ μ€ν¨: {e}")
|
524 |
+
|
525 |
+
# STT κ²°κ³Ό μ²λ¦¬
|
526 |
+
if "error" in result:
|
527 |
+
error_msg = f"μμ±μΈμ μ€λ₯: {result.get('error')}"
|
528 |
+
print(f"[STT] {error_msg}")
|
529 |
+
chat_history.append(("μμ± λ©μμ§", error_msg))
|
530 |
+
return "", chat_history
|
531 |
+
|
532 |
+
# μΈμλ ν
μ€νΈ μΆμΆ
|
533 |
+
recognized_text = result.get("text", "")
|
534 |
+
if not recognized_text:
|
535 |
+
error_msg = "μμ±μ μΈμν μ μμ΅λλ€. λ€μ μλν΄μ£ΌμΈμ."
|
536 |
+
print("[STT] μΈμλ ν
μ€νΈ μμ")
|
537 |
+
chat_history.append(("μμ± λ©μμ§", error_msg))
|
538 |
+
return "", chat_history
|
539 |
+
|
540 |
+
print(f"[STT] μΈμλ ν
μ€νΈ: {recognized_text}")
|
541 |
+
|
542 |
+
# μΈμλ ν
μ€νΈλ‘ 쿼리 μ²λ¦¬ (μμ± λ©μμ§ μ λμ΄ μΆκ°)
|
543 |
+
return self.process_query(f"π€ {recognized_text}", chat_history)
|
544 |
+
|
545 |
+
except Exception as e:
|
546 |
+
error_msg = f"μμ± μ²λ¦¬ μ€ μ€λ₯ λ°μ: {str(e)}"
|
547 |
+
print(f"[STT] {error_msg}")
|
548 |
+
chat_history.append(("μμ± λ©μμ§", error_msg))
|
549 |
+
return "", chat_history
|
550 |
+
|
551 |
+
def launch_app(self) -> None:
|
552 |
+
"""
|
553 |
+
μμ±μΈμ κΈ°λ₯μ΄ μΆκ°λ Gradio μ± μ€ν
|
554 |
+
"""
|
555 |
+
import gradio as gr
|
556 |
+
|
557 |
+
with gr.Blocks(title="μμ±μΈμ κΈ°λ₯μ΄ μΆκ°λ PDF λ¬Έμ κΈ°λ° RAG μ±λ΄") as app:
|
558 |
+
gr.Markdown("# μμ±μΈμ κΈ°λ₯μ΄ μΆκ°λ PDF λ¬Έμ κΈ°λ° RAG μ±λ΄")
|
559 |
+
gr.Markdown(f"* μ¬μ© μ€μΈ LLM λͺ¨λΈ: **{LLM_MODEL}**")
|
560 |
+
gr.Markdown(f"* PDF λ¬Έμ ν΄λ: **{self.pdf_directory}**")
|
561 |
+
gr.Markdown("* λ€μ΄λ² ν΄λ‘λ° μμ±μΈμ API ν΅ν©")
|
562 |
+
|
563 |
+
with gr.Row():
|
564 |
+
with gr.Column(scale=1):
|
565 |
+
# λ¬Έμ μν μΉμ
|
566 |
+
status_box = gr.Textbox(
|
567 |
+
label="λ¬Έμ μ²λ¦¬ μν",
|
568 |
+
value=f"μ²λ¦¬λ λ¬Έμ ({len(self.processed_files)}κ°): {', '.join(self.processed_files)}",
|
569 |
+
lines=5,
|
570 |
+
interactive=False
|
571 |
+
)
|
572 |
+
|
573 |
+
# μΊμ κ΄λ¦¬ λ²νΌ
|
574 |
+
refresh_button = gr.Button("λ¬Έμ μλ‘ μ½κΈ°", variant="primary")
|
575 |
+
reset_button = gr.Button("μΊμ μ΄κΈ°ν", variant="stop")
|
576 |
+
|
577 |
+
# μ²λ¦¬λ νμΌ μ 보
|
578 |
+
with gr.Accordion("μΊμ μΈλΆ μ 보", open=False):
|
579 |
+
file_info = ""
|
580 |
+
for file_path, info in self.file_index.items():
|
581 |
+
file_info += f"- {os.path.basename(file_path)}: {info['chunks_count']}κ° μ²ν¬\n"
|
582 |
+
|
583 |
+
cache_info = gr.Textbox(
|
584 |
+
label="μΊμλ νμΌ μ 보",
|
585 |
+
value=file_info or "μΊμλ νμΌμ΄ μμ΅λλ€.",
|
586 |
+
lines=5,
|
587 |
+
interactive=False
|
588 |
+
)
|
589 |
+
|
590 |
+
with gr.Column(scale=2):
|
591 |
+
# μ±ν
μΈν°νμ΄μ€
|
592 |
+
chatbot = gr.Chatbot(
|
593 |
+
label="λν λ΄μ©",
|
594 |
+
bubble_full_width=False,
|
595 |
+
height=500,
|
596 |
+
show_copy_button=True
|
597 |
+
)
|
598 |
+
|
599 |
+
with gr.Tabs() as input_tabs:
|
600 |
+
# ν
μ€νΈ μ
λ ₯ ν
|
601 |
+
with gr.Tab("ν
μ€νΈ μ
λ ₯"):
|
602 |
+
# ν
μ€νΈ μ
λ ₯κ³Ό μ μ‘ λ²νΌμ μνμΌλ‘ λ°°μΉ
|
603 |
+
with gr.Row():
|
604 |
+
query_box = gr.Textbox(
|
605 |
+
label="μ§λ¬Έ",
|
606 |
+
placeholder="μ²λ¦¬λ λ¬Έμ λ΄μ©μ λν΄ μ§λ¬ΈνμΈμ...",
|
607 |
+
lines=2,
|
608 |
+
scale=4
|
609 |
+
)
|
610 |
+
submit_btn = gr.Button("μ μ‘", variant="primary", scale=1)
|
611 |
+
|
612 |
+
# μμ± μ
λ ₯ ν
|
613 |
+
with gr.Tab("μμ± μ
λ ₯"):
|
614 |
+
audio_input = gr.Audio(
|
615 |
+
label="λ§μ΄ν¬ μ
λ ₯",
|
616 |
+
sources=["microphone"],
|
617 |
+
type="bytes",
|
618 |
+
format="wav"
|
619 |
+
)
|
620 |
+
voice_submit_btn = gr.Button("μμ± μ§λ¬Έ μ μ‘", variant="primary")
|
621 |
+
|
622 |
+
clear_chat_button = gr.Button("λν μ΄κΈ°ν")
|
623 |
+
|
624 |
+
# μ΄λ²€νΈ νΈλ€λ¬ μ€μ
|
625 |
+
refresh_button.click(
|
626 |
+
fn=self.auto_process_documents,
|
627 |
+
inputs=[],
|
628 |
+
outputs=[status_box]
|
629 |
+
)
|
630 |
+
|
631 |
+
reset_button.click(
|
632 |
+
fn=lambda: (self.reset_cache(), self.auto_process_documents()),
|
633 |
+
inputs=[],
|
634 |
+
outputs=[status_box]
|
635 |
+
)
|
636 |
+
|
637 |
+
# ν
μ€νΈ μ μ‘ λ²νΌ ν΄λ¦ μ΄λ²€νΈ
|
638 |
+
submit_btn.click(
|
639 |
+
fn=self.process_query,
|
640 |
+
inputs=[query_box, chatbot],
|
641 |
+
outputs=[query_box, chatbot]
|
642 |
+
)
|
643 |
+
|
644 |
+
# μν°ν€ μ
λ ₯ μ΄λ²€νΈ
|
645 |
+
query_box.submit(
|
646 |
+
fn=self.process_query,
|
647 |
+
inputs=[query_box, chatbot],
|
648 |
+
outputs=[query_box, chatbot]
|
649 |
+
)
|
650 |
+
|
651 |
+
# μμ± μ μ‘ λ²νΌ ν΄λ¦ μ΄λ²€νΈ
|
652 |
+
voice_submit_btn.click(
|
653 |
+
fn=self.process_voice_query,
|
654 |
+
inputs=[audio_input, chatbot],
|
655 |
+
outputs=[audio_input, chatbot]
|
656 |
+
)
|
657 |
+
|
658 |
+
# λν μ΄κΈ°ν λ²νΌ
|
659 |
+
clear_chat_button.click(
|
660 |
+
fn=lambda: [],
|
661 |
+
outputs=[chatbot]
|
662 |
+
)
|
663 |
+
|
664 |
+
# μ± μ€ν
|
665 |
+
app.launch(share=False)
|
666 |
+
|
667 |
+
|
668 |
+
if __name__ == "__main__":
|
669 |
+
app = VoiceRAGChatApp()
|
670 |
+
app.launch_app()
|