openfree commited on
Commit
225994f
ยท
verified ยท
1 Parent(s): 22b67c7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +231 -175
app.py CHANGED
@@ -4,14 +4,33 @@ from typing import List, Dict, Any, Optional
4
  import hashlib
5
  import json
6
  from datetime import datetime
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
- # PDF ์ฒ˜๋ฆฌ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ
9
- import pymupdf # PyMuPDF
10
- import chromadb
11
- from chromadb.utils import embedding_functions
12
- from langchain.text_splitter import RecursiveCharacterTextSplitter
13
- from sentence_transformers import SentenceTransformer
14
  import numpy as np
 
15
 
16
  # Custom CSS (๊ธฐ์กด CSS + ์ถ”๊ฐ€ ์Šคํƒ€์ผ)
17
  custom_css = """
@@ -65,84 +84,96 @@ custom_css = """
65
  border: 1px solid rgba(251, 191, 36, 0.5);
66
  color: #f59e0b;
67
  }
68
- .document-card {
69
- padding: 12px;
70
- margin: 8px 0;
71
- border-radius: 8px;
72
- background: rgba(255, 255, 255, 0.1);
73
- border: 1px solid rgba(255, 255, 255, 0.2);
74
- cursor: pointer;
75
- transition: all 0.3s ease;
76
- }
77
- .document-card:hover {
78
- background: rgba(255, 255, 255, 0.2);
79
- transform: translateX(5px);
80
- }
81
  """
82
 
83
- class PDFRAGSystem:
84
- """PDF ๊ธฐ๋ฐ˜ RAG ์‹œ์Šคํ…œ ํด๋ž˜์Šค"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
  def __init__(self):
87
  self.documents = {}
 
 
 
 
 
88
  self.embedder = None
89
- self.vector_store = None
90
- self.text_splitter = RecursiveCharacterTextSplitter(
91
- chunk_size=1000,
92
- chunk_overlap=200,
93
- length_function=len,
94
- separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
95
- )
96
- self.initialize_vector_store()
97
-
98
- def initialize_vector_store(self):
99
- """๋ฒกํ„ฐ ์ €์žฅ์†Œ ์ดˆ๊ธฐํ™”"""
100
- try:
101
- # Sentence Transformer ๋ชจ๋ธ ๋กœ๋“œ
102
- self.embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
103
-
104
- # ChromaDB ํด๋ผ์ด์–ธํŠธ ์ดˆ๊ธฐํ™”
105
- self.chroma_client = chromadb.Client()
106
- self.collection = self.chroma_client.create_collection(
107
- name="pdf_documents",
108
- metadata={"hnsw:space": "cosine"}
109
- )
110
- except Exception as e:
111
- print(f"Vector store initialization error: {e}")
112
 
113
  def extract_text_from_pdf(self, pdf_path: str) -> Dict[str, Any]:
114
  """PDF์—์„œ ํ…์ŠคํŠธ ์ถ”์ถœ"""
 
 
 
 
 
 
 
 
 
 
 
115
  try:
116
- doc = pymupdf.open(pdf_path)
117
  text_content = []
118
  metadata = {
119
  "title": doc.metadata.get("title", "Untitled"),
120
  "author": doc.metadata.get("author", "Unknown"),
121
  "pages": len(doc),
122
- "creation_date": doc.metadata.get("creationDate", ""),
123
  "file_name": os.path.basename(pdf_path)
124
  }
125
 
126
  for page_num, page in enumerate(doc):
127
  text = page.get_text()
128
  if text.strip():
129
- text_content.append({
130
- "page": page_num + 1,
131
- "content": text
132
- })
133
 
134
  doc.close()
135
 
136
  return {
137
  "metadata": metadata,
138
- "pages": text_content,
139
- "full_text": "\n\n".join([p["content"] for p in text_content])
140
  }
141
  except Exception as e:
142
  raise Exception(f"PDF ์ฒ˜๋ฆฌ ์˜ค๋ฅ˜: {str(e)}")
143
 
144
- def process_and_index_pdf(self, pdf_path: str, doc_id: str) -> Dict[str, Any]:
145
- """PDF ์ฒ˜๋ฆฌ ๋ฐ ๋ฒกํ„ฐ ์ธ๋ฑ์‹ฑ"""
146
  try:
147
  # PDF ํ…์ŠคํŠธ ์ถ”์ถœ
148
  pdf_data = self.extract_text_from_pdf(pdf_path)
@@ -150,33 +181,20 @@ class PDFRAGSystem:
150
  # ํ…์ŠคํŠธ๋ฅผ ์ฒญํฌ๋กœ ๋ถ„ํ• 
151
  chunks = self.text_splitter.split_text(pdf_data["full_text"])
152
 
153
- # ๊ฐ ์ฒญํฌ์— ๋Œ€ํ•œ ์ž„๋ฒ ๋”ฉ ์ƒ์„ฑ
154
- embeddings = self.embedder.encode(chunks)
155
-
156
- # ChromaDB์— ์ €์žฅ
157
- ids = [f"{doc_id}_{i}" for i in range(len(chunks))]
158
- metadatas = [
159
- {
160
- "doc_id": doc_id,
161
- "chunk_index": i,
162
- "source": pdf_data["metadata"]["file_name"],
163
- "page_count": pdf_data["metadata"]["pages"]
164
- }
165
- for i in range(len(chunks))
166
- ]
167
 
168
- self.collection.add(
169
- ids=ids,
170
- embeddings=embeddings.tolist(),
171
- documents=chunks,
172
- metadatas=metadatas
173
- )
174
 
175
  # ๋ฌธ์„œ ์ •๋ณด ์ €์žฅ
176
  self.documents[doc_id] = {
177
  "metadata": pdf_data["metadata"],
178
  "chunk_count": len(chunks),
179
- "upload_time": datetime.now().isoformat()
 
180
  }
181
 
182
  return {
@@ -193,59 +211,92 @@ class PDFRAGSystem:
193
  "error": str(e)
194
  }
195
 
196
- def search_relevant_chunks(self, query: str, top_k: int = 5) -> List[Dict]:
197
  """์ฟผ๋ฆฌ์™€ ๊ด€๋ จ๋œ ์ฒญํฌ ๊ฒ€์ƒ‰"""
198
- try:
199
- # ์ฟผ๋ฆฌ ์ž„๋ฒ ๋”ฉ ์ƒ์„ฑ
200
- query_embedding = self.embedder.encode([query])
201
-
202
- # ์œ ์‚ฌํ•œ ๋ฌธ์„œ ๊ฒ€์ƒ‰
203
- results = self.collection.query(
204
- query_embeddings=query_embedding.tolist(),
205
- n_results=top_k
206
- )
207
 
208
- if results and results['documents']:
209
- chunks = []
210
- for i in range(len(results['documents'][0])):
211
- chunks.append({
212
- "content": results['documents'][0][i],
213
- "metadata": results['metadatas'][0][i],
214
- "distance": results['distances'][0][i] if 'distances' in results else None
215
- })
216
- return chunks
217
- return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
 
219
- except Exception as e:
220
- print(f"Search error: {e}")
221
- return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
 
223
- def generate_rag_prompt(self, query: str, context_chunks: List[Dict]) -> str:
224
- """RAG ํ”„๋กฌํ”„ํŠธ ์ƒ์„ฑ"""
225
- context = "\n\n---\n\n".join([
226
- f"[์ถœ์ฒ˜: {chunk['metadata']['source']}, ์ฒญํฌ {chunk['metadata']['chunk_index']+1}]\n{chunk['content']}"
227
- for chunk in context_chunks
228
- ])
 
 
 
 
229
 
230
- prompt = f"""๋‹ค์Œ ๋ฌธ์„œ ๋‚ด์šฉ์„ ์ฐธ๊ณ ํ•˜์—ฌ ์งˆ๋ฌธ์— ๋‹ต๋ณ€ํ•ด์ฃผ์„ธ์š”.
231
- ๋‹ต๋ณ€์€ ์ œ๊ณต๋œ ๋ฌธ์„œ ๋‚ด์šฉ์„ ๋ฐ”ํƒ•์œผ๋กœ ์ž‘์„ฑํ•˜๋˜, ํ•„์š”์‹œ ์ถ”๊ฐ€ ์„ค๋ช…์„ ํฌํ•จํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.
232
- ๋ฌธ์„œ์—์„œ ๊ด€๋ จ ์ •๋ณด๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†๋Š” ๊ฒฝ์šฐ, ๊ทธ ์‚ฌ์‹ค์„ ๋ช…์‹œํ•ด์ฃผ์„ธ์š”.
233
 
234
- ๐Ÿ“š ์ฐธ๊ณ  ๋ฌธ์„œ:
235
  {context}
236
 
237
- โ“ ์งˆ๋ฌธ: {query}
 
238
 
239
- ๐Ÿ’ก ๋‹ต๋ณ€:"""
 
240
 
241
- return prompt
242
 
243
  # RAG ์‹œ์Šคํ…œ ์ธ์Šคํ„ด์Šค ์ƒ์„ฑ
244
- rag_system = PDFRAGSystem()
245
 
246
  # State variables
247
  current_model = gr.State("openai/gpt-oss-120b")
248
- uploaded_documents = gr.State({})
249
  rag_enabled = gr.State(False)
250
 
251
  def upload_pdf(file):
@@ -260,8 +311,8 @@ def upload_pdf(file):
260
 
261
  doc_id = f"doc_{file_hash}"
262
 
263
- # PDF ์ฒ˜๋ฆฌ ๋ฐ ์ธ๋ฑ์‹ฑ
264
- result = rag_system.process_and_index_pdf(file.name, doc_id)
265
 
266
  if result["success"]:
267
  status_html = f"""
@@ -300,49 +351,67 @@ def upload_pdf(file):
300
  def clear_documents():
301
  """์—…๋กœ๋“œ๋œ ๋ฌธ์„œ ์ดˆ๊ธฐํ™”"""
302
  try:
303
- # ChromaDB ์ปฌ๋ ‰์…˜ ์žฌ์ƒ์„ฑ
304
- rag_system.chroma_client.delete_collection("pdf_documents")
305
- rag_system.collection = rag_system.chroma_client.create_collection(
306
- name="pdf_documents",
307
- metadata={"hnsw:space": "cosine"}
308
- )
309
  rag_system.documents = {}
 
 
310
 
311
  return gr.update(value="<div class='pdf-status pdf-success'>โœ… ๋ชจ๋“  ๋ฌธ์„œ๊ฐ€ ์‚ญ์ œ๋˜์—ˆ์Šต๋‹ˆ๋‹ค</div>"), gr.update(choices=[], value=[]), gr.update(value=False)
312
  except Exception as e:
313
  return gr.update(value=f"<div class='pdf-status pdf-error'>โŒ ์‚ญ์ œ ์‹คํŒจ: {str(e)}</div>"), gr.update(), gr.update()
314
 
315
- def process_with_rag(message: str, enable_rag: bool, selected_docs: List[str], top_k: int = 5):
316
- """RAG๋ฅผ ํ™œ์šฉํ•œ ๋ฉ”์‹œ์ง€ ์ฒ˜๋ฆฌ"""
317
- if not enable_rag or not selected_docs:
318
- return message # RAG ๋น„ํ™œ์„ฑํ™”์‹œ ์›๋ณธ ๋ฉ”์‹œ์ง€ ๋ฐ˜ํ™˜
 
 
 
 
 
319
 
320
- try:
 
 
 
 
321
  # ๊ด€๋ จ ์ฒญํฌ ๊ฒ€์ƒ‰
322
- relevant_chunks = rag_system.search_relevant_chunks(message, top_k=top_k)
323
 
324
  if relevant_chunks:
325
- # ์„ ํƒ๋œ ๋ฌธ์„œ์˜ ์ฒญํฌ๋งŒ ํ•„ํ„ฐ๋ง
326
- selected_doc_ids = [doc.split(":")[0] for doc in selected_docs]
327
- filtered_chunks = [
328
- chunk for chunk in relevant_chunks
329
- if chunk['metadata']['doc_id'] in selected_doc_ids
330
- ]
331
 
332
- if filtered_chunks:
333
- # RAG ํ”„๋กฌํ”„ํŠธ ์ƒ์„ฑ
334
- rag_prompt = rag_system.generate_rag_prompt(message, filtered_chunks[:top_k])
335
- return rag_prompt
336
-
337
- return message
338
-
339
- except Exception as e:
340
- print(f"RAG processing error: {e}")
341
- return message
342
 
343
- def switch_model(model_choice):
344
- """๋ชจ๋ธ ์ „ํ™˜ ํ•จ์ˆ˜"""
345
- return gr.update(visible=False), gr.update(visible=True), model_choice
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
346
 
347
  # Gradio ์ธํ„ฐํŽ˜์ด์Šค
348
  with gr.Blocks(fill_height=True, theme="Nymbo/Nymbo_Theme", css=custom_css) as demo:
@@ -403,14 +472,18 @@ with gr.Blocks(fill_height=True, theme="Nymbo/Nymbo_Theme", css=custom_css) as d
403
  info="๋‹ต๋ณ€ ์ƒ์„ฑ์‹œ ์ฐธ๊ณ ํ•  ๋ฌธ์„œ ์ฒญํฌ์˜ ๊ฐœ์ˆ˜"
404
  )
405
 
406
- chunk_size = gr.Slider(
407
- minimum=500,
408
- maximum=2000,
409
- value=1000,
410
- step=100,
411
- label="์ฒญํฌ ํฌ๊ธฐ",
412
- info="๋ฌธ์„œ๋ฅผ ๋ถ„ํ• ํ•˜๋Š” ์ฒญํฌ์˜ ํฌ๊ธฐ (๋ฌธ์ž ์ˆ˜)"
413
- )
 
 
 
 
414
 
415
  # ๊ณ ๊ธ‰ ์˜ต์…˜
416
  with gr.Accordion("โš™๏ธ ๋ชจ๋ธ ์„ค์ •", open=False):
@@ -443,7 +516,6 @@ with gr.Blocks(fill_height=True, theme="Nymbo/Nymbo_Theme", css=custom_css) as d
443
  # ๋ชจ๋ธ ์ธํ„ฐํŽ˜์ด์Šค ์ปจํ…Œ์ด๋„ˆ
444
  with gr.Column(visible=True) as model_120b_container:
445
  gr.Markdown("### Model: openai/gpt-oss-120b")
446
- # ์‹ค์ œ ๋ชจ๋ธ ๋กœ๋“œ๋Š” gr.load()๋กœ ์ฒ˜๋ฆฌ
447
  chatbot_120b = gr.Chatbot(height=400)
448
  msg_box_120b = gr.Textbox(
449
  label="๋ฉ”์‹œ์ง€ ์ž…๋ ฅ",
@@ -501,31 +573,15 @@ with gr.Blocks(fill_height=True, theme="Nymbo/Nymbo_Theme", css=custom_css) as d
501
  outputs=[]
502
  )
503
 
504
- # ์ฑ„ํŒ… ๊ธฐ๋Šฅ (RAG ํ†ตํ•ฉ)
505
- def chat_with_rag(message, history, enable_rag, selected_docs, top_k):
506
- """RAG๋ฅผ ํ™œ์šฉํ•œ ์ฑ„ํŒ…"""
507
- # RAG ์ฒ˜๋ฆฌ
508
- processed_message = process_with_rag(message, enable_rag, selected_docs, top_k)
509
-
510
- # ์—ฌ๊ธฐ์— ์‹ค์ œ ๋ชจ๋ธ API ํ˜ธ์ถœ ์ฝ”๋“œ๊ฐ€ ๋“ค์–ด๊ฐ€์•ผ ํ•จ
511
- # ํ˜„์žฌ๋Š” ์˜ˆ์‹œ ์‘๋‹ต
512
- if enable_rag and selected_docs:
513
- response = f"[RAG ํ™œ์„ฑํ™”] ์„ ํƒ๋œ {len(selected_docs)}๊ฐœ ๋ฌธ์„œ๋ฅผ ์ฐธ๊ณ ํ•˜์—ฌ ๋‹ต๋ณ€ํ•ฉ๋‹ˆ๋‹ค:\n\n{processed_message[:200]}..."
514
- else:
515
- response = f"[์ผ๋ฐ˜ ๋ชจ๋“œ] {message}์— ๋Œ€ํ•œ ๋‹ต๋ณ€์ž…๋‹ˆ๋‹ค."
516
-
517
- history.append((message, response))
518
- return "", history
519
-
520
  # 120b ๋ชจ๋ธ ์ฑ„ํŒ…
521
  msg_box_120b.submit(
522
- fn=chat_with_rag,
523
  inputs=[msg_box_120b, chatbot_120b, enable_rag, document_list, top_k_chunks],
524
  outputs=[msg_box_120b, chatbot_120b]
525
  )
526
 
527
  send_btn_120b.click(
528
- fn=chat_with_rag,
529
  inputs=[msg_box_120b, chatbot_120b, enable_rag, document_list, top_k_chunks],
530
  outputs=[msg_box_120b, chatbot_120b]
531
  )
@@ -537,13 +593,13 @@ with gr.Blocks(fill_height=True, theme="Nymbo/Nymbo_Theme", css=custom_css) as d
537
 
538
  # 20b ๋ชจ๋ธ ์ฑ„ํŒ…
539
  msg_box_20b.submit(
540
- fn=chat_with_rag,
541
  inputs=[msg_box_20b, chatbot_20b, enable_rag, document_list, top_k_chunks],
542
  outputs=[msg_box_20b, chatbot_20b]
543
  )
544
 
545
  send_btn_20b.click(
546
- fn=chat_with_rag,
547
  inputs=[msg_box_20b, chatbot_20b, enable_rag, document_list, top_k_chunks],
548
  outputs=[msg_box_20b, chatbot_20b]
549
  )
 
4
  import hashlib
5
  import json
6
  from datetime import datetime
7
+ import tempfile
8
+
9
+ # PDF ์ฒ˜๋ฆฌ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ (์„ค์น˜ ํ•„์š”ํ•œ ๊ฒฝ์šฐ๋ฅผ ์œ„ํ•œ ๋Œ€์ฒด ๊ตฌํ˜„ ํฌํ•จ)
10
+ try:
11
+ import fitz # PyMuPDF
12
+ PDF_AVAILABLE = True
13
+ except ImportError:
14
+ PDF_AVAILABLE = False
15
+ print("PyMuPDF not installed. Install with: pip install pymupdf")
16
+
17
+ try:
18
+ import chromadb
19
+ from chromadb.utils import embedding_functions
20
+ CHROMA_AVAILABLE = True
21
+ except ImportError:
22
+ CHROMA_AVAILABLE = False
23
+ print("ChromaDB not installed. Install with: pip install chromadb")
24
+
25
+ try:
26
+ from sentence_transformers import SentenceTransformer
27
+ ST_AVAILABLE = True
28
+ except ImportError:
29
+ ST_AVAILABLE = False
30
+ print("Sentence Transformers not installed. Install with: pip install sentence-transformers")
31
 
 
 
 
 
 
 
32
  import numpy as np
33
+ from typing import Tuple
34
 
35
  # Custom CSS (๊ธฐ์กด CSS + ์ถ”๊ฐ€ ์Šคํƒ€์ผ)
36
  custom_css = """
 
84
  border: 1px solid rgba(251, 191, 36, 0.5);
85
  color: #f59e0b;
86
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  """
88
 
89
+ class SimpleTextSplitter:
90
+ """๊ฐ„๋‹จํ•œ ํ…์ŠคํŠธ ๋ถ„ํ• ๊ธฐ"""
91
+ def __init__(self, chunk_size=1000, chunk_overlap=200):
92
+ self.chunk_size = chunk_size
93
+ self.chunk_overlap = chunk_overlap
94
+
95
+ def split_text(self, text: str) -> List[str]:
96
+ """ํ…์ŠคํŠธ๋ฅผ ์ฒญํฌ๋กœ ๋ถ„ํ• """
97
+ chunks = []
98
+ start = 0
99
+ text_len = len(text)
100
+
101
+ while start < text_len:
102
+ end = start + self.chunk_size
103
+
104
+ # ๋ฌธ์žฅ ๋์—์„œ ์ž๋ฅด๊ธฐ ์œ„ํ•ด ๋งˆ์นจํ‘œ ์ฐพ๊ธฐ
105
+ if end < text_len:
106
+ last_period = text.rfind('.', start, end)
107
+ if last_period != -1 and last_period > start:
108
+ end = last_period + 1
109
+
110
+ chunk = text[start:end].strip()
111
+ if chunk:
112
+ chunks.append(chunk)
113
+
114
+ start = end - self.chunk_overlap
115
+ if start < 0:
116
+ start = 0
117
+
118
+ return chunks
119
+
120
+ class SimplePDFRAGSystem:
121
+ """๊ฐ„๋‹จํ•œ PDF ๊ธฐ๋ฐ˜ RAG ์‹œ์Šคํ…œ"""
122
 
123
  def __init__(self):
124
  self.documents = {}
125
+ self.document_chunks = {}
126
+ self.embeddings_store = {}
127
+ self.text_splitter = SimpleTextSplitter(chunk_size=1000, chunk_overlap=200)
128
+
129
+ # ์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ ์ดˆ๊ธฐํ™” (๊ฐ€๋Šฅํ•œ ๊ฒฝ์šฐ)
130
  self.embedder = None
131
+ if ST_AVAILABLE:
132
+ try:
133
+ self.embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
134
+ print("Embedding model loaded successfully")
135
+ except Exception as e:
136
+ print(f"Failed to load embedding model: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
 
138
  def extract_text_from_pdf(self, pdf_path: str) -> Dict[str, Any]:
139
  """PDF์—์„œ ํ…์ŠคํŠธ ์ถ”์ถœ"""
140
+ if not PDF_AVAILABLE:
141
+ # PyMuPDF๊ฐ€ ์—†๋Š” ๊ฒฝ์šฐ ๋Œ€์ฒด ๋ฐฉ๋ฒ•
142
+ return {
143
+ "metadata": {
144
+ "title": "PDF Reader Not Available",
145
+ "file_name": os.path.basename(pdf_path),
146
+ "pages": 0
147
+ },
148
+ "full_text": "PDF ์ฒ˜๋ฆฌ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ๊ฐ€ ์„ค์น˜๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค. 'pip install pymupdf'๋ฅผ ์‹คํ–‰ํ•ด์ฃผ์„ธ์š”."
149
+ }
150
+
151
  try:
152
+ doc = fitz.open(pdf_path)
153
  text_content = []
154
  metadata = {
155
  "title": doc.metadata.get("title", "Untitled"),
156
  "author": doc.metadata.get("author", "Unknown"),
157
  "pages": len(doc),
 
158
  "file_name": os.path.basename(pdf_path)
159
  }
160
 
161
  for page_num, page in enumerate(doc):
162
  text = page.get_text()
163
  if text.strip():
164
+ text_content.append(text)
 
 
 
165
 
166
  doc.close()
167
 
168
  return {
169
  "metadata": metadata,
170
+ "full_text": "\n\n".join(text_content)
 
171
  }
172
  except Exception as e:
173
  raise Exception(f"PDF ์ฒ˜๋ฆฌ ์˜ค๋ฅ˜: {str(e)}")
174
 
175
+ def process_and_store_pdf(self, pdf_path: str, doc_id: str) -> Dict[str, Any]:
176
+ """PDF ์ฒ˜๋ฆฌ ๋ฐ ์ €์žฅ"""
177
  try:
178
  # PDF ํ…์ŠคํŠธ ์ถ”์ถœ
179
  pdf_data = self.extract_text_from_pdf(pdf_path)
 
181
  # ํ…์ŠคํŠธ๋ฅผ ์ฒญํฌ๋กœ ๋ถ„ํ• 
182
  chunks = self.text_splitter.split_text(pdf_data["full_text"])
183
 
184
+ # ์ฒญํฌ ์ €์žฅ
185
+ self.document_chunks[doc_id] = chunks
 
 
 
 
 
 
 
 
 
 
 
 
186
 
187
+ # ์ž„๋ฒ ๋”ฉ ์ƒ์„ฑ (๊ฐ€๋Šฅํ•œ ๊ฒฝ์šฐ)
188
+ if self.embedder:
189
+ embeddings = self.embedder.encode(chunks)
190
+ self.embeddings_store[doc_id] = embeddings
 
 
191
 
192
  # ๋ฌธ์„œ ์ •๋ณด ์ €์žฅ
193
  self.documents[doc_id] = {
194
  "metadata": pdf_data["metadata"],
195
  "chunk_count": len(chunks),
196
+ "upload_time": datetime.now().isoformat(),
197
+ "full_text": pdf_data["full_text"][:500] # ์ฒ˜์Œ 500์ž ์ €์žฅ
198
  }
199
 
200
  return {
 
211
  "error": str(e)
212
  }
213
 
214
+ def search_relevant_chunks(self, query: str, doc_ids: List[str], top_k: int = 5) -> List[Dict]:
215
  """์ฟผ๋ฆฌ์™€ ๊ด€๋ จ๋œ ์ฒญํฌ ๊ฒ€์ƒ‰"""
216
+ all_relevant_chunks = []
217
+
218
+ if self.embedder and self.embeddings_store:
219
+ # ์ž„๋ฒ ๋”ฉ ๊ธฐ๋ฐ˜ ๊ฒ€์ƒ‰
220
+ query_embedding = self.embedder.encode([query])[0]
 
 
 
 
221
 
222
+ for doc_id in doc_ids:
223
+ if doc_id in self.embeddings_store and doc_id in self.document_chunks:
224
+ doc_embeddings = self.embeddings_store[doc_id]
225
+ chunks = self.document_chunks[doc_id]
226
+
227
+ # ์ฝ”์‚ฌ์ธ ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ
228
+ similarities = []
229
+ for emb in doc_embeddings:
230
+ sim = np.dot(query_embedding, emb) / (np.linalg.norm(query_embedding) * np.linalg.norm(emb))
231
+ similarities.append(sim)
232
+
233
+ # ์ƒ์œ„ k๊ฐœ ์„ ํƒ
234
+ top_indices = np.argsort(similarities)[-top_k:][::-1]
235
+
236
+ for idx in top_indices:
237
+ all_relevant_chunks.append({
238
+ "content": chunks[idx],
239
+ "doc_id": doc_id,
240
+ "doc_name": self.documents[doc_id]["metadata"]["file_name"],
241
+ "chunk_index": idx,
242
+ "similarity": similarities[idx]
243
+ })
244
+ else:
245
+ # ์ž„๋ฒ ๋”ฉ์ด ์—†๋Š” ๊ฒฝ์šฐ ํ‚ค์›Œ๋“œ ๊ธฐ๋ฐ˜ ๊ฐ„๋‹จํ•œ ๊ฒ€์ƒ‰
246
+ query_lower = query.lower()
247
+ query_words = set(query_lower.split())
248
 
249
+ for doc_id in doc_ids:
250
+ if doc_id in self.document_chunks:
251
+ chunks = self.document_chunks[doc_id]
252
+ for idx, chunk in enumerate(chunks):
253
+ chunk_lower = chunk.lower()
254
+ # ์ฟผ๋ฆฌ ๋‹จ์–ด๊ฐ€ ์ฒญํฌ์— ํฌํ•จ๋˜์–ด ์žˆ๋Š”์ง€ ํ™•์ธ
255
+ matching_words = sum(1 for word in query_words if word in chunk_lower)
256
+ if matching_words > 0:
257
+ all_relevant_chunks.append({
258
+ "content": chunk,
259
+ "doc_id": doc_id,
260
+ "doc_name": self.documents[doc_id]["metadata"]["file_name"],
261
+ "chunk_index": idx,
262
+ "similarity": matching_words / len(query_words)
263
+ })
264
+
265
+ # ์œ ์‚ฌ๋„ ์ˆœ์œผ๋กœ ์ •๋ ฌํ•˜๊ณ  ์ƒ์œ„ k๊ฐœ ๋ฐ˜ํ™˜
266
+ all_relevant_chunks.sort(key=lambda x: x.get('similarity', 0), reverse=True)
267
+ return all_relevant_chunks[:top_k]
268
 
269
+ def generate_context_prompt(self, query: str, chunks: List[Dict]) -> str:
270
+ """์ปจํ…์ŠคํŠธ๋ฅผ ํฌํ•จํ•œ ํ”„๋กฌํ”„ํŠธ ์ƒ์„ฑ"""
271
+ if not chunks:
272
+ return query
273
+
274
+ context_parts = []
275
+ for i, chunk in enumerate(chunks, 1):
276
+ context_parts.append(
277
+ f"[๋ฌธ์„œ: {chunk['doc_name']}, ์„น์…˜ {chunk['chunk_index']+1}]\n{chunk['content']}\n"
278
+ )
279
 
280
+ context = "\n---\n".join(context_parts)
281
+
282
+ enhanced_prompt = f"""๋‹ค์Œ ๋ฌธ์„œ ๋‚ด์šฉ์„ ์ฐธ๊ณ ํ•˜์—ฌ ์งˆ๋ฌธ์— ๋‹ต๋ณ€ํ•ด์ฃผ์„ธ์š”.
283
 
284
+ ## ์ฐธ๊ณ  ๋ฌธ์„œ:
285
  {context}
286
 
287
+ ## ์งˆ๋ฌธ:
288
+ {query}
289
 
290
+ ## ๋‹ต๋ณ€:
291
+ ์œ„ ๋ฌธ์„œ ๋‚ด์šฉ์„ ๋ฐ”ํƒ•์œผ๋กœ ์งˆ๋ฌธ์— ๋Œ€ํ•ด ์ƒ์„ธํ•˜๊ณ  ์ •ํ™•ํ•˜๊ฒŒ ๋‹ต๋ณ€ํ•˜๊ฒ ์Šต๋‹ˆ๋‹ค."""
292
 
293
+ return enhanced_prompt
294
 
295
  # RAG ์‹œ์Šคํ…œ ์ธ์Šคํ„ด์Šค ์ƒ์„ฑ
296
+ rag_system = SimplePDFRAGSystem()
297
 
298
  # State variables
299
  current_model = gr.State("openai/gpt-oss-120b")
 
300
  rag_enabled = gr.State(False)
301
 
302
  def upload_pdf(file):
 
311
 
312
  doc_id = f"doc_{file_hash}"
313
 
314
+ # PDF ์ฒ˜๋ฆฌ ๋ฐ ์ €์žฅ
315
+ result = rag_system.process_and_store_pdf(file.name, doc_id)
316
 
317
  if result["success"]:
318
  status_html = f"""
 
351
  def clear_documents():
352
  """์—…๋กœ๋“œ๋œ ๋ฌธ์„œ ์ดˆ๊ธฐํ™”"""
353
  try:
 
 
 
 
 
 
354
  rag_system.documents = {}
355
+ rag_system.document_chunks = {}
356
+ rag_system.embeddings_store = {}
357
 
358
  return gr.update(value="<div class='pdf-status pdf-success'>โœ… ๋ชจ๋“  ๋ฌธ์„œ๊ฐ€ ์‚ญ์ œ๋˜์—ˆ์Šต๋‹ˆ๋‹ค</div>"), gr.update(choices=[], value=[]), gr.update(value=False)
359
  except Exception as e:
360
  return gr.update(value=f"<div class='pdf-status pdf-error'>โŒ ์‚ญ์ œ ์‹คํŒจ: {str(e)}</div>"), gr.update(), gr.update()
361
 
362
+ def switch_model(model_choice):
363
+ """๋ชจ๋ธ ์ „ํ™˜ ํ•จ์ˆ˜"""
364
+ if model_choice == "openai/gpt-oss-120b":
365
+ return gr.update(visible=True), gr.update(visible=False), model_choice
366
+ else:
367
+ return gr.update(visible=False), gr.update(visible=True), model_choice
368
+
369
+ def chat_with_model(message: str, history: List[Tuple[str, str]], enable_rag: bool, selected_docs: List[str], top_k: int, model: str):
370
+ """๋ชจ๋ธ๊ณผ ๋Œ€ํ™” (RAG ํฌํ•จ)"""
371
 
372
+ # RAG๊ฐ€ ํ™œ์„ฑํ™”๋˜๊ณ  ๋ฌธ์„œ๊ฐ€ ์„ ํƒ๋œ ๊ฒฝ์šฐ
373
+ if enable_rag and selected_docs:
374
+ # ์„ ํƒ๋œ ๋ฌธ์„œ ID ์ถ”์ถœ
375
+ doc_ids = [doc.split(":")[0] for doc in selected_docs]
376
+
377
  # ๊ด€๋ จ ์ฒญํฌ ๊ฒ€์ƒ‰
378
+ relevant_chunks = rag_system.search_relevant_chunks(message, doc_ids, top_k)
379
 
380
  if relevant_chunks:
381
+ # ์ปจํ…์ŠคํŠธ๋ฅผ ํฌํ•จํ•œ ํ”„๋กฌํ”„ํŠธ ์ƒ์„ฑ
382
+ enhanced_message = rag_system.generate_context_prompt(message, relevant_chunks)
 
 
 
 
383
 
384
+ # ๋””๋ฒ„๊ทธ ์ •๋ณด ํฌํ•จ ์‘๋‹ต (์‹ค์ œ ๊ตฌํ˜„์‹œ ๋ชจ๋ธ API ํ˜ธ์ถœ๋กœ ๋Œ€์ฒด)
385
+ response = f"""๐Ÿ“š RAG ๊ธฐ๋ฐ˜ ๋‹ต๋ณ€ (๋ชจ๋ธ: {model})
 
 
 
 
 
 
 
 
386
 
387
+ ์ฐพ์€ ๊ด€๋ จ ๋ฌธ์„œ ์„น์…˜: {len(relevant_chunks)}๊ฐœ
388
+
389
+ ์งˆ๋ฌธ: {message}
390
+
391
+ ๋‹ต๋ณ€:
392
+ {enhanced_message[:2000]}...
393
+
394
+ [์ฐธ๊ณ : ์‹ค์ œ ๊ตฌํ˜„์‹œ ์—ฌ๊ธฐ์„œ ๋ชจ๋ธ API๋ฅผ ํ˜ธ์ถœํ•˜์—ฌ enhanced_message๋ฅผ ์ „์†กํ•˜๊ณ  ์‘๋‹ต์„ ๋ฐ›์•„์•ผ ํ•ฉ๋‹ˆ๋‹ค]
395
+
396
+ ๊ด€๋ จ ๋ฌธ์„œ ์„น์…˜ ์š”์•ฝ:
397
+ """
398
+ for i, chunk in enumerate(relevant_chunks[:3], 1):
399
+ response += f"\n{i}. {chunk['doc_name']} - ์„น์…˜ {chunk['chunk_index']+1} (์œ ์‚ฌ๋„: {chunk['similarity']:.2f})"
400
+ response += f"\n ๋‚ด์šฉ: {chunk['content'][:200]}...\n"
401
+ else:
402
+ response = f"โš ๏ธ ์„ ํƒ๋œ ๋ฌธ์„œ์—์„œ '{message}'์™€ ๊ด€๋ จ๋œ ๋‚ด์šฉ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค. ๋‹ค๋ฅธ ์งˆ๋ฌธ์„ ์‹œ๋„ํ•ด๋ณด์„ธ์š”."
403
+ else:
404
+ # RAG ๋น„ํ™œ์„ฑํ™” ์ƒํƒœ
405
+ response = f"""์ผ๋ฐ˜ ๋‹ต๋ณ€ ๋ชจ๋“œ (๋ชจ๋ธ: {model})
406
+
407
+ ์งˆ๋ฌธ: {message}
408
+
409
+ [์ฐธ๊ณ : ์‹ค์ œ ๊ตฌํ˜„์‹œ ์—ฌ๊ธฐ์„œ ๋ชจ๋ธ API๋ฅผ ํ˜ธ์ถœํ•˜์—ฌ message๋ฅผ ์ „์†กํ•˜๊ณ  ์‘๋‹ต์„ ๋ฐ›์•„์•ผ ํ•ฉ๋‹ˆ๋‹ค]
410
+
411
+ PDF ๋ฌธ์„œ๋ฅผ ์—…๋กœ๋“œํ•˜๊ณ  RAG๋ฅผ ํ™œ์„ฑํ™”ํ•˜๋ฉด ๋ฌธ์„œ ๊ธฐ๋ฐ˜ ๋‹ต๋ณ€์„ ๋ฐ›์„ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค."""
412
+
413
+ history.append((message, response))
414
+ return "", history
415
 
416
  # Gradio ์ธํ„ฐํŽ˜์ด์Šค
417
  with gr.Blocks(fill_height=True, theme="Nymbo/Nymbo_Theme", css=custom_css) as demo:
 
472
  info="๋‹ต๋ณ€ ์ƒ์„ฑ์‹œ ์ฐธ๊ณ ํ•  ๋ฌธ์„œ ์ฒญํฌ์˜ ๊ฐœ์ˆ˜"
473
  )
474
 
475
+ gr.Markdown("""
476
+ ### ๐Ÿ“ RAG ์‚ฌ์šฉ ํŒ:
477
+ 1. PDF ํŒŒ์ผ์„ ์—…๋กœ๋“œํ•˜์„ธ์š”
478
+ 2. ์—…๋กœ๋“œ๋œ ๋ฌธ์„œ๋ฅผ ์„ ํƒํ•˜์„ธ์š”
479
+ 3. RAG๋ฅผ ํ™œ์„ฑํ™”ํ•˜์„ธ์š”
480
+ 4. ๋ฌธ์„œ ๋‚ด์šฉ์— ๋Œ€ํ•ด ์งˆ๋ฌธํ•˜์„ธ์š”
481
+
482
+ ์˜ˆ์‹œ ์งˆ๋ฌธ:
483
+ - "๋ฌธ์„œ์˜ ์ฃผ์š” ๋‚ด์šฉ์„ ์š”์•ฝํ•ด์ฃผ์„ธ์š”"
484
+ - "์ด ๋ฌธ์„œ์—์„œ ์–ธ๊ธ‰๋œ ๋‚ ์งœ๋Š” ์–ธ์ œ์ธ๊ฐ€์š”?"
485
+ - "์ฐธ๊ฐ€ ์ž๊ฒฉ ์กฐ๊ฑด์€ ๋ฌด์—‡์ธ๊ฐ€์š”?"
486
+ """)
487
 
488
  # ๊ณ ๊ธ‰ ์˜ต์…˜
489
  with gr.Accordion("โš™๏ธ ๋ชจ๋ธ ์„ค์ •", open=False):
 
516
  # ๋ชจ๋ธ ์ธํ„ฐํŽ˜์ด์Šค ์ปจํ…Œ์ด๋„ˆ
517
  with gr.Column(visible=True) as model_120b_container:
518
  gr.Markdown("### Model: openai/gpt-oss-120b")
 
519
  chatbot_120b = gr.Chatbot(height=400)
520
  msg_box_120b = gr.Textbox(
521
  label="๋ฉ”์‹œ์ง€ ์ž…๋ ฅ",
 
573
  outputs=[]
574
  )
575
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
576
  # 120b ๋ชจ๋ธ ์ฑ„ํŒ…
577
  msg_box_120b.submit(
578
+ fn=lambda msg, hist, rag, docs, k: chat_with_model(msg, hist, rag, docs, k, "openai/gpt-oss-120b"),
579
  inputs=[msg_box_120b, chatbot_120b, enable_rag, document_list, top_k_chunks],
580
  outputs=[msg_box_120b, chatbot_120b]
581
  )
582
 
583
  send_btn_120b.click(
584
+ fn=lambda msg, hist, rag, docs, k: chat_with_model(msg, hist, rag, docs, k, "openai/gpt-oss-120b"),
585
  inputs=[msg_box_120b, chatbot_120b, enable_rag, document_list, top_k_chunks],
586
  outputs=[msg_box_120b, chatbot_120b]
587
  )
 
593
 
594
  # 20b ๋ชจ๋ธ ์ฑ„ํŒ…
595
  msg_box_20b.submit(
596
+ fn=lambda msg, hist, rag, docs, k: chat_with_model(msg, hist, rag, docs, k, "openai/gpt-oss-20b"),
597
  inputs=[msg_box_20b, chatbot_20b, enable_rag, document_list, top_k_chunks],
598
  outputs=[msg_box_20b, chatbot_20b]
599
  )
600
 
601
  send_btn_20b.click(
602
+ fn=lambda msg, hist, rag, docs, k: chat_with_model(msg, hist, rag, docs, k, "openai/gpt-oss-20b"),
603
  inputs=[msg_box_20b, chatbot_20b, enable_rag, document_list, top_k_chunks],
604
  outputs=[msg_box_20b, chatbot_20b]
605
  )