Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -173,13 +173,25 @@ class PDFRAGSystem:
|
|
| 173 |
# ํ
์คํธ๋ฅผ ์ฒญํฌ๋ก ๋ถํ
|
| 174 |
chunks = self.text_splitter.split_text(pdf_data["full_text"])
|
| 175 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
# ์ฒญํฌ ์ ์ฅ
|
| 177 |
self.document_chunks[doc_id] = chunks
|
| 178 |
|
| 179 |
-
# ์๋ฒ ๋ฉ ์์ฑ
|
| 180 |
if self.embedder:
|
| 181 |
-
|
| 182 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
|
| 184 |
# ๋ฌธ์ ์ ๋ณด ์ ์ฅ
|
| 185 |
self.documents[doc_id] = {
|
|
@@ -188,6 +200,9 @@ class PDFRAGSystem:
|
|
| 188 |
"upload_time": datetime.now().isoformat()
|
| 189 |
}
|
| 190 |
|
|
|
|
|
|
|
|
|
|
| 191 |
return {
|
| 192 |
"success": True,
|
| 193 |
"doc_id": doc_id,
|
|
@@ -197,6 +212,7 @@ class PDFRAGSystem:
|
|
| 197 |
}
|
| 198 |
|
| 199 |
except Exception as e:
|
|
|
|
| 200 |
return {"success": False, "error": str(e)}
|
| 201 |
|
| 202 |
def search_relevant_chunks(self, query: str, doc_ids: List[str], top_k: int = 3) -> List[Dict]:
|
|
@@ -205,54 +221,75 @@ class PDFRAGSystem:
|
|
| 205 |
|
| 206 |
print(f"Searching chunks for query: '{query[:50]}...' in {len(doc_ids)} documents")
|
| 207 |
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
|
| 212 |
-
|
| 213 |
-
|
|
|
|
|
|
|
| 214 |
doc_embeddings = self.embeddings_store[doc_id]
|
| 215 |
-
chunks = self.document_chunks[doc_id]
|
| 216 |
|
| 217 |
-
# ์ฝ์ฌ์ธ ์ ์ฌ๋ ๊ณ์ฐ
|
| 218 |
similarities = []
|
| 219 |
-
for emb in doc_embeddings:
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
|
|
|
|
|
|
|
|
|
| 238 |
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
"similarity": score / len(query_keywords) if query_keywords else 0
|
| 250 |
-
})
|
| 251 |
|
| 252 |
-
#
|
| 253 |
all_relevant_chunks.sort(key=lambda x: x.get('similarity', 0), reverse=True)
|
|
|
|
|
|
|
| 254 |
result = all_relevant_chunks[:top_k]
|
| 255 |
print(f"Returning {len(result)} chunks")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
return result
|
| 257 |
|
| 258 |
def create_rag_prompt(self, query: str, doc_ids: List[str], top_k: int = 3) -> tuple:
|
|
@@ -262,10 +299,23 @@ class PDFRAGSystem:
|
|
| 262 |
relevant_chunks = self.search_relevant_chunks(query, doc_ids, top_k)
|
| 263 |
|
| 264 |
if not relevant_chunks:
|
| 265 |
-
print("No relevant chunks found")
|
| 266 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
|
| 268 |
-
print(f"
|
| 269 |
|
| 270 |
# ์ปจํ
์คํธ ๊ตฌ์ฑ
|
| 271 |
context_parts = []
|
|
@@ -274,15 +324,18 @@ class PDFRAGSystem:
|
|
| 274 |
|
| 275 |
for i, chunk in enumerate(relevant_chunks, 1):
|
| 276 |
context_parts.append(f"\n[Document Reference {i} - {chunk['doc_name']}]")
|
| 277 |
-
|
|
|
|
| 278 |
context_parts.append(content)
|
| 279 |
-
print(f"Added chunk {i} with similarity: {chunk.get('similarity', 0):.3f}")
|
| 280 |
|
| 281 |
context_parts.append("\n" + "=" * 40)
|
| 282 |
|
| 283 |
context = "\n".join(context_parts)
|
| 284 |
enhanced_query = f"{context}\n\nQuestion: {query}\n\nAnswer based on the document context provided above:"
|
| 285 |
|
|
|
|
|
|
|
| 286 |
return enhanced_query, context
|
| 287 |
|
| 288 |
# Initialize model and RAG system
|
|
|
|
| 173 |
# ํ
์คํธ๋ฅผ ์ฒญํฌ๋ก ๋ถํ
|
| 174 |
chunks = self.text_splitter.split_text(pdf_data["full_text"])
|
| 175 |
|
| 176 |
+
if not chunks:
|
| 177 |
+
print("Warning: No chunks created from PDF")
|
| 178 |
+
return {"success": False, "error": "No text content found in PDF"}
|
| 179 |
+
|
| 180 |
+
print(f"Created {len(chunks)} chunks from PDF")
|
| 181 |
+
|
| 182 |
# ์ฒญํฌ ์ ์ฅ
|
| 183 |
self.document_chunks[doc_id] = chunks
|
| 184 |
|
| 185 |
+
# ์๋ฒ ๋ฉ ์์ฑ (์ ํ์ )
|
| 186 |
if self.embedder:
|
| 187 |
+
try:
|
| 188 |
+
print("Generating embeddings...")
|
| 189 |
+
embeddings = self.embedder.encode(chunks)
|
| 190 |
+
self.embeddings_store[doc_id] = embeddings
|
| 191 |
+
print(f"Generated {len(embeddings)} embeddings")
|
| 192 |
+
except Exception as e:
|
| 193 |
+
print(f"Warning: Failed to generate embeddings: {e}")
|
| 194 |
+
# ์๋ฒ ๋ฉ ์คํจํด๋ ๊ณ์ ์งํ
|
| 195 |
|
| 196 |
# ๋ฌธ์ ์ ๋ณด ์ ์ฅ
|
| 197 |
self.documents[doc_id] = {
|
|
|
|
| 200 |
"upload_time": datetime.now().isoformat()
|
| 201 |
}
|
| 202 |
|
| 203 |
+
# ๋๋ฒ๊ทธ: ์ฒซ ๋ฒ์งธ ์ฒญํฌ ์ถ๋ ฅ
|
| 204 |
+
print(f"First chunk preview: {chunks[0][:200]}...")
|
| 205 |
+
|
| 206 |
return {
|
| 207 |
"success": True,
|
| 208 |
"doc_id": doc_id,
|
|
|
|
| 212 |
}
|
| 213 |
|
| 214 |
except Exception as e:
|
| 215 |
+
print(f"Error processing PDF: {e}")
|
| 216 |
return {"success": False, "error": str(e)}
|
| 217 |
|
| 218 |
def search_relevant_chunks(self, query: str, doc_ids: List[str], top_k: int = 3) -> List[Dict]:
|
|
|
|
| 221 |
|
| 222 |
print(f"Searching chunks for query: '{query[:50]}...' in {len(doc_ids)} documents")
|
| 223 |
|
| 224 |
+
# ๋จผ์ ๋ฌธ์๊ฐ ์๋์ง ํ์ธ
|
| 225 |
+
for doc_id in doc_ids:
|
| 226 |
+
if doc_id not in self.document_chunks:
|
| 227 |
+
print(f"Warning: Document {doc_id} not found in chunks")
|
| 228 |
+
continue
|
| 229 |
+
|
| 230 |
+
chunks = self.document_chunks[doc_id]
|
| 231 |
+
print(f"Document {doc_id} has {len(chunks)} chunks")
|
| 232 |
|
| 233 |
+
# ์๋ฒ ๋ฉ ๊ธฐ๋ฐ ๊ฒ์ ์๋
|
| 234 |
+
if self.embedder and doc_id in self.embeddings_store:
|
| 235 |
+
try:
|
| 236 |
+
query_embedding = self.embedder.encode([query])[0]
|
| 237 |
doc_embeddings = self.embeddings_store[doc_id]
|
|
|
|
| 238 |
|
| 239 |
+
# ์ฝ์ฌ์ธ ์ ์ฌ๋ ๊ณ์ฐ (์์ ํ๊ฒ)
|
| 240 |
similarities = []
|
| 241 |
+
for i, emb in enumerate(doc_embeddings):
|
| 242 |
+
try:
|
| 243 |
+
query_norm = np.linalg.norm(query_embedding)
|
| 244 |
+
emb_norm = np.linalg.norm(emb)
|
| 245 |
+
|
| 246 |
+
if query_norm > 0 and emb_norm > 0:
|
| 247 |
+
sim = np.dot(query_embedding, emb) / (query_norm * emb_norm)
|
| 248 |
+
similarities.append(sim)
|
| 249 |
+
else:
|
| 250 |
+
similarities.append(0.0)
|
| 251 |
+
except Exception as e:
|
| 252 |
+
print(f"Error calculating similarity for chunk {i}: {e}")
|
| 253 |
+
similarities.append(0.0)
|
| 254 |
|
| 255 |
+
# ์์ ์ฒญํฌ ์ ํ
|
| 256 |
+
if similarities:
|
| 257 |
+
top_indices = np.argsort(similarities)[-min(top_k, len(similarities)):][::-1]
|
| 258 |
+
|
| 259 |
+
for idx in top_indices:
|
| 260 |
+
if idx < len(chunks): # ์ธ๋ฑ์ค ๋ฒ์ ํ์ธ
|
| 261 |
+
all_relevant_chunks.append({
|
| 262 |
+
"content": chunks[idx],
|
| 263 |
+
"doc_name": self.documents[doc_id]["metadata"]["file_name"],
|
| 264 |
+
"similarity": similarities[idx]
|
| 265 |
+
})
|
| 266 |
+
print(f"Added chunk {idx} with similarity: {similarities[idx]:.3f}")
|
| 267 |
+
except Exception as e:
|
| 268 |
+
print(f"Error in embedding search: {e}")
|
| 269 |
+
# ์๋ฒ ๋ฉ ์คํจ์ ํด๋ฐฑ
|
| 270 |
|
| 271 |
+
# ์๋ฒ ๋ฉ์ด ์๊ฑฐ๋ ์คํจํ ๊ฒฝ์ฐ - ๊ฐ๋จํ ์ฒ์ N๊ฐ ์ฒญํฌ ๋ฐํ
|
| 272 |
+
if not all_relevant_chunks:
|
| 273 |
+
print(f"Falling back to simple chunk selection for {doc_id}")
|
| 274 |
+
for i in range(min(top_k, len(chunks))):
|
| 275 |
+
all_relevant_chunks.append({
|
| 276 |
+
"content": chunks[i],
|
| 277 |
+
"doc_name": self.documents[doc_id]["metadata"]["file_name"],
|
| 278 |
+
"similarity": 1.0 - (i * 0.1) # ์์๋๋ก ๊ฐ์ค์น
|
| 279 |
+
})
|
| 280 |
+
print(f"Added chunk {i} (fallback)")
|
|
|
|
|
|
|
| 281 |
|
| 282 |
+
# ์ ์ฌ๋ ๊ธฐ์ค ์ ๋ ฌ
|
| 283 |
all_relevant_chunks.sort(key=lambda x: x.get('similarity', 0), reverse=True)
|
| 284 |
+
|
| 285 |
+
# ์์ K๊ฐ ์ ํ
|
| 286 |
result = all_relevant_chunks[:top_k]
|
| 287 |
print(f"Returning {len(result)} chunks")
|
| 288 |
+
|
| 289 |
+
# ๋๋ฒ๊ทธ: ์ฒซ ๋ฒ์งธ ์ฒญํฌ ๋ด์ฉ ์ผ๋ถ ์ถ๋ ฅ
|
| 290 |
+
if result:
|
| 291 |
+
print(f"First chunk preview: {result[0]['content'][:100]}...")
|
| 292 |
+
|
| 293 |
return result
|
| 294 |
|
| 295 |
def create_rag_prompt(self, query: str, doc_ids: List[str], top_k: int = 3) -> tuple:
|
|
|
|
| 299 |
relevant_chunks = self.search_relevant_chunks(query, doc_ids, top_k)
|
| 300 |
|
| 301 |
if not relevant_chunks:
|
| 302 |
+
print("No relevant chunks found - checking if documents exist")
|
| 303 |
+
# ๋ฌธ์๊ฐ ์๋๋ฐ ์ฒญํฌ๋ฅผ ๋ชป ์ฐพ์ ๊ฒฝ์ฐ, ์ฒซ ๋ฒ์งธ ์ฒญํฌ๋ผ๋ ์ฌ์ฉ
|
| 304 |
+
for doc_id in doc_ids:
|
| 305 |
+
if doc_id in self.document_chunks and self.document_chunks[doc_id]:
|
| 306 |
+
print(f"Using first chunk from {doc_id} as fallback")
|
| 307 |
+
relevant_chunks = [{
|
| 308 |
+
"content": self.document_chunks[doc_id][0],
|
| 309 |
+
"doc_name": self.documents[doc_id]["metadata"]["file_name"],
|
| 310 |
+
"similarity": 0.5
|
| 311 |
+
}]
|
| 312 |
+
break
|
| 313 |
+
|
| 314 |
+
if not relevant_chunks:
|
| 315 |
+
print("No documents or chunks available")
|
| 316 |
+
return query, ""
|
| 317 |
|
| 318 |
+
print(f"Using {len(relevant_chunks)} chunks for context")
|
| 319 |
|
| 320 |
# ์ปจํ
์คํธ ๊ตฌ์ฑ
|
| 321 |
context_parts = []
|
|
|
|
| 324 |
|
| 325 |
for i, chunk in enumerate(relevant_chunks, 1):
|
| 326 |
context_parts.append(f"\n[Document Reference {i} - {chunk['doc_name']}]")
|
| 327 |
+
# ์ฒญํฌ ํฌ๊ธฐ ์ฆ๊ฐ
|
| 328 |
+
content = chunk['content'][:1000] if len(chunk['content']) > 1000 else chunk['content']
|
| 329 |
context_parts.append(content)
|
| 330 |
+
print(f"Added chunk {i} ({len(content)} chars) with similarity: {chunk.get('similarity', 0):.3f}")
|
| 331 |
|
| 332 |
context_parts.append("\n" + "=" * 40)
|
| 333 |
|
| 334 |
context = "\n".join(context_parts)
|
| 335 |
enhanced_query = f"{context}\n\nQuestion: {query}\n\nAnswer based on the document context provided above:"
|
| 336 |
|
| 337 |
+
print(f"Enhanced query length: {len(enhanced_query)} chars (original: {len(query)} chars)")
|
| 338 |
+
|
| 339 |
return enhanced_query, context
|
| 340 |
|
| 341 |
# Initialize model and RAG system
|