hugging2021 commited on
Commit
bd2e020
Β·
verified Β·
1 Parent(s): 670c138

Update document_processor_image_test.py

Browse files
Files changed (1) hide show
  1. document_processor_image_test.py +66 -66
document_processor_image_test.py CHANGED
@@ -9,16 +9,16 @@ from langchain_core.documents import Document
9
  from langchain_community.embeddings import HuggingFaceEmbeddings
10
  from langchain_community.vectorstores import FAISS
11
 
12
- # PyMuPDF 라이브러리
13
  try:
14
  import fitz # PyMuPDF
15
  PYMUPDF_AVAILABLE = True
16
- print("βœ… PyMuPDF 라이브러리 μ‚¬μš© κ°€λŠ₯")
17
  except ImportError:
18
  PYMUPDF_AVAILABLE = False
19
- print("⚠️ PyMuPDF λΌμ΄λΈŒλŸ¬λ¦¬κ°€ μ„€μΉ˜λ˜μ§€ μ•ŠμŒ. pip install PyMuPDF둜 μ„€μΉ˜ν•˜μ„Έμš”.")
20
 
21
- # PDF 처리용
22
  import pytesseract
23
  from PIL import Image
24
  from pdf2image import convert_from_path
@@ -26,14 +26,14 @@ import pdfplumber
26
  from pymupdf4llm import LlamaMarkdownReader
27
 
28
  # --------------------------------
29
- # 둜그 좜λ ₯
30
  # --------------------------------
31
 
32
  def log(msg):
33
  print(f"[{time.strftime('%H:%M:%S')}] {msg}")
34
 
35
  # --------------------------------
36
- # ν…μŠ€νŠΈ μ •μ œ ν•¨μˆ˜
37
  # --------------------------------
38
 
39
  def clean_text(text):
@@ -41,7 +41,7 @@ def clean_text(text):
41
 
42
  def apply_corrections(text):
43
  corrections = {
44
- 'ΒΊΒ©': '정보', 'Ì': '의', 'Β½': '운영', 'Γƒ': '', 'Β©': '',
45
  'Ò€ℒ': "'", 'Ò€œ': '"', 'Ò€': '"'
46
  }
47
  for k, v in corrections.items():
@@ -49,16 +49,16 @@ def apply_corrections(text):
49
  return text
50
 
51
  # --------------------------------
52
- # HWPX 처리 (μ„Ήμ…˜λ³„ 처리만 μ‚¬μš©)
53
  # --------------------------------
54
 
55
  def load_hwpx(file_path):
56
- """HWPX 파일 λ‘œλ”© (XML νŒŒμ‹± λ°©μ‹λ§Œ μ‚¬μš©)"""
57
  import zipfile
58
  import xml.etree.ElementTree as ET
59
  import chardet
60
 
61
- log(f"πŸ“₯ HWPX μ„Ήμ…˜λ³„ 처리 μ‹œμž‘: {file_path}")
62
  start = time.time()
63
  documents = []
64
 
@@ -67,9 +67,9 @@ def load_hwpx(file_path):
67
  file_list = zip_ref.namelist()
68
  section_files = [f for f in file_list
69
  if f.startswith('Contents/section') and f.endswith('.xml')]
70
- section_files.sort() # section0.xml, section1.xml μˆœμ„œλ‘œ μ •λ ¬
71
 
72
- log(f"πŸ“„ 발견된 μ„Ήμ…˜ 파일: {len(section_files)}개")
73
 
74
  for section_idx, section_file in enumerate(section_files):
75
  with zip_ref.open(section_file) as xml_file:
@@ -83,14 +83,14 @@ def load_hwpx(file_path):
83
  tree = ET.ElementTree(ET.fromstring(text))
84
  root = tree.getroot()
85
 
86
- # λ„€μž„μŠ€νŽ˜μ΄μŠ€ 없이 ν…μŠ€νŠΈ μ°ΎκΈ°
87
  t_elements = [elem for elem in root.iter() if elem.tag.endswith('}t') or elem.tag == 't']
88
  body_text = ""
89
  for elem in t_elements:
90
  if elem.text:
91
  body_text += clean_text(elem.text) + " "
92
 
93
- # page λ©”νƒ€λ°μ΄ν„°λŠ” 빈 κ°’μœΌλ‘œ μ„€μ •
94
  page_value = ""
95
 
96
  if body_text.strip():
@@ -104,9 +104,9 @@ def load_hwpx(file_path):
104
  "total_sections": len(section_files)
105
  }
106
  ))
107
- log(f"βœ… μ„Ήμ…˜ ν…μŠ€νŠΈ μΆ”μΆœ μ™„λ£Œ (chars: {len(body_text)})")
108
 
109
- # ν‘œ μ°ΎκΈ°
110
  table_elements = [elem for elem in root.iter() if elem.tag.endswith('}table') or elem.tag == 'table']
111
  if table_elements:
112
  table_text = ""
@@ -136,12 +136,12 @@ def load_hwpx(file_path):
136
  "total_sections": len(section_files)
137
  }
138
  ))
139
- log(f"πŸ“Š ν‘œ μΆ”μΆœ μ™„λ£Œ")
140
 
141
- # 이미지 μ°ΎκΈ°
142
  if [elem for elem in root.iter() if elem.tag.endswith('}picture') or elem.tag == 'picture']:
143
  documents.append(Document(
144
- page_content="[이미지 포함]",
145
  metadata={
146
  "source": file_path,
147
  "filename": os.path.basename(file_path),
@@ -150,22 +150,22 @@ def load_hwpx(file_path):
150
  "total_sections": len(section_files)
151
  }
152
  ))
153
- log(f"πŸ–ΌοΈ 이미지 발견")
154
 
155
  except Exception as e:
156
- log(f"❌ HWPX 처리 였λ₯˜: {e}")
157
 
158
  duration = time.time() - start
159
 
160
- # λ¬Έμ„œ 정보 μš”μ•½ 좜λ ₯
161
  if documents:
162
- log(f"πŸ“‹ μΆ”μΆœλœ λ¬Έμ„œ 수: {len(documents)}")
163
 
164
- log(f"βœ… HWPX 처리 μ™„λ£Œ: {file_path} ⏱️ {duration:.2f}초, 총 {len(documents)}개 λ¬Έμ„œ")
165
  return documents
166
 
167
  # --------------------------------
168
- # PDF 처리 ν•¨μˆ˜λ“€ (κΈ°μ‘΄κ³Ό 동일)
169
  # --------------------------------
170
 
171
  def run_ocr_on_image(image: Image.Image, lang='kor+eng'):
@@ -182,7 +182,7 @@ def extract_images_with_ocr(pdf_path, lang='kor+eng'):
182
  page_ocr_data[page_num] = text.strip()
183
  return page_ocr_data
184
  except Exception as e:
185
- print(f"❌ 이미지 OCR μ‹€νŒ¨: {e}")
186
  return {}
187
 
188
  def extract_tables_with_pdfplumber(pdf_path):
@@ -203,7 +203,7 @@ def extract_tables_with_pdfplumber(pdf_path):
203
  page_table_data[page_num] = table_text.strip()
204
  return page_table_data
205
  except Exception as e:
206
- print(f"❌ ν‘œ μΆ”μΆœ μ‹€νŒ¨: {e}")
207
  return {}
208
 
209
  def extract_body_text_with_pages(pdf_path):
@@ -239,57 +239,57 @@ def extract_body_text_with_pages(pdf_path):
239
  start = end - 100
240
 
241
  except Exception as e:
242
- print(f"❌ λ³Έλ¬Έ μΆ”μΆœ μ‹€νŒ¨: {e}")
243
 
244
  return page_body_data
245
 
246
  def load_pdf_with_metadata(pdf_path):
247
- """PDF νŒŒμΌμ—μ„œ νŽ˜μ΄μ§€λ³„ 정보λ₯Ό μΆ”μΆœ"""
248
- log(f"πŸ“‘ PDF νŽ˜μ΄μ§€λ³„ 처리 μ‹œμž‘: {pdf_path}")
249
  start = time.time()
250
 
251
- # λ¨Όμ € PyPDFLoader둜 μ‹€μ œ νŽ˜μ΄μ§€ 수 확인
252
  try:
253
  from langchain_community.document_loaders import PyPDFLoader
254
  loader = PyPDFLoader(pdf_path)
255
  pdf_pages = loader.load()
256
  actual_total_pages = len(pdf_pages)
257
- log(f"πŸ“„ PyPDFLoader둜 ν™•μΈν•œ μ‹€μ œ νŽ˜μ΄μ§€ 수: {actual_total_pages}")
258
  except Exception as e:
259
- log(f"❌ PyPDFLoader νŽ˜μ΄μ§€ 수 확인 μ‹€νŒ¨: {e}")
260
  actual_total_pages = 1
261
 
262
  try:
263
  page_tables = extract_tables_with_pdfplumber(pdf_path)
264
  except Exception as e:
265
  page_tables = {}
266
- print(f"❌ ν‘œ μΆ”μΆœ μ‹€νŒ¨: {e}")
267
 
268
  try:
269
  page_ocr = extract_images_with_ocr(pdf_path)
270
  except Exception as e:
271
  page_ocr = {}
272
- print(f"❌ 이미지 OCR μ‹€νŒ¨: {e}")
273
 
274
  try:
275
  page_body = extract_body_text_with_pages(pdf_path)
276
  except Exception as e:
277
  page_body = {}
278
- print(f"❌ λ³Έλ¬Έ μΆ”μΆœ μ‹€νŒ¨: {e}")
279
 
280
  duration = time.time() - start
281
- log(f"βœ… PDF νŽ˜μ΄μ§€λ³„ 처리 μ™„λ£Œ: {pdf_path} ⏱️ {duration:.2f}초")
282
 
283
- # μ‹€μ œ νŽ˜μ΄μ§€ 수λ₯Ό κΈ°μ€€μœΌλ‘œ μ„€μ •
284
  all_pages = set(page_tables.keys()) | set(page_ocr.keys()) | set(page_body.keys())
285
  if all_pages:
286
  max_extracted_page = max(all_pages)
287
- # μ‹€μ œ νŽ˜μ΄μ§€ μˆ˜μ™€ μΆ”μΆœλœ νŽ˜μ΄μ§€ 수 쀑 큰 κ°’ μ‚¬μš©
288
  total_pages = max(actual_total_pages, max_extracted_page)
289
  else:
290
  total_pages = actual_total_pages
291
 
292
- log(f"πŸ“Š μ΅œμ’… μ„€μ •λœ 총 νŽ˜μ΄μ§€ 수: {total_pages}")
293
 
294
  docs = []
295
 
@@ -305,7 +305,7 @@ def load_pdf_with_metadata(pdf_path):
305
  "total_pages": total_pages
306
  }
307
  ))
308
- log(f"πŸ“Š νŽ˜μ΄μ§€ {page_num}: ν‘œ μΆ”μΆœ μ™„λ£Œ")
309
 
310
  if page_num in page_body and page_body[page_num].strip():
311
  docs.append(Document(
@@ -318,7 +318,7 @@ def load_pdf_with_metadata(pdf_path):
318
  "total_pages": total_pages
319
  }
320
  ))
321
- log(f"πŸ“„ νŽ˜μ΄μ§€ {page_num}: λ³Έλ¬Έ μΆ”μΆœ μ™„λ£Œ")
322
 
323
  if page_num in page_ocr and page_ocr[page_num].strip():
324
  docs.append(Document(
@@ -331,11 +331,11 @@ def load_pdf_with_metadata(pdf_path):
331
  "total_pages": total_pages
332
  }
333
  ))
334
- log(f"πŸ–ΌοΈ νŽ˜μ΄μ§€ {page_num}: OCR μΆ”μΆœ μ™„λ£Œ")
335
 
336
  if not docs:
337
  docs.append(Document(
338
- page_content="[λ‚΄μš© μΆ”μΆœ μ‹€νŒ¨]",
339
  metadata={
340
  "source": pdf_path,
341
  "filename": os.path.basename(pdf_path),
@@ -345,36 +345,36 @@ def load_pdf_with_metadata(pdf_path):
345
  }
346
  ))
347
 
348
- # νŽ˜μ΄μ§€ 정보 μš”μ•½ 좜λ ₯
349
  if docs:
350
  page_numbers = [doc.metadata.get('page', 0) for doc in docs if doc.metadata.get('page')]
351
  if page_numbers:
352
- log(f"πŸ“‹ μΆ”μΆœλœ νŽ˜μ΄μ§€ λ²”μœ„: {min(page_numbers)} ~ {max(page_numbers)}")
353
 
354
- log(f"πŸ“Š μΆ”μΆœλœ νŽ˜μ΄μ§€λ³„ PDF λ¬Έμ„œ: {len(docs)}개 (총 {total_pages}νŽ˜μ΄μ§€)")
355
  return docs
356
 
357
  # --------------------------------
358
- # λ¬Έμ„œ λ‘œλ”© 및 λΆ„ν• 
359
  # --------------------------------
360
 
361
  def load_documents(folder_path):
362
  documents = []
363
 
364
  for file in glob.glob(os.path.join(folder_path, "*.hwpx")):
365
- log(f"πŸ“„ HWPX 파일 확인: {file}")
366
  docs = load_hwpx(file)
367
  documents.extend(docs)
368
 
369
  for file in glob.glob(os.path.join(folder_path, "*.pdf")):
370
- log(f"πŸ“„ PDF 파일 확인: {file}")
371
  documents.extend(load_pdf_with_metadata(file))
372
 
373
- log(f"πŸ“š λ¬Έμ„œ λ‘œλ”© 전체 μ™„λ£Œ! 총 λ¬Έμ„œ 수: {len(documents)}")
374
  return documents
375
 
376
  def split_documents(documents, chunk_size=800, chunk_overlap=100):
377
- log("πŸ”ͺ 청크 λΆ„ν•  μ‹œμž‘")
378
  splitter = RecursiveCharacterTextSplitter(
379
  chunk_size=chunk_size,
380
  chunk_overlap=chunk_overlap,
@@ -389,21 +389,21 @@ def split_documents(documents, chunk_size=800, chunk_overlap=100):
389
  page_content=enriched_chunk,
390
  metadata={**doc.metadata, "chunk_index": i}
391
  ))
392
- log(f"βœ… 청크 λΆ„ν•  μ™„λ£Œ: 총 {len(chunks)}개 생성")
393
  return chunks
394
 
395
  # --------------------------------
396
- # 메인 μ‹€ν–‰
397
  # --------------------------------
398
 
399
  if __name__ == "__main__":
400
  folder = "dataset_test"
401
- log("πŸš€ PyMuPDF 기반 λ¬Έμ„œ 처리 μ‹œμž‘")
402
  docs = load_documents(folder)
403
- log("πŸ“¦ λ¬Έμ„œ λ‘œλ”© μ™„λ£Œ")
404
 
405
- # νŽ˜μ΄μ§€ 정보 확인
406
- log("πŸ“„ νŽ˜μ΄μ§€ 정보 μš”μ•½:")
407
  page_info = {}
408
  for doc in docs:
409
  source = doc.metadata.get('source', 'unknown')
@@ -417,10 +417,10 @@ if __name__ == "__main__":
417
 
418
  for source, info in page_info.items():
419
  max_page = max(info['pages']) if info['pages'] and isinstance(max(info['pages']), int) else 'unknown'
420
- log(f" πŸ“„ {os.path.basename(source)}: {max_page}νŽ˜μ΄μ§€, νƒ€μž…: {info['types']}")
421
 
422
  chunks = split_documents(docs)
423
- log("πŸ’‘ E5-Large-Instruct μž„λ² λ”© μ€€λΉ„ 쀑")
424
  embedding_model = HuggingFaceEmbeddings(
425
  model_name="intfloat/e5-large-v2",
426
  model_kwargs={"device": "cuda"}
@@ -429,12 +429,12 @@ if __name__ == "__main__":
429
  vectorstore = FAISS.from_documents(chunks, embedding_model)
430
  vectorstore.save_local("vector_db")
431
 
432
- log(f"πŸ“Š 전체 λ¬Έμ„œ 수: {len(docs)}")
433
- log(f"πŸ”— 청크 총 수: {len(chunks)}")
434
- log("βœ… FAISS μ €μž₯ μ™„λ£Œ: vector_db")
435
 
436
- # νŽ˜μ΄μ§€ 정보가 ν¬ν•¨λœ μƒ˜ν”Œ 좜λ ₯
437
- log("\nπŸ“‹ μ‹€μ œ νŽ˜μ΄μ§€ 정보 포함 μƒ˜ν”Œ:")
438
  for i, chunk in enumerate(chunks[:5]):
439
  meta = chunk.metadata
440
- log(f" 청크 {i+1}: {meta.get('type')} | νŽ˜μ΄μ§€ {meta.get('page')} | {os.path.basename(meta.get('source', 'unknown'))}")
 
9
  from langchain_community.embeddings import HuggingFaceEmbeddings
10
  from langchain_community.vectorstores import FAISS
11
 
12
+ # PyMuPDF library
13
  try:
14
  import fitz # PyMuPDF
15
  PYMUPDF_AVAILABLE = True
16
+ print("βœ… PyMuPDF library available")
17
  except ImportError:
18
  PYMUPDF_AVAILABLE = False
19
+ print("⚠️ PyMuPDF library is not installed. Install with: pip install PyMuPDF")
20
 
21
+ # PDF processing utilities
22
  import pytesseract
23
  from PIL import Image
24
  from pdf2image import convert_from_path
 
26
  from pymupdf4llm import LlamaMarkdownReader
27
 
28
  # --------------------------------
29
+ # Log Output
30
  # --------------------------------
31
 
32
  def log(msg):
33
  print(f"[{time.strftime('%H:%M:%S')}] {msg}")
34
 
35
  # --------------------------------
36
+ # Text Cleaning Function
37
  # --------------------------------
38
 
39
  def clean_text(text):
 
41
 
42
  def apply_corrections(text):
43
  corrections = {
44
+ 'ΒΊΒ©': 'info', 'Ì': 'of', 'Β½': 'operation', 'Γƒ': '', 'Β©': '',
45
  'Ò€ℒ': "'", 'Ò€œ': '"', 'Ò€': '"'
46
  }
47
  for k, v in corrections.items():
 
49
  return text
50
 
51
  # --------------------------------
52
+ # HWPX Processing (Section-wise Processing Only)
53
  # --------------------------------
54
 
55
  def load_hwpx(file_path):
56
+ """Loading HWPX file (using XML parsing method only)"""
57
  import zipfile
58
  import xml.etree.ElementTree as ET
59
  import chardet
60
 
61
+ log(f"πŸ“₯ Starting HWPX section-wise processing: {file_path}")
62
  start = time.time()
63
  documents = []
64
 
 
67
  file_list = zip_ref.namelist()
68
  section_files = [f for f in file_list
69
  if f.startswith('Contents/section') and f.endswith('.xml')]
70
+ section_files.sort() # Sort by section0.xml, section1.xml order
71
 
72
+ log(f"πŸ“„ Found section files: {len(section_files)}")
73
 
74
  for section_idx, section_file in enumerate(section_files):
75
  with zip_ref.open(section_file) as xml_file:
 
83
  tree = ET.ElementTree(ET.fromstring(text))
84
  root = tree.getroot()
85
 
86
+ # Find text without namespace
87
  t_elements = [elem for elem in root.iter() if elem.tag.endswith('}t') or elem.tag == 't']
88
  body_text = ""
89
  for elem in t_elements:
90
  if elem.text:
91
  body_text += clean_text(elem.text) + " "
92
 
93
+ # Set page metadata to empty
94
  page_value = ""
95
 
96
  if body_text.strip():
 
104
  "total_sections": len(section_files)
105
  }
106
  ))
107
+ log(f"βœ… Section text extraction complete (chars: {len(body_text)})")
108
 
109
+ # Find tables
110
  table_elements = [elem for elem in root.iter() if elem.tag.endswith('}table') or elem.tag == 'table']
111
  if table_elements:
112
  table_text = ""
 
136
  "total_sections": len(section_files)
137
  }
138
  ))
139
+ log(f"πŸ“Š Table extraction complete")
140
 
141
+ # Find images
142
  if [elem for elem in root.iter() if elem.tag.endswith('}picture') or elem.tag == 'picture']:
143
  documents.append(Document(
144
+ page_content="[Image included]",
145
  metadata={
146
  "source": file_path,
147
  "filename": os.path.basename(file_path),
 
150
  "total_sections": len(section_files)
151
  }
152
  ))
153
+ log(f"πŸ–ΌοΈ Image found")
154
 
155
  except Exception as e:
156
+ log(f"❌ HWPX processing error: {e}")
157
 
158
  duration = time.time() - start
159
 
160
+ # Print summary of document information
161
  if documents:
162
+ log(f"πŸ“‹ Number of extracted documents: {len(documents)}")
163
 
164
+ log(f"βœ… HWPX processing complete: {file_path} ⏱️ {duration:.2f}s, total {len(documents)} documents")
165
  return documents
166
 
167
  # --------------------------------
168
+ # PDF Processing Functions (same as before)
169
  # --------------------------------
170
 
171
  def run_ocr_on_image(image: Image.Image, lang='kor+eng'):
 
182
  page_ocr_data[page_num] = text.strip()
183
  return page_ocr_data
184
  except Exception as e:
185
+ print(f"❌ Image OCR failed: {e}")
186
  return {}
187
 
188
  def extract_tables_with_pdfplumber(pdf_path):
 
203
  page_table_data[page_num] = table_text.strip()
204
  return page_table_data
205
  except Exception as e:
206
+ print(f"❌ Table extraction failed: {e}")
207
  return {}
208
 
209
  def extract_body_text_with_pages(pdf_path):
 
239
  start = end - 100
240
 
241
  except Exception as e:
242
+ print(f"❌ Body extraction failed: {e}")
243
 
244
  return page_body_data
245
 
246
  def load_pdf_with_metadata(pdf_path):
247
+ """Extracts page-specific information from a PDF file"""
248
+ log(f"πŸ“‘ Starting PDF page-wise processing: {pdf_path}")
249
  start = time.time()
250
 
251
+ # First, check the actual number of pages using PyPDFLoader
252
  try:
253
  from langchain_community.document_loaders import PyPDFLoader
254
  loader = PyPDFLoader(pdf_path)
255
  pdf_pages = loader.load()
256
  actual_total_pages = len(pdf_pages)
257
+ log(f"πŸ“„ Actual page count as verified by PyPDFLoader: {actual_total_pages}")
258
  except Exception as e:
259
+ log(f"❌ PyPDFLoader page count verification failed: {e}")
260
  actual_total_pages = 1
261
 
262
  try:
263
  page_tables = extract_tables_with_pdfplumber(pdf_path)
264
  except Exception as e:
265
  page_tables = {}
266
+ print(f"❌ Table extraction failed: {e}")
267
 
268
  try:
269
  page_ocr = extract_images_with_ocr(pdf_path)
270
  except Exception as e:
271
  page_ocr = {}
272
+ print(f"❌ Image OCR failed: {e}")
273
 
274
  try:
275
  page_body = extract_body_text_with_pages(pdf_path)
276
  except Exception as e:
277
  page_body = {}
278
+ print(f"❌ Body extraction failed: {e}")
279
 
280
  duration = time.time() - start
281
+ log(f"βœ… PDF page-wise processing complete: {pdf_path} ⏱️ {duration:.2f}s")
282
 
283
+ # Set the total number of pages based on the actual number of pages
284
  all_pages = set(page_tables.keys()) | set(page_ocr.keys()) | set(page_body.keys())
285
  if all_pages:
286
  max_extracted_page = max(all_pages)
287
+ # Use the greater of the actual and extracted page numbers
288
  total_pages = max(actual_total_pages, max_extracted_page)
289
  else:
290
  total_pages = actual_total_pages
291
 
292
+ log(f"πŸ“Š Final total page count set to: {total_pages}")
293
 
294
  docs = []
295
 
 
305
  "total_pages": total_pages
306
  }
307
  ))
308
+ log(f"πŸ“Š Page {page_num}: Table extraction complete")
309
 
310
  if page_num in page_body and page_body[page_num].strip():
311
  docs.append(Document(
 
318
  "total_pages": total_pages
319
  }
320
  ))
321
+ log(f"πŸ“„ Page {page_num}: Body extraction complete")
322
 
323
  if page_num in page_ocr and page_ocr[page_num].strip():
324
  docs.append(Document(
 
331
  "total_pages": total_pages
332
  }
333
  ))
334
+ log(f"πŸ–ΌοΈ Page {page_num}: OCR extraction complete")
335
 
336
  if not docs:
337
  docs.append(Document(
338
+ page_content="[Content extraction failed]",
339
  metadata={
340
  "source": pdf_path,
341
  "filename": os.path.basename(pdf_path),
 
345
  }
346
  ))
347
 
348
+ # Print summary of page information
349
  if docs:
350
  page_numbers = [doc.metadata.get('page', 0) for doc in docs if doc.metadata.get('page')]
351
  if page_numbers:
352
+ log(f"πŸ“‹ Extracted page range: {min(page_numbers)} ~ {max(page_numbers)}")
353
 
354
+ log(f"πŸ“Š PDF documents with extracted pages: {len(docs)} documents (total {total_pages} pages)")
355
  return docs
356
 
357
  # --------------------------------
358
+ # Document Loading and Splitting
359
  # --------------------------------
360
 
361
  def load_documents(folder_path):
362
  documents = []
363
 
364
  for file in glob.glob(os.path.join(folder_path, "*.hwpx")):
365
+ log(f"πŸ“„ HWPX file found: {file}")
366
  docs = load_hwpx(file)
367
  documents.extend(docs)
368
 
369
  for file in glob.glob(os.path.join(folder_path, "*.pdf")):
370
+ log(f"πŸ“„ PDF file found: {file}")
371
  documents.extend(load_pdf_with_metadata(file))
372
 
373
+ log(f"πŸ“š Document loading complete! Total documents: {len(documents)}")
374
  return documents
375
 
376
  def split_documents(documents, chunk_size=800, chunk_overlap=100):
377
+ log("πŸ”ͺ Starting chunk splitting")
378
  splitter = RecursiveCharacterTextSplitter(
379
  chunk_size=chunk_size,
380
  chunk_overlap=chunk_overlap,
 
389
  page_content=enriched_chunk,
390
  metadata={**doc.metadata, "chunk_index": i}
391
  ))
392
+ log(f"βœ… Chunk splitting complete: Created {len(chunks)} chunks")
393
  return chunks
394
 
395
  # --------------------------------
396
+ # Main Execution
397
  # --------------------------------
398
 
399
  if __name__ == "__main__":
400
  folder = "dataset_test"
401
+ log("πŸš€ PyMuPDF-based document processing started")
402
  docs = load_documents(folder)
403
+ log("πŸ“¦ Document loading complete")
404
 
405
+ # Page information check
406
+ log("πŸ“„ Page information summary:")
407
  page_info = {}
408
  for doc in docs:
409
  source = doc.metadata.get('source', 'unknown')
 
417
 
418
  for source, info in page_info.items():
419
  max_page = max(info['pages']) if info['pages'] and isinstance(max(info['pages']), int) else 'unknown'
420
+ log(f" πŸ“„ {os.path.basename(source)}: {max_page} pages, type: {info['types']}")
421
 
422
  chunks = split_documents(docs)
423
+ log("πŸ’‘ E5-Large-Instruct embedding preparation")
424
  embedding_model = HuggingFaceEmbeddings(
425
  model_name="intfloat/e5-large-v2",
426
  model_kwargs={"device": "cuda"}
 
429
  vectorstore = FAISS.from_documents(chunks, embedding_model)
430
  vectorstore.save_local("vector_db")
431
 
432
+ log(f"πŸ“Š Total number of documents: {len(docs)}")
433
+ log(f"πŸ”— Total number of chunks: {len(chunks)}")
434
+ log("βœ… FAISS save complete: vector_db")
435
 
436
+ # Sample output with page information
437
+ log("\nπŸ“‹ Sample including actual page information:")
438
  for i, chunk in enumerate(chunks[:5]):
439
  meta = chunk.metadata
440
+ log(f" Chunk {i+1}: {meta.get('type')} | Page {meta.get('page')} | {os.path.basename(meta.get('source', 'unknown'))}")