Spaces:

hugging2021
/

open-webui-rag-system

Build error

App Files Files Community

hugging2021 commited on Jul 20

Commit

bd2e020

verified ·

1 Parent(s): 670c138

Update document_processor_image_test.py

Browse files

Files changed (1) hide show

document_processor_image_test.py +66 -66

document_processor_image_test.py CHANGED Viewed

@@ -9,16 +9,16 @@ from langchain_core.documents import Document
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.vectorstores import FAISS
-# PyMuPDF 라이브러리
 try:
     import fitz  # PyMuPDF
     PYMUPDF_AVAILABLE = True
-    print("✅ PyMuPDF 라이브러리 사용 가능")
 except ImportError:
     PYMUPDF_AVAILABLE = False
-    print("⚠️ PyMuPDF 라이브러리가 설치되지 않음. pip install PyMuPDF로 설치하세요.")
-# PDF 처리용
 import pytesseract
 from PIL import Image
 from pdf2image import convert_from_path
@@ -26,14 +26,14 @@ import pdfplumber
 from pymupdf4llm import LlamaMarkdownReader
 # --------------------------------
-# 로그 출력
 # --------------------------------
 def log(msg):
     print(f"[{time.strftime('%H:%M:%S')}] {msg}")
 # --------------------------------
-# 텍스트 정제 함수
 # --------------------------------
 def clean_text(text):
@@ -41,7 +41,7 @@ def clean_text(text):
 def apply_corrections(text):
     corrections = {
-        'º©': '정보', 'Ì': '의', '½': '운영', 'Ã': '', '©': '',
         'â€™': "'", 'â€œ': '"', 'â€': '"'
     }
     for k, v in corrections.items():
@@ -49,16 +49,16 @@ def apply_corrections(text):
     return text
 # --------------------------------
-# HWPX 처리 (섹션별 처리만 사용)
 # --------------------------------
 def load_hwpx(file_path):
-    """HWPX 파일 로딩 (XML 파싱 방식만 사용)"""
     import zipfile
     import xml.etree.ElementTree as ET
     import chardet
-    log(f"📥 HWPX 섹션별 처리 시작: {file_path}")
     start = time.time()
     documents = []
@@ -67,9 +67,9 @@ def load_hwpx(file_path):
             file_list = zip_ref.namelist()
             section_files = [f for f in file_list
                            if f.startswith('Contents/section') and f.endswith('.xml')]
-            section_files.sort()  # section0.xml, section1.xml 순서로 정렬
-            log(f"📄 발견된 섹션 파일: {len(section_files)}개")
             for section_idx, section_file in enumerate(section_files):
                 with zip_ref.open(section_file) as xml_file:
@@ -83,14 +83,14 @@ def load_hwpx(file_path):
                     tree = ET.ElementTree(ET.fromstring(text))
                     root = tree.getroot()
-                    # 네임스페이스 없이 텍스트 찾기
                     t_elements = [elem for elem in root.iter() if elem.tag.endswith('}t') or elem.tag == 't']
                     body_text = ""
                     for elem in t_elements:
                         if elem.text:
                             body_text += clean_text(elem.text) + " "
-                    # page 메타데이터는 빈 값으로 설정
                     page_value = ""
                     if body_text.strip():
@@ -104,9 +104,9 @@ def load_hwpx(file_path):
                                 "total_sections": len(section_files)
                             }
                         ))
-                        log(f"✅ 섹션 텍스트 추출 완료 (chars: {len(body_text)})")
-                    # 표 찾기
                     table_elements = [elem for elem in root.iter() if elem.tag.endswith('}table') or elem.tag == 'table']
                     if table_elements:
                         table_text = ""
@@ -136,12 +136,12 @@ def load_hwpx(file_path):
                                     "total_sections": len(section_files)
                                 }
                             ))
-                            log(f"📊 표 추출 완료")
-                    # 이미지 찾기
                     if [elem for elem in root.iter() if elem.tag.endswith('}picture') or elem.tag == 'picture']:
                         documents.append(Document(
-                            page_content="[이미지 포함]",
                             metadata={
                                 "source": file_path,
                                 "filename": os.path.basename(file_path),
@@ -150,22 +150,22 @@ def load_hwpx(file_path):
                                 "total_sections": len(section_files)
                             }
                         ))
-                        log(f"🖼️ 이미지 발견")
     except Exception as e:
-        log(f"❌ HWPX 처리 오류: {e}")
     duration = time.time() - start
-    # 문서 정보 요약 출력
     if documents:
-        log(f"📋 추출된 문서 수: {len(documents)}")
-    log(f"✅ HWPX 처리 완료: {file_path} ⏱️ {duration:.2f}초, 총 {len(documents)}개 문서")
     return documents
 # --------------------------------
-# PDF 처리 함수들 (기존과 동일)
 # --------------------------------
 def run_ocr_on_image(image: Image.Image, lang='kor+eng'):
@@ -182,7 +182,7 @@ def extract_images_with_ocr(pdf_path, lang='kor+eng'):
                 page_ocr_data[page_num] = text.strip()
         return page_ocr_data
     except Exception as e:
-        print(f"❌ 이미지 OCR 실패: {e}")
         return {}
 def extract_tables_with_pdfplumber(pdf_path):
@@ -203,7 +203,7 @@ def extract_tables_with_pdfplumber(pdf_path):
                     page_table_data[page_num] = table_text.strip()
         return page_table_data
     except Exception as e:
-        print(f"❌ 표 추출 실패: {e}")
         return {}
 def extract_body_text_with_pages(pdf_path):
@@ -239,57 +239,57 @@ def extract_body_text_with_pages(pdf_path):
                 start = end - 100
     except Exception as e:
-        print(f"❌ 본문 추출 실패: {e}")
     return page_body_data
 def load_pdf_with_metadata(pdf_path):
-    """PDF 파일에서 페이지별 정보를 추출"""
-    log(f"📑 PDF 페이지별 처리 시작: {pdf_path}")
     start = time.time()
-    # 먼저 PyPDFLoader로 실제 페이지 수 확인
     try:
         from langchain_community.document_loaders import PyPDFLoader
         loader = PyPDFLoader(pdf_path)
         pdf_pages = loader.load()
         actual_total_pages = len(pdf_pages)
-        log(f"📄 PyPDFLoader로 확인한 실제 페이지 수: {actual_total_pages}")
     except Exception as e:
-        log(f"❌ PyPDFLoader 페이지 수 확인 실패: {e}")
         actual_total_pages = 1
     try:
         page_tables = extract_tables_with_pdfplumber(pdf_path)
     except Exception as e:
         page_tables = {}
-        print(f"❌ 표 추출 실패: {e}")
     try:
         page_ocr = extract_images_with_ocr(pdf_path)
     except Exception as e:
         page_ocr = {}
-        print(f"❌ 이미지 OCR 실패: {e}")
     try:
         page_body = extract_body_text_with_pages(pdf_path)
     except Exception as e:
         page_body = {}
-        print(f"❌ 본문 추출 실패: {e}")
     duration = time.time() - start
-    log(f"✅ PDF 페이지별 처리 완료: {pdf_path} ⏱️ {duration:.2f}초")
-    # 실제 페이지 수를 기준으로 설정
     all_pages = set(page_tables.keys()) | set(page_ocr.keys()) | set(page_body.keys())
     if all_pages:
         max_extracted_page = max(all_pages)
-        # 실제 페이지 수와 추출된 페이지 수 중 큰 값 사용
         total_pages = max(actual_total_pages, max_extracted_page)
     else:
         total_pages = actual_total_pages
-    log(f"📊 최종 설정된 총 페이지 수: {total_pages}")
     docs = []
@@ -305,7 +305,7 @@ def load_pdf_with_metadata(pdf_path):
                     "total_pages": total_pages
                 }
             ))
-            log(f"📊 페이지 {page_num}: 표 추출 완료")
         if page_num in page_body and page_body[page_num].strip():
             docs.append(Document(
@@ -318,7 +318,7 @@ def load_pdf_with_metadata(pdf_path):
                     "total_pages": total_pages
                 }
             ))
-            log(f"📄 페이지 {page_num}: 본문 추출 완료")
         if page_num in page_ocr and page_ocr[page_num].strip():
             docs.append(Document(
@@ -331,11 +331,11 @@ def load_pdf_with_metadata(pdf_path):
                     "total_pages": total_pages
                 }
             ))
-            log(f"🖼️ 페이지 {page_num}: OCR 추출 완료")
     if not docs:
         docs.append(Document(
-            page_content="[내용 추출 실패]",
             metadata={
                 "source": pdf_path,
                 "filename": os.path.basename(pdf_path),
@@ -345,36 +345,36 @@ def load_pdf_with_metadata(pdf_path):
             }
         ))
-    # 페이지 정보 요약 출력
     if docs:
         page_numbers = [doc.metadata.get('page', 0) for doc in docs if doc.metadata.get('page')]
         if page_numbers:
-            log(f"📋 추출된 페이지 범위: {min(page_numbers)} ~ {max(page_numbers)}")
-    log(f"📊 추출된 페이지별 PDF 문서: {len(docs)}개 (총 {total_pages}페이지)")
     return docs
 # --------------------------------
-# 문서 로딩 및 분할
 # --------------------------------
 def load_documents(folder_path):
     documents = []
     for file in glob.glob(os.path.join(folder_path, "*.hwpx")):
-        log(f"📄 HWPX 파일 확인: {file}")
         docs = load_hwpx(file)
         documents.extend(docs)
     for file in glob.glob(os.path.join(folder_path, "*.pdf")):
-        log(f"📄 PDF 파일 확인: {file}")
         documents.extend(load_pdf_with_metadata(file))
-    log(f"📚 문서 로딩 전체 완료! 총 문서 수: {len(documents)}")
     return documents
 def split_documents(documents, chunk_size=800, chunk_overlap=100):
-    log("🔪 청크 분할 시작")
     splitter = RecursiveCharacterTextSplitter(
         chunk_size=chunk_size,
         chunk_overlap=chunk_overlap,
@@ -389,21 +389,21 @@ def split_documents(documents, chunk_size=800, chunk_overlap=100):
                 page_content=enriched_chunk,
                 metadata={**doc.metadata, "chunk_index": i}
             ))
-    log(f"✅ 청크 분할 완료: 총 {len(chunks)}개 생성")
     return chunks
 # --------------------------------
-# 메인 실행
 # --------------------------------
 if __name__ == "__main__":
     folder = "dataset_test"
-    log("🚀 PyMuPDF 기반 문서 처리 시작")
     docs = load_documents(folder)
-    log("📦 문서 로딩 완료")
-    # 페이지 정보 확인
-    log("📄 페이지 정보 요약:")
     page_info = {}
     for doc in docs:
         source = doc.metadata.get('source', 'unknown')
@@ -417,10 +417,10 @@ if __name__ == "__main__":
     for source, info in page_info.items():
         max_page = max(info['pages']) if info['pages'] and isinstance(max(info['pages']), int) else 'unknown'
-        log(f"  📄 {os.path.basename(source)}: {max_page}페이지, 타입: {info['types']}")
     chunks = split_documents(docs)
-    log("💡 E5-Large-Instruct 임베딩 준비 중")
     embedding_model = HuggingFaceEmbeddings(
         model_name="intfloat/e5-large-v2",
         model_kwargs={"device": "cuda"}
@@ -429,12 +429,12 @@ if __name__ == "__main__":
     vectorstore = FAISS.from_documents(chunks, embedding_model)
     vectorstore.save_local("vector_db")
-    log(f"📊 전체 문서 수: {len(docs)}")
-    log(f"🔗 청크 총 수: {len(chunks)}")
-    log("✅ FAISS 저장 완료: vector_db")
-    # 페이지 정보가 포함된 샘플 출력
-    log("\n📋 실제 페이지 정보 포함 샘플:")
     for i, chunk in enumerate(chunks[:5]):
         meta = chunk.metadata
-        log(f"  청크 {i+1}: {meta.get('type')} | 페이지 {meta.get('page')} | {os.path.basename(meta.get('source', 'unknown'))}")

 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.vectorstores import FAISS
+# PyMuPDF library
 try:
     import fitz  # PyMuPDF
     PYMUPDF_AVAILABLE = True
+    print("✅ PyMuPDF library available")
 except ImportError:
     PYMUPDF_AVAILABLE = False
+    print("⚠️ PyMuPDF library is not installed. Install with: pip install PyMuPDF")
+# PDF processing utilities
 import pytesseract
 from PIL import Image
 from pdf2image import convert_from_path
 from pymupdf4llm import LlamaMarkdownReader
 # --------------------------------
+# Log Output
 # --------------------------------
 def log(msg):
     print(f"[{time.strftime('%H:%M:%S')}] {msg}")
 # --------------------------------
+# Text Cleaning Function
 # --------------------------------
 def clean_text(text):
 def apply_corrections(text):
     corrections = {
+        'º©': 'info', 'Ì': 'of', '½': 'operation', 'Ã': '', '©': '',
         'â€™': "'", 'â€œ': '"', 'â€': '"'
     }
     for k, v in corrections.items():
     return text
 # --------------------------------
+# HWPX Processing (Section-wise Processing Only)
 # --------------------------------
 def load_hwpx(file_path):
+    """Loading HWPX file (using XML parsing method only)"""
     import zipfile
     import xml.etree.ElementTree as ET
     import chardet
+    log(f"📥 Starting HWPX section-wise processing: {file_path}")
     start = time.time()
     documents = []
             file_list = zip_ref.namelist()
             section_files = [f for f in file_list
                            if f.startswith('Contents/section') and f.endswith('.xml')]
+            section_files.sort()  # Sort by section0.xml, section1.xml order
+            log(f"📄 Found section files: {len(section_files)}")
             for section_idx, section_file in enumerate(section_files):
                 with zip_ref.open(section_file) as xml_file:
                     tree = ET.ElementTree(ET.fromstring(text))
                     root = tree.getroot()
+                    # Find text without namespace
                     t_elements = [elem for elem in root.iter() if elem.tag.endswith('}t') or elem.tag == 't']
                     body_text = ""
                     for elem in t_elements:
                         if elem.text:
                             body_text += clean_text(elem.text) + " "
+                    # Set page metadata to empty
                     page_value = ""
                     if body_text.strip():
                                 "total_sections": len(section_files)
                             }
                         ))
+                        log(f"✅ Section text extraction complete (chars: {len(body_text)})")
+                    # Find tables
                     table_elements = [elem for elem in root.iter() if elem.tag.endswith('}table') or elem.tag == 'table']
                     if table_elements:
                         table_text = ""
                                     "total_sections": len(section_files)
                                 }
                             ))
+                            log(f"📊 Table extraction complete")
+                    # Find images
                     if [elem for elem in root.iter() if elem.tag.endswith('}picture') or elem.tag == 'picture']:
                         documents.append(Document(
+                            page_content="[Image included]",
                             metadata={
                                 "source": file_path,
                                 "filename": os.path.basename(file_path),
                                 "total_sections": len(section_files)
                             }
                         ))
+                        log(f"🖼️ Image found")
     except Exception as e:
+        log(f"❌ HWPX processing error: {e}")
     duration = time.time() - start
+    # Print summary of document information
     if documents:
+        log(f"📋 Number of extracted documents: {len(documents)}")
+    log(f"✅ HWPX processing complete: {file_path} ⏱️ {duration:.2f}s, total {len(documents)} documents")
     return documents
 # --------------------------------
+# PDF Processing Functions (same as before)
 # --------------------------------
 def run_ocr_on_image(image: Image.Image, lang='kor+eng'):
                 page_ocr_data[page_num] = text.strip()
         return page_ocr_data
     except Exception as e:
+        print(f"❌ Image OCR failed: {e}")
         return {}
 def extract_tables_with_pdfplumber(pdf_path):
                     page_table_data[page_num] = table_text.strip()
         return page_table_data
     except Exception as e:
+        print(f"❌ Table extraction failed: {e}")
         return {}
 def extract_body_text_with_pages(pdf_path):
                 start = end - 100
     except Exception as e:
+        print(f"❌ Body extraction failed: {e}")
     return page_body_data
 def load_pdf_with_metadata(pdf_path):
+    """Extracts page-specific information from a PDF file"""
+    log(f"📑 Starting PDF page-wise processing: {pdf_path}")
     start = time.time()
+    # First, check the actual number of pages using PyPDFLoader
     try:
         from langchain_community.document_loaders import PyPDFLoader
         loader = PyPDFLoader(pdf_path)
         pdf_pages = loader.load()
         actual_total_pages = len(pdf_pages)
+        log(f"📄 Actual page count as verified by PyPDFLoader: {actual_total_pages}")
     except Exception as e:
+        log(f"❌ PyPDFLoader page count verification failed: {e}")
         actual_total_pages = 1
     try:
         page_tables = extract_tables_with_pdfplumber(pdf_path)
     except Exception as e:
         page_tables = {}
+        print(f"❌ Table extraction failed: {e}")
     try:
         page_ocr = extract_images_with_ocr(pdf_path)
     except Exception as e:
         page_ocr = {}
+        print(f"❌ Image OCR failed: {e}")
     try:
         page_body = extract_body_text_with_pages(pdf_path)
     except Exception as e:
         page_body = {}
+        print(f"❌ Body extraction failed: {e}")
     duration = time.time() - start
+    log(f"✅ PDF page-wise processing complete: {pdf_path} ⏱️ {duration:.2f}s")
+    # Set the total number of pages based on the actual number of pages
     all_pages = set(page_tables.keys()) | set(page_ocr.keys()) | set(page_body.keys())
     if all_pages:
         max_extracted_page = max(all_pages)
+        # Use the greater of the actual and extracted page numbers
         total_pages = max(actual_total_pages, max_extracted_page)
     else:
         total_pages = actual_total_pages
+    log(f"📊 Final total page count set to: {total_pages}")
     docs = []
                     "total_pages": total_pages
                 }
             ))
+            log(f"📊 Page {page_num}: Table extraction complete")
         if page_num in page_body and page_body[page_num].strip():
             docs.append(Document(
                     "total_pages": total_pages
                 }
             ))
+            log(f"📄 Page {page_num}: Body extraction complete")
         if page_num in page_ocr and page_ocr[page_num].strip():
             docs.append(Document(
                     "total_pages": total_pages
                 }
             ))
+            log(f"🖼️ Page {page_num}: OCR extraction complete")
     if not docs:
         docs.append(Document(
+            page_content="[Content extraction failed]",
             metadata={
                 "source": pdf_path,
                 "filename": os.path.basename(pdf_path),
             }
         ))
+    # Print summary of page information
     if docs:
         page_numbers = [doc.metadata.get('page', 0) for doc in docs if doc.metadata.get('page')]
         if page_numbers:
+            log(f"📋 Extracted page range: {min(page_numbers)} ~ {max(page_numbers)}")
+    log(f"📊 PDF documents with extracted pages: {len(docs)} documents (total {total_pages} pages)")
     return docs
 # --------------------------------
+# Document Loading and Splitting
 # --------------------------------
 def load_documents(folder_path):
     documents = []
     for file in glob.glob(os.path.join(folder_path, "*.hwpx")):
+        log(f"📄 HWPX file found: {file}")
         docs = load_hwpx(file)
         documents.extend(docs)
     for file in glob.glob(os.path.join(folder_path, "*.pdf")):
+        log(f"📄 PDF file found: {file}")
         documents.extend(load_pdf_with_metadata(file))
+    log(f"📚 Document loading complete! Total documents: {len(documents)}")
     return documents
 def split_documents(documents, chunk_size=800, chunk_overlap=100):
+    log("🔪 Starting chunk splitting")
     splitter = RecursiveCharacterTextSplitter(
         chunk_size=chunk_size,
         chunk_overlap=chunk_overlap,
                 page_content=enriched_chunk,
                 metadata={**doc.metadata, "chunk_index": i}
             ))
+    log(f"✅ Chunk splitting complete: Created {len(chunks)} chunks")
     return chunks
 # --------------------------------
+# Main Execution
 # --------------------------------
 if __name__ == "__main__":
     folder = "dataset_test"
+    log("🚀 PyMuPDF-based document processing started")
     docs = load_documents(folder)
+    log("📦 Document loading complete")
+    # Page information check
+    log("📄 Page information summary:")
     page_info = {}
     for doc in docs:
         source = doc.metadata.get('source', 'unknown')
     for source, info in page_info.items():
         max_page = max(info['pages']) if info['pages'] and isinstance(max(info['pages']), int) else 'unknown'
+        log(f"  📄 {os.path.basename(source)}: {max_page} pages, type: {info['types']}")
     chunks = split_documents(docs)
+    log("💡 E5-Large-Instruct embedding preparation")
     embedding_model = HuggingFaceEmbeddings(
         model_name="intfloat/e5-large-v2",
         model_kwargs={"device": "cuda"}
     vectorstore = FAISS.from_documents(chunks, embedding_model)
     vectorstore.save_local("vector_db")
+    log(f"📊 Total number of documents: {len(docs)}")
+    log(f"🔗 Total number of chunks: {len(chunks)}")
+    log("✅ FAISS save complete: vector_db")
+    # Sample output with page information
+    log("\n📋 Sample including actual page information:")
     for i, chunk in enumerate(chunks[:5]):
         meta = chunk.metadata
+        log(f"  Chunk {i+1}: {meta.get('type')} | Page {meta.get('page')} | {os.path.basename(meta.get('source', 'unknown'))}")