Spaces:

hugging2021
/

open-webui-rag-system

Build error

App Files Files Community

hugging2021 commited on Jul 20

Commit

65cfd8a

verified ·

1 Parent(s): a88526d

Update rag_system.py

Browse files

Files changed (1) hide show

rag_system.py +427 -214

rag_system.py CHANGED Viewed

@@ -1,227 +1,440 @@
 import os
-import argparse
-import sys
-from langchain_community.chains import RetrievalQA
-from langchain_communit.prompts import PromptTemplate
-from vector_store import get_embeddings, load_vector_store
-from llm_loader import load_llama_model
-def create_refine_prompts_with_pages(language="de"):
-    if language == "de":
-        question_prompt = PromptTemplate(
-            input_variables=["context_str", "question"],
-            template="""
-다음은 검색된 문서 조각들입니다:
-{context_str}
-위 문서들을 참고하여 질문에 답변해주세요.
-**중요한 규칙:**
-- 답변 시 참고한 문서가 있다면 해당 정보를 인용하세요
-- 문서에 명시된 정보만 사용하고, 추측하지 마세요
-- 페이지 번호나 출처는 위 문서에서 확인된 것만 언급하세요
-- 확실하지 않은 정보는 "문서에서 확인되지 않음"이라고 명시하세요
-질문: {question}
-답변:"""
-        )
-        refine_prompt = PromptTemplate(
-            input_variables=["question", "existing_answer", "context_str"],
-            template="""
-기존 답변:
-{existing_answer}
-추가 문서:
-{context_str}
-기존 답변을 위 추가 문서를 바탕으로 보완하거나 수정해주세요.
-**규칙:**
-- 새로운 정보가 기존 답변과 다르다면 수정하세요
-- 추가 문서에 명시된 정보만 사용하세요
-- 하나의 완결된 답변으로 작성하세요
-- 확실하지 않은 출처나 페이지는 언급하지 마세요
-질문: {question}
-답변:"""
-        )
     else:
-        question_prompt = PromptTemplate(
-            input_variables=["context_str", "question"],
-            template="""
-Here are the retrieved document fragments:
-{context_str}
-Please answer the question based on the above documents.
-**Important rules:**
-- Only use information explicitly stated in the documents
-- If citing sources, only mention what is clearly indicated in the documents above
-- Do not guess or infer page numbers not shown in the context
-- If unsure, state "not confirmed in the provided documents"
-Question: {question}
-Answer:"""
-        )
-        refine_prompt = PromptTemplate(
-            input_variables=["question", "existing_answer", "context_str"],
-            template="""
-Existing answer:
-{existing_answer}
-Additional documents:
-{context_str}
-Refine the existing answer using the additional documents.
-**Rules:**
-- Only use information explicitly stated in the additional documents
-- Create one coherent final answer
-- Do not mention uncertain sources or page numbers
-Question: {question}
-Answer:"""
-        )
-    return question_prompt, refine_prompt
-def build_rag_chain(llm, vectorstore, language="ko", k=7):
-    """RAG 체인 구축"""
-    question_prompt, refine_prompt = create_refine_prompts_with_pages(language)
-    qa_chain = RetrievalQA.from_chain_type(
-        llm=llm,
-        chain_type="refine",
-        retriever=vectorstore.as_retriever(search_kwargs={"k": k}),
-        chain_type_kwargs={
-            "question_prompt": question_prompt,
-            "refine_prompt": refine_prompt
-        },
-        return_source_documents=True
-    )
-    return qa_chain
-def ask_question_with_pages(qa_chain, question):
-    """질문 처리"""
-    result = qa_chain.invoke({"query": question})
-    # 결과에서 A: 이후 문장만 추출
-    answer = result['result']
-    final_answer = answer.split("A:")[-1].strip() if "A:" in answer else answer.strip()
-    print(f"\n🧾 질문: {question}")
-    print(f"\n🟢 최종 답변: {final_answer}")
-    # 메타데이터 디버깅 정보 출력 (비활성화)
-    # debug_metadata_info(result["source_documents"])
-    # 참고 문서를 페이지별로 정리
-    print("\n📚 참고 문서 요약:")
-    source_info = {}
-    for doc in result["source_documents"]:
-        source = doc.metadata.get('source', 'N/A')
-        page = doc.metadata.get('page', 'N/A')
-        doc_type = doc.metadata.get('type', 'N/A')
-        section = doc.metadata.get('section', None)
-        total_pages = doc.metadata.get('total_pages', None)
-        filename = doc.metadata.get('filename', 'N/A')
-        if filename == 'N/A':
-            filename = os.path.basename(source) if source != 'N/A' else 'N/A'
-        if filename not in source_info:
-            source_info[filename] = {
-                'pages': set(),
-                'sections': set(),
-                'types': set(),
-                'total_pages': total_pages
             }
-        if page != 'N/A':
-            if isinstance(page, str) and page.startswith('섹션'):
-                source_info[filename]['sections'].add(page)
-            else:
-                source_info[filename]['pages'].add(page)
-        if section is not None:
-            source_info[filename]['sections'].add(f"섹션 {section}")
-        source_info[filename]['types'].add(doc_type)
-    # 결과 출력
-    total_chunks = len(result["source_documents"])
-    print(f"총 사용된 청크 수: {total_chunks}")
-    for filename, info in source_info.items():
-        print(f"\n- {filename}")
-        # 전체 페이지 수 정보
-        if info['total_pages']:
-            print(f"  전체 페이지 수: {info['total_pages']}")
-        # 페이지 정보 출력
-        if info['pages']:
-            pages_list = list(info['pages'])
-            print(f"  페이지: {', '.join(map(str, pages_list))}")
-        # 섹션 정보 출력
-        if info['sections']:
-            sections_list = sorted(list(info['sections']))
-            print(f"  섹션: {', '.join(sections_list)}")
-        # 페이지와 섹션이 모두 없는 경우
-        if not info['pages'] and not info['sections']:
-            print(f"  페이지: 정보 없음")
-        # 문서 유형 출력
-        types_str = ', '.join(sorted(info['types']))
-        print(f"  유형: {types_str}")
-    return result
-# 기존 ask_question 함수는 ask_question_with_pages로 교체
-def ask_question(qa_chain, question):
-    """호환성을 위한 래퍼 함수"""
-    return ask_question_with_pages(qa_chain, question)
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="RAG refine system (페이지 번호 지원)")
-    parser.add_argument("--vector_store", type=str, default="vector_db", help="벡터 스토어 경로")
-    parser.add_argument("--model", type=str, default="LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct", help="LLM 모델 ID")
-    parser.add_argument("--device", type=str, default="cuda", choices=["cuda", "cpu"], help="사용할 디바이스")
-    parser.add_argument("--k", type=int, default=7, help="검색할 문서 수")
-    parser.add_argument("--language", type=str, default="ko", choices=["ko", "en"], help="사용할 언어")
-    parser.add_argument("--query", type=str, help="질문 (없으면 대화형 모드 실행)")
-    args = parser.parse_args()
-    embeddings = get_embeddings(device=args.device)
-    vectorstore = load_vector_store(embeddings, load_path=args.vector_store)
-    llm = load_llama_model()
-    qa_chain = build_rag_chain(llm, vectorstore, language=args.language, k=args.k)
-    print("🟢 RAG 페이지 번호 지원 시스템 준비 완료!")
-    if args.query:
-        ask_question_with_pages(qa_chain, args.query)
-    else:
-        print("💬 대화형 모드 시작 (종료하려면 'exit', 'quit', '종료' 입력)")
-        while True:
-            try:
-                query = input("\n질문: ").strip()
-                if query.lower() in ["exit", "quit", "종료"]:
-                    break
-                if query:  # 빈 입력 방지
-                    ask_question_with_pages(qa_chain, query)
-            except KeyboardInterrupt:
-                print("\n\n프로그램을 종료합니다.")
-                break
-            except Exception as e:
-                print(f"❗ 오류 발생: {e}\n다시 시도해주세요.")

 import os
+import re
+import glob
+import time
+from collections import defaultdict
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_core.documents import Document
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_community.vectorstores import FAISS
+# PyMuPDF library
+try:
+    import fitz  # PyMuPDF
+    PYMUPDF_AVAILABLE = True
+    print("✅ PyMuPDF library available")
+except ImportError:
+    PYMUPDF_AVAILABLE = False
+    print("⚠️ PyMuPDF library is not installed. Install with: pip install PyMuPDF")
+# PDF processing utilities
+import pytesseract
+from PIL import Image
+from pdf2image import convert_from_path
+import pdfplumber
+from pymupdf4llm import LlamaMarkdownReader
+# --------------------------------
+# Log Output
+# --------------------------------
+def log(msg):
+    print(f"[{time.strftime('%H:%M:%S')}] {msg}")
+# --------------------------------
+# Text Cleaning Function
+# --------------------------------
+def clean_text(text):
+    return re.sub(r"[^\uAC00-\uD7A3\u1100-\u11FF\u3130-\u318F\w\s.,!?\"'()$:\-]", "", text)
+def apply_corrections(text):
+    corrections = {
+        'º©': 'info', 'Ì': 'of', '½': 'operation', 'Ã': '', '©': '',
+        'â€™': "'", 'â€œ': '"', 'â€': '"'
+    }
+    for k, v in corrections.items():
+        text = text.replace(k, v)
+    return text
+# --------------------------------
+# HWPX Processing (Section-wise Processing Only)
+# --------------------------------
+def load_hwpx(file_path):
+    """Loading HWPX file (using XML parsing method only)"""
+    import zipfile
+    import xml.etree.ElementTree as ET
+    import chardet
+    log(f"Starting HWPX section-wise processing: {file_path}")
+    start = time.time()
+    documents = []
+    try:
+        with zipfile.ZipFile(file_path, 'r') as zip_ref:
+            file_list = zip_ref.namelist()
+            section_files = [f for f in file_list
+                           if f.startswith('Contents/section') and f.endswith('.xml')]
+            section_files.sort()  # Sort by section0.xml, section1.xml order
+            log(f"Found section files: {len(section_files)} files")
+            for section_idx, section_file in enumerate(section_files):
+                with zip_ref.open(section_file) as xml_file:
+                    raw = xml_file.read()
+                    encoding = chardet.detect(raw)['encoding'] or 'utf-8'
+                    try:
+                        text = raw.decode(encoding)
+                    except UnicodeDecodeError:
+                        text = raw.decode("cp949", errors="replace")
+                    tree = ET.ElementTree(ET.fromstring(text))
+                    root = tree.getroot()
+                    # Find text without namespace
+                    t_elements = [elem for elem in root.iter() if elem.tag.endswith('}t') or elem.tag == 't']
+                    body_text = ""
+                    for elem in t_elements:
+                        if elem.text:
+                            body_text += clean_text(elem.text) + " "
+                    # Set page metadata to empty
+                    page_value = ""
+                    if body_text.strip():
+                        documents.append(Document(
+                            page_content=apply_corrections(body_text),
+                            metadata={
+                                "source": file_path,
+                                "filename": os.path.basename(file_path),
+                                "type": "hwpx_body",
+                                "page": page_value,
+                                "total_sections": len(section_files)
+                            }
+                        ))
+                        log(f"Section text extraction complete (chars: {len(body_text)})")
+                    # Find tables
+                    table_elements = [elem for elem in root.iter() if elem.tag.endswith('}table') or elem.tag == 'table']
+                    if table_elements:
+                        table_text = ""
+                        for table_idx, table in enumerate(table_elements):
+                            table_text += f"[Table {table_idx + 1}]\n"
+                            rows = [elem for elem in table.iter() if elem.tag.endswith('}tr') or elem.tag == 'tr']
+                            for row in rows:
+                                row_text = []
+                                cells = [elem for elem in row.iter() if elem.tag.endswith('}tc') or elem.tag == 'tc']
+                                for cell in cells:
+                                    cell_texts = []
+                                    for t_elem in cell.iter():
+                                        if (t_elem.tag.endswith('}t') or t_elem.tag == 't') and t_elem.text:
+                                            cell_texts.append(clean_text(t_elem.text))
+                                    row_text.append(" ".join(cell_texts))
+                                if row_text:
+                                    table_text += "\t".join(row_text) + "\n"
+                        if table_text.strip():
+                            documents.append(Document(
+                                page_content=apply_corrections(table_text),
+                                metadata={
+                                    "source": file_path,
+                                    "filename": os.path.basename(file_path),
+                                    "type": "hwpx_table",
+                                    "page": page_value,
+                                    "total_sections": len(section_files)
+                                }
+                            ))
+                            log(f"Table extraction complete")
+                    # Find images
+                    if [elem for elem in root.iter() if elem.tag.endswith('}picture') or elem.tag == 'picture']:
+                        documents.append(Document(
+                            page_content="[Image included]",
+                            metadata={
+                                "source": file_path,
+                                "filename": os.path.basename(file_path),
+                                "type": "hwpx_image",
+                                "page": page_value,
+                                "total_sections": len(section_files)
+                            }
+                        ))
+                        log(f"Image found")
+    except Exception as e:
+        log(f"HWPX processing error: {e}")
+    duration = time.time() - start
+    # Print summary of document information
+    if documents:
+        log(f"Number of extracted documents: {len(documents)}")
+    log(f"HWPX processing complete: {file_path} ⏱️ {duration:.2f}s, total {len(documents)} documents")
+    return documents
+# --------------------------------
+# PDF Processing Functions (same as before)
+# --------------------------------
+def run_ocr_on_image(image: Image.Image, lang='kor+eng'):
+    return pytesseract.image_to_string(image, lang=lang)
+def extract_images_with_ocr(pdf_path, lang='kor+eng'):
+    try:
+        images = convert_from_path(pdf_path)
+        page_ocr_data = {}
+        for idx, img in enumerate(images):
+            page_num = idx + 1
+            text = run_ocr_on_image(img, lang=lang)
+            if text.strip():
+                page_ocr_data[page_num] = text.strip()
+        return page_ocr_data
+    except Exception as e:
+        print(f"Image OCR failed: {e}")
+        return {}
+def extract_tables_with_pdfplumber(pdf_path):
+    page_table_data = {}
+    try:
+        with pdfplumber.open(pdf_path) as pdf:
+            for i, page in enumerate(pdf.pages):
+                page_num = i + 1
+                tables = page.extract_tables()
+                table_text = ""
+                for t_index, table in enumerate(tables):
+                    if table:
+                        table_text += f"[Table {t_index+1}]\n"
+                        for row in table:
+                            row_text = "\t".join(cell if cell else "" for cell in row)
+                            table_text += row_text + "\n"
+                if table_text.strip():
+                    page_table_data[page_num] = table_text.strip()
+        return page_table_data
+    except Exception as e:
+        print(f"Table extraction failed: {e}")
+        return {}
+def extract_body_text_with_pages(pdf_path):
+    page_body_data = {}
+    try:
+        pdf_processor = LlamaMarkdownReader()
+        docs = pdf_processor.load_data(file_path=pdf_path)
+        combined_text = ""
+        for d in docs:
+            if isinstance(d, dict) and "text" in d:
+                combined_text += d["text"]
+            elif hasattr(d, "text"):
+                combined_text += d.text
+        if combined_text.strip():
+            chars_per_page = 2000
+            start = 0
+            page_num = 1
+            while start < len(combined_text):
+                end = start + chars_per_page
+                if end > len(combined_text):
+                    end = len(combined_text)
+                page_text = combined_text[start:end]
+                if page_text.strip():
+                    page_body_data[page_num] = page_text.strip()
+                    page_num += 1
+                if end == len(combined_text):
+                    break
+                start = end - 100
+    except Exception as e:
+        print(f"Body extraction failed: {e}")
+    return page_body_data
+def load_pdf_with_metadata(pdf_path):
+    """Extracts page-specific information from a PDF file"""
+    log(f"Starting PDF page-wise processing: {pdf_path}")
+    start = time.time()
+    # First, check the actual number of pages using PyPDFLoader
+    try:
+        from langchain_community.document_loaders import PyPDFLoader
+        loader = PyPDFLoader(pdf_path)
+        pdf_pages = loader.load()
+        actual_total_pages = len(pdf_pages)
+        log(f"Actual page count as verified by PyPDFLoader: {actual_total_pages}")
+    except Exception as e:
+        log(f"PyPDFLoader page count verification failed: {e}")
+        actual_total_pages = 1
+    try:
+        page_tables = extract_tables_with_pdfplumber(pdf_path)
+    except Exception as e:
+        page_tables = {}
+        print(f"Table extraction failed: {e}")
+    try:
+        page_ocr = extract_images_with_ocr(pdf_path)
+    except Exception as e:
+        page_ocr = {}
+        print(f"Image OCR failed: {e}")
+    try:
+        page_body = extract_body_text_with_pages(pdf_path)
+    except Exception as e:
+        page_body = {}
+        print(f"Body extraction failed: {e}")
+    duration = time.time() - start
+    log(f"PDF page-wise processing complete: {pdf_path} ⏱️ {duration:.2f}s")
+    # Set the total number of pages based on the actual number of pages
+    all_pages = set(page_tables.keys()) | set(page_ocr.keys()) | set(page_body.keys())
+    if all_pages:
+        max_extracted_page = max(all_pages)
+        # Use the greater of the actual and extracted page numbers
+        total_pages = max(actual_total_pages, max_extracted_page)
     else:
+        total_pages = actual_total_pages
+    log(f"Final total page count set to: {total_pages}")
+    docs = []
+    for page_num in sorted(all_pages):
+        if page_num in page_tables and page_tables[page_num].strip():
+            docs.append(Document(
+                page_content=clean_text(apply_corrections(page_tables[page_num])),
+                metadata={
+                    "source": pdf_path,
+                    "filename": os.path.basename(pdf_path),
+                    "type": "table",
+                    "page": page_num,
+                    "total_pages": total_pages
+                }
+            ))
+            log(f"Page {page_num}: Table extraction complete")
+        if page_num in page_body and page_body[page_num].strip():
+            docs.append(Document(
+                page_content=clean_text(apply_corrections(page_body[page_num])),
+                metadata={
+                    "source": pdf_path,
+                    "filename": os.path.basename(pdf_path),
+                    "type": "body",
+                    "page": page_num,
+                    "total_pages": total_pages
+                }
+            ))
+            log(f"Page {page_num}: Body extraction complete")
+        if page_num in page_ocr and page_ocr[page_num].strip():
+            docs.append(Document(
+                page_content=clean_text(apply_corrections(page_ocr[page_num])),
+                metadata={
+                    "source": pdf_path,
+                    "filename": os.path.basename(pdf_path),
+                    "type": "ocr",
+                    "page": page_num,
+                    "total_pages": total_pages
+                }
+            ))
+            log(f"Page {page_num}: OCR extraction complete")
+    if not docs:
+        docs.append(Document(
+            page_content="[Content extraction failed]",
+            metadata={
+                "source": pdf_path,
+                "filename": os.path.basename(pdf_path),
+                "type": "error",
+                "page": 1,
+                "total_pages": total_pages
             }
+        ))
+    # Print summary of page information
+    if docs:
+        page_numbers = [doc.metadata.get('page', 0) for doc in docs if doc.metadata.get('page')]
+        if page_numbers:
+            log(f"Extracted page range: {min(page_numbers)} ~ {max(page_numbers)}")
+    log(f"PDF documents with extracted pages: {len(docs)} documents (total {total_pages} pages)")
+    return docs
+# --------------------------------
+# Document Loading and Splitting
+# --------------------------------
+def load_documents(folder_path):
+    documents = []
+    for file in glob.glob(os.path.join(folder_path, "*.hwpx")):
+        log(f"HWPX file found: {file}")
+        docs = load_hwpx(file)
+        documents.extend(docs)
+    for file in glob.glob(os.path.join(folder_path, "*.pdf")):
+        log(f"PDF file found: {file}")
+        documents.extend(load_pdf_with_metadata(file))
+    log(f"Document loading complete! Total documents: {len(documents)}")
+    return documents
+def split_documents(documents, chunk_size=800, chunk_overlap=100):
+    log("Starting chunk splitting")
+    splitter = RecursiveCharacterTextSplitter(
+        chunk_size=chunk_size,
+        chunk_overlap=chunk_overlap,
+        length_function=len
+    )
+    chunks = []
+    for doc in documents:
+        split = splitter.split_text(doc.page_content)
+        for i, chunk in enumerate(split):
+            enriched_chunk = f"passage: {chunk}"
+            chunks.append(Document(
+                page_content=enriched_chunk,
+                metadata={**doc.metadata, "chunk_index": i}
+            ))
+    log(f"Chunk splitting complete: Created {len(chunks)} chunks")
+    return chunks
+# --------------------------------
+# Main Execution
+# --------------------------------
 if __name__ == "__main__":
+    folder = "dataset_test"
+    log("PyMuPDF-based document processing started")
+    docs = load_documents(folder)
+    log("Document loading complete")
+    # Page information check
+    log("Page information summary:")
+    page_info = {}
+    for doc in docs:
+        source = doc.metadata.get('source', 'unknown')
+        page = doc.metadata.get('page', 'unknown')
+        doc_type = doc.metadata.get('type', 'unknown')
+        if source not in page_info:
+            page_info[source] = {'pages': set(), 'types': set()}
+        page_info[source]['pages'].add(page)
+        page_info[source]['types'].add(doc_type)
+    for source, info in page_info.items():
+        max_page = max(info['pages']) if info['pages'] and isinstance(max(info['pages']), int) else 'unknown'
+        log(f"  {os.path.basename(source)}: {max_page} pages, type: {info['types']}")
+    chunks = split_documents(docs)
+    log("E5-Large-Instruct embedding preparation")
+    embedding_model = HuggingFaceEmbeddings(
+        model_name="intfloat/e5-large-v2",
+        model_kwargs={"device": "cuda"}
+    )
+    vectorstore = FAISS.from_documents(chunks, embedding_model)
+    vectorstore.save_local("vector_db")
+    log(f"Total number of documents: {len(docs)}")
+    log(f"Total number of chunks: {len(chunks)}")
+    log("FAISS save complete: vector_db")
+    # Sample output with page information
+    log("\nSample including actual page information:")
+    for i, chunk in enumerate(chunks[:5]):
+        meta = chunk.metadata
+        log(f"  Chunk {i+1}: {meta.get('type')} | Page {meta.get('page')} | {os.path.basename(meta.get('source', 'unknown'))}")