Spaces:

jeongsoo
/

RAG_AgenticServer_Small

Sleeping

File size: 14,873 Bytes
"""
문서 처리 유틸리티 모듈
"""

import os
import re
import csv
import io
import logging
from typing import List, Dict, Any, Optional, Tuple, Union
import numpy as np

logger = logging.getLogger("DocProcessor")
if not logger.hasHandlers():
    handler = logging.StreamHandler()
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    logger.setLevel(logging.INFO)

class DocumentProcessor:
    """문서 처리 유틸리티 클래스"""
    
    @staticmethod
    def split_text(
        text: str, 
        chunk_size: int = 512,
        chunk_overlap: int = 50,
        separator: str = "\n"
    ) -> List[str]:
        """
        텍스트를 더 작은 청크로 분할
        
        Args:
            text: 분할할 텍스트
            chunk_size: 각 청크의 최대 문자 수
            chunk_overlap: 청크 간 중첩되는 문자 수
            separator: 분할 시 사용할 구분자
        
        Returns:
            분할된 텍스트 청크 목록
        """
        if not text or chunk_size <= 0:
            return []
        
        # 구분자로 분할
        parts = text.split(separator)
        chunks = []
        current_chunk = []
        current_size = 0
        
        for part in parts:
            part_size = len(part)
            
            if current_size + part_size + len(current_chunk) > chunk_size and current_chunk:
                # 현재 청크가 최대 크기를 초과하면 저장
                chunks.append(separator.join(current_chunk))
                
                # 중첩을 위해 일부 청크 유지
                overlap_tokens = []
                overlap_size = 0
                for token in reversed(current_chunk):
                    if overlap_size + len(token) <= chunk_overlap:
                        overlap_tokens.insert(0, token)
                        overlap_size += len(token) + 1  # separator 길이 포함
                    else:
                        break
                
                current_chunk = overlap_tokens
                current_size = overlap_size - len(current_chunk)  # separator 길이 제외
            
            current_chunk.append(part)
            current_size += part_size
        
        # 마지막 청크 추가
        if current_chunk:
            chunks.append(separator.join(current_chunk))
        
        return chunks
    
    @staticmethod
    def clean_text(text: str, remove_urls: bool = True, remove_extra_whitespace: bool = True) -> str:
        """
        텍스트 정제
        
        Args:
            text: 정제할 텍스트
            remove_urls: URL 제거 여부
            remove_extra_whitespace: 여분의 공백 제거 여부
        
        Returns:
            정제된 텍스트
        """
        if not text:
            return ""
        
        # URL 제거
        if remove_urls:
            text = re.sub(r'https?://\S+|www\.\S+', '', text)
        
        # 특수 문자 및 HTML 태그 정제
        text = re.sub(r'<.*?>', '', text)  # HTML 태그 제거
        
        # 여분의 공백 제거
        if remove_extra_whitespace:
            text = re.sub(r'\s+', ' ', text).strip()
        
        return text
    
    @staticmethod
    def text_to_documents(
        text: str,
        metadata: Optional[Dict[str, Any]] = None,
        chunk_size: int = 512,
        chunk_overlap: int = 50
    ) -> List[Dict[str, Any]]:
        """
        텍스트를 문서 객체 목록으로 변환
        
        Args:
            text: 변환할 텍스트
            metadata: 문서에 추가할 메타데이터
            chunk_size: 각 청크의 최대 문자 수
            chunk_overlap: 청크 간 중첩되는 문자 수
        
        Returns:
            문서 객체 목록
        """
        if not text:
            return []
        
        # 텍스트 정제
        clean = DocumentProcessor.clean_text(text)
        
        # 텍스트 분할
        chunks = DocumentProcessor.split_text(
            clean, 
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap
        )
        
        # 문서 객체 생성
        documents = []
        for i, chunk in enumerate(chunks):
            doc = {
                "text": chunk,
                "index": i,
                "chunk_count": len(chunks)
            }
            
            # 메타데이터 추가
            if metadata:
                doc.update(metadata)
            
            documents.append(doc)
        
        return documents
    
    @staticmethod
    def load_documents_from_directory(
        directory: str,
        extensions: List[str] = [".txt", ".md", ".csv"],
        recursive: bool = True,
        chunk_size: int = 512,
        chunk_overlap: int = 50
    ) -> List[Dict[str, Any]]:
        """
        디렉토리에서 문서 로드 및 처리
        
        Args:
            directory: 로드할 디렉토리 경로
            extensions: 처리할 파일 확장자 목록
            recursive: 하위 디렉토리 검색 여부
            chunk_size: 각 청크의 최대 문자 수
            chunk_overlap: 청크 간 중첩되는 문자 수
        
        Returns:
            문서 객체 목록
        """
        if not os.path.isdir(directory):
            logger.error(f"디렉토리를 찾을 수 없습니다: {directory}")
            return []
        
        documents = []
        
        for root, dirs, files in os.walk(directory):
            if not recursive and root != directory:
                continue
            
            for file in files:
                _, ext = os.path.splitext(file)
                if ext.lower() not in extensions:
                    continue
                
                file_path = os.path.join(root, file)
                rel_path = os.path.relpath(file_path, directory)
                
                try:
                    logger.info(f"파일 로드 중: {rel_path}")
                    # 먼저 UTF-8로 시도
                    try:
                        with open(file_path, 'r', encoding='utf-8') as f:
                            content = f.read()
                    except UnicodeDecodeError:
                        # UTF-8로 실패하면 CP949(한국어 Windows 기본 인코딩)로 시도
                        logger.info(f"UTF-8 디코딩 실패, CP949로 시도: {rel_path}")
                        with open(file_path, 'r', encoding='cp949') as f:
                            content = f.read()
                    
                    # 메타데이터 생성
                    metadata = {
                        "source": rel_path,
                        "filename": file,
                        "filetype": ext.lower()[1:],
                        "filepath": file_path
                    }
                    
                    # CSV 파일은 특별 처리
                    if ext.lower() == '.csv':
                        logger.info(f"CSV 파일 감지, 행 단위로 분할 처리: {rel_path}")
                        file_docs = DocumentProcessor.csv_to_documents(content, metadata)
                    else:
                        # 일반 텍스트 문서 처리
                        file_docs = DocumentProcessor.text_to_documents(
                            content,
                            metadata=metadata,
                            chunk_size=chunk_size,
                            chunk_overlap=chunk_overlap
                        )
                    
                    documents.extend(file_docs)
                    logger.info(f"{len(file_docs)}개 청크 추출: {rel_path}")
                    
                except Exception as e:
                    logger.error(f"파일 '{rel_path}' 처리 중 오류 발생: {e}")
                    continue
        
        logger.info(f"총 {len(documents)}개 문서 청크를 로드했습니다.")
        return documents
    
    @staticmethod
    def prepare_rag_context(results: List[Dict[str, Any]], field: str = "text") -> List[str]:
        """
        검색 결과에서 RAG에 사용할 컨텍스트 추출
        
        Args:
            results: 검색 결과 목록
            field: 텍스트 내용이 있는 필드 이름
            
        Returns:
            컨텍스트 텍스트 목록
        """
        context = []
        
        for result in results:
            if field in result:
                context.append(result[field])
        
        return context
    
    @staticmethod
    def csv_to_documents(content: str, metadata: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
        CSV 파일 내용을 행 단위로 분리하여 각 행을 별도의 문서로 처리
        
        Args:
            content: CSV 파일의 내용
            metadata: 기본 메타데이터
            
        Returns:
            문서 객체 목록 (각 행이 별도의 문서)
        """
        documents = []
        
        try:
            # 일반 CSV 파싱 시도 (코마 구분자 기본)
            try:
                csv_reader = csv.reader(io.StringIO(content))
                rows = list(csv_reader)
                if len(rows) > 0 and len(rows[0]) > 1:
                    # 코마로 제대로 구분되었다고 판단
                    logger.info(f"CSV 파일 코마 구분자로 처리: {metadata.get('source', 'unknown')}")
                    has_valid_format = True
                else:
                    # 코마로 제대로 구분되지 않음
                    has_valid_format = False
            except Exception:
                has_valid_format = False
            
            # 코마 형식이 아닌 경우, 공백 구분자 처리 시도
            if not has_valid_format:
                logger.warning(f"CSV 파일이 표준 코마 형식이 아닙니다. 공백 구분자로 처리하겠습니다: {metadata.get('source', 'unknown')}")
                lines = content.strip().split('\n')
                
                for i, line in enumerate(lines):
                    # IT로 시작하는 줄만 처리 (데이터 행으로 간주)
                    if not line.strip().startswith('IT'):
                        continue
                        
                    # 공백으로 분리하되, 최소 5개 열로 보장
                    parts = line.split(maxsplit=4)
                    
                    # 유효한 행의 최소 길이 확인
                    if len(parts) < 5:
                        logger.warning(f"행 {i+1} 부족한 데이터: {line[:50]}...")
                        continue
                    
                    # 각 필드 추출
                    doc_id = parts[0].strip()  # IT 번호
                    query_type = parts[1].strip()  # 쿼리 유형
                    question = parts[2].strip()  # 질문
                    answer = parts[3].strip()  # 답변
                    reference = parts[4].strip() if len(parts) > 4 else ""  # 참조
                    
                    # 문서 텍스트 생성 - 각 필드를 구분하여 포함
                    text = f"ID: {doc_id}\n"
                    text += f"쿼리 유형: {query_type}\n"
                    text += f"질의 (Question): {question}\n"
                    text += f"응답 (Answer): {answer}\n"
                    if reference:
                        text += f"참조 문서/맥락 (Reference/Context): {reference}"
                    
                    # 문서 객체 생성
                    doc_metadata = metadata.copy()
                    doc_metadata.update({
                        "row": i,
                        "query_type": query_type,
                        "question": question,
                        "answer": answer,
                        "reference": reference
                    })
                    
                    document = {
                        "text": text,
                        "id": doc_id,  # IT 번호를 ID로 사용
                        **doc_metadata
                    }
                    
                    documents.append(document)
                    logger.debug(f"IT 문서 처리: {doc_id} - {question[:30]}...")
                
                logger.info(f"공백 구분자 CSV 파일 '{metadata.get('source', 'unknown')}'에서 {len(documents)}개 행을 문서로 변환했습니다.")
                return documents
            
            # 표준 CSV 형식 처리 (코마 구분자 사용)
            if not rows:
                logger.warning(f"CSV 파일에 데이터가 없습니다: {metadata.get('source', 'unknown')}")
                return []
            
            # 첫 번째 행을 헤더로 사용
            headers = rows[0]
            logger.debug(f"CSV 헤더: {headers}")
            
            # 각 행을 별도의 문서로 변환
            for i, row in enumerate(rows[1:], 1):  # 헤더 제외, 1부터 시작
                # 행이 헤더보다 짧으면 빈 값으로 채움
                while len(row) < len(headers):
                    row.append("")
                
                # 행 데이터를 사전형으로 변환
                row_data = {headers[j]: value for j, value in enumerate(row) if j < len(headers)}
                
                # 첫 번째 열을 ID로 사용 (있는 경우)
                row_id = row[0] if row and len(row) > 0 else f"row_{i}"
                
                # 문서 텍스트 생성 - 모든 필드를 포함한 표현
                text_parts = []
                for j, header in enumerate(headers):
                    if j < len(row) and row[j]:
                        text_parts.append(f"{header}: {row[j]}")
                
                text = "\n".join(text_parts)
                
                # 문서 객체 생성
                doc_metadata = metadata.copy()
                doc_metadata.update({
                    "row": i,
                    "row_id": row_id,
                    "total_rows": len(rows) - 1,  # 헤더 제외
                    "csv_data": row_data  # 원본 행 데이터도 저장
                })
                
                document = {
                    "text": text,
                    "id": row_id,
                    **doc_metadata
                }
                
                documents.append(document)
            
            logger.info(f"CSV 파일 '{metadata.get('source', 'unknown')}'에서 {len(documents)}개 행을 문서로 변환했습니다.")
            
        except Exception as e:
            logger.error(f"CSV 파일 처리 중 오류 발생: {e}")
        
        return documents