Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	File size: 9,701 Bytes
			
			73375a3 c04ffe5 73375a3 c04ffe5 73375a3 c04ffe5 73375a3 c04ffe5 73375a3 c04ffe5 73375a3 c04ffe5 42dc069 c04ffe5 42dc069 c04ffe5 42dc069 c04ffe5 42dc069 c04ffe5 42dc069 73375a3 c04ffe5 3dd2ff2 73375a3 c04ffe5 73375a3 c04ffe5 73375a3 c04ffe5 73375a3 c04ffe5 42dc069 73375a3 42dc069 73375a3 c04ffe5 73375a3 c04ffe5 73375a3 c04ffe5 73375a3 c04ffe5 73375a3 c04ffe5 73375a3 c04ffe5 73375a3 c04ffe5 73375a3 42dc069 73375a3 42dc069 73375a3  | 
								1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280  | 
								"""
Utility functions for text processing.
Contains helper functions for working with text data from OCR.
"""
import re
import logging
import difflib
from typing import List, Dict, Any, Optional
# Configure logging
logging.basicConfig(level=logging.INFO, 
                   format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
def format_ocr_text(text: str, for_display: bool = False) -> str:
    """
    Format OCR text for display or processing.
    This function maintains clean separation between data and presentation.
    
    Args:
        text: OCR text to format
        for_display: Whether to format for display (HTML) or plain text
        
    Returns:
        Formatted text
    """
    if not text:
        return ""
    
    # Clean the text first
    text = clean_raw_text(text)
    
    # Basic text formatting (line breaks, etc.)
    formatted_text = text.replace("\n", "<br>" if for_display else "\n")
    
    if for_display:
        # For display, wrap in paragraph tags but avoid unnecessary divs
        # to maintain content purity
        return f"<p>{formatted_text}</p>"
    else:
        # For processing, return clean text only - no markup
        return formatted_text
def format_markdown_text(text: str, preserve_format: bool = True) -> str:
    """
    Format text as Markdown, preserving or enhancing its structure.
    Ensures that text has clean markdown formatting without introducing
    unnecessary presentation elements.
    
    Args:
        text: Raw text to format as Markdown
        preserve_format: Whether to preserve original formatting
        
    Returns:
        Markdown-formatted text
    """
    if not text:
        return ""
    
    # Clean the text first
    text = clean_raw_text(text)
    
    # Normalize line endings
    text = text.replace('\r\n', '\n').replace('\r', '\n')
    
    # Preserve paragraphs if requested
    if preserve_format:
        # Ensure paragraphs are separated by double line breaks
        text = re.sub(r'\n{3,}', '\n\n', text)
    else:
        # Convert single line breaks within paragraphs to spaces
        text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)
        # Ensure paragraphs are separated by double line breaks
        text = re.sub(r'\n{2,}', '\n\n', text)
    
    # Remove excess whitespace
    text = re.sub(r' {2,}', ' ', text)
    
    # Enhance markdown features if they exist
    
    # Make sure headers have space after # marks
    text = re.sub(r'(^|\n)(#{1,6})([^#\s])', r'\1\2 \3', text)
    
    # Make sure list items have space after markers
    text = re.sub(r'(^|\n)([*+-])([^\s])', r'\1\2 \3', text)
    text = re.sub(r'(^|\n)(\d+\.)([^\s])', r'\1\2 \3', text)
    
    return text.strip()
def clean_raw_text(text: str) -> str:
    """
    Clean raw text by removing unnecessary whitespace and artifacts.
    
    Args:
        text: Raw text to clean
        
    Returns:
        Cleaned text
    """
    if not text:
        return ""
        
    # Remove image references like 
    text = re.sub(r'!\[.*?\]\(data:image/[^)]+\)', '', text)
    
    # Remove basic markdown image references like 
    text = re.sub(r'!\[[^\]]*\]\([^)]+\)', '', text)
    
    # Remove base64 encoded image data
    text = re.sub(r'data:image/[^;]+;base64,[a-zA-Z0-9+/=]+', '', text)
    
    # Clean up any JSON-like image object references
    text = re.sub(r'{"image(_data)?":("[^"]*"|null|true|false|\{[^}]*\}|\[[^\]]*\])}', '', text)
    
    # Clean up excessive whitespace and line breaks created by removals
    text = re.sub(r'\n{3,}', '\n\n', text)
    text = re.sub(r'\s{3,}', ' ', text)
            
    return text.strip()
def detect_content_regions(image_np):
    """
    Detect content regions based on text density analysis.
    Returns regions with adaptive overlapping.
    
    Args:
        image_np: Numpy array image
        
    Returns:
        list: List of region tuples (x, y, width, height)
    """
    # Import necessary modules
    import numpy as np
    import cv2
    
    # Convert to grayscale for text detection
    if len(image_np.shape) > 2 and image_np.shape[2] == 3:
        gray = cv2.cvtColor(image_np, cv2.COLOR_RGB2GRAY)
    else:
        gray = image_np
    
    # Create text density profile
    # Sum pixel values horizontally to get vertical text density
    v_profile = np.sum(255 - gray, axis=1)
    
    # Normalize the profile
    v_profile = v_profile / np.max(v_profile) if np.max(v_profile) > 0 else v_profile
    
    # Find significant density changes
    changes = []
    threshold = 0.2
    for i in range(1, len(v_profile)):
        if abs(v_profile[i] - v_profile[i-1]) > threshold:
            changes.append(i)
    
    # Create adaptive regions based on density changes
    img_height, img_width = gray.shape
    
    # Default to at least 3 regions with overlap
    if len(changes) < 2:
        # If no significant changes, use default division with overlapping regions
        header_height = int(img_height * 0.3)
        middle_start = int(img_height * 0.2)
        middle_height = int(img_height * 0.4)
        body_start = int(img_height * 0.5)
        body_height = img_height - body_start
    else:
        # Use detected density changes for more precise regions
        changes = sorted(changes)
        header_height = changes[0] + int(img_height * 0.05)  # Add overlap
        middle_start = max(0, changes[0] - int(img_height * 0.05))
        
        if len(changes) > 1:
            middle_height = (changes[1] - middle_start) + int(img_height * 0.05)
            body_start = max(0, changes[1] - int(img_height * 0.05))
        else:
            middle_height = int(img_height * 0.4)
            body_start = int(img_height * 0.5)
            
        body_height = img_height - body_start
    
    # Define regions with adaptive overlap
    regions = [
        (0, 0, img_width, header_height),                  # Header region
        (0, middle_start, img_width, middle_height),       # Middle region with overlap
        (0, body_start, img_width, body_height)            # Body region with overlap
    ]
    
    return regions
def merge_region_texts(regions: List[Dict[str, Any]], min_similarity_threshold: float = 0.7) -> str:
    """
    Intelligently merge text from multiple document regions, handling overlapping content.
    Uses text similarity detection to avoid duplicating content from overlapping regions.
    
    Args:
        regions: List of region dictionaries, each containing 'text' and 'order' keys
        min_similarity_threshold: Minimum similarity ratio to consider text as duplicate
        
    Returns:
        Merged text with duplications removed
    """
    # If no regions, return empty string
    if not regions:
        return ""
    
    # If only one region, return its text directly
    if len(regions) == 1:
        return regions[0]['text']
    
    # Sort regions by their defined order
    sorted_regions = sorted(regions, key=lambda x: x.get('order', 0))
    
    # Extract text segments from each region
    texts = [region.get('text', '').strip() for region in sorted_regions]
    
    # Remove empty texts
    texts = [t for t in texts if t]
    
    if not texts:
        return ""
    
    # Start with the first region's text
    merged_text = texts[0]
    
    # Process each subsequent region
    for i in range(1, len(texts)):
        current_text = texts[i]
        
        # Skip if current text is empty
        if not current_text:
            continue
        
        # Find potential overlap with existing merged text
        # Split both texts into lines for line-by-line comparison
        merged_lines = merged_text.splitlines()
        current_lines = current_text.splitlines()
        
        # Initialize variables to track where to start appending
        append_from_line = 0  # Default: append all lines from current text
        max_similarity = 0.0
        max_similarity_pos = -1
        
        # Check for potential line duplications
        # Look at the last N lines of merged text (N = min(20, len(merged_lines)))
        # to see if they match the first N lines of current text
        check_lines = min(20, len(merged_lines))
        for j in range(1, check_lines + 1):
            # Get the last j lines from merged text
            merged_end = "\n".join(merged_lines[-j:])
            
            # Get the first j lines from current text
            current_start = "\n".join(current_lines[:j])
            
            # Skip comparison if either section is too short
            if len(merged_end) < 10 or len(current_start) < 10:
                continue
            
            # Calculate similarity ratio
            similarity = difflib.SequenceMatcher(None, merged_end, current_start).ratio()
            
            # If we found a better match, update
            if similarity > max_similarity and similarity >= min_similarity_threshold:
                max_similarity = similarity
                max_similarity_pos = j
        
        # If we found a good match, skip those lines from current text
        if max_similarity_pos > 0:
            logger.info(f"Found overlapping text with similarity {max_similarity:.2f}, skipping {max_similarity_pos} lines")
            append_from_line = max_similarity_pos
        
        # Append non-duplicated content with a separator
        if append_from_line < len(current_lines):
            remaining_text = "\n".join(current_lines[append_from_line:])
            if remaining_text.strip():
                merged_text += "\n\n" + remaining_text
    
    return merged_text
 |