Spaces:
Running
Running
File size: 3,365 Bytes
c04ffe5 42dc069 c04ffe5 42dc069 c04ffe5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
import re
import ast
from .text_utils import clean_raw_text, format_markdown_text
def classify_document_content(result):
"""Classify document content based on structure and content"""
classification = {
'has_title': False,
'has_content': False,
'has_sections': False,
'is_structured': False
}
if 'ocr_contents' not in result or not isinstance(result['ocr_contents'], dict):
return classification
# Check for title
if 'title' in result['ocr_contents'] and result['ocr_contents']['title']:
classification['has_title'] = True
# Check for content
content_fields = ['content', 'transcript', 'text']
for field in content_fields:
if field in result['ocr_contents'] and result['ocr_contents'][field]:
classification['has_content'] = True
break
# Check for sections
section_count = 0
for key in result['ocr_contents'].keys():
if key not in ['raw_text', 'error'] and result['ocr_contents'][key]:
section_count += 1
classification['has_sections'] = section_count > 2
# Check if structured
classification['is_structured'] = (
classification['has_title'] and
classification['has_content'] and
classification['has_sections']
)
return classification
def extract_document_text(result):
"""Extract main document text content"""
if 'ocr_contents' not in result or not isinstance(result['ocr_contents'], dict):
return ""
# Try to get the text from content fields in preferred order - prioritize main_text
for field in ['main_text', 'content', 'transcript', 'text', 'raw_text']:
if field in result['ocr_contents'] and result['ocr_contents'][field]:
content = result['ocr_contents'][field]
if isinstance(content, str):
return content
return ""
def extract_image_description(image_data):
"""Extract image description from data"""
if not image_data or not isinstance(image_data, dict):
return ""
# Try different fields that might contain descriptions
for field in ['alt_text', 'caption', 'description']:
if field in image_data and image_data[field]:
return image_data[field]
return ""
def format_structured_data(content):
"""Format structured data like lists and dictionaries into readable markdown
Args:
content: The content to format (str, list, dict)
Returns:
Formatted markdown text
"""
if not content:
return ""
# For string content, return as-is to maintain content purity
# This prevents JSON-like text from being transformed inappropriately
if isinstance(content, str):
return content
# Handle native Python lists
if isinstance(content, list):
if not content:
return ""
# Convert to markdown bullet points
return "\n".join([f"- {item}" for item in content])
# Handle native Python dictionaries
elif isinstance(content, dict):
if not content:
return ""
# Convert to markdown key-value pairs
return "\n".join([f"**{k}**: {v}" for k, v in content.items()])
# Return as string for other types
return str(content)
|