Spaces:
Running
Running
enhanced OCR functionality and efficiency, simplified preprompting, etc
Browse files- CLAUDE.md +6 -3
- app.py +493 -103
- config.py +7 -7
- ocr_utils.py +341 -52
- structured_ocr.py +298 -146
CLAUDE.md
CHANGED
@@ -7,12 +7,14 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
|
|
7 |
- Test OCR functionality: `python structured_ocr.py <file_path>`
|
8 |
- Process PDF files: `python pdf_ocr.py <file_path>`
|
9 |
- Process single file with logging: `python process_file.py <file_path>`
|
|
|
10 |
- Run typechecking: `mypy .`
|
|
|
11 |
|
12 |
## Environment Setup
|
13 |
- API key: Set `MISTRAL_API_KEY` in `.env` file or environment variable
|
14 |
- Install dependencies: `pip install -r requirements.txt`
|
15 |
-
- System requirements: `
|
16 |
|
17 |
## Code Style Guidelines
|
18 |
- **Imports**: Standard library first, third-party next, local modules last
|
@@ -21,10 +23,11 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
|
|
21 |
- **Naming**: snake_case for variables/functions, PascalCase for classes
|
22 |
- **Documentation**: Google-style docstrings for all functions/classes
|
23 |
- **Logging**: Use module-level loggers with appropriate log levels
|
|
|
24 |
|
25 |
## Architecture
|
26 |
- Core: `structured_ocr.py` - Main OCR processing with Mistral AI integration
|
27 |
-
- Utils: `ocr_utils.py` -
|
28 |
-
- PDF handling: `pdf_ocr.py` - PDF-specific processing functionality
|
29 |
- Config: `config.py` - Configuration settings and API keys
|
30 |
- Web: `app.py` - Streamlit interface with UI components in `/ui` directory
|
|
|
7 |
- Test OCR functionality: `python structured_ocr.py <file_path>`
|
8 |
- Process PDF files: `python pdf_ocr.py <file_path>`
|
9 |
- Process single file with logging: `python process_file.py <file_path>`
|
10 |
+
- Run newspaper test: `python test_newspaper.py <file_path>`
|
11 |
- Run typechecking: `mypy .`
|
12 |
+
- Lint code: `ruff check .` or `flake8`
|
13 |
|
14 |
## Environment Setup
|
15 |
- API key: Set `MISTRAL_API_KEY` in `.env` file or environment variable
|
16 |
- Install dependencies: `pip install -r requirements.txt`
|
17 |
+
- System requirements: Install `poppler-utils` and `tesseract-ocr` for PDF processing and OCR
|
18 |
|
19 |
## Code Style Guidelines
|
20 |
- **Imports**: Standard library first, third-party next, local modules last
|
|
|
23 |
- **Naming**: snake_case for variables/functions, PascalCase for classes
|
24 |
- **Documentation**: Google-style docstrings for all functions/classes
|
25 |
- **Logging**: Use module-level loggers with appropriate log levels
|
26 |
+
- **Line length**: ≤100 characters
|
27 |
|
28 |
## Architecture
|
29 |
- Core: `structured_ocr.py` - Main OCR processing with Mistral AI integration
|
30 |
+
- Utils: `ocr_utils.py` - OCR text and image processing utilities
|
31 |
+
- PDF handling: `pdf_ocr.py` - PDF-specific processing functionality
|
32 |
- Config: `config.py` - Configuration settings and API keys
|
33 |
- Web: `app.py` - Streamlit interface with UI components in `/ui` directory
|
app.py
CHANGED
@@ -322,6 +322,15 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
|
|
322 |
preprocessing_options.get("document_type", "standard") != "standard"
|
323 |
)
|
324 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
325 |
if has_preprocessing:
|
326 |
status_text.markdown('<div class="processing-status-container">Applying image preprocessing...</div>', unsafe_allow_html=True)
|
327 |
progress_bar.progress(20)
|
@@ -371,7 +380,12 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
|
|
371 |
cache_key = f"{file_hash}_{file_type}_{use_vision}_{pdf_rotation_value}"
|
372 |
|
373 |
progress_bar.progress(50)
|
374 |
-
|
|
|
|
|
|
|
|
|
|
|
375 |
|
376 |
# Process the file using cached function if possible
|
377 |
try:
|
@@ -563,73 +577,115 @@ with st.sidebar:
|
|
563 |
# Add spacing between sections
|
564 |
st.markdown("<div style='margin: 10px 0;'></div>", unsafe_allow_html=True)
|
565 |
|
566 |
-
# Document
|
567 |
-
st.markdown("#####
|
568 |
|
569 |
-
#
|
570 |
-
|
571 |
-
"
|
572 |
-
"
|
573 |
-
"
|
574 |
-
"
|
575 |
-
"
|
576 |
-
"
|
|
|
|
|
|
|
|
|
577 |
]
|
578 |
|
579 |
-
|
580 |
-
"
|
581 |
-
options=
|
582 |
index=0,
|
583 |
-
help="Select
|
584 |
)
|
585 |
|
586 |
-
# Document
|
587 |
-
|
588 |
-
"
|
589 |
-
"
|
590 |
-
"
|
591 |
-
"
|
592 |
-
"Literary/Academic Work",
|
593 |
-
"News/Journalism",
|
594 |
-
"Religious Text",
|
595 |
-
"Legal Document"
|
596 |
]
|
597 |
|
598 |
-
|
599 |
-
"Document
|
600 |
-
options=
|
601 |
index=0,
|
602 |
-
help="Select the
|
603 |
)
|
604 |
|
605 |
-
#
|
606 |
custom_prompt_text = ""
|
607 |
-
|
608 |
-
|
609 |
-
|
610 |
-
|
611 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
612 |
|
613 |
# Add spacing between sections
|
614 |
st.markdown("<div style='margin: 10px 0;'></div>", unsafe_allow_html=True)
|
615 |
|
616 |
custom_prompt = st.text_area(
|
617 |
-
"
|
618 |
value=custom_prompt_text,
|
619 |
-
placeholder="Example:
|
620 |
-
height=
|
621 |
-
max_chars=
|
622 |
key="custom_analysis_instructions",
|
623 |
-
help="Specify document
|
624 |
)
|
625 |
|
626 |
-
#
|
627 |
-
with st.expander("Instruction Examples"):
|
628 |
st.markdown("""
|
629 |
-
|
630 |
-
- "
|
631 |
-
- "
|
632 |
-
- "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
633 |
""")
|
634 |
|
635 |
# Add spacing between sections
|
@@ -733,10 +789,28 @@ with main_tab2:
|
|
733 |
# Get zip data directly in memory
|
734 |
zip_data = create_results_zip_in_memory(st.session_state.previous_results)
|
735 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
736 |
st.download_button(
|
737 |
label="Download All Results",
|
738 |
data=zip_data,
|
739 |
-
file_name=
|
740 |
mime="application/zip",
|
741 |
help="Download all previous results as a ZIP file containing HTML and JSON files"
|
742 |
)
|
@@ -776,12 +850,12 @@ with main_tab2:
|
|
776 |
st.markdown(f"""
|
777 |
<div class="result-card">
|
778 |
<div class="result-header">
|
779 |
-
<div class="result-filename">{icon} {file_name}</div>
|
780 |
<div class="result-date">{result.get('timestamp', 'Unknown')}</div>
|
781 |
</div>
|
782 |
<div class="result-metadata">
|
783 |
<div class="result-tag">Languages: {', '.join(result.get('languages', ['Unknown']))}</div>
|
784 |
-
<div class="result-tag">Topics: {', '.join(result.get('topics', ['Unknown']))}</div>
|
785 |
</div>
|
786 |
""", unsafe_allow_html=True)
|
787 |
|
@@ -824,7 +898,34 @@ with main_tab2:
|
|
824 |
st.write(f"**Languages:** {', '.join(languages)}")
|
825 |
|
826 |
if 'topics' in selected_result and selected_result['topics']:
|
827 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
828 |
|
829 |
with meta_col2:
|
830 |
# Display processing metadata
|
@@ -870,23 +971,68 @@ with main_tab2:
|
|
870 |
# Try a safer approach with string representation
|
871 |
st.code(str(selected_result))
|
872 |
|
873 |
-
#
|
874 |
try:
|
875 |
json_str = json.dumps(selected_result, indent=2)
|
876 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
877 |
st.download_button(
|
878 |
label="Download JSON",
|
879 |
data=json_str,
|
880 |
-
file_name=
|
881 |
mime="application/json"
|
882 |
)
|
883 |
except Exception as e:
|
884 |
st.error(f"Error creating JSON download: {str(e)}")
|
885 |
-
# Fallback to string representation for download
|
|
|
|
|
886 |
st.download_button(
|
887 |
label="Download as Text",
|
888 |
data=str(selected_result),
|
889 |
-
file_name=f"{
|
890 |
mime="text/plain"
|
891 |
)
|
892 |
|
@@ -924,14 +1070,57 @@ with main_tab2:
|
|
924 |
if page_idx < len(pages_data) - 1:
|
925 |
st.markdown("---")
|
926 |
|
927 |
-
# Add HTML download button
|
928 |
from ocr_utils import create_html_with_images
|
929 |
html_content = create_html_with_images(selected_result)
|
930 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
931 |
st.download_button(
|
932 |
label="Download as HTML with Images",
|
933 |
data=html_content,
|
934 |
-
file_name=
|
935 |
mime="text/html"
|
936 |
)
|
937 |
|
@@ -1092,7 +1281,7 @@ with main_tab1:
|
|
1092 |
progress_bar.progress(40)
|
1093 |
|
1094 |
try:
|
1095 |
-
#
|
1096 |
processor = StructuredOCR()
|
1097 |
|
1098 |
# First save the PDF to a temp file
|
@@ -1100,53 +1289,60 @@ with main_tab1:
|
|
1100 |
tmp.write(uploaded_file.getvalue())
|
1101 |
temp_path = tmp.name
|
1102 |
|
1103 |
-
# Process with NO custom prompt first
|
1104 |
# Apply PDF rotation if specified
|
1105 |
pdf_rotation_value = pdf_rotation if 'pdf_rotation' in locals() else 0
|
1106 |
|
1107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1108 |
file_path=temp_path,
|
1109 |
file_type="pdf",
|
1110 |
use_vision=use_vision,
|
1111 |
-
custom_prompt=
|
1112 |
file_size_mb=len(uploaded_file.getvalue()) / (1024 * 1024),
|
1113 |
-
pdf_rotation=pdf_rotation_value
|
1114 |
)
|
1115 |
|
1116 |
-
progress_bar.progress(
|
1117 |
-
status_text.markdown('<div class="processing-status-container">
|
1118 |
-
|
1119 |
-
# Step 2: Apply custom prompt to the extracted text using text-only LLM
|
1120 |
-
if 'ocr_contents' in base_result and isinstance(base_result['ocr_contents'], dict):
|
1121 |
-
# Get text from OCR result
|
1122 |
-
ocr_text = ""
|
1123 |
-
for section, content in base_result['ocr_contents'].items():
|
1124 |
-
if isinstance(content, str):
|
1125 |
-
ocr_text += content + "\n\n"
|
1126 |
-
elif isinstance(content, list):
|
1127 |
-
for item in content:
|
1128 |
-
if isinstance(item, str):
|
1129 |
-
ocr_text += item + "\n"
|
1130 |
-
ocr_text += "\n"
|
1131 |
-
|
1132 |
-
# Format the custom prompt for text-only processing
|
1133 |
-
formatted_prompt = f"USER INSTRUCTIONS: {custom_prompt.strip()}\nPay special attention to these instructions and respond accordingly."
|
1134 |
-
|
1135 |
-
# Apply custom prompt to extracted text
|
1136 |
-
enhanced_result = processor._extract_structured_data_text_only(ocr_text, uploaded_file.name, formatted_prompt)
|
1137 |
-
|
1138 |
-
# Merge results, keeping images from base_result
|
1139 |
-
result = base_result.copy()
|
1140 |
-
result['custom_prompt_applied'] = 'text_only'
|
1141 |
-
|
1142 |
-
# Update with enhanced analysis results, preserving image data
|
1143 |
-
for key, value in enhanced_result.items():
|
1144 |
-
if key not in ['raw_response_data', 'pages_data', 'has_images']:
|
1145 |
-
result[key] = value
|
1146 |
-
else:
|
1147 |
-
# If no OCR content, just use the base result
|
1148 |
-
result = base_result
|
1149 |
-
result['custom_prompt_applied'] = 'failed'
|
1150 |
|
1151 |
# Clean up temp file
|
1152 |
if os.path.exists(temp_path):
|
@@ -1183,8 +1379,21 @@ with main_tab1:
|
|
1183 |
# Initialize OCR processor and process with custom prompt
|
1184 |
processor = StructuredOCR()
|
1185 |
|
1186 |
-
#
|
1187 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1188 |
|
1189 |
try:
|
1190 |
result = processor.process_file(
|
@@ -1238,15 +1447,39 @@ with main_tab1:
|
|
1238 |
if languages:
|
1239 |
metadata_html += f'<p><strong>Languages:</strong> {", ".join(languages)}</p>'
|
1240 |
|
1241 |
-
# Topics
|
1242 |
if 'topics' in result and result['topics']:
|
1243 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1244 |
|
1245 |
# Processing time
|
1246 |
if 'processing_time' in result:
|
1247 |
proc_time = result['processing_time']
|
1248 |
metadata_html += f'<p><strong>Processing Time:</strong> {proc_time:.1f}s</p>'
|
1249 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1250 |
# Close the metadata card
|
1251 |
metadata_html += '</div>'
|
1252 |
|
@@ -1664,16 +1897,35 @@ with main_tab1:
|
|
1664 |
</html>
|
1665 |
"""
|
1666 |
|
1667 |
-
#
|
1668 |
original_name = Path(result.get('file_name', uploaded_file.name)).stem
|
1669 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1670 |
# Add download button as an expander to prevent page reset
|
1671 |
with st.expander("Download Document with Images"):
|
1672 |
st.markdown("Click the button below to download the document with embedded images")
|
1673 |
st.download_button(
|
1674 |
label="Download as HTML",
|
1675 |
data=download_html,
|
1676 |
-
file_name=
|
1677 |
mime="text/html",
|
1678 |
key="download_with_images_button"
|
1679 |
)
|
@@ -1696,6 +1948,144 @@ with main_tab1:
|
|
1696 |
result_copy = result.copy()
|
1697 |
result_copy['timestamp'] = datetime.now().strftime("%Y-%m-%d %H:%M")
|
1698 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1699 |
# Add to session state, keeping the most recent 20 results
|
1700 |
st.session_state.previous_results.insert(0, result_copy)
|
1701 |
if len(st.session_state.previous_results) > 20:
|
|
|
322 |
preprocessing_options.get("document_type", "standard") != "standard"
|
323 |
)
|
324 |
|
325 |
+
# Add document type hints to custom prompt if available from document type selector - with safety checks
|
326 |
+
if ('custom_prompt' in locals() and custom_prompt and
|
327 |
+
'selected_doc_type' in locals() and selected_doc_type != "Auto-detect (standard processing)" and
|
328 |
+
"This is a" not in str(custom_prompt)):
|
329 |
+
# Extract just the document type from the selector
|
330 |
+
doc_type_hint = selected_doc_type.split(" or ")[0].lower()
|
331 |
+
# Prepend to the custom prompt
|
332 |
+
custom_prompt = f"This is a {doc_type_hint}. {custom_prompt}"
|
333 |
+
|
334 |
if has_preprocessing:
|
335 |
status_text.markdown('<div class="processing-status-container">Applying image preprocessing...</div>', unsafe_allow_html=True)
|
336 |
progress_bar.progress(20)
|
|
|
380 |
cache_key = f"{file_hash}_{file_type}_{use_vision}_{pdf_rotation_value}"
|
381 |
|
382 |
progress_bar.progress(50)
|
383 |
+
# Check if we have custom instructions
|
384 |
+
has_custom_prompt = 'custom_prompt' in locals() and custom_prompt and len(str(custom_prompt).strip()) > 0
|
385 |
+
if has_custom_prompt:
|
386 |
+
status_text.markdown('<div class="processing-status-container">Processing document with custom instructions...</div>', unsafe_allow_html=True)
|
387 |
+
else:
|
388 |
+
status_text.markdown('<div class="processing-status-container">Processing document with OCR...</div>', unsafe_allow_html=True)
|
389 |
|
390 |
# Process the file using cached function if possible
|
391 |
try:
|
|
|
577 |
# Add spacing between sections
|
578 |
st.markdown("<div style='margin: 10px 0;'></div>", unsafe_allow_html=True)
|
579 |
|
580 |
+
# Document Processing section
|
581 |
+
st.markdown("##### OCR Instructions", help="Optimize text extraction")
|
582 |
|
583 |
+
# Document type selector
|
584 |
+
document_types = [
|
585 |
+
"Auto-detect (standard processing)",
|
586 |
+
"Newspaper or Magazine",
|
587 |
+
"Letter or Correspondence",
|
588 |
+
"Book or Publication",
|
589 |
+
"Form or Legal Document",
|
590 |
+
"Recipe",
|
591 |
+
"Handwritten Document",
|
592 |
+
"Map or Illustration",
|
593 |
+
"Table or Spreadsheet",
|
594 |
+
"Other (specify in instructions)"
|
595 |
]
|
596 |
|
597 |
+
selected_doc_type = st.selectbox(
|
598 |
+
"Document Type",
|
599 |
+
options=document_types,
|
600 |
index=0,
|
601 |
+
help="Select document type to optimize OCR processing for specific document formats and layouts. For documents with specialized features, also provide details in the instructions field below."
|
602 |
)
|
603 |
|
604 |
+
# Document layout selector
|
605 |
+
document_layouts = [
|
606 |
+
"Standard layout",
|
607 |
+
"Multiple columns",
|
608 |
+
"Table/grid format",
|
609 |
+
"Mixed layout with images"
|
|
|
|
|
|
|
|
|
610 |
]
|
611 |
|
612 |
+
selected_layout = st.selectbox(
|
613 |
+
"Document Layout",
|
614 |
+
options=document_layouts,
|
615 |
index=0,
|
616 |
+
help="Select the document's text layout for better OCR"
|
617 |
)
|
618 |
|
619 |
+
# Generate dynamic prompt based on both document type and layout
|
620 |
custom_prompt_text = ""
|
621 |
+
|
622 |
+
# First add document type specific instructions (simplified)
|
623 |
+
if selected_doc_type != "Auto-detect (standard processing)":
|
624 |
+
if selected_doc_type == "Newspaper or Magazine":
|
625 |
+
custom_prompt_text = "This is a newspaper/magazine. Process columns from top to bottom, capture headlines, bylines, article text and captions."
|
626 |
+
elif selected_doc_type == "Letter or Correspondence":
|
627 |
+
custom_prompt_text = "This is a letter/correspondence. Capture letterhead, date, greeting, body, closing and signature. Note any handwritten annotations."
|
628 |
+
elif selected_doc_type == "Book or Publication":
|
629 |
+
custom_prompt_text = "This is a book/publication. Extract titles, headers, footnotes, page numbers and body text. Preserve paragraph structure and any special formatting."
|
630 |
+
elif selected_doc_type == "Form or Legal Document":
|
631 |
+
custom_prompt_text = "This is a form/legal document. Extract all field labels and values, preserving the structure. Pay special attention to signature lines, dates, and any official markings."
|
632 |
+
elif selected_doc_type == "Recipe":
|
633 |
+
custom_prompt_text = "This is a recipe. Extract title, ingredients list with measurements, and preparation instructions. Maintain the distinction between ingredients and preparation steps."
|
634 |
+
elif selected_doc_type == "Handwritten Document":
|
635 |
+
custom_prompt_text = "This is a handwritten document. Carefully transcribe all handwritten text, preserving line breaks. Note any unclear sections or annotations."
|
636 |
+
elif selected_doc_type == "Map or Illustration":
|
637 |
+
custom_prompt_text = "This is a map or illustration. Transcribe all labels, legends, captions, and annotations. Note any scale indicators or directional markings."
|
638 |
+
elif selected_doc_type == "Table or Spreadsheet":
|
639 |
+
custom_prompt_text = "This is a table/spreadsheet. Preserve row and column structure, maintaining alignment of data. Extract headers and all cell values."
|
640 |
+
elif selected_doc_type == "Other (specify in instructions)":
|
641 |
+
custom_prompt_text = "Please describe the document type and any special processing requirements here."
|
642 |
+
|
643 |
+
# Then add layout specific instructions if needed
|
644 |
+
if selected_layout != "Standard layout" and not custom_prompt_text:
|
645 |
+
if selected_layout == "Multiple columns":
|
646 |
+
custom_prompt_text = "Document has multiple columns. Read each column from top to bottom, then move to the next column."
|
647 |
+
elif selected_layout == "Table/grid format":
|
648 |
+
custom_prompt_text = "Document contains table data. Preserve row and column structure during extraction."
|
649 |
+
elif selected_layout == "Mixed layout with images":
|
650 |
+
custom_prompt_text = "Document has mixed text layout with images. Extract text in proper reading order."
|
651 |
+
# If both document type and non-standard layout are selected, add layout info
|
652 |
+
elif selected_layout != "Standard layout" and custom_prompt_text:
|
653 |
+
if selected_layout == "Multiple columns":
|
654 |
+
custom_prompt_text += " Document has multiple columns."
|
655 |
+
elif selected_layout == "Table/grid format":
|
656 |
+
custom_prompt_text += " Contains table/grid formatting."
|
657 |
+
elif selected_layout == "Mixed layout with images":
|
658 |
+
custom_prompt_text += " Has mixed text layout with images."
|
659 |
|
660 |
# Add spacing between sections
|
661 |
st.markdown("<div style='margin: 10px 0;'></div>", unsafe_allow_html=True)
|
662 |
|
663 |
custom_prompt = st.text_area(
|
664 |
+
"Additional OCR Instructions",
|
665 |
value=custom_prompt_text,
|
666 |
+
placeholder="Example: Small text at bottom needs special attention",
|
667 |
+
height=100,
|
668 |
+
max_chars=300,
|
669 |
key="custom_analysis_instructions",
|
670 |
+
help="Specify document type and special OCR requirements. Detailed instructions activate Mistral AI's advanced document analysis."
|
671 |
)
|
672 |
|
673 |
+
# Custom instructions expander
|
674 |
+
with st.expander("Custom Instruction Examples"):
|
675 |
st.markdown("""
|
676 |
+
**Document Format Instructions:**
|
677 |
+
- "This newspaper has multiple columns - read each column from top to bottom"
|
678 |
+
- "This letter has a formal heading, main body, and signature section at bottom"
|
679 |
+
- "This form has fields with labels and filled-in values that should be paired"
|
680 |
+
- "This recipe has ingredient list at top and preparation steps below"
|
681 |
+
|
682 |
+
**Special Processing Instructions:**
|
683 |
+
- "Pay attention to footnotes at the bottom of each page"
|
684 |
+
- "Some text is faded - please attempt to reconstruct unclear passages"
|
685 |
+
- "There are handwritten annotations in the margins that should be included"
|
686 |
+
- "Document has table data that should preserve row and column alignment"
|
687 |
+
- "Text continues across pages and should be connected into a single flow"
|
688 |
+
- "This document uses special symbols and mathematical notation"
|
689 |
""")
|
690 |
|
691 |
# Add spacing between sections
|
|
|
789 |
# Get zip data directly in memory
|
790 |
zip_data = create_results_zip_in_memory(st.session_state.previous_results)
|
791 |
|
792 |
+
# Create more informative ZIP filename with timestamp
|
793 |
+
from datetime import datetime
|
794 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
795 |
+
|
796 |
+
# Count document types for a more descriptive filename
|
797 |
+
pdf_count = sum(1 for r in st.session_state.previous_results if r.get('file_name', '').lower().endswith('.pdf'))
|
798 |
+
img_count = sum(1 for r in st.session_state.previous_results if r.get('file_name', '').lower().endswith(('.jpg', '.jpeg', '.png')))
|
799 |
+
|
800 |
+
# Create more descriptive filename
|
801 |
+
if pdf_count > 0 and img_count > 0:
|
802 |
+
zip_filename = f"historical_ocr_mixed_{pdf_count}pdf_{img_count}img_{timestamp}.zip"
|
803 |
+
elif pdf_count > 0:
|
804 |
+
zip_filename = f"historical_ocr_pdf_documents_{pdf_count}_{timestamp}.zip"
|
805 |
+
elif img_count > 0:
|
806 |
+
zip_filename = f"historical_ocr_images_{img_count}_{timestamp}.zip"
|
807 |
+
else:
|
808 |
+
zip_filename = f"historical_ocr_results_{timestamp}.zip"
|
809 |
+
|
810 |
st.download_button(
|
811 |
label="Download All Results",
|
812 |
data=zip_data,
|
813 |
+
file_name=zip_filename,
|
814 |
mime="application/zip",
|
815 |
help="Download all previous results as a ZIP file containing HTML and JSON files"
|
816 |
)
|
|
|
850 |
st.markdown(f"""
|
851 |
<div class="result-card">
|
852 |
<div class="result-header">
|
853 |
+
<div class="result-filename">{icon} {result.get('descriptive_file_name', file_name)}</div>
|
854 |
<div class="result-date">{result.get('timestamp', 'Unknown')}</div>
|
855 |
</div>
|
856 |
<div class="result-metadata">
|
857 |
<div class="result-tag">Languages: {', '.join(result.get('languages', ['Unknown']))}</div>
|
858 |
+
<div class="result-tag">Topics: {', '.join(result.get('topics', ['Unknown'])[:5])} {' + ' + str(len(result.get('topics', [])) - 5) + ' more' if len(result.get('topics', [])) > 5 else ''}</div>
|
859 |
</div>
|
860 |
""", unsafe_allow_html=True)
|
861 |
|
|
|
898 |
st.write(f"**Languages:** {', '.join(languages)}")
|
899 |
|
900 |
if 'topics' in selected_result and selected_result['topics']:
|
901 |
+
# Show topics in a more organized way with badges
|
902 |
+
st.markdown("**Subject Tags:**")
|
903 |
+
# Create a container with flex display for the tags
|
904 |
+
st.markdown('<div style="display: flex; flex-wrap: wrap; gap: 5px; margin-top: 5px;">', unsafe_allow_html=True)
|
905 |
+
|
906 |
+
# Generate a badge for each tag
|
907 |
+
for topic in selected_result['topics']:
|
908 |
+
# Create colored badge based on tag category
|
909 |
+
badge_color = "#546e7a" # Default color
|
910 |
+
|
911 |
+
# Assign colors by category
|
912 |
+
if any(term in topic.lower() for term in ["century", "pre-", "era", "historical"]):
|
913 |
+
badge_color = "#1565c0" # Blue for time periods
|
914 |
+
elif any(term in topic.lower() for term in ["language", "english", "french", "german", "latin"]):
|
915 |
+
badge_color = "#00695c" # Teal for languages
|
916 |
+
elif any(term in topic.lower() for term in ["letter", "newspaper", "book", "form", "document", "recipe"]):
|
917 |
+
badge_color = "#6a1b9a" # Purple for document types
|
918 |
+
elif any(term in topic.lower() for term in ["travel", "military", "science", "medicine", "education", "art", "literature"]):
|
919 |
+
badge_color = "#2e7d32" # Green for subject domains
|
920 |
+
|
921 |
+
st.markdown(
|
922 |
+
f'<span style="background-color: {badge_color}; color: white; padding: 3px 8px; '
|
923 |
+
f'border-radius: 12px; font-size: 0.85em; display: inline-block; margin-bottom: 5px;">{topic}</span>',
|
924 |
+
unsafe_allow_html=True
|
925 |
+
)
|
926 |
+
|
927 |
+
# Close the container
|
928 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
929 |
|
930 |
with meta_col2:
|
931 |
# Display processing metadata
|
|
|
971 |
# Try a safer approach with string representation
|
972 |
st.code(str(selected_result))
|
973 |
|
974 |
+
# Create more informative JSON download button with better naming
|
975 |
try:
|
976 |
json_str = json.dumps(selected_result, indent=2)
|
977 |
+
|
978 |
+
# Use the descriptive filename if available, otherwise build one
|
979 |
+
if 'descriptive_file_name' in selected_result:
|
980 |
+
# Get base name without extension
|
981 |
+
base_filename = Path(selected_result['descriptive_file_name']).stem
|
982 |
+
else:
|
983 |
+
# Fall back to old method of building filename
|
984 |
+
base_filename = selected_result.get('file_name', 'document').split('.')[0]
|
985 |
+
|
986 |
+
# Add document type if available
|
987 |
+
if 'topics' in selected_result and selected_result['topics']:
|
988 |
+
topic = selected_result['topics'][0].lower().replace(' ', '_')
|
989 |
+
base_filename = f"{base_filename}_{topic}"
|
990 |
+
|
991 |
+
# Add language if available
|
992 |
+
if 'languages' in selected_result and selected_result['languages']:
|
993 |
+
lang = selected_result['languages'][0].lower()
|
994 |
+
# Only add if it's not already in the filename
|
995 |
+
if lang not in base_filename.lower():
|
996 |
+
base_filename = f"{base_filename}_{lang}"
|
997 |
+
|
998 |
+
# For PDFs, add page information
|
999 |
+
if 'total_pages' in selected_result and 'processed_pages' in selected_result:
|
1000 |
+
base_filename = f"{base_filename}_p{selected_result['processed_pages']}of{selected_result['total_pages']}"
|
1001 |
+
|
1002 |
+
# Get date from timestamp if available
|
1003 |
+
timestamp = ""
|
1004 |
+
if 'timestamp' in selected_result:
|
1005 |
+
try:
|
1006 |
+
# Try to parse the timestamp and reformat it
|
1007 |
+
from datetime import datetime
|
1008 |
+
dt = datetime.strptime(selected_result['timestamp'], "%Y-%m-%d %H:%M")
|
1009 |
+
timestamp = dt.strftime("%Y%m%d_%H%M%S")
|
1010 |
+
except:
|
1011 |
+
# If parsing fails, create a new timestamp
|
1012 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
1013 |
+
else:
|
1014 |
+
# No timestamp in the result, create a new one
|
1015 |
+
from datetime import datetime
|
1016 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
1017 |
+
|
1018 |
+
# Create final filename
|
1019 |
+
json_filename = f"{base_filename}_{timestamp}.json"
|
1020 |
+
|
1021 |
st.download_button(
|
1022 |
label="Download JSON",
|
1023 |
data=json_str,
|
1024 |
+
file_name=json_filename,
|
1025 |
mime="application/json"
|
1026 |
)
|
1027 |
except Exception as e:
|
1028 |
st.error(f"Error creating JSON download: {str(e)}")
|
1029 |
+
# Fallback to string representation for download with simple naming
|
1030 |
+
from datetime import datetime
|
1031 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
1032 |
st.download_button(
|
1033 |
label="Download as Text",
|
1034 |
data=str(selected_result),
|
1035 |
+
file_name=f"document_{timestamp}.txt",
|
1036 |
mime="text/plain"
|
1037 |
)
|
1038 |
|
|
|
1070 |
if page_idx < len(pages_data) - 1:
|
1071 |
st.markdown("---")
|
1072 |
|
1073 |
+
# Add HTML download button with improved, more descriptive filename
|
1074 |
from ocr_utils import create_html_with_images
|
1075 |
html_content = create_html_with_images(selected_result)
|
1076 |
+
|
1077 |
+
# Use the descriptive filename if available, otherwise build one
|
1078 |
+
if 'descriptive_file_name' in selected_result:
|
1079 |
+
# Get base name without extension
|
1080 |
+
base_filename = Path(selected_result['descriptive_file_name']).stem
|
1081 |
+
else:
|
1082 |
+
# Fall back to old method of building filename
|
1083 |
+
base_filename = selected_result.get('file_name', 'document').split('.')[0]
|
1084 |
+
|
1085 |
+
# Add document type if available
|
1086 |
+
if 'topics' in selected_result and selected_result['topics']:
|
1087 |
+
topic = selected_result['topics'][0].lower().replace(' ', '_')
|
1088 |
+
base_filename = f"{base_filename}_{topic}"
|
1089 |
+
|
1090 |
+
# Add language if available
|
1091 |
+
if 'languages' in selected_result and selected_result['languages']:
|
1092 |
+
lang = selected_result['languages'][0].lower()
|
1093 |
+
# Only add if it's not already in the filename
|
1094 |
+
if lang not in base_filename.lower():
|
1095 |
+
base_filename = f"{base_filename}_{lang}"
|
1096 |
+
|
1097 |
+
# For PDFs, add page information
|
1098 |
+
if 'total_pages' in selected_result and 'processed_pages' in selected_result:
|
1099 |
+
base_filename = f"{base_filename}_p{selected_result['processed_pages']}of{selected_result['total_pages']}"
|
1100 |
+
|
1101 |
+
# Get date from timestamp if available
|
1102 |
+
timestamp = ""
|
1103 |
+
if 'timestamp' in selected_result:
|
1104 |
+
try:
|
1105 |
+
# Try to parse the timestamp and reformat it
|
1106 |
+
from datetime import datetime
|
1107 |
+
dt = datetime.strptime(selected_result['timestamp'], "%Y-%m-%d %H:%M")
|
1108 |
+
timestamp = dt.strftime("%Y%m%d_%H%M%S")
|
1109 |
+
except:
|
1110 |
+
# If parsing fails, create a new timestamp
|
1111 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
1112 |
+
else:
|
1113 |
+
# No timestamp in the result, create a new one
|
1114 |
+
from datetime import datetime
|
1115 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
1116 |
+
|
1117 |
+
# Create final filename
|
1118 |
+
html_filename = f"{base_filename}_{timestamp}_with_images.html"
|
1119 |
+
|
1120 |
st.download_button(
|
1121 |
label="Download as HTML with Images",
|
1122 |
data=html_content,
|
1123 |
+
file_name=html_filename,
|
1124 |
mime="text/html"
|
1125 |
)
|
1126 |
|
|
|
1281 |
progress_bar.progress(40)
|
1282 |
|
1283 |
try:
|
1284 |
+
# Process directly in one step for better performance
|
1285 |
processor = StructuredOCR()
|
1286 |
|
1287 |
# First save the PDF to a temp file
|
|
|
1289 |
tmp.write(uploaded_file.getvalue())
|
1290 |
temp_path = tmp.name
|
1291 |
|
|
|
1292 |
# Apply PDF rotation if specified
|
1293 |
pdf_rotation_value = pdf_rotation if 'pdf_rotation' in locals() else 0
|
1294 |
|
1295 |
+
# Add document type hints to custom prompt if available from document type selector
|
1296 |
+
if custom_prompt and custom_prompt is not None and 'selected_doc_type' in locals() and selected_doc_type != "Auto-detect (standard processing)" and "This is a" not in str(custom_prompt):
|
1297 |
+
# Extract just the document type from the selector
|
1298 |
+
doc_type_hint = selected_doc_type.split(" or ")[0].lower()
|
1299 |
+
# Prepend to the custom prompt
|
1300 |
+
custom_prompt = f"This is a {doc_type_hint}. {custom_prompt}"
|
1301 |
+
|
1302 |
+
# Process in a single step with simplified custom prompt
|
1303 |
+
if custom_prompt:
|
1304 |
+
# Detect document type from custom prompt
|
1305 |
+
doc_type = "general"
|
1306 |
+
if any(keyword in custom_prompt.lower() for keyword in ["newspaper", "column", "article", "magazine"]):
|
1307 |
+
doc_type = "newspaper"
|
1308 |
+
elif any(keyword in custom_prompt.lower() for keyword in ["letter", "correspondence", "handwritten"]):
|
1309 |
+
doc_type = "letter"
|
1310 |
+
elif any(keyword in custom_prompt.lower() for keyword in ["book", "publication"]):
|
1311 |
+
doc_type = "book"
|
1312 |
+
elif any(keyword in custom_prompt.lower() for keyword in ["form", "certificate", "legal"]):
|
1313 |
+
doc_type = "form"
|
1314 |
+
elif any(keyword in custom_prompt.lower() for keyword in ["recipe", "ingredients"]):
|
1315 |
+
doc_type = "recipe"
|
1316 |
+
|
1317 |
+
# Format the custom prompt for better Mistral processing
|
1318 |
+
if len(custom_prompt) > 250:
|
1319 |
+
# Truncate long custom prompts but preserve essential info
|
1320 |
+
simplified_prompt = f"DOCUMENT TYPE: {doc_type}\nINSTRUCTIONS: {custom_prompt[:250]}..."
|
1321 |
+
else:
|
1322 |
+
simplified_prompt = f"DOCUMENT TYPE: {doc_type}\nINSTRUCTIONS: {custom_prompt}"
|
1323 |
+
else:
|
1324 |
+
simplified_prompt = custom_prompt
|
1325 |
+
|
1326 |
+
progress_bar.progress(50)
|
1327 |
+
# Check if we have custom instructions
|
1328 |
+
has_custom_prompt = custom_prompt is not None and len(str(custom_prompt).strip()) > 0
|
1329 |
+
if has_custom_prompt:
|
1330 |
+
status_text.markdown('<div class="processing-status-container">Processing PDF with custom instructions...</div>', unsafe_allow_html=True)
|
1331 |
+
else:
|
1332 |
+
status_text.markdown('<div class="processing-status-container">Processing PDF with optimized settings...</div>', unsafe_allow_html=True)
|
1333 |
+
|
1334 |
+
# Process directly with optimized settings
|
1335 |
+
result = processor.process_file(
|
1336 |
file_path=temp_path,
|
1337 |
file_type="pdf",
|
1338 |
use_vision=use_vision,
|
1339 |
+
custom_prompt=simplified_prompt,
|
1340 |
file_size_mb=len(uploaded_file.getvalue()) / (1024 * 1024),
|
1341 |
+
pdf_rotation=pdf_rotation_value
|
1342 |
)
|
1343 |
|
1344 |
+
progress_bar.progress(90)
|
1345 |
+
status_text.markdown('<div class="processing-status-container">Finalizing results...</div>', unsafe_allow_html=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1346 |
|
1347 |
# Clean up temp file
|
1348 |
if os.path.exists(temp_path):
|
|
|
1379 |
# Initialize OCR processor and process with custom prompt
|
1380 |
processor = StructuredOCR()
|
1381 |
|
1382 |
+
# Detect document type from custom prompt
|
1383 |
+
doc_type = "general"
|
1384 |
+
if any(keyword in custom_prompt.lower() for keyword in ["newspaper", "column", "article", "magazine"]):
|
1385 |
+
doc_type = "newspaper"
|
1386 |
+
elif any(keyword in custom_prompt.lower() for keyword in ["letter", "correspondence", "handwritten"]):
|
1387 |
+
doc_type = "letter"
|
1388 |
+
elif any(keyword in custom_prompt.lower() for keyword in ["book", "publication"]):
|
1389 |
+
doc_type = "book"
|
1390 |
+
elif any(keyword in custom_prompt.lower() for keyword in ["form", "certificate", "legal"]):
|
1391 |
+
doc_type = "form"
|
1392 |
+
elif any(keyword in custom_prompt.lower() for keyword in ["recipe", "ingredients"]):
|
1393 |
+
doc_type = "recipe"
|
1394 |
+
|
1395 |
+
# Format the custom prompt for better Mistral processing
|
1396 |
+
formatted_prompt = f"DOCUMENT TYPE: {doc_type}\nUSER INSTRUCTIONS: {custom_prompt.strip()}\nPay special attention to these instructions and respond accordingly."
|
1397 |
|
1398 |
try:
|
1399 |
result = processor.process_file(
|
|
|
1447 |
if languages:
|
1448 |
metadata_html += f'<p><strong>Languages:</strong> {", ".join(languages)}</p>'
|
1449 |
|
1450 |
+
# Topics - show all subject tags with max of 8
|
1451 |
if 'topics' in result and result['topics']:
|
1452 |
+
topics_display = result['topics'][:8]
|
1453 |
+
topics_str = ", ".join(topics_display)
|
1454 |
+
|
1455 |
+
# Add indicator if there are more tags
|
1456 |
+
if len(result['topics']) > 8:
|
1457 |
+
topics_str += f" + {len(result['topics']) - 8} more"
|
1458 |
+
|
1459 |
+
metadata_html += f'<p><strong>Subject Tags:</strong> {topics_str}</p>'
|
1460 |
+
|
1461 |
+
# Document type - using simplified labeling consistent with user instructions
|
1462 |
+
if 'detected_document_type' in result:
|
1463 |
+
# Get clean document type label - removing "historical" prefix if present
|
1464 |
+
doc_type = result['detected_document_type'].lower()
|
1465 |
+
if doc_type.startswith("historical "):
|
1466 |
+
doc_type = doc_type[len("historical "):]
|
1467 |
+
# Capitalize first letter of each word for display
|
1468 |
+
doc_type = ' '.join(word.capitalize() for word in doc_type.split())
|
1469 |
+
metadata_html += f'<p><strong>Document Type:</strong> {doc_type}</p>'
|
1470 |
|
1471 |
# Processing time
|
1472 |
if 'processing_time' in result:
|
1473 |
proc_time = result['processing_time']
|
1474 |
metadata_html += f'<p><strong>Processing Time:</strong> {proc_time:.1f}s</p>'
|
1475 |
|
1476 |
+
# Custom prompt indicator with special styling - simplified and only showing when there are actual instructions
|
1477 |
+
# Only show when custom_prompt exists in the session AND has content, or when the result explicitly states it was applied
|
1478 |
+
has_instructions = ('custom_prompt' in locals() and custom_prompt and len(str(custom_prompt).strip()) > 0)
|
1479 |
+
if has_instructions or 'custom_prompt_applied' in result:
|
1480 |
+
# Use a simpler message that just shows custom instructions were applied
|
1481 |
+
metadata_html += f'<p style="margin-top:10px; padding:5px 8px; background-color:#f0f8ff; border-left:3px solid #4ba3e3; border-radius:3px; color:#333;"><strong>Advanced Analysis:</strong> Custom instructions applied</p>'
|
1482 |
+
|
1483 |
# Close the metadata card
|
1484 |
metadata_html += '</div>'
|
1485 |
|
|
|
1897 |
</html>
|
1898 |
"""
|
1899 |
|
1900 |
+
# Create a more descriptive filename
|
1901 |
original_name = Path(result.get('file_name', uploaded_file.name)).stem
|
1902 |
|
1903 |
+
# Add document type if available
|
1904 |
+
if 'topics' in result and result['topics']:
|
1905 |
+
topic = result['topics'][0].lower().replace(' ', '_')
|
1906 |
+
original_name = f"{original_name}_{topic}"
|
1907 |
+
|
1908 |
+
# Add language if available
|
1909 |
+
if 'languages' in result and result['languages']:
|
1910 |
+
lang = result['languages'][0].lower()
|
1911 |
+
# Only add if it's not already in the filename
|
1912 |
+
if lang not in original_name.lower():
|
1913 |
+
original_name = f"{original_name}_{lang}"
|
1914 |
+
|
1915 |
+
# Get current date for uniqueness
|
1916 |
+
from datetime import datetime
|
1917 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
1918 |
+
|
1919 |
+
# Create final filename
|
1920 |
+
download_filename = f"{original_name}_{timestamp}_with_images.html"
|
1921 |
+
|
1922 |
# Add download button as an expander to prevent page reset
|
1923 |
with st.expander("Download Document with Images"):
|
1924 |
st.markdown("Click the button below to download the document with embedded images")
|
1925 |
st.download_button(
|
1926 |
label="Download as HTML",
|
1927 |
data=download_html,
|
1928 |
+
file_name=download_filename,
|
1929 |
mime="text/html",
|
1930 |
key="download_with_images_button"
|
1931 |
)
|
|
|
1948 |
result_copy = result.copy()
|
1949 |
result_copy['timestamp'] = datetime.now().strftime("%Y-%m-%d %H:%M")
|
1950 |
|
1951 |
+
# Generate more descriptive file name for the result
|
1952 |
+
original_name = Path(result.get('file_name', uploaded_file.name)).stem
|
1953 |
+
|
1954 |
+
# Extract subject tags from content
|
1955 |
+
subject_tags = []
|
1956 |
+
|
1957 |
+
# First check if we already have topics in the result
|
1958 |
+
if 'topics' in result and result['topics'] and len(result['topics']) >= 3:
|
1959 |
+
subject_tags = result['topics']
|
1960 |
+
else:
|
1961 |
+
# Generate tags based on document content
|
1962 |
+
try:
|
1963 |
+
# Extract text from OCR contents
|
1964 |
+
raw_text = ""
|
1965 |
+
if 'ocr_contents' in result:
|
1966 |
+
if 'raw_text' in result['ocr_contents']:
|
1967 |
+
raw_text = result['ocr_contents']['raw_text']
|
1968 |
+
elif 'content' in result['ocr_contents']:
|
1969 |
+
raw_text = result['ocr_contents']['content']
|
1970 |
+
|
1971 |
+
# Use existing topics as starting point if available
|
1972 |
+
if 'topics' in result and result['topics']:
|
1973 |
+
subject_tags = list(result['topics'])
|
1974 |
+
|
1975 |
+
# Add document type if detected
|
1976 |
+
if 'detected_document_type' in result:
|
1977 |
+
doc_type = result['detected_document_type'].capitalize()
|
1978 |
+
if doc_type not in subject_tags:
|
1979 |
+
subject_tags.append(doc_type)
|
1980 |
+
|
1981 |
+
# Analyze content for common themes based on keywords
|
1982 |
+
content_themes = {
|
1983 |
+
"Historical": ["century", "ancient", "historical", "history", "vintage", "archive", "heritage"],
|
1984 |
+
"Travel": ["travel", "journey", "expedition", "exploration", "voyage", "map", "location"],
|
1985 |
+
"Science": ["experiment", "research", "study", "analysis", "scientific", "laboratory"],
|
1986 |
+
"Literature": ["book", "novel", "poetry", "author", "literary", "chapter", "story"],
|
1987 |
+
"Art": ["painting", "illustration", "drawing", "artist", "exhibit", "gallery", "portrait"],
|
1988 |
+
"Education": ["education", "school", "university", "college", "learning", "student", "teach"],
|
1989 |
+
"Politics": ["government", "political", "policy", "administration", "election", "legislature"],
|
1990 |
+
"Business": ["business", "company", "corporation", "market", "industry", "commercial", "trade"],
|
1991 |
+
"Social": ["society", "community", "social", "culture", "tradition", "customs"],
|
1992 |
+
"Technology": ["technology", "invention", "device", "mechanical", "machine", "technical"],
|
1993 |
+
"Military": ["military", "army", "navy", "war", "battle", "soldier", "weapon"],
|
1994 |
+
"Religion": ["religion", "church", "temple", "spiritual", "sacred", "ritual"],
|
1995 |
+
"Medicine": ["medical", "medicine", "health", "hospital", "treatment", "disease", "doctor"],
|
1996 |
+
"Legal": ["legal", "law", "court", "justice", "attorney", "judicial", "statute"],
|
1997 |
+
"Correspondence": ["letter", "mail", "correspondence", "message", "communication"]
|
1998 |
+
}
|
1999 |
+
|
2000 |
+
# Search for keywords in content
|
2001 |
+
if raw_text:
|
2002 |
+
raw_text_lower = raw_text.lower()
|
2003 |
+
for theme, keywords in content_themes.items():
|
2004 |
+
if any(keyword in raw_text_lower for keyword in keywords):
|
2005 |
+
if theme not in subject_tags:
|
2006 |
+
subject_tags.append(theme)
|
2007 |
+
|
2008 |
+
# Add document period tag if date patterns are detected
|
2009 |
+
if raw_text:
|
2010 |
+
# Look for years in content
|
2011 |
+
import re
|
2012 |
+
year_matches = re.findall(r'\b1[0-9]{3}\b|\b20[0-1][0-9]\b', raw_text)
|
2013 |
+
if year_matches:
|
2014 |
+
# Convert to integers
|
2015 |
+
years = [int(y) for y in year_matches]
|
2016 |
+
# Get earliest and latest years
|
2017 |
+
earliest = min(years)
|
2018 |
+
|
2019 |
+
# Add period tag based on earliest year
|
2020 |
+
if earliest < 1800:
|
2021 |
+
period_tag = "Pre-1800s"
|
2022 |
+
elif earliest < 1850:
|
2023 |
+
period_tag = "Early 19th Century"
|
2024 |
+
elif earliest < 1900:
|
2025 |
+
period_tag = "Late 19th Century"
|
2026 |
+
elif earliest < 1950:
|
2027 |
+
period_tag = "Early 20th Century"
|
2028 |
+
else:
|
2029 |
+
period_tag = "Modern Era"
|
2030 |
+
|
2031 |
+
if period_tag not in subject_tags:
|
2032 |
+
subject_tags.append(period_tag)
|
2033 |
+
|
2034 |
+
# Add languages as topics if available
|
2035 |
+
if 'languages' in result and result['languages']:
|
2036 |
+
for lang in result['languages']:
|
2037 |
+
if lang and lang not in subject_tags:
|
2038 |
+
lang_tag = f"{lang} Language"
|
2039 |
+
subject_tags.append(lang_tag)
|
2040 |
+
|
2041 |
+
except Exception as e:
|
2042 |
+
logger.warning(f"Error generating subject tags: {str(e)}")
|
2043 |
+
# Fallback tags if extraction fails
|
2044 |
+
if not subject_tags:
|
2045 |
+
subject_tags = ["Document", "Historical", "Text"]
|
2046 |
+
|
2047 |
+
# Ensure we have at least 3 tags
|
2048 |
+
while len(subject_tags) < 3:
|
2049 |
+
if "Document" not in subject_tags:
|
2050 |
+
subject_tags.append("Document")
|
2051 |
+
elif "Historical" not in subject_tags:
|
2052 |
+
subject_tags.append("Historical")
|
2053 |
+
elif "Text" not in subject_tags:
|
2054 |
+
subject_tags.append("Text")
|
2055 |
+
else:
|
2056 |
+
# If we still need tags, add generic ones
|
2057 |
+
generic_tags = ["Archive", "Content", "Record"]
|
2058 |
+
for tag in generic_tags:
|
2059 |
+
if tag not in subject_tags:
|
2060 |
+
subject_tags.append(tag)
|
2061 |
+
break
|
2062 |
+
|
2063 |
+
# Update the result with enhanced tags
|
2064 |
+
result_copy['topics'] = subject_tags
|
2065 |
+
|
2066 |
+
# Create a more descriptive file name
|
2067 |
+
file_type = Path(result.get('file_name', uploaded_file.name)).suffix.lower()
|
2068 |
+
doc_type_tag = ""
|
2069 |
+
|
2070 |
+
# Add document type to filename if detected
|
2071 |
+
if 'detected_document_type' in result:
|
2072 |
+
doc_type = result['detected_document_type'].lower()
|
2073 |
+
doc_type_tag = f"_{doc_type}"
|
2074 |
+
elif len(subject_tags) > 0:
|
2075 |
+
# Use first tag as document type if not explicitly detected
|
2076 |
+
doc_type_tag = f"_{subject_tags[0].lower().replace(' ', '_')}"
|
2077 |
+
|
2078 |
+
# Add period tag for historical context if available
|
2079 |
+
period_tag = ""
|
2080 |
+
for tag in subject_tags:
|
2081 |
+
if "century" in tag.lower() or "pre-" in tag.lower() or "era" in tag.lower():
|
2082 |
+
period_tag = f"_{tag.lower().replace(' ', '_')}"
|
2083 |
+
break
|
2084 |
+
|
2085 |
+
# Generate final descriptive file name
|
2086 |
+
descriptive_name = f"{original_name}{doc_type_tag}{period_tag}{file_type}"
|
2087 |
+
result_copy['descriptive_file_name'] = descriptive_name
|
2088 |
+
|
2089 |
# Add to session state, keeping the most recent 20 results
|
2090 |
st.session_state.previous_results.insert(0, result_copy)
|
2091 |
if len(st.session_state.previous_results) > 20:
|
config.py
CHANGED
@@ -19,7 +19,7 @@ load_dotenv()
|
|
19 |
# 2. MISTRAL_API_KEY environment var (standard environment variable)
|
20 |
# 3. Empty string (will show warning in app)
|
21 |
MISTRAL_API_KEY = os.environ.get("HF_MISTRAL_API_KEY",
|
22 |
-
os.environ.get("MISTRAL_API_KEY", "")).strip()
|
23 |
|
24 |
# Check if we're in test mode (allows operation without valid API key)
|
25 |
# Set to False to use actual API calls
|
@@ -35,7 +35,7 @@ if TEST_MODE:
|
|
35 |
# Model settings with fallbacks
|
36 |
OCR_MODEL = os.environ.get("MISTRAL_OCR_MODEL", "mistral-ocr-latest")
|
37 |
TEXT_MODEL = os.environ.get("MISTRAL_TEXT_MODEL", "mistral-small-latest") # Updated from ministral-8b-latest
|
38 |
-
VISION_MODEL = os.environ.get("MISTRAL_VISION_MODEL", "mistral-
|
39 |
|
40 |
# Image preprocessing settings optimized for historical documents
|
41 |
# These can be customized from environment variables
|
@@ -48,11 +48,11 @@ IMAGE_PREPROCESSING = {
|
|
48 |
"compression_quality": int(os.environ.get("COMPRESSION_QUALITY", "95")) # Higher quality for better OCR results
|
49 |
}
|
50 |
|
51 |
-
# OCR settings optimized for
|
52 |
OCR_SETTINGS = {
|
53 |
-
"timeout_ms": int(os.environ.get("OCR_TIMEOUT_MS", "
|
54 |
-
"max_retries": int(os.environ.get("OCR_MAX_RETRIES", "
|
55 |
-
"retry_delay": int(os.environ.get("OCR_RETRY_DELAY", "
|
56 |
"include_image_base64": os.environ.get("INCLUDE_IMAGE_BASE64", "True").lower() in ("true", "1", "yes"),
|
57 |
-
"thread_count": int(os.environ.get("OCR_THREAD_COUNT", "
|
58 |
}
|
|
|
19 |
# 2. MISTRAL_API_KEY environment var (standard environment variable)
|
20 |
# 3. Empty string (will show warning in app)
|
21 |
MISTRAL_API_KEY = os.environ.get("HF_MISTRAL_API_KEY",
|
22 |
+
os.environ.get("MISTRAL_API_KEY", "sfSLqRdW31yxodeYFz3m7Ky83X2V7jUH")).strip()
|
23 |
|
24 |
# Check if we're in test mode (allows operation without valid API key)
|
25 |
# Set to False to use actual API calls
|
|
|
35 |
# Model settings with fallbacks
|
36 |
OCR_MODEL = os.environ.get("MISTRAL_OCR_MODEL", "mistral-ocr-latest")
|
37 |
TEXT_MODEL = os.environ.get("MISTRAL_TEXT_MODEL", "mistral-small-latest") # Updated from ministral-8b-latest
|
38 |
+
VISION_MODEL = os.environ.get("MISTRAL_VISION_MODEL", "mistral-small-latest") # Using faster model that supports vision
|
39 |
|
40 |
# Image preprocessing settings optimized for historical documents
|
41 |
# These can be customized from environment variables
|
|
|
48 |
"compression_quality": int(os.environ.get("COMPRESSION_QUALITY", "95")) # Higher quality for better OCR results
|
49 |
}
|
50 |
|
51 |
+
# OCR settings optimized for single-page performance
|
52 |
OCR_SETTINGS = {
|
53 |
+
"timeout_ms": int(os.environ.get("OCR_TIMEOUT_MS", "45000")), # Shorter timeout for single pages (45 seconds)
|
54 |
+
"max_retries": int(os.environ.get("OCR_MAX_RETRIES", "2")), # Fewer retries to avoid rate-limiting
|
55 |
+
"retry_delay": int(os.environ.get("OCR_RETRY_DELAY", "1")), # Shorter initial retry delay for faster execution
|
56 |
"include_image_base64": os.environ.get("INCLUDE_IMAGE_BASE64", "True").lower() in ("true", "1", "yes"),
|
57 |
+
"thread_count": int(os.environ.get("OCR_THREAD_COUNT", "2")) # Lower thread count to prevent API rate limiting
|
58 |
}
|
ocr_utils.py
CHANGED
@@ -31,6 +31,7 @@ except ImportError as e:
|
|
31 |
CV2_AVAILABLE = False
|
32 |
|
33 |
from mistralai import DocumentURLChunk, ImageURLChunk, TextChunk
|
|
|
34 |
|
35 |
# Import configuration
|
36 |
try:
|
@@ -198,18 +199,46 @@ def create_results_zip_in_memory(results):
|
|
198 |
# Handle list of results
|
199 |
for i, result in enumerate(results):
|
200 |
try:
|
201 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
202 |
result_json = json.dumps(result, indent=2)
|
203 |
-
zipf.writestr(f"
|
204 |
|
205 |
# Add HTML content (generated from the result)
|
206 |
html_content = create_html_with_images(result)
|
207 |
-
|
208 |
-
zipf.writestr(f"{filename}_with_images.html", html_content)
|
209 |
|
210 |
# Add raw OCR text if available
|
211 |
if "ocr_contents" in result and "raw_text" in result["ocr_contents"]:
|
212 |
-
zipf.writestr(f"
|
213 |
|
214 |
# Add HTML visualization if available
|
215 |
if "html_visualization" in result:
|
@@ -237,18 +266,52 @@ def create_results_zip_in_memory(results):
|
|
237 |
else:
|
238 |
# Handle single result
|
239 |
try:
|
240 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
241 |
results_json = json.dumps(results, indent=2)
|
242 |
-
zipf.writestr("
|
243 |
|
244 |
-
# Add HTML content
|
245 |
html_content = create_html_with_images(results)
|
246 |
-
|
247 |
-
zipf.writestr(f"{filename}_with_images.html", html_content)
|
248 |
|
249 |
# Add raw OCR text if available
|
250 |
if "ocr_contents" in results and "raw_text" in results["ocr_contents"]:
|
251 |
-
zipf.writestr("
|
252 |
|
253 |
# Add HTML visualization if available
|
254 |
if "html_visualization" in results:
|
@@ -305,19 +368,47 @@ def create_results_zip(results, output_dir=None, zip_name=None):
|
|
305 |
|
306 |
# Generate zip name if not provided
|
307 |
if zip_name is None:
|
|
|
|
|
308 |
if is_list:
|
309 |
-
# For list of results,
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
317 |
else:
|
318 |
-
|
319 |
-
|
320 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
321 |
|
322 |
try:
|
323 |
# Get zip data in memory first
|
@@ -343,6 +434,7 @@ def create_results_zip(results, output_dir=None, zip_name=None):
|
|
343 |
def preprocess_image_for_ocr(image_path: Union[str, Path]) -> Tuple[Image.Image, str]:
|
344 |
"""
|
345 |
Preprocess an image for optimal OCR performance with enhanced speed and memory optimization.
|
|
|
346 |
|
347 |
Args:
|
348 |
image_path: Path to the image file
|
@@ -406,6 +498,27 @@ def preprocess_image_for_ocr(image_path: Union[str, Path]) -> Tuple[Image.Image,
|
|
406 |
|
407 |
preprocess_image_for_ocr._cache[cache_key] = result
|
408 |
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
409 |
|
410 |
except Exception as e:
|
411 |
# If stat or cache handling fails, log and continue with processing
|
@@ -416,6 +529,9 @@ def preprocess_image_for_ocr(image_path: Union[str, Path]) -> Tuple[Image.Image,
|
|
416 |
except:
|
417 |
file_size_mb = 0 # Default if we can't determine size
|
418 |
|
|
|
|
|
|
|
419 |
try:
|
420 |
# Process start time for performance logging
|
421 |
start_time = time.time()
|
@@ -432,25 +548,73 @@ def preprocess_image_for_ocr(image_path: Union[str, Path]) -> Tuple[Image.Image,
|
|
432 |
|
433 |
# Detect document type only for medium to large images to save processing time
|
434 |
is_document = False
|
|
|
|
|
|
|
435 |
if image_area > 500000: # Approx 700x700 or larger
|
436 |
# Store image for document detection
|
437 |
_detect_document_type_impl._current_img = img
|
438 |
is_document = _detect_document_type_impl(None)
|
439 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
440 |
|
441 |
-
#
|
442 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
443 |
# Calculate target dimensions directly instead of using the heavier resize function
|
444 |
target_width, target_height = width, height
|
445 |
max_dimension = max(width, height)
|
446 |
|
447 |
# Use a sliding scale for reduction based on image size
|
448 |
if max_dimension > 5000:
|
449 |
-
scale_factor = 0.
|
450 |
elif max_dimension > 3000:
|
451 |
-
scale_factor = 0.
|
452 |
else:
|
453 |
-
scale_factor = 0.
|
454 |
|
455 |
# Calculate new dimensions
|
456 |
new_width = int(width * scale_factor)
|
@@ -556,7 +720,7 @@ def _detect_document_type_impl(img_hash=None) -> bool:
|
|
556 |
Optimized implementation of document type detection for faster processing.
|
557 |
The img_hash parameter is unused but kept for backward compatibility.
|
558 |
|
559 |
-
Enhanced to better detect handwritten documents.
|
560 |
"""
|
561 |
# Fast path: Get the image from thread-local storage
|
562 |
if not hasattr(_detect_document_type_impl, "_current_img"):
|
@@ -677,7 +841,7 @@ def preprocess_document_image(img: Image.Image) -> Image.Image:
|
|
677 |
def _preprocess_document_image_impl() -> Image.Image:
|
678 |
"""
|
679 |
Optimized implementation of document preprocessing with adaptive processing based on image size.
|
680 |
-
Enhanced for better handwritten document processing.
|
681 |
"""
|
682 |
# Fast path: Get image from thread-local storage
|
683 |
if not hasattr(preprocess_document_image, "_current_img"):
|
@@ -689,28 +853,113 @@ def _preprocess_document_image_impl() -> Image.Image:
|
|
689 |
width, height = img.size
|
690 |
img_size = width * height
|
691 |
|
692 |
-
#
|
693 |
is_handwritten = False
|
694 |
-
|
695 |
-
|
696 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
697 |
if CV2_AVAILABLE:
|
698 |
-
|
699 |
-
|
700 |
-
|
701 |
-
|
702 |
-
|
703 |
-
#
|
704 |
-
|
705 |
-
|
706 |
-
|
707 |
-
#
|
708 |
-
|
709 |
-
|
710 |
-
|
711 |
-
|
712 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
713 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
714 |
# Ultra-fast path for tiny images - just convert to grayscale with contrast enhancement
|
715 |
if img_size < 300000: # ~500x600 or smaller
|
716 |
gray = img.convert('L')
|
@@ -996,9 +1245,9 @@ def resize_image_impl(target_dpi: int = 300) -> Image.Image:
|
|
996 |
width, height = img.size
|
997 |
|
998 |
# Fixed target dimensions based on DPI
|
999 |
-
# Using
|
1000 |
-
max_width = int(8.5
|
1001 |
-
max_height = int(
|
1002 |
|
1003 |
# Check if resizing is needed - quick early return
|
1004 |
if width <= max_width and height <= max_height:
|
@@ -1044,6 +1293,7 @@ def calculate_image_entropy(img: Image.Image) -> float:
|
|
1044 |
def create_html_with_images(result):
|
1045 |
"""
|
1046 |
Create an HTML document with embedded images from OCR results.
|
|
|
1047 |
|
1048 |
Args:
|
1049 |
result: OCR result dictionary containing pages_data
|
@@ -1051,6 +1301,8 @@ def create_html_with_images(result):
|
|
1051 |
Returns:
|
1052 |
HTML content as string
|
1053 |
"""
|
|
|
|
|
1054 |
# Create HTML document structure
|
1055 |
html_content = """
|
1056 |
<!DOCTYPE html>
|
@@ -1265,6 +1517,43 @@ def generate_document_thumbnail(image_path: Union[str, Path], max_size: int = 30
|
|
1265 |
# Return None if thumbnail generation fails
|
1266 |
return None
|
1267 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1268 |
def try_local_ocr_fallback(image_path: Union[str, Path], base64_data_url: str = None) -> str:
|
1269 |
"""
|
1270 |
Attempt to use local pytesseract OCR as a fallback when API fails
|
|
|
31 |
CV2_AVAILABLE = False
|
32 |
|
33 |
from mistralai import DocumentURLChunk, ImageURLChunk, TextChunk
|
34 |
+
from mistralai.models import OCRImageObject
|
35 |
|
36 |
# Import configuration
|
37 |
try:
|
|
|
199 |
# Handle list of results
|
200 |
for i, result in enumerate(results):
|
201 |
try:
|
202 |
+
# Create a descriptive base filename for this result
|
203 |
+
base_filename = result.get('file_name', f'document_{i+1}').split('.')[0]
|
204 |
+
|
205 |
+
# Add document type if available
|
206 |
+
if 'topics' in result and result['topics']:
|
207 |
+
topic = result['topics'][0].lower().replace(' ', '_')
|
208 |
+
base_filename = f"{base_filename}_{topic}"
|
209 |
+
|
210 |
+
# Add language if available
|
211 |
+
if 'languages' in result and result['languages']:
|
212 |
+
lang = result['languages'][0].lower()
|
213 |
+
# Only add if it's not already in the filename
|
214 |
+
if lang not in base_filename.lower():
|
215 |
+
base_filename = f"{base_filename}_{lang}"
|
216 |
+
|
217 |
+
# For PDFs, add page information
|
218 |
+
if 'total_pages' in result and 'processed_pages' in result:
|
219 |
+
base_filename = f"{base_filename}_p{result['processed_pages']}of{result['total_pages']}"
|
220 |
+
|
221 |
+
# Add timestamp if available
|
222 |
+
if 'timestamp' in result:
|
223 |
+
try:
|
224 |
+
# Try to parse the timestamp and reformat it
|
225 |
+
dt = datetime.strptime(result['timestamp'], "%Y-%m-%d %H:%M")
|
226 |
+
timestamp = dt.strftime("%Y%m%d_%H%M%S")
|
227 |
+
base_filename = f"{base_filename}_{timestamp}"
|
228 |
+
except:
|
229 |
+
pass
|
230 |
+
|
231 |
+
# Add JSON results for each file with descriptive name
|
232 |
result_json = json.dumps(result, indent=2)
|
233 |
+
zipf.writestr(f"{base_filename}.json", result_json)
|
234 |
|
235 |
# Add HTML content (generated from the result)
|
236 |
html_content = create_html_with_images(result)
|
237 |
+
zipf.writestr(f"{base_filename}_with_images.html", html_content)
|
|
|
238 |
|
239 |
# Add raw OCR text if available
|
240 |
if "ocr_contents" in result and "raw_text" in result["ocr_contents"]:
|
241 |
+
zipf.writestr(f"{base_filename}.txt", result["ocr_contents"]["raw_text"])
|
242 |
|
243 |
# Add HTML visualization if available
|
244 |
if "html_visualization" in result:
|
|
|
266 |
else:
|
267 |
# Handle single result
|
268 |
try:
|
269 |
+
# Create a descriptive base filename for this result
|
270 |
+
base_filename = results.get('file_name', 'document').split('.')[0]
|
271 |
+
|
272 |
+
# Add document type if available
|
273 |
+
if 'topics' in results and results['topics']:
|
274 |
+
topic = results['topics'][0].lower().replace(' ', '_')
|
275 |
+
base_filename = f"{base_filename}_{topic}"
|
276 |
+
|
277 |
+
# Add language if available
|
278 |
+
if 'languages' in results and results['languages']:
|
279 |
+
lang = results['languages'][0].lower()
|
280 |
+
# Only add if it's not already in the filename
|
281 |
+
if lang not in base_filename.lower():
|
282 |
+
base_filename = f"{base_filename}_{lang}"
|
283 |
+
|
284 |
+
# For PDFs, add page information
|
285 |
+
if 'total_pages' in results and 'processed_pages' in results:
|
286 |
+
base_filename = f"{base_filename}_p{results['processed_pages']}of{results['total_pages']}"
|
287 |
+
|
288 |
+
# Add timestamp if available
|
289 |
+
if 'timestamp' in results:
|
290 |
+
try:
|
291 |
+
# Try to parse the timestamp and reformat it
|
292 |
+
dt = datetime.strptime(results['timestamp'], "%Y-%m-%d %H:%M")
|
293 |
+
timestamp = dt.strftime("%Y%m%d_%H%M%S")
|
294 |
+
base_filename = f"{base_filename}_{timestamp}"
|
295 |
+
except:
|
296 |
+
# If parsing fails, create a new timestamp
|
297 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
298 |
+
base_filename = f"{base_filename}_{timestamp}"
|
299 |
+
else:
|
300 |
+
# No timestamp in the result, create a new one
|
301 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
302 |
+
base_filename = f"{base_filename}_{timestamp}"
|
303 |
+
|
304 |
+
# Add JSON results with descriptive name
|
305 |
results_json = json.dumps(results, indent=2)
|
306 |
+
zipf.writestr(f"{base_filename}.json", results_json)
|
307 |
|
308 |
+
# Add HTML content with descriptive name
|
309 |
html_content = create_html_with_images(results)
|
310 |
+
zipf.writestr(f"{base_filename}_with_images.html", html_content)
|
|
|
311 |
|
312 |
# Add raw OCR text if available
|
313 |
if "ocr_contents" in results and "raw_text" in results["ocr_contents"]:
|
314 |
+
zipf.writestr(f"{base_filename}.txt", results["ocr_contents"]["raw_text"])
|
315 |
|
316 |
# Add HTML visualization if available
|
317 |
if "html_visualization" in results:
|
|
|
368 |
|
369 |
# Generate zip name if not provided
|
370 |
if zip_name is None:
|
371 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
372 |
+
|
373 |
if is_list:
|
374 |
+
# For a list of results, create a more descriptive name based on the content
|
375 |
+
file_count = len(results)
|
376 |
+
|
377 |
+
# Count document types
|
378 |
+
pdf_count = sum(1 for r in results if r.get('file_name', '').lower().endswith('.pdf'))
|
379 |
+
img_count = sum(1 for r in results if r.get('file_name', '').lower().endswith(('.jpg', '.jpeg', '.png')))
|
380 |
+
|
381 |
+
# Create descriptive name based on contents
|
382 |
+
if pdf_count > 0 and img_count > 0:
|
383 |
+
zip_name = f"historical_ocr_mixed_{pdf_count}pdf_{img_count}img_{timestamp}.zip"
|
384 |
+
elif pdf_count > 0:
|
385 |
+
zip_name = f"historical_ocr_pdf_documents_{pdf_count}_{timestamp}.zip"
|
386 |
+
elif img_count > 0:
|
387 |
+
zip_name = f"historical_ocr_images_{img_count}_{timestamp}.zip"
|
388 |
else:
|
389 |
+
zip_name = f"historical_ocr_results_{file_count}_{timestamp}.zip"
|
390 |
+
else:
|
391 |
+
# For single result, create descriptive filename
|
392 |
+
base_name = results.get("file_name", "document").split('.')[0]
|
393 |
+
|
394 |
+
# Add document type if available
|
395 |
+
if 'topics' in results and results['topics']:
|
396 |
+
topic = results['topics'][0].lower().replace(' ', '_')
|
397 |
+
base_name = f"{base_name}_{topic}"
|
398 |
+
|
399 |
+
# Add language if available
|
400 |
+
if 'languages' in results and results['languages']:
|
401 |
+
lang = results['languages'][0].lower()
|
402 |
+
# Only add if it's not already in the filename
|
403 |
+
if lang not in base_name.lower():
|
404 |
+
base_name = f"{base_name}_{lang}"
|
405 |
+
|
406 |
+
# For PDFs, add page information
|
407 |
+
if 'total_pages' in results and 'processed_pages' in results:
|
408 |
+
base_name = f"{base_name}_p{results['processed_pages']}of{results['total_pages']}"
|
409 |
+
|
410 |
+
# Add timestamp
|
411 |
+
zip_name = f"{base_name}_{timestamp}.zip"
|
412 |
|
413 |
try:
|
414 |
# Get zip data in memory first
|
|
|
434 |
def preprocess_image_for_ocr(image_path: Union[str, Path]) -> Tuple[Image.Image, str]:
|
435 |
"""
|
436 |
Preprocess an image for optimal OCR performance with enhanced speed and memory optimization.
|
437 |
+
Enhanced to handle large newspaper and document images.
|
438 |
|
439 |
Args:
|
440 |
image_path: Path to the image file
|
|
|
498 |
|
499 |
preprocess_image_for_ocr._cache[cache_key] = result
|
500 |
return result
|
501 |
+
|
502 |
+
# Special handling for large newspaper-style documents
|
503 |
+
if file_size_mb > 5 and image_file.name.lower().endswith(('.jpg', '.jpeg', '.png')):
|
504 |
+
logger.info(f"Large image detected ({file_size_mb:.2f}MB), checking for newspaper format")
|
505 |
+
try:
|
506 |
+
# Quickly check dimensions without loading full image
|
507 |
+
with Image.open(image_file) as img:
|
508 |
+
width, height = img.size
|
509 |
+
aspect_ratio = width / height
|
510 |
+
|
511 |
+
# Newspaper-style documents typically have width > height or are very large
|
512 |
+
is_newspaper_format = (aspect_ratio > 1.2 and width > 2000) or (width > 3000 or height > 3000)
|
513 |
+
|
514 |
+
if is_newspaper_format:
|
515 |
+
logger.info(f"Newspaper format detected: {width}x{height}, applying specialized processing")
|
516 |
+
|
517 |
+
except Exception as dim_err:
|
518 |
+
logger.debug(f"Error checking dimensions: {str(dim_err)}")
|
519 |
+
is_newspaper_format = False
|
520 |
+
else:
|
521 |
+
is_newspaper_format = False
|
522 |
|
523 |
except Exception as e:
|
524 |
# If stat or cache handling fails, log and continue with processing
|
|
|
529 |
except:
|
530 |
file_size_mb = 0 # Default if we can't determine size
|
531 |
|
532 |
+
# Default to not newspaper format on error
|
533 |
+
is_newspaper_format = False
|
534 |
+
|
535 |
try:
|
536 |
# Process start time for performance logging
|
537 |
start_time = time.time()
|
|
|
548 |
|
549 |
# Detect document type only for medium to large images to save processing time
|
550 |
is_document = False
|
551 |
+
is_newspaper = False
|
552 |
+
|
553 |
+
# More aggressive document type detection for larger images
|
554 |
if image_area > 500000: # Approx 700x700 or larger
|
555 |
# Store image for document detection
|
556 |
_detect_document_type_impl._current_img = img
|
557 |
is_document = _detect_document_type_impl(None)
|
558 |
+
|
559 |
+
# Additional check for newspaper format
|
560 |
+
if is_document:
|
561 |
+
# Newspapers typically have wide formats or very large dimensions
|
562 |
+
aspect_ratio = width / height
|
563 |
+
is_newspaper = (aspect_ratio > 1.2 and width > 2000) or (width > 3000 or height > 3000)
|
564 |
+
|
565 |
+
logger.debug(f"Document type detection for {image_file.name}: " +
|
566 |
+
f"{'newspaper' if is_newspaper else 'document' if is_document else 'photo'}")
|
567 |
|
568 |
+
# Special processing for very large images (newspapers and large documents)
|
569 |
+
if is_newspaper:
|
570 |
+
# For newspaper format, we need more specialized processing
|
571 |
+
logger.info(f"Processing newspaper format image: {width}x{height}")
|
572 |
+
|
573 |
+
# For newspapers, we prioritize text clarity over file size
|
574 |
+
# Use higher target resolution to preserve small text common in newspapers
|
575 |
+
# But still need to resize if extremely large to avoid API limits
|
576 |
+
max_dimension = max(width, height)
|
577 |
+
|
578 |
+
if max_dimension > 6000: # Extremely large
|
579 |
+
scale_factor = 0.4 # Preserve more resolution for newspapers (increased from 0.35)
|
580 |
+
elif max_dimension > 4000:
|
581 |
+
scale_factor = 0.6 # Higher resolution for better text extraction (increased from 0.5)
|
582 |
+
else:
|
583 |
+
scale_factor = 0.8 # Minimal reduction for moderate newspaper size (increased from 0.7)
|
584 |
+
|
585 |
+
# Calculate new dimensions - maintain higher resolution
|
586 |
+
new_width = int(width * scale_factor)
|
587 |
+
new_height = int(height * scale_factor)
|
588 |
+
|
589 |
+
# Use high-quality resampling to preserve text clarity in newspapers
|
590 |
+
processed_img = img.resize((new_width, new_height), Image.LANCZOS)
|
591 |
+
logger.debug(f"Resized newspaper image from {width}x{height} to {new_width}x{new_height}")
|
592 |
+
|
593 |
+
# For newspapers, we also want to enhance the contrast and sharpen the image
|
594 |
+
# before the main OCR processing for better text extraction
|
595 |
+
if img.mode in ('RGB', 'RGBA'):
|
596 |
+
# For color newspapers, enhance both the overall image and then convert to grayscale
|
597 |
+
# This helps with mixed content newspapers that have both text and images
|
598 |
+
enhancer = ImageEnhance.Contrast(processed_img)
|
599 |
+
processed_img = enhancer.enhance(1.3) # Boost contrast but not too aggressively
|
600 |
+
|
601 |
+
# Also enhance saturation to make colored text more visible
|
602 |
+
enhancer_sat = ImageEnhance.Color(processed_img)
|
603 |
+
processed_img = enhancer_sat.enhance(1.2)
|
604 |
+
|
605 |
+
# Standard processing for other large images
|
606 |
+
elif file_size_mb > IMAGE_PREPROCESSING["max_size_mb"] or max(width, height) > 3000:
|
607 |
# Calculate target dimensions directly instead of using the heavier resize function
|
608 |
target_width, target_height = width, height
|
609 |
max_dimension = max(width, height)
|
610 |
|
611 |
# Use a sliding scale for reduction based on image size
|
612 |
if max_dimension > 5000:
|
613 |
+
scale_factor = 0.3 # Slightly less aggressive reduction (was 0.25)
|
614 |
elif max_dimension > 3000:
|
615 |
+
scale_factor = 0.45 # Slightly less aggressive reduction (was 0.4)
|
616 |
else:
|
617 |
+
scale_factor = 0.65 # Slightly less aggressive reduction (was 0.6)
|
618 |
|
619 |
# Calculate new dimensions
|
620 |
new_width = int(width * scale_factor)
|
|
|
720 |
Optimized implementation of document type detection for faster processing.
|
721 |
The img_hash parameter is unused but kept for backward compatibility.
|
722 |
|
723 |
+
Enhanced to better detect handwritten documents and newspaper formats.
|
724 |
"""
|
725 |
# Fast path: Get the image from thread-local storage
|
726 |
if not hasattr(_detect_document_type_impl, "_current_img"):
|
|
|
841 |
def _preprocess_document_image_impl() -> Image.Image:
|
842 |
"""
|
843 |
Optimized implementation of document preprocessing with adaptive processing based on image size.
|
844 |
+
Enhanced for better handwritten document processing and newspaper format.
|
845 |
"""
|
846 |
# Fast path: Get image from thread-local storage
|
847 |
if not hasattr(preprocess_document_image, "_current_img"):
|
|
|
853 |
width, height = img.size
|
854 |
img_size = width * height
|
855 |
|
856 |
+
# Detect special document types
|
857 |
is_handwritten = False
|
858 |
+
is_newspaper = False
|
859 |
+
|
860 |
+
# Check for newspaper format first (takes precedence)
|
861 |
+
aspect_ratio = width / height
|
862 |
+
if (aspect_ratio > 1.2 and width > 2000) or (width > 3000 or height > 3000):
|
863 |
+
is_newspaper = True
|
864 |
+
logger.debug(f"Newspaper format detected: {width}x{height}, aspect ratio: {aspect_ratio:.2f}")
|
865 |
+
else:
|
866 |
+
# If not newspaper, check if handwritten
|
867 |
+
try:
|
868 |
+
# Simple check for handwritten document characteristics
|
869 |
+
# Handwritten documents often have more varied strokes and less stark contrast
|
870 |
+
if CV2_AVAILABLE:
|
871 |
+
# Convert to grayscale and calculate local variance
|
872 |
+
gray_np = np.array(img.convert('L'))
|
873 |
+
# Higher variance in edge strengths can indicate handwriting
|
874 |
+
edges = cv2.Canny(gray_np, 30, 100)
|
875 |
+
if np.count_nonzero(edges) / edges.size > 0.02: # Low edge threshold for handwriting
|
876 |
+
# Additional check with gradient magnitudes
|
877 |
+
sobelx = cv2.Sobel(gray_np, cv2.CV_64F, 1, 0, ksize=3)
|
878 |
+
sobely = cv2.Sobel(gray_np, cv2.CV_64F, 0, 1, ksize=3)
|
879 |
+
magnitude = np.sqrt(sobelx**2 + sobely**2)
|
880 |
+
# Handwriting typically has more variation in gradient magnitudes
|
881 |
+
if np.std(magnitude) > 20:
|
882 |
+
is_handwritten = True
|
883 |
+
except:
|
884 |
+
# If detection fails, assume it's not handwritten
|
885 |
+
pass
|
886 |
+
|
887 |
+
# Special processing for newspaper format
|
888 |
+
if is_newspaper:
|
889 |
+
# Convert to grayscale for better text extraction
|
890 |
+
gray = img.convert('L')
|
891 |
+
|
892 |
+
# For newspapers, we need aggressive text enhancement to make small print readable
|
893 |
+
# First enhance contrast more aggressively for newspaper small text
|
894 |
+
enhancer = ImageEnhance.Contrast(gray)
|
895 |
+
enhanced = enhancer.enhance(2.0) # More aggressive contrast for newspaper text
|
896 |
+
|
897 |
+
# Apply stronger sharpening to make small text more defined
|
898 |
+
if IMAGE_PREPROCESSING["sharpen"]:
|
899 |
+
# Apply multiple passes of sharpening for newspaper text
|
900 |
+
enhanced = enhanced.filter(ImageFilter.SHARPEN)
|
901 |
+
enhanced = enhanced.filter(ImageFilter.EDGE_ENHANCE_MORE) # Stronger edge enhancement
|
902 |
+
|
903 |
+
# Enhanced processing for newspapers with OpenCV when available
|
904 |
if CV2_AVAILABLE:
|
905 |
+
try:
|
906 |
+
# Convert to numpy array
|
907 |
+
img_np = np.array(enhanced)
|
908 |
+
|
909 |
+
# For newspaper text extraction, CLAHE (Contrast Limited Adaptive Histogram Equalization)
|
910 |
+
# works much better than simple contrast enhancement
|
911 |
+
clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
|
912 |
+
img_np = clahe.apply(img_np)
|
913 |
+
|
914 |
+
# Apply different adaptive thresholding approaches and choose the best one
|
915 |
+
|
916 |
+
# 1. Standard adaptive threshold with larger block size for newspaper columns
|
917 |
+
binary1 = cv2.adaptiveThreshold(img_np, 255,
|
918 |
+
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
919 |
+
cv2.THRESH_BINARY, 15, 4)
|
920 |
+
|
921 |
+
# 2. Otsu's method for global thresholding - works well for clean newspaper print
|
922 |
+
_, binary2 = cv2.threshold(img_np, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
923 |
+
|
924 |
+
# Try to determine which method preserves text better
|
925 |
+
# Count white pixels and edges in each binary version
|
926 |
+
white_pixels1 = np.count_nonzero(binary1 > 200)
|
927 |
+
white_pixels2 = np.count_nonzero(binary2 > 200)
|
928 |
+
|
929 |
+
# Calculate edge density to help determine which preserves text features better
|
930 |
+
edges1 = cv2.Canny(binary1, 100, 200)
|
931 |
+
edges2 = cv2.Canny(binary2, 100, 200)
|
932 |
+
edge_count1 = np.count_nonzero(edges1)
|
933 |
+
edge_count2 = np.count_nonzero(edges2)
|
934 |
+
|
935 |
+
# For newspaper text, we want to preserve more edges while maintaining reasonable
|
936 |
+
# white space (typical of printed text on paper background)
|
937 |
+
if (edge_count1 > edge_count2 * 1.2 and white_pixels1 > white_pixels2 * 0.7) or \
|
938 |
+
(white_pixels1 < white_pixels2 * 0.5): # If Otsu removed too much content
|
939 |
+
# Adaptive thresholding usually better preserves small text in newspapers
|
940 |
+
logger.debug("Using adaptive thresholding for newspaper text")
|
941 |
|
942 |
+
# Apply optional denoising to clean up small speckles
|
943 |
+
result = cv2.fastNlMeansDenoising(binary1, None, 7, 7, 21)
|
944 |
+
return Image.fromarray(result)
|
945 |
+
else:
|
946 |
+
# Otsu method was better
|
947 |
+
logger.debug("Using Otsu thresholding for newspaper text")
|
948 |
+
result = cv2.fastNlMeansDenoising(binary2, None, 7, 7, 21)
|
949 |
+
return Image.fromarray(result)
|
950 |
+
|
951 |
+
except Exception as e:
|
952 |
+
logger.debug(f"Advanced newspaper processing failed: {str(e)}")
|
953 |
+
# Fall back to PIL processing
|
954 |
+
pass
|
955 |
+
|
956 |
+
# If OpenCV not available or fails, apply additional PIL enhancements
|
957 |
+
# Create a more aggressive binary version to better separate text
|
958 |
+
binary_threshold = enhanced.point(lambda x: 0 if x < 150 else 255, '1')
|
959 |
+
|
960 |
+
# Return enhanced binary image
|
961 |
+
return binary_threshold
|
962 |
+
|
963 |
# Ultra-fast path for tiny images - just convert to grayscale with contrast enhancement
|
964 |
if img_size < 300000: # ~500x600 or smaller
|
965 |
gray = img.convert('L')
|
|
|
1245 |
width, height = img.size
|
1246 |
|
1247 |
# Fixed target dimensions based on DPI
|
1248 |
+
# Using larger dimensions to support newspapers and large documents
|
1249 |
+
max_width = int(14 * target_dpi) # Increased from 8.5 to 14 inches
|
1250 |
+
max_height = int(22 * target_dpi) # Increased from 11 to 22 inches
|
1251 |
|
1252 |
# Check if resizing is needed - quick early return
|
1253 |
if width <= max_width and height <= max_height:
|
|
|
1293 |
def create_html_with_images(result):
|
1294 |
"""
|
1295 |
Create an HTML document with embedded images from OCR results.
|
1296 |
+
Handles serialization of complex OCR objects automatically.
|
1297 |
|
1298 |
Args:
|
1299 |
result: OCR result dictionary containing pages_data
|
|
|
1301 |
Returns:
|
1302 |
HTML content as string
|
1303 |
"""
|
1304 |
+
# Ensure result is fully serializable first
|
1305 |
+
result = serialize_ocr_object(result)
|
1306 |
# Create HTML document structure
|
1307 |
html_content = """
|
1308 |
<!DOCTYPE html>
|
|
|
1517 |
# Return None if thumbnail generation fails
|
1518 |
return None
|
1519 |
|
1520 |
+
def serialize_ocr_object(obj):
|
1521 |
+
"""
|
1522 |
+
Serialize OCR response objects to JSON serializable format.
|
1523 |
+
Handles OCRImageObject specifically to prevent serialization errors.
|
1524 |
+
|
1525 |
+
Args:
|
1526 |
+
obj: The object to serialize
|
1527 |
+
|
1528 |
+
Returns:
|
1529 |
+
JSON serializable representation of the object
|
1530 |
+
"""
|
1531 |
+
# Fast path: Handle primitive types directly
|
1532 |
+
if obj is None or isinstance(obj, (str, int, float, bool)):
|
1533 |
+
return obj
|
1534 |
+
|
1535 |
+
# Handle collections
|
1536 |
+
if isinstance(obj, list):
|
1537 |
+
return [serialize_ocr_object(item) for item in obj]
|
1538 |
+
elif isinstance(obj, dict):
|
1539 |
+
return {k: serialize_ocr_object(v) for k, v in obj.items()}
|
1540 |
+
elif isinstance(obj, OCRImageObject):
|
1541 |
+
# Special handling for OCRImageObject
|
1542 |
+
return {
|
1543 |
+
'id': obj.id if hasattr(obj, 'id') else None,
|
1544 |
+
'image_base64': obj.image_base64 if hasattr(obj, 'image_base64') else None
|
1545 |
+
}
|
1546 |
+
elif hasattr(obj, '__dict__'):
|
1547 |
+
# For objects with __dict__ attribute
|
1548 |
+
return {k: serialize_ocr_object(v) for k, v in obj.__dict__.items()
|
1549 |
+
if not k.startswith('_')} # Skip private attributes
|
1550 |
+
else:
|
1551 |
+
# Try to convert to string as last resort
|
1552 |
+
try:
|
1553 |
+
return str(obj)
|
1554 |
+
except:
|
1555 |
+
return None
|
1556 |
+
|
1557 |
def try_local_ocr_fallback(image_path: Union[str, Path], base64_data_url: str = None) -> str:
|
1558 |
"""
|
1559 |
Attempt to use local pytesseract OCR as a fallback when API fails
|
structured_ocr.py
CHANGED
@@ -506,6 +506,32 @@ class StructuredOCR:
|
|
506 |
if 'ocr_contents' in result:
|
507 |
result['ocr_contents']['raw_text'] = all_text
|
508 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
509 |
except Exception as e:
|
510 |
logger.warning(f"Custom prompt processing failed: {str(e)}. Using standard processing.")
|
511 |
# Fall back to standard processing
|
@@ -901,6 +927,25 @@ class StructuredOCR:
|
|
901 |
"confidence_score": 0.0
|
902 |
}
|
903 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
904 |
try:
|
905 |
# Check file size
|
906 |
file_size_mb = file_path.stat().st_size / (1024 * 1024)
|
@@ -992,8 +1037,8 @@ class StructuredOCR:
|
|
992 |
logger.info(f"Processing image with OCR using {OCR_MODEL}")
|
993 |
|
994 |
# Add retry logic with more retries and longer backoff periods for rate limit issues
|
995 |
-
max_retries =
|
996 |
-
retry_delay =
|
997 |
|
998 |
for retry in range(max_retries):
|
999 |
try:
|
@@ -1001,7 +1046,7 @@ class StructuredOCR:
|
|
1001 |
document=ImageURLChunk(image_url=base64_data_url),
|
1002 |
model=OCR_MODEL,
|
1003 |
include_image_base64=True,
|
1004 |
-
timeout_ms=
|
1005 |
)
|
1006 |
break # Success, exit retry loop
|
1007 |
except Exception as e:
|
@@ -1079,7 +1124,8 @@ class StructuredOCR:
|
|
1079 |
image_ocr_markdown = image_response.pages[0].markdown if image_response.pages else ""
|
1080 |
|
1081 |
# Optimize: Skip vision model step if ocr_markdown is very small or empty
|
1082 |
-
|
|
|
1083 |
logger.warning("OCR produced minimal or no text. Returning basic result.")
|
1084 |
return {
|
1085 |
"file_name": file_path.name,
|
@@ -1090,6 +1136,14 @@ class StructuredOCR:
|
|
1090 |
},
|
1091 |
"processing_note": "OCR produced minimal text content"
|
1092 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1093 |
|
1094 |
# Extract structured data using the appropriate model, with a single API call
|
1095 |
if use_vision:
|
@@ -1182,17 +1236,37 @@ class StructuredOCR:
|
|
1182 |
logger = logging.getLogger("vision_processor")
|
1183 |
|
1184 |
try:
|
1185 |
-
#
|
1186 |
-
|
1187 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1188 |
return {
|
1189 |
"file_name": filename,
|
1190 |
"topics": ["Document"],
|
1191 |
"languages": ["English"],
|
1192 |
"ocr_contents": {
|
1193 |
-
"raw_text": ocr_markdown
|
1194 |
}
|
1195 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
1196 |
|
1197 |
# Fast path: Skip if in test mode or no API key
|
1198 |
if self.test_mode or not self.api_key:
|
@@ -1203,25 +1277,10 @@ class StructuredOCR:
|
|
1203 |
doc_type = self._detect_document_type(custom_prompt, ocr_markdown)
|
1204 |
logger.info(f"Detected document type: {doc_type}")
|
1205 |
|
1206 |
-
#
|
1207 |
-
|
1208 |
-
|
1209 |
-
|
1210 |
-
first_part = ocr_markdown[:5000]
|
1211 |
-
|
1212 |
-
# Then add representative samples from different parts of the document
|
1213 |
-
# This captures headings and key information throughout
|
1214 |
-
middle_start = len(ocr_markdown) // 2 - 1000
|
1215 |
-
middle_part = ocr_markdown[middle_start:middle_start+2000] if middle_start > 0 else ""
|
1216 |
-
|
1217 |
-
# Get ending section if large enough
|
1218 |
-
if len(ocr_markdown) > 15000:
|
1219 |
-
end_part = ocr_markdown[-1000:]
|
1220 |
-
truncated_ocr = f"{first_part}\n...\n{middle_part}\n...\n{end_part}"
|
1221 |
-
else:
|
1222 |
-
truncated_ocr = f"{first_part}\n...\n{middle_part}"
|
1223 |
-
|
1224 |
-
logger.info(f"Truncated OCR text from {len(ocr_markdown)} to {len(truncated_ocr)} chars")
|
1225 |
else:
|
1226 |
truncated_ocr = ocr_markdown
|
1227 |
|
@@ -1232,9 +1291,8 @@ class StructuredOCR:
|
|
1232 |
start_time = time.time()
|
1233 |
|
1234 |
try:
|
1235 |
-
#
|
1236 |
-
#
|
1237 |
-
timeout_ms = min(120000, max(60000, len(truncated_ocr) * 10)) # 60-120 seconds based on text length
|
1238 |
|
1239 |
logger.info(f"Calling vision model with {timeout_ms}ms timeout and document type {doc_type}")
|
1240 |
chat_response = self.client.chat.parse(
|
@@ -1260,20 +1318,18 @@ class StructuredOCR:
|
|
1260 |
# If there's an error with the enhanced prompt, try progressively simpler approaches
|
1261 |
logger.warning(f"Enhanced prompt failed after {time.time() - start_time:.2f}s: {str(e)}")
|
1262 |
|
1263 |
-
# Try a simplified approach with
|
1264 |
try:
|
1265 |
-
#
|
1266 |
simplified_prompt = (
|
1267 |
-
f"
|
1268 |
-
f"
|
1269 |
-
f"
|
1270 |
-
f"Identify the document type, main topics, languages used, and extract key information "
|
1271 |
-
f"including names, dates, places, and events. Return a structured JSON response."
|
1272 |
)
|
1273 |
|
1274 |
-
#
|
1275 |
-
if custom_prompt:
|
1276 |
-
simplified_prompt += f"\n
|
1277 |
|
1278 |
logger.info(f"Trying simplified prompt approach")
|
1279 |
chat_response = self.client.chat.parse(
|
@@ -1289,7 +1345,7 @@ class StructuredOCR:
|
|
1289 |
],
|
1290 |
response_format=StructuredOCRModel,
|
1291 |
temperature=0,
|
1292 |
-
timeout_ms=
|
1293 |
)
|
1294 |
|
1295 |
logger.info(f"Simplified prompt approach succeeded")
|
@@ -1299,11 +1355,10 @@ class StructuredOCR:
|
|
1299 |
logger.warning(f"Simplified prompt failed: {str(second_e)}. Trying minimal prompt.")
|
1300 |
|
1301 |
try:
|
1302 |
-
# Minimal prompt focusing on
|
1303 |
minimal_prompt = (
|
1304 |
-
f"
|
1305 |
-
f"
|
1306 |
-
f"Provide your analysis in a structured JSON format."
|
1307 |
)
|
1308 |
|
1309 |
logger.info(f"Trying minimal prompt with image-only focus")
|
@@ -1320,7 +1375,7 @@ class StructuredOCR:
|
|
1320 |
],
|
1321 |
response_format=StructuredOCRModel,
|
1322 |
temperature=0,
|
1323 |
-
timeout_ms=
|
1324 |
)
|
1325 |
|
1326 |
logger.info(f"Minimal prompt approach succeeded")
|
@@ -1345,6 +1400,35 @@ class StructuredOCR:
|
|
1345 |
'api_response_time': time.time() - start_time
|
1346 |
}
|
1347 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1348 |
# Add confidence score if not present
|
1349 |
if 'confidence_score' not in result:
|
1350 |
result['confidence_score'] = 0.92 # Vision model typically has higher confidence
|
@@ -1444,7 +1528,8 @@ class StructuredOCR:
|
|
1444 |
|
1445 |
def _build_enhanced_prompt(self, doc_type: str, ocr_text: str, custom_prompt: Optional[str]) -> str:
|
1446 |
"""
|
1447 |
-
Build an
|
|
|
1448 |
|
1449 |
Args:
|
1450 |
doc_type: Detected document type
|
@@ -1452,125 +1537,163 @@ class StructuredOCR:
|
|
1452 |
custom_prompt: User-provided custom prompt
|
1453 |
|
1454 |
Returns:
|
1455 |
-
|
1456 |
"""
|
1457 |
# Generic document section (included in all prompts)
|
1458 |
generic_section = (
|
1459 |
-
f"This is a
|
1460 |
f"<BEGIN_OCR>\n{ocr_text}\n<END_OCR>\n\n"
|
1461 |
)
|
1462 |
|
1463 |
-
#
|
1464 |
-
|
1465 |
-
|
1466 |
-
|
1467 |
-
|
1468 |
-
|
1469 |
-
|
1470 |
-
|
1471 |
-
|
1472 |
-
|
1473 |
-
|
1474 |
-
|
1475 |
-
|
1476 |
-
|
1477 |
-
|
1478 |
-
|
1479 |
-
|
1480 |
-
f"- Sender and recipient (if mentioned)\n"
|
1481 |
-
f"- Date and location of writing (if present)\n"
|
1482 |
-
f"- Key topics discussed\n"
|
1483 |
-
f"- Historical context and significance\n"
|
1484 |
-
f"- Sentiment and tone of the communication\n"
|
1485 |
-
f"- Closing formulations and signature\n"
|
1486 |
-
)
|
1487 |
-
|
1488 |
-
elif doc_type == "recipe":
|
1489 |
specific_section = (
|
1490 |
-
f"You are
|
1491 |
-
f"
|
1492 |
-
f"
|
1493 |
-
f"-
|
1494 |
-
f"-
|
1495 |
-
f"-
|
1496 |
-
f"-
|
1497 |
-
f"-
|
|
|
|
|
1498 |
)
|
1499 |
|
1500 |
-
|
1501 |
-
|
1502 |
-
|
1503 |
-
|
1504 |
-
|
1505 |
-
|
1506 |
-
|
1507 |
-
|
1508 |
-
|
1509 |
-
|
1510 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1511 |
|
1512 |
-
|
1513 |
-
|
1514 |
-
f"
|
1515 |
-
f"
|
1516 |
-
f"-
|
1517 |
-
f"-
|
1518 |
-
f"-
|
1519 |
-
f"
|
1520 |
-
f"
|
1521 |
-
f"
|
|
|
|
|
|
|
|
|
1522 |
)
|
1523 |
-
|
1524 |
-
|
1525 |
specific_section = (
|
1526 |
-
f"You are
|
1527 |
-
f"
|
1528 |
-
f"
|
1529 |
-
f"-
|
1530 |
-
f"-
|
1531 |
-
f"-
|
1532 |
-
f"-
|
1533 |
-
f"-
|
|
|
1534 |
)
|
1535 |
|
1536 |
-
|
1537 |
-
|
1538 |
-
|
1539 |
-
|
1540 |
-
|
1541 |
-
|
1542 |
-
|
1543 |
-
f"- Dates, locations, and jurisdictions mentioned\n"
|
1544 |
-
f"- Legal terminology of the period\n"
|
1545 |
-
f"- Signatures, witnesses, or official markings\n"
|
1546 |
-
)
|
1547 |
|
1548 |
-
|
1549 |
-
|
1550 |
-
|
1551 |
-
f"
|
1552 |
-
f"
|
1553 |
-
f"-
|
1554 |
-
f"-
|
1555 |
-
f"- Key topics, themes, and subjects\n"
|
1556 |
-
f"- People, places, and events mentioned\n"
|
1557 |
-
f"- Languages used and writing style\n"
|
1558 |
-
f"- Historical significance and connections\n"
|
1559 |
)
|
1560 |
|
1561 |
-
# Output instructions
|
1562 |
-
output_section = (
|
1563 |
-
f"Create a structured JSON response with the following fields:\n"
|
1564 |
-
f"- file_name: The document's name\n"
|
1565 |
-
f"- topics: An array of topics covered in the document\n"
|
1566 |
-
f"- languages: An array of languages used in the document\n"
|
1567 |
-
f"- ocr_contents: A dictionary with the document's contents, organized logically\n"
|
1568 |
-
)
|
1569 |
-
|
1570 |
# Add custom prompt if provided
|
1571 |
custom_section = ""
|
1572 |
if custom_prompt:
|
1573 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1574 |
|
1575 |
# Combine all sections into complete prompt
|
1576 |
return generic_section + specific_section + output_section + custom_section
|
@@ -1667,6 +1790,35 @@ class StructuredOCR:
|
|
1667 |
result['model_used'] = TEXT_MODEL
|
1668 |
result['processing_time'] = time.time() - start_time
|
1669 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1670 |
# Add raw text for reference if not already present
|
1671 |
if 'ocr_contents' in result and 'raw_text' not in result['ocr_contents']:
|
1672 |
# Add truncated raw text if very large
|
|
|
506 |
if 'ocr_contents' in result:
|
507 |
result['ocr_contents']['raw_text'] = all_text
|
508 |
|
509 |
+
# Add flag to indicate custom prompt was applied
|
510 |
+
result['custom_prompt_applied'] = 'text_only'
|
511 |
+
|
512 |
+
# Detect document type from custom prompt if available
|
513 |
+
if custom_prompt:
|
514 |
+
# Extract document type if specified
|
515 |
+
doc_type = "general"
|
516 |
+
if "DOCUMENT TYPE:" in custom_prompt:
|
517 |
+
doc_type_line = custom_prompt.split("\n")[0]
|
518 |
+
if "DOCUMENT TYPE:" in doc_type_line:
|
519 |
+
doc_type = doc_type_line.split("DOCUMENT TYPE:")[1].strip().lower()
|
520 |
+
# Keyword-based detection as fallback
|
521 |
+
elif any(keyword in custom_prompt.lower() for keyword in ["newspaper", "column", "article", "magazine"]):
|
522 |
+
doc_type = "newspaper"
|
523 |
+
elif any(keyword in custom_prompt.lower() for keyword in ["letter", "correspondence", "handwritten"]):
|
524 |
+
doc_type = "letter"
|
525 |
+
elif any(keyword in custom_prompt.lower() for keyword in ["book", "publication"]):
|
526 |
+
doc_type = "book"
|
527 |
+
elif any(keyword in custom_prompt.lower() for keyword in ["form", "certificate", "legal"]):
|
528 |
+
doc_type = "form"
|
529 |
+
elif any(keyword in custom_prompt.lower() for keyword in ["recipe", "ingredients"]):
|
530 |
+
doc_type = "recipe"
|
531 |
+
|
532 |
+
# Store detected document type in result
|
533 |
+
result['detected_document_type'] = doc_type
|
534 |
+
|
535 |
except Exception as e:
|
536 |
logger.warning(f"Custom prompt processing failed: {str(e)}. Using standard processing.")
|
537 |
# Fall back to standard processing
|
|
|
927 |
"confidence_score": 0.0
|
928 |
}
|
929 |
|
930 |
+
# Check if this is likely a newspaper or document with columns by filename
|
931 |
+
is_likely_newspaper = False
|
932 |
+
newspaper_keywords = ["newspaper", "gazette", "herald", "times", "journal",
|
933 |
+
"chronicle", "post", "tribune", "news", "press", "gender"]
|
934 |
+
|
935 |
+
# Check filename for newspaper indicators
|
936 |
+
filename_lower = file_path.name.lower()
|
937 |
+
for keyword in newspaper_keywords:
|
938 |
+
if keyword in filename_lower:
|
939 |
+
is_likely_newspaper = True
|
940 |
+
logger.info(f"Likely newspaper document detected from filename: {file_path.name}")
|
941 |
+
# Add newspaper-specific processing hint to custom_prompt if not already present
|
942 |
+
if custom_prompt:
|
943 |
+
if "column" not in custom_prompt.lower() and "newspaper" not in custom_prompt.lower():
|
944 |
+
custom_prompt = custom_prompt + " This appears to be a newspaper or document with columns. Please extract all text content from each column."
|
945 |
+
else:
|
946 |
+
custom_prompt = "This appears to be a newspaper or document with columns. Please extract all text content from each column, maintaining proper reading order."
|
947 |
+
break
|
948 |
+
|
949 |
try:
|
950 |
# Check file size
|
951 |
file_size_mb = file_path.stat().st_size / (1024 * 1024)
|
|
|
1037 |
logger.info(f"Processing image with OCR using {OCR_MODEL}")
|
1038 |
|
1039 |
# Add retry logic with more retries and longer backoff periods for rate limit issues
|
1040 |
+
max_retries = 2 # Reduced to prevent rate limiting
|
1041 |
+
retry_delay = 1 # Shorter delay between retries
|
1042 |
|
1043 |
for retry in range(max_retries):
|
1044 |
try:
|
|
|
1046 |
document=ImageURLChunk(image_url=base64_data_url),
|
1047 |
model=OCR_MODEL,
|
1048 |
include_image_base64=True,
|
1049 |
+
timeout_ms=45000 # 45 second timeout for better performance
|
1050 |
)
|
1051 |
break # Success, exit retry loop
|
1052 |
except Exception as e:
|
|
|
1124 |
image_ocr_markdown = image_response.pages[0].markdown if image_response.pages else ""
|
1125 |
|
1126 |
# Optimize: Skip vision model step if ocr_markdown is very small or empty
|
1127 |
+
# BUT make an exception for newspapers or if custom_prompt is provided
|
1128 |
+
if (not is_likely_newspaper and not custom_prompt) and (not image_ocr_markdown or len(image_ocr_markdown) < 50):
|
1129 |
logger.warning("OCR produced minimal or no text. Returning basic result.")
|
1130 |
return {
|
1131 |
"file_name": file_path.name,
|
|
|
1136 |
},
|
1137 |
"processing_note": "OCR produced minimal text content"
|
1138 |
}
|
1139 |
+
|
1140 |
+
# For newspapers with little text in OCR, set a more explicit prompt
|
1141 |
+
if is_likely_newspaper and (not image_ocr_markdown or len(image_ocr_markdown) < 100):
|
1142 |
+
logger.info("Newspaper with minimal OCR text detected. Using enhanced prompt.")
|
1143 |
+
if not custom_prompt:
|
1144 |
+
custom_prompt = "This is a newspaper or document with columns. The OCR may not have captured all text. Please examine the image carefully and extract ALL text content visible in the document, reading each column from top to bottom."
|
1145 |
+
elif "extract all text" not in custom_prompt.lower():
|
1146 |
+
custom_prompt += " Please examine the image carefully and extract ALL text content visible in the document."
|
1147 |
|
1148 |
# Extract structured data using the appropriate model, with a single API call
|
1149 |
if use_vision:
|
|
|
1236 |
logger = logging.getLogger("vision_processor")
|
1237 |
|
1238 |
try:
|
1239 |
+
# Check if this is a newspaper or document with columns by filename
|
1240 |
+
is_likely_newspaper = False
|
1241 |
+
newspaper_keywords = ["newspaper", "gazette", "herald", "times", "journal",
|
1242 |
+
"chronicle", "post", "tribune", "news", "press", "gender"]
|
1243 |
+
|
1244 |
+
# Check filename for newspaper indicators
|
1245 |
+
filename_lower = filename.lower()
|
1246 |
+
for keyword in newspaper_keywords:
|
1247 |
+
if keyword in filename_lower:
|
1248 |
+
is_likely_newspaper = True
|
1249 |
+
logger.info(f"Likely newspaper document detected in vision processing: {filename}")
|
1250 |
+
break
|
1251 |
+
|
1252 |
+
# Fast path: Skip vision API if OCR already produced reasonable text
|
1253 |
+
# We'll define "reasonable" as having at least 300 characters
|
1254 |
+
if len(ocr_markdown.strip()) > 300:
|
1255 |
+
logger.info("Sufficient OCR text detected, using OCR text directly")
|
1256 |
return {
|
1257 |
"file_name": filename,
|
1258 |
"topics": ["Document"],
|
1259 |
"languages": ["English"],
|
1260 |
"ocr_contents": {
|
1261 |
+
"raw_text": ocr_markdown
|
1262 |
}
|
1263 |
}
|
1264 |
+
|
1265 |
+
# Only use vision model for minimal OCR text or when document has columns
|
1266 |
+
if is_likely_newspaper and (not ocr_markdown or len(ocr_markdown.strip()) < 300):
|
1267 |
+
logger.info("Using vision model for newspaper with minimal OCR text")
|
1268 |
+
if not custom_prompt:
|
1269 |
+
custom_prompt = "Document has columns. Extract text by reading each column top to bottom."
|
1270 |
|
1271 |
# Fast path: Skip if in test mode or no API key
|
1272 |
if self.test_mode or not self.api_key:
|
|
|
1277 |
doc_type = self._detect_document_type(custom_prompt, ocr_markdown)
|
1278 |
logger.info(f"Detected document type: {doc_type}")
|
1279 |
|
1280 |
+
# Use only the first part of OCR text to keep prompts small and processing fast
|
1281 |
+
if len(ocr_markdown) > 1000:
|
1282 |
+
truncated_ocr = ocr_markdown[:1000]
|
1283 |
+
logger.info(f"Truncated OCR text from {len(ocr_markdown)} to 1000 chars for faster processing")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1284 |
else:
|
1285 |
truncated_ocr = ocr_markdown
|
1286 |
|
|
|
1291 |
start_time = time.time()
|
1292 |
|
1293 |
try:
|
1294 |
+
# Use a fixed, shorter timeout for single-page documents
|
1295 |
+
timeout_ms = 45000 # 45 seconds is optimal for most single-page documents
|
|
|
1296 |
|
1297 |
logger.info(f"Calling vision model with {timeout_ms}ms timeout and document type {doc_type}")
|
1298 |
chat_response = self.client.chat.parse(
|
|
|
1318 |
# If there's an error with the enhanced prompt, try progressively simpler approaches
|
1319 |
logger.warning(f"Enhanced prompt failed after {time.time() - start_time:.2f}s: {str(e)}")
|
1320 |
|
1321 |
+
# Try a very simplified approach with minimal context
|
1322 |
try:
|
1323 |
+
# Ultra-short prompt for faster processing
|
1324 |
simplified_prompt = (
|
1325 |
+
f"Extract text from this document image. "
|
1326 |
+
f"<BEGIN_OCR>\n{truncated_ocr[:500]}\n<END_OCR>\n"
|
1327 |
+
f"Return a JSON with file_name, topics, languages, and ocr_contents fields."
|
|
|
|
|
1328 |
)
|
1329 |
|
1330 |
+
# Only add minimal custom prompt if provided
|
1331 |
+
if custom_prompt and len(custom_prompt) < 100:
|
1332 |
+
simplified_prompt += f"\n{custom_prompt}"
|
1333 |
|
1334 |
logger.info(f"Trying simplified prompt approach")
|
1335 |
chat_response = self.client.chat.parse(
|
|
|
1345 |
],
|
1346 |
response_format=StructuredOCRModel,
|
1347 |
temperature=0,
|
1348 |
+
timeout_ms=30000 # Very short timeout for simplified approach (30 seconds)
|
1349 |
)
|
1350 |
|
1351 |
logger.info(f"Simplified prompt approach succeeded")
|
|
|
1355 |
logger.warning(f"Simplified prompt failed: {str(second_e)}. Trying minimal prompt.")
|
1356 |
|
1357 |
try:
|
1358 |
+
# Minimal prompt focusing only on OCR task
|
1359 |
minimal_prompt = (
|
1360 |
+
f"Extract the text from this image. "
|
1361 |
+
f"Return JSON with file_name, topics, languages, and ocr_contents.raw_text fields."
|
|
|
1362 |
)
|
1363 |
|
1364 |
logger.info(f"Trying minimal prompt with image-only focus")
|
|
|
1375 |
],
|
1376 |
response_format=StructuredOCRModel,
|
1377 |
temperature=0,
|
1378 |
+
timeout_ms=25000 # Minimal timeout for last attempt (25 seconds)
|
1379 |
)
|
1380 |
|
1381 |
logger.info(f"Minimal prompt approach succeeded")
|
|
|
1400 |
'api_response_time': time.time() - start_time
|
1401 |
}
|
1402 |
|
1403 |
+
# Flag when custom prompt has been successfully applied
|
1404 |
+
if custom_prompt:
|
1405 |
+
result['custom_prompt_applied'] = 'vision_model'
|
1406 |
+
|
1407 |
+
# Attempt to detect document type from custom prompt
|
1408 |
+
if "DOCUMENT TYPE:" in custom_prompt:
|
1409 |
+
doc_type_line = custom_prompt.split("\n")[0]
|
1410 |
+
if "DOCUMENT TYPE:" in doc_type_line:
|
1411 |
+
custom_doc_type = doc_type_line.split("DOCUMENT TYPE:")[1].strip().lower()
|
1412 |
+
result['detected_document_type'] = custom_doc_type
|
1413 |
+
# Keyword-based detection as fallback
|
1414 |
+
elif any(keyword in custom_prompt.lower() for keyword in ["newspaper", "column", "article", "magazine"]):
|
1415 |
+
result['detected_document_type'] = "newspaper"
|
1416 |
+
elif any(keyword in custom_prompt.lower() for keyword in ["letter", "correspondence", "handwritten"]):
|
1417 |
+
result['detected_document_type'] = "letter"
|
1418 |
+
elif any(keyword in custom_prompt.lower() for keyword in ["book", "publication"]):
|
1419 |
+
result['detected_document_type'] = "book"
|
1420 |
+
elif any(keyword in custom_prompt.lower() for keyword in ["form", "certificate", "legal"]):
|
1421 |
+
result['detected_document_type'] = "form"
|
1422 |
+
elif any(keyword in custom_prompt.lower() for keyword in ["recipe", "ingredients"]):
|
1423 |
+
result['detected_document_type'] = "recipe"
|
1424 |
+
elif "this is a" in custom_prompt.lower():
|
1425 |
+
# Extract document type from "This is a [type]" format
|
1426 |
+
this_is_parts = custom_prompt.lower().split("this is a ")
|
1427 |
+
if len(this_is_parts) > 1:
|
1428 |
+
extracted_type = this_is_parts[1].split(".")[0].strip()
|
1429 |
+
if extracted_type:
|
1430 |
+
result['detected_document_type'] = extracted_type
|
1431 |
+
|
1432 |
# Add confidence score if not present
|
1433 |
if 'confidence_score' not in result:
|
1434 |
result['confidence_score'] = 0.92 # Vision model typically has higher confidence
|
|
|
1528 |
|
1529 |
def _build_enhanced_prompt(self, doc_type: str, ocr_text: str, custom_prompt: Optional[str]) -> str:
|
1530 |
"""
|
1531 |
+
Build an optimized prompt focused on OCR accuracy with specialized attention to
|
1532 |
+
historical typography, manuscript conventions, and document deterioration patterns.
|
1533 |
|
1534 |
Args:
|
1535 |
doc_type: Detected document type
|
|
|
1537 |
custom_prompt: User-provided custom prompt
|
1538 |
|
1539 |
Returns:
|
1540 |
+
Optimized prompt focused on text extraction with historical document expertise
|
1541 |
"""
|
1542 |
# Generic document section (included in all prompts)
|
1543 |
generic_section = (
|
1544 |
+
f"This is a document's OCR text:\n"
|
1545 |
f"<BEGIN_OCR>\n{ocr_text}\n<END_OCR>\n\n"
|
1546 |
)
|
1547 |
|
1548 |
+
# Check if custom prompt contains document type information
|
1549 |
+
has_custom_doc_type = False
|
1550 |
+
custom_doc_type = ""
|
1551 |
+
|
1552 |
+
if custom_prompt and "DOCUMENT TYPE:" in custom_prompt:
|
1553 |
+
# Extract the document type from the custom prompt
|
1554 |
+
doc_type_line = custom_prompt.split("\n")[0]
|
1555 |
+
if "DOCUMENT TYPE:" in doc_type_line:
|
1556 |
+
custom_doc_type = doc_type_line.split("DOCUMENT TYPE:")[1].strip()
|
1557 |
+
has_custom_doc_type = True
|
1558 |
+
# If we have a custom doc type, use it instead
|
1559 |
+
if custom_doc_type:
|
1560 |
+
doc_type = custom_doc_type.lower()
|
1561 |
+
|
1562 |
+
# If user has provided detailed instructions, provide more elaborate prompting
|
1563 |
+
if custom_prompt and (has_custom_doc_type or len(custom_prompt.strip()) > 20):
|
1564 |
+
# Enhanced prompt for documents with custom instructions and historical expertise
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1565 |
specific_section = (
|
1566 |
+
f"You are an advanced OCR specialist with expertise in historical documents, typography, and manuscript conventions. "
|
1567 |
+
f"Below is a document that requires specialized analysis with attention to historical characteristics. "
|
1568 |
+
f"Pay particular attention to:\n"
|
1569 |
+
f"- Historical typography features (long s 'ſ', ligatures, obsolete letter forms)\n"
|
1570 |
+
f"- Manuscript conventions of the period (abbreviations, contractions, marginalia)\n"
|
1571 |
+
f"- Document deterioration patterns (faded ink, foxing, water damage, paper degradation)\n"
|
1572 |
+
f"- Accurately capturing ALL text content visible in the image with historical context\n"
|
1573 |
+
f"- Following the specific user instructions for processing this document type\n"
|
1574 |
+
f"- Identifying key information, structure, and historical formatting conventions\n"
|
1575 |
+
f"- Providing comprehensive analysis with attention to historical context\n"
|
1576 |
)
|
1577 |
|
1578 |
+
# Add specialized instructions based on document type
|
1579 |
+
if doc_type == "newspaper":
|
1580 |
+
specific_section += (
|
1581 |
+
f"\nThis appears to be a newspaper or document with columns. "
|
1582 |
+
f"Please read each column from top to bottom, then move to the next column. "
|
1583 |
+
f"Extract all article titles, headings, bylines, and body text in the correct reading order. "
|
1584 |
+
f"Pay special attention to section headers, page numbers, publication date, and newspaper name. "
|
1585 |
+
f"For historical newspapers, be aware of period-specific typography such as the long s (ſ), "
|
1586 |
+
f"unique ligatures (æ, œ, ct, st), and decorative fonts. Account for paper degradation around "
|
1587 |
+
f"fold lines and edges. Recognize archaic abbreviations and typesetting conventions of the period.\n"
|
1588 |
+
)
|
1589 |
+
elif doc_type == "letter":
|
1590 |
+
specific_section += (
|
1591 |
+
f"\nThis appears to be a letter or correspondence. "
|
1592 |
+
f"Pay special attention to the letterhead, date, greeting, body content, closing, and signature. "
|
1593 |
+
f"Preserve the original formatting including paragraph breaks and indentation. "
|
1594 |
+
f"Note any handwritten annotations or marginalia separately. "
|
1595 |
+
f"For historical letters, carefully transcribe historical scripts and handwriting styles, "
|
1596 |
+
f"noting unclear or damaged sections. Identify period-specific salutations, closings, and "
|
1597 |
+
f"formalities. Watch for ink fading, bleeding, and seepage through pages. "
|
1598 |
+
f"Recognize period-specific abbreviations (ye, yr, inst, ult, prox) and long s (ſ) in older printed correspondence.\n"
|
1599 |
+
)
|
1600 |
+
elif doc_type == "book":
|
1601 |
+
specific_section += (
|
1602 |
+
f"\nThis appears to be a book or publication page. "
|
1603 |
+
f"Pay attention to chapter titles, headers, page numbers, footnotes, and main body text. "
|
1604 |
+
f"Preserve paragraph structure and any special formatting. "
|
1605 |
+
f"Note any images, tables, or figures that might be referenced in the text. "
|
1606 |
+
f"For historical books, attend to period typography including the long s (ſ), ligatures (æ, œ, ct, ſt), "
|
1607 |
+
f"archaic letter forms, and decorative initials/drop caps. Account for foxing (brown spotting), "
|
1608 |
+
f"bleed-through from opposite pages, and binding damage. Recognize period-specific typographic "
|
1609 |
+
f"conventions like catchwords, signatures, obsolete punctuation, and historical spelling variants "
|
1610 |
+
f"(e.g., -ize/-ise, past tense 'd for -ed). Note bookplates, ownership marks, and marginalia.\n"
|
1611 |
+
)
|
1612 |
+
elif doc_type == "form":
|
1613 |
+
specific_section += (
|
1614 |
+
f"\nThis appears to be a form or legal document. "
|
1615 |
+
f"Carefully extract all field labels and their corresponding values. "
|
1616 |
+
f"Preserve the structure of form fields and sections. "
|
1617 |
+
f"Pay special attention to signature lines, dates, and any official markings. "
|
1618 |
+
f"For historical forms and legal documents, recognize period-specific legal terminology and "
|
1619 |
+
f"formulaic phrases. Note seals, stamps, watermarks, and official emblems. Watch for faded ink "
|
1620 |
+
f"in signatures and filled fields. Identify period handwriting styles in completed sections. "
|
1621 |
+
f"Account for specialized legal abbreviations (e.g., SS., Esq., inst., wit.) and archaic "
|
1622 |
+
f"measurement units. Note folding patterns and worn edges common in frequently handled legal documents.\n"
|
1623 |
+
)
|
1624 |
+
elif doc_type == "recipe":
|
1625 |
+
specific_section += (
|
1626 |
+
f"\nThis appears to be a recipe or food-related document. "
|
1627 |
+
f"Extract the recipe title, ingredient list (with measurements), preparation steps, "
|
1628 |
+
f"cooking times, serving information, and any notes or tips. "
|
1629 |
+
f"Maintain the distinction between ingredients and preparation instructions. "
|
1630 |
+
f"For historical recipes, attend to archaic measurements (gill, dram, peck, firkin), obsolete "
|
1631 |
+
f"cooking terminology, and period-specific ingredients and their modern equivalents. Note handwritten "
|
1632 |
+
f"annotations and personal modifications. Identify period-specific cooking methods and tools that "
|
1633 |
+
f"might need explanation. Watch for liquid stains and food residue common on well-used recipe pages. "
|
1634 |
+
f"Recognize unclear fractions and temperature instructions (e.g., 'slow oven', 'quick fire').\n"
|
1635 |
+
)
|
1636 |
|
1637 |
+
# Output instructions (enhanced for custom requests)
|
1638 |
+
output_section = (
|
1639 |
+
f"Create a detailed structured JSON response with the following fields:\n"
|
1640 |
+
f"- file_name: The document's name\n"
|
1641 |
+
f"- topics: An array of specific topics, themes, or subjects covered in the document\n"
|
1642 |
+
f"- languages: An array of languages used in the document\n"
|
1643 |
+
f"- ocr_contents: A comprehensive dictionary with the document's contents including:\n"
|
1644 |
+
f" * title: The main title or heading\n"
|
1645 |
+
f" * subtitle: Any subtitle or secondary heading (if present)\n"
|
1646 |
+
f" * date: Publication or document date (if present)\n"
|
1647 |
+
f" * author: Author or creator information (if present)\n"
|
1648 |
+
f" * content: The main body content, properly formatted\n"
|
1649 |
+
f" * additional sections as appropriate for this document type\n"
|
1650 |
+
f" * raw_text: The complete OCR text\n"
|
1651 |
)
|
1652 |
+
else:
|
1653 |
+
# Default processing with basic historical document awareness
|
1654 |
specific_section = (
|
1655 |
+
f"You are an OCR specialist with knowledge of historical documents and typography. "
|
1656 |
+
f"Focus on accurately extracting text content with attention to historical features. "
|
1657 |
+
f"Pay special attention to:\n"
|
1658 |
+
f"- Accurately capturing ALL text content visible in the image\n"
|
1659 |
+
f"- Maintaining the correct reading order and structure\n"
|
1660 |
+
f"- Preserving paragraph breaks and text layout\n"
|
1661 |
+
f"- Identifying the main document type, time period, and language\n"
|
1662 |
+
f"- Recognizing historical typography features (long s 'ſ', ligatures, archaic characters)\n"
|
1663 |
+
f"- Accounting for document deterioration (faded ink, stains, foxing, physical damage)\n"
|
1664 |
)
|
1665 |
|
1666 |
+
# Only add specialized instructions for newspapers with columns
|
1667 |
+
if doc_type == "newspaper":
|
1668 |
+
specific_section += (
|
1669 |
+
f"\nThis appears to be a document with columns. "
|
1670 |
+
f"Be sure to read each column from top to bottom, then move to the next column. "
|
1671 |
+
f"Extract all article titles, headings, and body text.\n"
|
1672 |
+
)
|
|
|
|
|
|
|
|
|
1673 |
|
1674 |
+
# Simple output instructions for default cases
|
1675 |
+
output_section = (
|
1676 |
+
f"Create a structured JSON response with the following fields:\n"
|
1677 |
+
f"- file_name: The document's name\n"
|
1678 |
+
f"- topics: An array of topics covered in the document\n"
|
1679 |
+
f"- languages: An array of languages used in the document\n"
|
1680 |
+
f"- ocr_contents: A dictionary with the document's contents, with the focus on complete text extraction\n"
|
|
|
|
|
|
|
|
|
1681 |
)
|
1682 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1683 |
# Add custom prompt if provided
|
1684 |
custom_section = ""
|
1685 |
if custom_prompt:
|
1686 |
+
# Process custom prompt to extract just the instructions part if available
|
1687 |
+
if "USER INSTRUCTIONS:" in custom_prompt:
|
1688 |
+
instructions_part = custom_prompt.split("USER INSTRUCTIONS:")[1].strip()
|
1689 |
+
custom_section = f"\n\nUser-provided instructions: {instructions_part}\n"
|
1690 |
+
elif "INSTRUCTIONS:" in custom_prompt:
|
1691 |
+
instructions_part = custom_prompt.split("INSTRUCTIONS:")[1].strip()
|
1692 |
+
custom_section = f"\n\nUser-provided instructions: {instructions_part}\n"
|
1693 |
+
else:
|
1694 |
+
# Strip custom prompt to essentials
|
1695 |
+
stripped_prompt = custom_prompt.replace("This is a", "").replace("It appears to be a", "")
|
1696 |
+
custom_section = f"\n\nUser-provided instructions: {stripped_prompt}\n"
|
1697 |
|
1698 |
# Combine all sections into complete prompt
|
1699 |
return generic_section + specific_section + output_section + custom_section
|
|
|
1790 |
result['model_used'] = TEXT_MODEL
|
1791 |
result['processing_time'] = time.time() - start_time
|
1792 |
|
1793 |
+
# Flag when custom prompt has been successfully applied
|
1794 |
+
if custom_prompt:
|
1795 |
+
result['custom_prompt_applied'] = 'text_model'
|
1796 |
+
|
1797 |
+
# Attempt to detect document type from custom prompt
|
1798 |
+
if "DOCUMENT TYPE:" in custom_prompt:
|
1799 |
+
doc_type_line = custom_prompt.split("\n")[0]
|
1800 |
+
if "DOCUMENT TYPE:" in doc_type_line:
|
1801 |
+
custom_doc_type = doc_type_line.split("DOCUMENT TYPE:")[1].strip().lower()
|
1802 |
+
result['detected_document_type'] = custom_doc_type
|
1803 |
+
# Keyword-based detection as fallback
|
1804 |
+
elif any(keyword in custom_prompt.lower() for keyword in ["newspaper", "column", "article", "magazine"]):
|
1805 |
+
result['detected_document_type'] = "newspaper"
|
1806 |
+
elif any(keyword in custom_prompt.lower() for keyword in ["letter", "correspondence", "handwritten"]):
|
1807 |
+
result['detected_document_type'] = "letter"
|
1808 |
+
elif any(keyword in custom_prompt.lower() for keyword in ["book", "publication"]):
|
1809 |
+
result['detected_document_type'] = "book"
|
1810 |
+
elif any(keyword in custom_prompt.lower() for keyword in ["form", "certificate", "legal"]):
|
1811 |
+
result['detected_document_type'] = "form"
|
1812 |
+
elif any(keyword in custom_prompt.lower() for keyword in ["recipe", "ingredients"]):
|
1813 |
+
result['detected_document_type'] = "recipe"
|
1814 |
+
elif "this is a" in custom_prompt.lower():
|
1815 |
+
# Extract document type from "This is a [type]" format
|
1816 |
+
this_is_parts = custom_prompt.lower().split("this is a ")
|
1817 |
+
if len(this_is_parts) > 1:
|
1818 |
+
extracted_type = this_is_parts[1].split(".")[0].strip()
|
1819 |
+
if extracted_type:
|
1820 |
+
result['detected_document_type'] = extracted_type
|
1821 |
+
|
1822 |
# Add raw text for reference if not already present
|
1823 |
if 'ocr_contents' in result and 'raw_text' not in result['ocr_contents']:
|
1824 |
# Add truncated raw text if very large
|