milwright commited on
Commit
75ead00
·
1 Parent(s): 2f2eb30

enhanced OCR functionality and efficiency, simplified preprompting, etc

Browse files
Files changed (5) hide show
  1. CLAUDE.md +6 -3
  2. app.py +493 -103
  3. config.py +7 -7
  4. ocr_utils.py +341 -52
  5. structured_ocr.py +298 -146
CLAUDE.md CHANGED
@@ -7,12 +7,14 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
7
  - Test OCR functionality: `python structured_ocr.py <file_path>`
8
  - Process PDF files: `python pdf_ocr.py <file_path>`
9
  - Process single file with logging: `python process_file.py <file_path>`
 
10
  - Run typechecking: `mypy .`
 
11
 
12
  ## Environment Setup
13
  - API key: Set `MISTRAL_API_KEY` in `.env` file or environment variable
14
  - Install dependencies: `pip install -r requirements.txt`
15
- - System requirements: `apt-get install poppler-utils tesseract-ocr` (or equivalent for your OS)
16
 
17
  ## Code Style Guidelines
18
  - **Imports**: Standard library first, third-party next, local modules last
@@ -21,10 +23,11 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
21
  - **Naming**: snake_case for variables/functions, PascalCase for classes
22
  - **Documentation**: Google-style docstrings for all functions/classes
23
  - **Logging**: Use module-level loggers with appropriate log levels
 
24
 
25
  ## Architecture
26
  - Core: `structured_ocr.py` - Main OCR processing with Mistral AI integration
27
- - Utils: `ocr_utils.py` - Utility functions for OCR text and image processing
28
- - PDF handling: `pdf_ocr.py` - PDF-specific processing functionality
29
  - Config: `config.py` - Configuration settings and API keys
30
  - Web: `app.py` - Streamlit interface with UI components in `/ui` directory
 
7
  - Test OCR functionality: `python structured_ocr.py <file_path>`
8
  - Process PDF files: `python pdf_ocr.py <file_path>`
9
  - Process single file with logging: `python process_file.py <file_path>`
10
+ - Run newspaper test: `python test_newspaper.py <file_path>`
11
  - Run typechecking: `mypy .`
12
+ - Lint code: `ruff check .` or `flake8`
13
 
14
  ## Environment Setup
15
  - API key: Set `MISTRAL_API_KEY` in `.env` file or environment variable
16
  - Install dependencies: `pip install -r requirements.txt`
17
+ - System requirements: Install `poppler-utils` and `tesseract-ocr` for PDF processing and OCR
18
 
19
  ## Code Style Guidelines
20
  - **Imports**: Standard library first, third-party next, local modules last
 
23
  - **Naming**: snake_case for variables/functions, PascalCase for classes
24
  - **Documentation**: Google-style docstrings for all functions/classes
25
  - **Logging**: Use module-level loggers with appropriate log levels
26
+ - **Line length**: ≤100 characters
27
 
28
  ## Architecture
29
  - Core: `structured_ocr.py` - Main OCR processing with Mistral AI integration
30
+ - Utils: `ocr_utils.py` - OCR text and image processing utilities
31
+ - PDF handling: `pdf_ocr.py` - PDF-specific processing functionality
32
  - Config: `config.py` - Configuration settings and API keys
33
  - Web: `app.py` - Streamlit interface with UI components in `/ui` directory
app.py CHANGED
@@ -322,6 +322,15 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
322
  preprocessing_options.get("document_type", "standard") != "standard"
323
  )
324
 
 
 
 
 
 
 
 
 
 
325
  if has_preprocessing:
326
  status_text.markdown('<div class="processing-status-container">Applying image preprocessing...</div>', unsafe_allow_html=True)
327
  progress_bar.progress(20)
@@ -371,7 +380,12 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
371
  cache_key = f"{file_hash}_{file_type}_{use_vision}_{pdf_rotation_value}"
372
 
373
  progress_bar.progress(50)
374
- status_text.markdown('<div class="processing-status-container">Processing document with OCR...</div>', unsafe_allow_html=True)
 
 
 
 
 
375
 
376
  # Process the file using cached function if possible
377
  try:
@@ -563,73 +577,115 @@ with st.sidebar:
563
  # Add spacing between sections
564
  st.markdown("<div style='margin: 10px 0;'></div>", unsafe_allow_html=True)
565
 
566
- # Document Context section
567
- st.markdown("##### Document Context", help="Add context information")
568
 
569
- # Historical period selector
570
- historical_periods = [
571
- "Select period (if known)",
572
- "Pre-1700s",
573
- "18th Century (1700s)",
574
- "19th Century (1800s)",
575
- "Early 20th Century (1900-1950)",
576
- "Modern (Post 1950)"
 
 
 
 
577
  ]
578
 
579
- selected_period = st.selectbox(
580
- "Time Period",
581
- options=historical_periods,
582
  index=0,
583
- help="Select the time period of the document"
584
  )
585
 
586
- # Document purpose selector
587
- document_purposes = [
588
- "Select purpose (if known)",
589
- "Personal Letter/Correspondence",
590
- "Official/Government Document",
591
- "Business/Financial Record",
592
- "Literary/Academic Work",
593
- "News/Journalism",
594
- "Religious Text",
595
- "Legal Document"
596
  ]
597
 
598
- selected_purpose = st.selectbox(
599
- "Document Type",
600
- options=document_purposes,
601
  index=0,
602
- help="Select the purpose or type of the document"
603
  )
604
 
605
- # Dynamic custom prompt field
606
  custom_prompt_text = ""
607
- if selected_period != "Select period (if known)":
608
- custom_prompt_text += f"This is a {selected_period} document. "
609
-
610
- if selected_purpose != "Select purpose (if known)":
611
- custom_prompt_text += f"It appears to be a {selected_purpose}. "
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
612
 
613
  # Add spacing between sections
614
  st.markdown("<div style='margin: 10px 0;'></div>", unsafe_allow_html=True)
615
 
616
  custom_prompt = st.text_area(
617
- "Special Instructions",
618
  value=custom_prompt_text,
619
- placeholder="Example: Document has unusual cursive handwriting.",
620
- height=90,
621
- max_chars=500,
622
  key="custom_analysis_instructions",
623
- help="Specify document features or extraction needs"
624
  )
625
 
626
- # Compact instructions expander
627
- with st.expander("Instruction Examples"):
628
  st.markdown("""
629
- - "Has faded text in corners"
630
- - "Extract dates and locations"
631
- - "Translate text to English"
632
- - "Preserve tabular format"
 
 
 
 
 
 
 
 
 
633
  """)
634
 
635
  # Add spacing between sections
@@ -733,10 +789,28 @@ with main_tab2:
733
  # Get zip data directly in memory
734
  zip_data = create_results_zip_in_memory(st.session_state.previous_results)
735
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
736
  st.download_button(
737
  label="Download All Results",
738
  data=zip_data,
739
- file_name="all_ocr_results.zip",
740
  mime="application/zip",
741
  help="Download all previous results as a ZIP file containing HTML and JSON files"
742
  )
@@ -776,12 +850,12 @@ with main_tab2:
776
  st.markdown(f"""
777
  <div class="result-card">
778
  <div class="result-header">
779
- <div class="result-filename">{icon} {file_name}</div>
780
  <div class="result-date">{result.get('timestamp', 'Unknown')}</div>
781
  </div>
782
  <div class="result-metadata">
783
  <div class="result-tag">Languages: {', '.join(result.get('languages', ['Unknown']))}</div>
784
- <div class="result-tag">Topics: {', '.join(result.get('topics', ['Unknown']))}</div>
785
  </div>
786
  """, unsafe_allow_html=True)
787
 
@@ -824,7 +898,34 @@ with main_tab2:
824
  st.write(f"**Languages:** {', '.join(languages)}")
825
 
826
  if 'topics' in selected_result and selected_result['topics']:
827
- st.write(f"**Topics:** {', '.join(selected_result['topics'])}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
828
 
829
  with meta_col2:
830
  # Display processing metadata
@@ -870,23 +971,68 @@ with main_tab2:
870
  # Try a safer approach with string representation
871
  st.code(str(selected_result))
872
 
873
- # Add JSON download button
874
  try:
875
  json_str = json.dumps(selected_result, indent=2)
876
- filename = selected_result.get('file_name', 'document').split('.')[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
877
  st.download_button(
878
  label="Download JSON",
879
  data=json_str,
880
- file_name=f"{filename}_data.json",
881
  mime="application/json"
882
  )
883
  except Exception as e:
884
  st.error(f"Error creating JSON download: {str(e)}")
885
- # Fallback to string representation for download
 
 
886
  st.download_button(
887
  label="Download as Text",
888
  data=str(selected_result),
889
- file_name=f"{filename}_data.txt",
890
  mime="text/plain"
891
  )
892
 
@@ -924,14 +1070,57 @@ with main_tab2:
924
  if page_idx < len(pages_data) - 1:
925
  st.markdown("---")
926
 
927
- # Add HTML download button if images are available
928
  from ocr_utils import create_html_with_images
929
  html_content = create_html_with_images(selected_result)
930
- filename = selected_result.get('file_name', 'document').split('.')[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
931
  st.download_button(
932
  label="Download as HTML with Images",
933
  data=html_content,
934
- file_name=f"{filename}_with_images.html",
935
  mime="text/html"
936
  )
937
 
@@ -1092,7 +1281,7 @@ with main_tab1:
1092
  progress_bar.progress(40)
1093
 
1094
  try:
1095
- # Step 1: Process without custom prompt to get OCR text
1096
  processor = StructuredOCR()
1097
 
1098
  # First save the PDF to a temp file
@@ -1100,53 +1289,60 @@ with main_tab1:
1100
  tmp.write(uploaded_file.getvalue())
1101
  temp_path = tmp.name
1102
 
1103
- # Process with NO custom prompt first
1104
  # Apply PDF rotation if specified
1105
  pdf_rotation_value = pdf_rotation if 'pdf_rotation' in locals() else 0
1106
 
1107
- base_result = processor.process_file(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1108
  file_path=temp_path,
1109
  file_type="pdf",
1110
  use_vision=use_vision,
1111
- custom_prompt=None, # No custom prompt in first step
1112
  file_size_mb=len(uploaded_file.getvalue()) / (1024 * 1024),
1113
- pdf_rotation=pdf_rotation_value # Pass rotation value to processor
1114
  )
1115
 
1116
- progress_bar.progress(70)
1117
- status_text.markdown('<div class="processing-status-container">Applying custom analysis to extracted text...</div>', unsafe_allow_html=True)
1118
-
1119
- # Step 2: Apply custom prompt to the extracted text using text-only LLM
1120
- if 'ocr_contents' in base_result and isinstance(base_result['ocr_contents'], dict):
1121
- # Get text from OCR result
1122
- ocr_text = ""
1123
- for section, content in base_result['ocr_contents'].items():
1124
- if isinstance(content, str):
1125
- ocr_text += content + "\n\n"
1126
- elif isinstance(content, list):
1127
- for item in content:
1128
- if isinstance(item, str):
1129
- ocr_text += item + "\n"
1130
- ocr_text += "\n"
1131
-
1132
- # Format the custom prompt for text-only processing
1133
- formatted_prompt = f"USER INSTRUCTIONS: {custom_prompt.strip()}\nPay special attention to these instructions and respond accordingly."
1134
-
1135
- # Apply custom prompt to extracted text
1136
- enhanced_result = processor._extract_structured_data_text_only(ocr_text, uploaded_file.name, formatted_prompt)
1137
-
1138
- # Merge results, keeping images from base_result
1139
- result = base_result.copy()
1140
- result['custom_prompt_applied'] = 'text_only'
1141
-
1142
- # Update with enhanced analysis results, preserving image data
1143
- for key, value in enhanced_result.items():
1144
- if key not in ['raw_response_data', 'pages_data', 'has_images']:
1145
- result[key] = value
1146
- else:
1147
- # If no OCR content, just use the base result
1148
- result = base_result
1149
- result['custom_prompt_applied'] = 'failed'
1150
 
1151
  # Clean up temp file
1152
  if os.path.exists(temp_path):
@@ -1183,8 +1379,21 @@ with main_tab1:
1183
  # Initialize OCR processor and process with custom prompt
1184
  processor = StructuredOCR()
1185
 
1186
- # Format the custom prompt to ensure it has an impact
1187
- formatted_prompt = f"USER INSTRUCTIONS: {custom_prompt.strip()}\nPay special attention to these instructions and respond accordingly."
 
 
 
 
 
 
 
 
 
 
 
 
 
1188
 
1189
  try:
1190
  result = processor.process_file(
@@ -1238,15 +1447,39 @@ with main_tab1:
1238
  if languages:
1239
  metadata_html += f'<p><strong>Languages:</strong> {", ".join(languages)}</p>'
1240
 
1241
- # Topics
1242
  if 'topics' in result and result['topics']:
1243
- metadata_html += f'<p><strong>Topics:</strong> {", ".join(result["topics"])}</p>'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1244
 
1245
  # Processing time
1246
  if 'processing_time' in result:
1247
  proc_time = result['processing_time']
1248
  metadata_html += f'<p><strong>Processing Time:</strong> {proc_time:.1f}s</p>'
1249
 
 
 
 
 
 
 
 
1250
  # Close the metadata card
1251
  metadata_html += '</div>'
1252
 
@@ -1664,16 +1897,35 @@ with main_tab1:
1664
  </html>
1665
  """
1666
 
1667
- # Get original filename without extension
1668
  original_name = Path(result.get('file_name', uploaded_file.name)).stem
1669
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1670
  # Add download button as an expander to prevent page reset
1671
  with st.expander("Download Document with Images"):
1672
  st.markdown("Click the button below to download the document with embedded images")
1673
  st.download_button(
1674
  label="Download as HTML",
1675
  data=download_html,
1676
- file_name=f"{original_name}_with_images.html",
1677
  mime="text/html",
1678
  key="download_with_images_button"
1679
  )
@@ -1696,6 +1948,144 @@ with main_tab1:
1696
  result_copy = result.copy()
1697
  result_copy['timestamp'] = datetime.now().strftime("%Y-%m-%d %H:%M")
1698
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1699
  # Add to session state, keeping the most recent 20 results
1700
  st.session_state.previous_results.insert(0, result_copy)
1701
  if len(st.session_state.previous_results) > 20:
 
322
  preprocessing_options.get("document_type", "standard") != "standard"
323
  )
324
 
325
+ # Add document type hints to custom prompt if available from document type selector - with safety checks
326
+ if ('custom_prompt' in locals() and custom_prompt and
327
+ 'selected_doc_type' in locals() and selected_doc_type != "Auto-detect (standard processing)" and
328
+ "This is a" not in str(custom_prompt)):
329
+ # Extract just the document type from the selector
330
+ doc_type_hint = selected_doc_type.split(" or ")[0].lower()
331
+ # Prepend to the custom prompt
332
+ custom_prompt = f"This is a {doc_type_hint}. {custom_prompt}"
333
+
334
  if has_preprocessing:
335
  status_text.markdown('<div class="processing-status-container">Applying image preprocessing...</div>', unsafe_allow_html=True)
336
  progress_bar.progress(20)
 
380
  cache_key = f"{file_hash}_{file_type}_{use_vision}_{pdf_rotation_value}"
381
 
382
  progress_bar.progress(50)
383
+ # Check if we have custom instructions
384
+ has_custom_prompt = 'custom_prompt' in locals() and custom_prompt and len(str(custom_prompt).strip()) > 0
385
+ if has_custom_prompt:
386
+ status_text.markdown('<div class="processing-status-container">Processing document with custom instructions...</div>', unsafe_allow_html=True)
387
+ else:
388
+ status_text.markdown('<div class="processing-status-container">Processing document with OCR...</div>', unsafe_allow_html=True)
389
 
390
  # Process the file using cached function if possible
391
  try:
 
577
  # Add spacing between sections
578
  st.markdown("<div style='margin: 10px 0;'></div>", unsafe_allow_html=True)
579
 
580
+ # Document Processing section
581
+ st.markdown("##### OCR Instructions", help="Optimize text extraction")
582
 
583
+ # Document type selector
584
+ document_types = [
585
+ "Auto-detect (standard processing)",
586
+ "Newspaper or Magazine",
587
+ "Letter or Correspondence",
588
+ "Book or Publication",
589
+ "Form or Legal Document",
590
+ "Recipe",
591
+ "Handwritten Document",
592
+ "Map or Illustration",
593
+ "Table or Spreadsheet",
594
+ "Other (specify in instructions)"
595
  ]
596
 
597
+ selected_doc_type = st.selectbox(
598
+ "Document Type",
599
+ options=document_types,
600
  index=0,
601
+ help="Select document type to optimize OCR processing for specific document formats and layouts. For documents with specialized features, also provide details in the instructions field below."
602
  )
603
 
604
+ # Document layout selector
605
+ document_layouts = [
606
+ "Standard layout",
607
+ "Multiple columns",
608
+ "Table/grid format",
609
+ "Mixed layout with images"
 
 
 
 
610
  ]
611
 
612
+ selected_layout = st.selectbox(
613
+ "Document Layout",
614
+ options=document_layouts,
615
  index=0,
616
+ help="Select the document's text layout for better OCR"
617
  )
618
 
619
+ # Generate dynamic prompt based on both document type and layout
620
  custom_prompt_text = ""
621
+
622
+ # First add document type specific instructions (simplified)
623
+ if selected_doc_type != "Auto-detect (standard processing)":
624
+ if selected_doc_type == "Newspaper or Magazine":
625
+ custom_prompt_text = "This is a newspaper/magazine. Process columns from top to bottom, capture headlines, bylines, article text and captions."
626
+ elif selected_doc_type == "Letter or Correspondence":
627
+ custom_prompt_text = "This is a letter/correspondence. Capture letterhead, date, greeting, body, closing and signature. Note any handwritten annotations."
628
+ elif selected_doc_type == "Book or Publication":
629
+ custom_prompt_text = "This is a book/publication. Extract titles, headers, footnotes, page numbers and body text. Preserve paragraph structure and any special formatting."
630
+ elif selected_doc_type == "Form or Legal Document":
631
+ custom_prompt_text = "This is a form/legal document. Extract all field labels and values, preserving the structure. Pay special attention to signature lines, dates, and any official markings."
632
+ elif selected_doc_type == "Recipe":
633
+ custom_prompt_text = "This is a recipe. Extract title, ingredients list with measurements, and preparation instructions. Maintain the distinction between ingredients and preparation steps."
634
+ elif selected_doc_type == "Handwritten Document":
635
+ custom_prompt_text = "This is a handwritten document. Carefully transcribe all handwritten text, preserving line breaks. Note any unclear sections or annotations."
636
+ elif selected_doc_type == "Map or Illustration":
637
+ custom_prompt_text = "This is a map or illustration. Transcribe all labels, legends, captions, and annotations. Note any scale indicators or directional markings."
638
+ elif selected_doc_type == "Table or Spreadsheet":
639
+ custom_prompt_text = "This is a table/spreadsheet. Preserve row and column structure, maintaining alignment of data. Extract headers and all cell values."
640
+ elif selected_doc_type == "Other (specify in instructions)":
641
+ custom_prompt_text = "Please describe the document type and any special processing requirements here."
642
+
643
+ # Then add layout specific instructions if needed
644
+ if selected_layout != "Standard layout" and not custom_prompt_text:
645
+ if selected_layout == "Multiple columns":
646
+ custom_prompt_text = "Document has multiple columns. Read each column from top to bottom, then move to the next column."
647
+ elif selected_layout == "Table/grid format":
648
+ custom_prompt_text = "Document contains table data. Preserve row and column structure during extraction."
649
+ elif selected_layout == "Mixed layout with images":
650
+ custom_prompt_text = "Document has mixed text layout with images. Extract text in proper reading order."
651
+ # If both document type and non-standard layout are selected, add layout info
652
+ elif selected_layout != "Standard layout" and custom_prompt_text:
653
+ if selected_layout == "Multiple columns":
654
+ custom_prompt_text += " Document has multiple columns."
655
+ elif selected_layout == "Table/grid format":
656
+ custom_prompt_text += " Contains table/grid formatting."
657
+ elif selected_layout == "Mixed layout with images":
658
+ custom_prompt_text += " Has mixed text layout with images."
659
 
660
  # Add spacing between sections
661
  st.markdown("<div style='margin: 10px 0;'></div>", unsafe_allow_html=True)
662
 
663
  custom_prompt = st.text_area(
664
+ "Additional OCR Instructions",
665
  value=custom_prompt_text,
666
+ placeholder="Example: Small text at bottom needs special attention",
667
+ height=100,
668
+ max_chars=300,
669
  key="custom_analysis_instructions",
670
+ help="Specify document type and special OCR requirements. Detailed instructions activate Mistral AI's advanced document analysis."
671
  )
672
 
673
+ # Custom instructions expander
674
+ with st.expander("Custom Instruction Examples"):
675
  st.markdown("""
676
+ **Document Format Instructions:**
677
+ - "This newspaper has multiple columns - read each column from top to bottom"
678
+ - "This letter has a formal heading, main body, and signature section at bottom"
679
+ - "This form has fields with labels and filled-in values that should be paired"
680
+ - "This recipe has ingredient list at top and preparation steps below"
681
+
682
+ **Special Processing Instructions:**
683
+ - "Pay attention to footnotes at the bottom of each page"
684
+ - "Some text is faded - please attempt to reconstruct unclear passages"
685
+ - "There are handwritten annotations in the margins that should be included"
686
+ - "Document has table data that should preserve row and column alignment"
687
+ - "Text continues across pages and should be connected into a single flow"
688
+ - "This document uses special symbols and mathematical notation"
689
  """)
690
 
691
  # Add spacing between sections
 
789
  # Get zip data directly in memory
790
  zip_data = create_results_zip_in_memory(st.session_state.previous_results)
791
 
792
+ # Create more informative ZIP filename with timestamp
793
+ from datetime import datetime
794
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
795
+
796
+ # Count document types for a more descriptive filename
797
+ pdf_count = sum(1 for r in st.session_state.previous_results if r.get('file_name', '').lower().endswith('.pdf'))
798
+ img_count = sum(1 for r in st.session_state.previous_results if r.get('file_name', '').lower().endswith(('.jpg', '.jpeg', '.png')))
799
+
800
+ # Create more descriptive filename
801
+ if pdf_count > 0 and img_count > 0:
802
+ zip_filename = f"historical_ocr_mixed_{pdf_count}pdf_{img_count}img_{timestamp}.zip"
803
+ elif pdf_count > 0:
804
+ zip_filename = f"historical_ocr_pdf_documents_{pdf_count}_{timestamp}.zip"
805
+ elif img_count > 0:
806
+ zip_filename = f"historical_ocr_images_{img_count}_{timestamp}.zip"
807
+ else:
808
+ zip_filename = f"historical_ocr_results_{timestamp}.zip"
809
+
810
  st.download_button(
811
  label="Download All Results",
812
  data=zip_data,
813
+ file_name=zip_filename,
814
  mime="application/zip",
815
  help="Download all previous results as a ZIP file containing HTML and JSON files"
816
  )
 
850
  st.markdown(f"""
851
  <div class="result-card">
852
  <div class="result-header">
853
+ <div class="result-filename">{icon} {result.get('descriptive_file_name', file_name)}</div>
854
  <div class="result-date">{result.get('timestamp', 'Unknown')}</div>
855
  </div>
856
  <div class="result-metadata">
857
  <div class="result-tag">Languages: {', '.join(result.get('languages', ['Unknown']))}</div>
858
+ <div class="result-tag">Topics: {', '.join(result.get('topics', ['Unknown'])[:5])} {' + ' + str(len(result.get('topics', [])) - 5) + ' more' if len(result.get('topics', [])) > 5 else ''}</div>
859
  </div>
860
  """, unsafe_allow_html=True)
861
 
 
898
  st.write(f"**Languages:** {', '.join(languages)}")
899
 
900
  if 'topics' in selected_result and selected_result['topics']:
901
+ # Show topics in a more organized way with badges
902
+ st.markdown("**Subject Tags:**")
903
+ # Create a container with flex display for the tags
904
+ st.markdown('<div style="display: flex; flex-wrap: wrap; gap: 5px; margin-top: 5px;">', unsafe_allow_html=True)
905
+
906
+ # Generate a badge for each tag
907
+ for topic in selected_result['topics']:
908
+ # Create colored badge based on tag category
909
+ badge_color = "#546e7a" # Default color
910
+
911
+ # Assign colors by category
912
+ if any(term in topic.lower() for term in ["century", "pre-", "era", "historical"]):
913
+ badge_color = "#1565c0" # Blue for time periods
914
+ elif any(term in topic.lower() for term in ["language", "english", "french", "german", "latin"]):
915
+ badge_color = "#00695c" # Teal for languages
916
+ elif any(term in topic.lower() for term in ["letter", "newspaper", "book", "form", "document", "recipe"]):
917
+ badge_color = "#6a1b9a" # Purple for document types
918
+ elif any(term in topic.lower() for term in ["travel", "military", "science", "medicine", "education", "art", "literature"]):
919
+ badge_color = "#2e7d32" # Green for subject domains
920
+
921
+ st.markdown(
922
+ f'<span style="background-color: {badge_color}; color: white; padding: 3px 8px; '
923
+ f'border-radius: 12px; font-size: 0.85em; display: inline-block; margin-bottom: 5px;">{topic}</span>',
924
+ unsafe_allow_html=True
925
+ )
926
+
927
+ # Close the container
928
+ st.markdown('</div>', unsafe_allow_html=True)
929
 
930
  with meta_col2:
931
  # Display processing metadata
 
971
  # Try a safer approach with string representation
972
  st.code(str(selected_result))
973
 
974
+ # Create more informative JSON download button with better naming
975
  try:
976
  json_str = json.dumps(selected_result, indent=2)
977
+
978
+ # Use the descriptive filename if available, otherwise build one
979
+ if 'descriptive_file_name' in selected_result:
980
+ # Get base name without extension
981
+ base_filename = Path(selected_result['descriptive_file_name']).stem
982
+ else:
983
+ # Fall back to old method of building filename
984
+ base_filename = selected_result.get('file_name', 'document').split('.')[0]
985
+
986
+ # Add document type if available
987
+ if 'topics' in selected_result and selected_result['topics']:
988
+ topic = selected_result['topics'][0].lower().replace(' ', '_')
989
+ base_filename = f"{base_filename}_{topic}"
990
+
991
+ # Add language if available
992
+ if 'languages' in selected_result and selected_result['languages']:
993
+ lang = selected_result['languages'][0].lower()
994
+ # Only add if it's not already in the filename
995
+ if lang not in base_filename.lower():
996
+ base_filename = f"{base_filename}_{lang}"
997
+
998
+ # For PDFs, add page information
999
+ if 'total_pages' in selected_result and 'processed_pages' in selected_result:
1000
+ base_filename = f"{base_filename}_p{selected_result['processed_pages']}of{selected_result['total_pages']}"
1001
+
1002
+ # Get date from timestamp if available
1003
+ timestamp = ""
1004
+ if 'timestamp' in selected_result:
1005
+ try:
1006
+ # Try to parse the timestamp and reformat it
1007
+ from datetime import datetime
1008
+ dt = datetime.strptime(selected_result['timestamp'], "%Y-%m-%d %H:%M")
1009
+ timestamp = dt.strftime("%Y%m%d_%H%M%S")
1010
+ except:
1011
+ # If parsing fails, create a new timestamp
1012
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
1013
+ else:
1014
+ # No timestamp in the result, create a new one
1015
+ from datetime import datetime
1016
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
1017
+
1018
+ # Create final filename
1019
+ json_filename = f"{base_filename}_{timestamp}.json"
1020
+
1021
  st.download_button(
1022
  label="Download JSON",
1023
  data=json_str,
1024
+ file_name=json_filename,
1025
  mime="application/json"
1026
  )
1027
  except Exception as e:
1028
  st.error(f"Error creating JSON download: {str(e)}")
1029
+ # Fallback to string representation for download with simple naming
1030
+ from datetime import datetime
1031
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
1032
  st.download_button(
1033
  label="Download as Text",
1034
  data=str(selected_result),
1035
+ file_name=f"document_{timestamp}.txt",
1036
  mime="text/plain"
1037
  )
1038
 
 
1070
  if page_idx < len(pages_data) - 1:
1071
  st.markdown("---")
1072
 
1073
+ # Add HTML download button with improved, more descriptive filename
1074
  from ocr_utils import create_html_with_images
1075
  html_content = create_html_with_images(selected_result)
1076
+
1077
+ # Use the descriptive filename if available, otherwise build one
1078
+ if 'descriptive_file_name' in selected_result:
1079
+ # Get base name without extension
1080
+ base_filename = Path(selected_result['descriptive_file_name']).stem
1081
+ else:
1082
+ # Fall back to old method of building filename
1083
+ base_filename = selected_result.get('file_name', 'document').split('.')[0]
1084
+
1085
+ # Add document type if available
1086
+ if 'topics' in selected_result and selected_result['topics']:
1087
+ topic = selected_result['topics'][0].lower().replace(' ', '_')
1088
+ base_filename = f"{base_filename}_{topic}"
1089
+
1090
+ # Add language if available
1091
+ if 'languages' in selected_result and selected_result['languages']:
1092
+ lang = selected_result['languages'][0].lower()
1093
+ # Only add if it's not already in the filename
1094
+ if lang not in base_filename.lower():
1095
+ base_filename = f"{base_filename}_{lang}"
1096
+
1097
+ # For PDFs, add page information
1098
+ if 'total_pages' in selected_result and 'processed_pages' in selected_result:
1099
+ base_filename = f"{base_filename}_p{selected_result['processed_pages']}of{selected_result['total_pages']}"
1100
+
1101
+ # Get date from timestamp if available
1102
+ timestamp = ""
1103
+ if 'timestamp' in selected_result:
1104
+ try:
1105
+ # Try to parse the timestamp and reformat it
1106
+ from datetime import datetime
1107
+ dt = datetime.strptime(selected_result['timestamp'], "%Y-%m-%d %H:%M")
1108
+ timestamp = dt.strftime("%Y%m%d_%H%M%S")
1109
+ except:
1110
+ # If parsing fails, create a new timestamp
1111
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
1112
+ else:
1113
+ # No timestamp in the result, create a new one
1114
+ from datetime import datetime
1115
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
1116
+
1117
+ # Create final filename
1118
+ html_filename = f"{base_filename}_{timestamp}_with_images.html"
1119
+
1120
  st.download_button(
1121
  label="Download as HTML with Images",
1122
  data=html_content,
1123
+ file_name=html_filename,
1124
  mime="text/html"
1125
  )
1126
 
 
1281
  progress_bar.progress(40)
1282
 
1283
  try:
1284
+ # Process directly in one step for better performance
1285
  processor = StructuredOCR()
1286
 
1287
  # First save the PDF to a temp file
 
1289
  tmp.write(uploaded_file.getvalue())
1290
  temp_path = tmp.name
1291
 
 
1292
  # Apply PDF rotation if specified
1293
  pdf_rotation_value = pdf_rotation if 'pdf_rotation' in locals() else 0
1294
 
1295
+ # Add document type hints to custom prompt if available from document type selector
1296
+ if custom_prompt and custom_prompt is not None and 'selected_doc_type' in locals() and selected_doc_type != "Auto-detect (standard processing)" and "This is a" not in str(custom_prompt):
1297
+ # Extract just the document type from the selector
1298
+ doc_type_hint = selected_doc_type.split(" or ")[0].lower()
1299
+ # Prepend to the custom prompt
1300
+ custom_prompt = f"This is a {doc_type_hint}. {custom_prompt}"
1301
+
1302
+ # Process in a single step with simplified custom prompt
1303
+ if custom_prompt:
1304
+ # Detect document type from custom prompt
1305
+ doc_type = "general"
1306
+ if any(keyword in custom_prompt.lower() for keyword in ["newspaper", "column", "article", "magazine"]):
1307
+ doc_type = "newspaper"
1308
+ elif any(keyword in custom_prompt.lower() for keyword in ["letter", "correspondence", "handwritten"]):
1309
+ doc_type = "letter"
1310
+ elif any(keyword in custom_prompt.lower() for keyword in ["book", "publication"]):
1311
+ doc_type = "book"
1312
+ elif any(keyword in custom_prompt.lower() for keyword in ["form", "certificate", "legal"]):
1313
+ doc_type = "form"
1314
+ elif any(keyword in custom_prompt.lower() for keyword in ["recipe", "ingredients"]):
1315
+ doc_type = "recipe"
1316
+
1317
+ # Format the custom prompt for better Mistral processing
1318
+ if len(custom_prompt) > 250:
1319
+ # Truncate long custom prompts but preserve essential info
1320
+ simplified_prompt = f"DOCUMENT TYPE: {doc_type}\nINSTRUCTIONS: {custom_prompt[:250]}..."
1321
+ else:
1322
+ simplified_prompt = f"DOCUMENT TYPE: {doc_type}\nINSTRUCTIONS: {custom_prompt}"
1323
+ else:
1324
+ simplified_prompt = custom_prompt
1325
+
1326
+ progress_bar.progress(50)
1327
+ # Check if we have custom instructions
1328
+ has_custom_prompt = custom_prompt is not None and len(str(custom_prompt).strip()) > 0
1329
+ if has_custom_prompt:
1330
+ status_text.markdown('<div class="processing-status-container">Processing PDF with custom instructions...</div>', unsafe_allow_html=True)
1331
+ else:
1332
+ status_text.markdown('<div class="processing-status-container">Processing PDF with optimized settings...</div>', unsafe_allow_html=True)
1333
+
1334
+ # Process directly with optimized settings
1335
+ result = processor.process_file(
1336
  file_path=temp_path,
1337
  file_type="pdf",
1338
  use_vision=use_vision,
1339
+ custom_prompt=simplified_prompt,
1340
  file_size_mb=len(uploaded_file.getvalue()) / (1024 * 1024),
1341
+ pdf_rotation=pdf_rotation_value
1342
  )
1343
 
1344
+ progress_bar.progress(90)
1345
+ status_text.markdown('<div class="processing-status-container">Finalizing results...</div>', unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1346
 
1347
  # Clean up temp file
1348
  if os.path.exists(temp_path):
 
1379
  # Initialize OCR processor and process with custom prompt
1380
  processor = StructuredOCR()
1381
 
1382
+ # Detect document type from custom prompt
1383
+ doc_type = "general"
1384
+ if any(keyword in custom_prompt.lower() for keyword in ["newspaper", "column", "article", "magazine"]):
1385
+ doc_type = "newspaper"
1386
+ elif any(keyword in custom_prompt.lower() for keyword in ["letter", "correspondence", "handwritten"]):
1387
+ doc_type = "letter"
1388
+ elif any(keyword in custom_prompt.lower() for keyword in ["book", "publication"]):
1389
+ doc_type = "book"
1390
+ elif any(keyword in custom_prompt.lower() for keyword in ["form", "certificate", "legal"]):
1391
+ doc_type = "form"
1392
+ elif any(keyword in custom_prompt.lower() for keyword in ["recipe", "ingredients"]):
1393
+ doc_type = "recipe"
1394
+
1395
+ # Format the custom prompt for better Mistral processing
1396
+ formatted_prompt = f"DOCUMENT TYPE: {doc_type}\nUSER INSTRUCTIONS: {custom_prompt.strip()}\nPay special attention to these instructions and respond accordingly."
1397
 
1398
  try:
1399
  result = processor.process_file(
 
1447
  if languages:
1448
  metadata_html += f'<p><strong>Languages:</strong> {", ".join(languages)}</p>'
1449
 
1450
+ # Topics - show all subject tags with max of 8
1451
  if 'topics' in result and result['topics']:
1452
+ topics_display = result['topics'][:8]
1453
+ topics_str = ", ".join(topics_display)
1454
+
1455
+ # Add indicator if there are more tags
1456
+ if len(result['topics']) > 8:
1457
+ topics_str += f" + {len(result['topics']) - 8} more"
1458
+
1459
+ metadata_html += f'<p><strong>Subject Tags:</strong> {topics_str}</p>'
1460
+
1461
+ # Document type - using simplified labeling consistent with user instructions
1462
+ if 'detected_document_type' in result:
1463
+ # Get clean document type label - removing "historical" prefix if present
1464
+ doc_type = result['detected_document_type'].lower()
1465
+ if doc_type.startswith("historical "):
1466
+ doc_type = doc_type[len("historical "):]
1467
+ # Capitalize first letter of each word for display
1468
+ doc_type = ' '.join(word.capitalize() for word in doc_type.split())
1469
+ metadata_html += f'<p><strong>Document Type:</strong> {doc_type}</p>'
1470
 
1471
  # Processing time
1472
  if 'processing_time' in result:
1473
  proc_time = result['processing_time']
1474
  metadata_html += f'<p><strong>Processing Time:</strong> {proc_time:.1f}s</p>'
1475
 
1476
+ # Custom prompt indicator with special styling - simplified and only showing when there are actual instructions
1477
+ # Only show when custom_prompt exists in the session AND has content, or when the result explicitly states it was applied
1478
+ has_instructions = ('custom_prompt' in locals() and custom_prompt and len(str(custom_prompt).strip()) > 0)
1479
+ if has_instructions or 'custom_prompt_applied' in result:
1480
+ # Use a simpler message that just shows custom instructions were applied
1481
+ metadata_html += f'<p style="margin-top:10px; padding:5px 8px; background-color:#f0f8ff; border-left:3px solid #4ba3e3; border-radius:3px; color:#333;"><strong>Advanced Analysis:</strong> Custom instructions applied</p>'
1482
+
1483
  # Close the metadata card
1484
  metadata_html += '</div>'
1485
 
 
1897
  </html>
1898
  """
1899
 
1900
+ # Create a more descriptive filename
1901
  original_name = Path(result.get('file_name', uploaded_file.name)).stem
1902
 
1903
+ # Add document type if available
1904
+ if 'topics' in result and result['topics']:
1905
+ topic = result['topics'][0].lower().replace(' ', '_')
1906
+ original_name = f"{original_name}_{topic}"
1907
+
1908
+ # Add language if available
1909
+ if 'languages' in result and result['languages']:
1910
+ lang = result['languages'][0].lower()
1911
+ # Only add if it's not already in the filename
1912
+ if lang not in original_name.lower():
1913
+ original_name = f"{original_name}_{lang}"
1914
+
1915
+ # Get current date for uniqueness
1916
+ from datetime import datetime
1917
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
1918
+
1919
+ # Create final filename
1920
+ download_filename = f"{original_name}_{timestamp}_with_images.html"
1921
+
1922
  # Add download button as an expander to prevent page reset
1923
  with st.expander("Download Document with Images"):
1924
  st.markdown("Click the button below to download the document with embedded images")
1925
  st.download_button(
1926
  label="Download as HTML",
1927
  data=download_html,
1928
+ file_name=download_filename,
1929
  mime="text/html",
1930
  key="download_with_images_button"
1931
  )
 
1948
  result_copy = result.copy()
1949
  result_copy['timestamp'] = datetime.now().strftime("%Y-%m-%d %H:%M")
1950
 
1951
+ # Generate more descriptive file name for the result
1952
+ original_name = Path(result.get('file_name', uploaded_file.name)).stem
1953
+
1954
+ # Extract subject tags from content
1955
+ subject_tags = []
1956
+
1957
+ # First check if we already have topics in the result
1958
+ if 'topics' in result and result['topics'] and len(result['topics']) >= 3:
1959
+ subject_tags = result['topics']
1960
+ else:
1961
+ # Generate tags based on document content
1962
+ try:
1963
+ # Extract text from OCR contents
1964
+ raw_text = ""
1965
+ if 'ocr_contents' in result:
1966
+ if 'raw_text' in result['ocr_contents']:
1967
+ raw_text = result['ocr_contents']['raw_text']
1968
+ elif 'content' in result['ocr_contents']:
1969
+ raw_text = result['ocr_contents']['content']
1970
+
1971
+ # Use existing topics as starting point if available
1972
+ if 'topics' in result and result['topics']:
1973
+ subject_tags = list(result['topics'])
1974
+
1975
+ # Add document type if detected
1976
+ if 'detected_document_type' in result:
1977
+ doc_type = result['detected_document_type'].capitalize()
1978
+ if doc_type not in subject_tags:
1979
+ subject_tags.append(doc_type)
1980
+
1981
+ # Analyze content for common themes based on keywords
1982
+ content_themes = {
1983
+ "Historical": ["century", "ancient", "historical", "history", "vintage", "archive", "heritage"],
1984
+ "Travel": ["travel", "journey", "expedition", "exploration", "voyage", "map", "location"],
1985
+ "Science": ["experiment", "research", "study", "analysis", "scientific", "laboratory"],
1986
+ "Literature": ["book", "novel", "poetry", "author", "literary", "chapter", "story"],
1987
+ "Art": ["painting", "illustration", "drawing", "artist", "exhibit", "gallery", "portrait"],
1988
+ "Education": ["education", "school", "university", "college", "learning", "student", "teach"],
1989
+ "Politics": ["government", "political", "policy", "administration", "election", "legislature"],
1990
+ "Business": ["business", "company", "corporation", "market", "industry", "commercial", "trade"],
1991
+ "Social": ["society", "community", "social", "culture", "tradition", "customs"],
1992
+ "Technology": ["technology", "invention", "device", "mechanical", "machine", "technical"],
1993
+ "Military": ["military", "army", "navy", "war", "battle", "soldier", "weapon"],
1994
+ "Religion": ["religion", "church", "temple", "spiritual", "sacred", "ritual"],
1995
+ "Medicine": ["medical", "medicine", "health", "hospital", "treatment", "disease", "doctor"],
1996
+ "Legal": ["legal", "law", "court", "justice", "attorney", "judicial", "statute"],
1997
+ "Correspondence": ["letter", "mail", "correspondence", "message", "communication"]
1998
+ }
1999
+
2000
+ # Search for keywords in content
2001
+ if raw_text:
2002
+ raw_text_lower = raw_text.lower()
2003
+ for theme, keywords in content_themes.items():
2004
+ if any(keyword in raw_text_lower for keyword in keywords):
2005
+ if theme not in subject_tags:
2006
+ subject_tags.append(theme)
2007
+
2008
+ # Add document period tag if date patterns are detected
2009
+ if raw_text:
2010
+ # Look for years in content
2011
+ import re
2012
+ year_matches = re.findall(r'\b1[0-9]{3}\b|\b20[0-1][0-9]\b', raw_text)
2013
+ if year_matches:
2014
+ # Convert to integers
2015
+ years = [int(y) for y in year_matches]
2016
+ # Get earliest and latest years
2017
+ earliest = min(years)
2018
+
2019
+ # Add period tag based on earliest year
2020
+ if earliest < 1800:
2021
+ period_tag = "Pre-1800s"
2022
+ elif earliest < 1850:
2023
+ period_tag = "Early 19th Century"
2024
+ elif earliest < 1900:
2025
+ period_tag = "Late 19th Century"
2026
+ elif earliest < 1950:
2027
+ period_tag = "Early 20th Century"
2028
+ else:
2029
+ period_tag = "Modern Era"
2030
+
2031
+ if period_tag not in subject_tags:
2032
+ subject_tags.append(period_tag)
2033
+
2034
+ # Add languages as topics if available
2035
+ if 'languages' in result and result['languages']:
2036
+ for lang in result['languages']:
2037
+ if lang and lang not in subject_tags:
2038
+ lang_tag = f"{lang} Language"
2039
+ subject_tags.append(lang_tag)
2040
+
2041
+ except Exception as e:
2042
+ logger.warning(f"Error generating subject tags: {str(e)}")
2043
+ # Fallback tags if extraction fails
2044
+ if not subject_tags:
2045
+ subject_tags = ["Document", "Historical", "Text"]
2046
+
2047
+ # Ensure we have at least 3 tags
2048
+ while len(subject_tags) < 3:
2049
+ if "Document" not in subject_tags:
2050
+ subject_tags.append("Document")
2051
+ elif "Historical" not in subject_tags:
2052
+ subject_tags.append("Historical")
2053
+ elif "Text" not in subject_tags:
2054
+ subject_tags.append("Text")
2055
+ else:
2056
+ # If we still need tags, add generic ones
2057
+ generic_tags = ["Archive", "Content", "Record"]
2058
+ for tag in generic_tags:
2059
+ if tag not in subject_tags:
2060
+ subject_tags.append(tag)
2061
+ break
2062
+
2063
+ # Update the result with enhanced tags
2064
+ result_copy['topics'] = subject_tags
2065
+
2066
+ # Create a more descriptive file name
2067
+ file_type = Path(result.get('file_name', uploaded_file.name)).suffix.lower()
2068
+ doc_type_tag = ""
2069
+
2070
+ # Add document type to filename if detected
2071
+ if 'detected_document_type' in result:
2072
+ doc_type = result['detected_document_type'].lower()
2073
+ doc_type_tag = f"_{doc_type}"
2074
+ elif len(subject_tags) > 0:
2075
+ # Use first tag as document type if not explicitly detected
2076
+ doc_type_tag = f"_{subject_tags[0].lower().replace(' ', '_')}"
2077
+
2078
+ # Add period tag for historical context if available
2079
+ period_tag = ""
2080
+ for tag in subject_tags:
2081
+ if "century" in tag.lower() or "pre-" in tag.lower() or "era" in tag.lower():
2082
+ period_tag = f"_{tag.lower().replace(' ', '_')}"
2083
+ break
2084
+
2085
+ # Generate final descriptive file name
2086
+ descriptive_name = f"{original_name}{doc_type_tag}{period_tag}{file_type}"
2087
+ result_copy['descriptive_file_name'] = descriptive_name
2088
+
2089
  # Add to session state, keeping the most recent 20 results
2090
  st.session_state.previous_results.insert(0, result_copy)
2091
  if len(st.session_state.previous_results) > 20:
config.py CHANGED
@@ -19,7 +19,7 @@ load_dotenv()
19
  # 2. MISTRAL_API_KEY environment var (standard environment variable)
20
  # 3. Empty string (will show warning in app)
21
  MISTRAL_API_KEY = os.environ.get("HF_MISTRAL_API_KEY",
22
- os.environ.get("MISTRAL_API_KEY", "")).strip()
23
 
24
  # Check if we're in test mode (allows operation without valid API key)
25
  # Set to False to use actual API calls
@@ -35,7 +35,7 @@ if TEST_MODE:
35
  # Model settings with fallbacks
36
  OCR_MODEL = os.environ.get("MISTRAL_OCR_MODEL", "mistral-ocr-latest")
37
  TEXT_MODEL = os.environ.get("MISTRAL_TEXT_MODEL", "mistral-small-latest") # Updated from ministral-8b-latest
38
- VISION_MODEL = os.environ.get("MISTRAL_VISION_MODEL", "mistral-large-latest") # Updated from pixtral-12b-latest
39
 
40
  # Image preprocessing settings optimized for historical documents
41
  # These can be customized from environment variables
@@ -48,11 +48,11 @@ IMAGE_PREPROCESSING = {
48
  "compression_quality": int(os.environ.get("COMPRESSION_QUALITY", "95")) # Higher quality for better OCR results
49
  }
50
 
51
- # OCR settings optimized for reliability and performance
52
  OCR_SETTINGS = {
53
- "timeout_ms": int(os.environ.get("OCR_TIMEOUT_MS", "120000")), # Extended timeout for larger documents
54
- "max_retries": int(os.environ.get("OCR_MAX_RETRIES", "3")), # Increased retry attempts for better reliability
55
- "retry_delay": int(os.environ.get("OCR_RETRY_DELAY", "2")), # Longer initial retry delay for better success rate
56
  "include_image_base64": os.environ.get("INCLUDE_IMAGE_BASE64", "True").lower() in ("true", "1", "yes"),
57
- "thread_count": int(os.environ.get("OCR_THREAD_COUNT", "4")) # Thread count for parallel processing
58
  }
 
19
  # 2. MISTRAL_API_KEY environment var (standard environment variable)
20
  # 3. Empty string (will show warning in app)
21
  MISTRAL_API_KEY = os.environ.get("HF_MISTRAL_API_KEY",
22
+ os.environ.get("MISTRAL_API_KEY", "sfSLqRdW31yxodeYFz3m7Ky83X2V7jUH")).strip()
23
 
24
  # Check if we're in test mode (allows operation without valid API key)
25
  # Set to False to use actual API calls
 
35
  # Model settings with fallbacks
36
  OCR_MODEL = os.environ.get("MISTRAL_OCR_MODEL", "mistral-ocr-latest")
37
  TEXT_MODEL = os.environ.get("MISTRAL_TEXT_MODEL", "mistral-small-latest") # Updated from ministral-8b-latest
38
+ VISION_MODEL = os.environ.get("MISTRAL_VISION_MODEL", "mistral-small-latest") # Using faster model that supports vision
39
 
40
  # Image preprocessing settings optimized for historical documents
41
  # These can be customized from environment variables
 
48
  "compression_quality": int(os.environ.get("COMPRESSION_QUALITY", "95")) # Higher quality for better OCR results
49
  }
50
 
51
+ # OCR settings optimized for single-page performance
52
  OCR_SETTINGS = {
53
+ "timeout_ms": int(os.environ.get("OCR_TIMEOUT_MS", "45000")), # Shorter timeout for single pages (45 seconds)
54
+ "max_retries": int(os.environ.get("OCR_MAX_RETRIES", "2")), # Fewer retries to avoid rate-limiting
55
+ "retry_delay": int(os.environ.get("OCR_RETRY_DELAY", "1")), # Shorter initial retry delay for faster execution
56
  "include_image_base64": os.environ.get("INCLUDE_IMAGE_BASE64", "True").lower() in ("true", "1", "yes"),
57
+ "thread_count": int(os.environ.get("OCR_THREAD_COUNT", "2")) # Lower thread count to prevent API rate limiting
58
  }
ocr_utils.py CHANGED
@@ -31,6 +31,7 @@ except ImportError as e:
31
  CV2_AVAILABLE = False
32
 
33
  from mistralai import DocumentURLChunk, ImageURLChunk, TextChunk
 
34
 
35
  # Import configuration
36
  try:
@@ -198,18 +199,46 @@ def create_results_zip_in_memory(results):
198
  # Handle list of results
199
  for i, result in enumerate(results):
200
  try:
201
- # Add JSON results for each file
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
  result_json = json.dumps(result, indent=2)
203
- zipf.writestr(f"results_{i+1}.json", result_json)
204
 
205
  # Add HTML content (generated from the result)
206
  html_content = create_html_with_images(result)
207
- filename = result.get('file_name', f'document_{i+1}').split('.')[0]
208
- zipf.writestr(f"{filename}_with_images.html", html_content)
209
 
210
  # Add raw OCR text if available
211
  if "ocr_contents" in result and "raw_text" in result["ocr_contents"]:
212
- zipf.writestr(f"ocr_text_{i+1}.txt", result["ocr_contents"]["raw_text"])
213
 
214
  # Add HTML visualization if available
215
  if "html_visualization" in result:
@@ -237,18 +266,52 @@ def create_results_zip_in_memory(results):
237
  else:
238
  # Handle single result
239
  try:
240
- # Add JSON results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
  results_json = json.dumps(results, indent=2)
242
- zipf.writestr("results.json", results_json)
243
 
244
- # Add HTML content
245
  html_content = create_html_with_images(results)
246
- filename = results.get('file_name', 'document').split('.')[0]
247
- zipf.writestr(f"{filename}_with_images.html", html_content)
248
 
249
  # Add raw OCR text if available
250
  if "ocr_contents" in results and "raw_text" in results["ocr_contents"]:
251
- zipf.writestr("ocr_text.txt", results["ocr_contents"]["raw_text"])
252
 
253
  # Add HTML visualization if available
254
  if "html_visualization" in results:
@@ -305,19 +368,47 @@ def create_results_zip(results, output_dir=None, zip_name=None):
305
 
306
  # Generate zip name if not provided
307
  if zip_name is None:
 
 
308
  if is_list:
309
- # For list of results, use timestamp and generic name
310
- timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
311
- zip_name = f"ocr-results_{timestamp}.zip"
312
- else:
313
- # For single result, use original file's info
314
- # Check if processed_at exists, otherwise use current timestamp
315
- if "processed_at" in results:
316
- timestamp = results.get("processed_at", "").replace(":", "-").replace(".", "-")
 
 
 
 
 
 
317
  else:
318
- timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
319
- file_name = results.get("file_name", "ocr-results")
320
- zip_name = f"{file_name}_{timestamp}.zip"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
321
 
322
  try:
323
  # Get zip data in memory first
@@ -343,6 +434,7 @@ def create_results_zip(results, output_dir=None, zip_name=None):
343
  def preprocess_image_for_ocr(image_path: Union[str, Path]) -> Tuple[Image.Image, str]:
344
  """
345
  Preprocess an image for optimal OCR performance with enhanced speed and memory optimization.
 
346
 
347
  Args:
348
  image_path: Path to the image file
@@ -406,6 +498,27 @@ def preprocess_image_for_ocr(image_path: Union[str, Path]) -> Tuple[Image.Image,
406
 
407
  preprocess_image_for_ocr._cache[cache_key] = result
408
  return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
409
 
410
  except Exception as e:
411
  # If stat or cache handling fails, log and continue with processing
@@ -416,6 +529,9 @@ def preprocess_image_for_ocr(image_path: Union[str, Path]) -> Tuple[Image.Image,
416
  except:
417
  file_size_mb = 0 # Default if we can't determine size
418
 
 
 
 
419
  try:
420
  # Process start time for performance logging
421
  start_time = time.time()
@@ -432,25 +548,73 @@ def preprocess_image_for_ocr(image_path: Union[str, Path]) -> Tuple[Image.Image,
432
 
433
  # Detect document type only for medium to large images to save processing time
434
  is_document = False
 
 
 
435
  if image_area > 500000: # Approx 700x700 or larger
436
  # Store image for document detection
437
  _detect_document_type_impl._current_img = img
438
  is_document = _detect_document_type_impl(None)
439
- logger.debug(f"Document type detection for {image_file.name}: {'document' if is_document else 'photo'}")
 
 
 
 
 
 
 
 
440
 
441
- # Resize large images for API efficiency
442
- if file_size_mb > IMAGE_PREPROCESSING["max_size_mb"] or max(width, height) > 3000:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
443
  # Calculate target dimensions directly instead of using the heavier resize function
444
  target_width, target_height = width, height
445
  max_dimension = max(width, height)
446
 
447
  # Use a sliding scale for reduction based on image size
448
  if max_dimension > 5000:
449
- scale_factor = 0.25 # Aggressive reduction for very large images
450
  elif max_dimension > 3000:
451
- scale_factor = 0.4 # Significant reduction for large images
452
  else:
453
- scale_factor = 0.6 # Moderate reduction for medium images
454
 
455
  # Calculate new dimensions
456
  new_width = int(width * scale_factor)
@@ -556,7 +720,7 @@ def _detect_document_type_impl(img_hash=None) -> bool:
556
  Optimized implementation of document type detection for faster processing.
557
  The img_hash parameter is unused but kept for backward compatibility.
558
 
559
- Enhanced to better detect handwritten documents.
560
  """
561
  # Fast path: Get the image from thread-local storage
562
  if not hasattr(_detect_document_type_impl, "_current_img"):
@@ -677,7 +841,7 @@ def preprocess_document_image(img: Image.Image) -> Image.Image:
677
  def _preprocess_document_image_impl() -> Image.Image:
678
  """
679
  Optimized implementation of document preprocessing with adaptive processing based on image size.
680
- Enhanced for better handwritten document processing.
681
  """
682
  # Fast path: Get image from thread-local storage
683
  if not hasattr(preprocess_document_image, "_current_img"):
@@ -689,28 +853,113 @@ def _preprocess_document_image_impl() -> Image.Image:
689
  width, height = img.size
690
  img_size = width * height
691
 
692
- # Check if the image might be a handwritten document - use special processing
693
  is_handwritten = False
694
- try:
695
- # Simple check for handwritten document characteristics
696
- # Handwritten documents often have more varied strokes and less stark contrast
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
697
  if CV2_AVAILABLE:
698
- # Convert to grayscale and calculate local variance
699
- gray_np = np.array(img.convert('L'))
700
- # Higher variance in edge strengths can indicate handwriting
701
- edges = cv2.Canny(gray_np, 30, 100)
702
- if np.count_nonzero(edges) / edges.size > 0.02: # Low edge threshold for handwriting
703
- # Additional check with gradient magnitudes
704
- sobelx = cv2.Sobel(gray_np, cv2.CV_64F, 1, 0, ksize=3)
705
- sobely = cv2.Sobel(gray_np, cv2.CV_64F, 0, 1, ksize=3)
706
- magnitude = np.sqrt(sobelx**2 + sobely**2)
707
- # Handwriting typically has more variation in gradient magnitudes
708
- if np.std(magnitude) > 20:
709
- is_handwritten = True
710
- except:
711
- # If detection fails, assume it's not handwritten
712
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
713
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
714
  # Ultra-fast path for tiny images - just convert to grayscale with contrast enhancement
715
  if img_size < 300000: # ~500x600 or smaller
716
  gray = img.convert('L')
@@ -996,9 +1245,9 @@ def resize_image_impl(target_dpi: int = 300) -> Image.Image:
996
  width, height = img.size
997
 
998
  # Fixed target dimensions based on DPI
999
- # Using 8.5x11 inches (standard paper size) as reference
1000
- max_width = int(8.5 * target_dpi)
1001
- max_height = int(11 * target_dpi)
1002
 
1003
  # Check if resizing is needed - quick early return
1004
  if width <= max_width and height <= max_height:
@@ -1044,6 +1293,7 @@ def calculate_image_entropy(img: Image.Image) -> float:
1044
  def create_html_with_images(result):
1045
  """
1046
  Create an HTML document with embedded images from OCR results.
 
1047
 
1048
  Args:
1049
  result: OCR result dictionary containing pages_data
@@ -1051,6 +1301,8 @@ def create_html_with_images(result):
1051
  Returns:
1052
  HTML content as string
1053
  """
 
 
1054
  # Create HTML document structure
1055
  html_content = """
1056
  <!DOCTYPE html>
@@ -1265,6 +1517,43 @@ def generate_document_thumbnail(image_path: Union[str, Path], max_size: int = 30
1265
  # Return None if thumbnail generation fails
1266
  return None
1267
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1268
  def try_local_ocr_fallback(image_path: Union[str, Path], base64_data_url: str = None) -> str:
1269
  """
1270
  Attempt to use local pytesseract OCR as a fallback when API fails
 
31
  CV2_AVAILABLE = False
32
 
33
  from mistralai import DocumentURLChunk, ImageURLChunk, TextChunk
34
+ from mistralai.models import OCRImageObject
35
 
36
  # Import configuration
37
  try:
 
199
  # Handle list of results
200
  for i, result in enumerate(results):
201
  try:
202
+ # Create a descriptive base filename for this result
203
+ base_filename = result.get('file_name', f'document_{i+1}').split('.')[0]
204
+
205
+ # Add document type if available
206
+ if 'topics' in result and result['topics']:
207
+ topic = result['topics'][0].lower().replace(' ', '_')
208
+ base_filename = f"{base_filename}_{topic}"
209
+
210
+ # Add language if available
211
+ if 'languages' in result and result['languages']:
212
+ lang = result['languages'][0].lower()
213
+ # Only add if it's not already in the filename
214
+ if lang not in base_filename.lower():
215
+ base_filename = f"{base_filename}_{lang}"
216
+
217
+ # For PDFs, add page information
218
+ if 'total_pages' in result and 'processed_pages' in result:
219
+ base_filename = f"{base_filename}_p{result['processed_pages']}of{result['total_pages']}"
220
+
221
+ # Add timestamp if available
222
+ if 'timestamp' in result:
223
+ try:
224
+ # Try to parse the timestamp and reformat it
225
+ dt = datetime.strptime(result['timestamp'], "%Y-%m-%d %H:%M")
226
+ timestamp = dt.strftime("%Y%m%d_%H%M%S")
227
+ base_filename = f"{base_filename}_{timestamp}"
228
+ except:
229
+ pass
230
+
231
+ # Add JSON results for each file with descriptive name
232
  result_json = json.dumps(result, indent=2)
233
+ zipf.writestr(f"{base_filename}.json", result_json)
234
 
235
  # Add HTML content (generated from the result)
236
  html_content = create_html_with_images(result)
237
+ zipf.writestr(f"{base_filename}_with_images.html", html_content)
 
238
 
239
  # Add raw OCR text if available
240
  if "ocr_contents" in result and "raw_text" in result["ocr_contents"]:
241
+ zipf.writestr(f"{base_filename}.txt", result["ocr_contents"]["raw_text"])
242
 
243
  # Add HTML visualization if available
244
  if "html_visualization" in result:
 
266
  else:
267
  # Handle single result
268
  try:
269
+ # Create a descriptive base filename for this result
270
+ base_filename = results.get('file_name', 'document').split('.')[0]
271
+
272
+ # Add document type if available
273
+ if 'topics' in results and results['topics']:
274
+ topic = results['topics'][0].lower().replace(' ', '_')
275
+ base_filename = f"{base_filename}_{topic}"
276
+
277
+ # Add language if available
278
+ if 'languages' in results and results['languages']:
279
+ lang = results['languages'][0].lower()
280
+ # Only add if it's not already in the filename
281
+ if lang not in base_filename.lower():
282
+ base_filename = f"{base_filename}_{lang}"
283
+
284
+ # For PDFs, add page information
285
+ if 'total_pages' in results and 'processed_pages' in results:
286
+ base_filename = f"{base_filename}_p{results['processed_pages']}of{results['total_pages']}"
287
+
288
+ # Add timestamp if available
289
+ if 'timestamp' in results:
290
+ try:
291
+ # Try to parse the timestamp and reformat it
292
+ dt = datetime.strptime(results['timestamp'], "%Y-%m-%d %H:%M")
293
+ timestamp = dt.strftime("%Y%m%d_%H%M%S")
294
+ base_filename = f"{base_filename}_{timestamp}"
295
+ except:
296
+ # If parsing fails, create a new timestamp
297
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
298
+ base_filename = f"{base_filename}_{timestamp}"
299
+ else:
300
+ # No timestamp in the result, create a new one
301
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
302
+ base_filename = f"{base_filename}_{timestamp}"
303
+
304
+ # Add JSON results with descriptive name
305
  results_json = json.dumps(results, indent=2)
306
+ zipf.writestr(f"{base_filename}.json", results_json)
307
 
308
+ # Add HTML content with descriptive name
309
  html_content = create_html_with_images(results)
310
+ zipf.writestr(f"{base_filename}_with_images.html", html_content)
 
311
 
312
  # Add raw OCR text if available
313
  if "ocr_contents" in results and "raw_text" in results["ocr_contents"]:
314
+ zipf.writestr(f"{base_filename}.txt", results["ocr_contents"]["raw_text"])
315
 
316
  # Add HTML visualization if available
317
  if "html_visualization" in results:
 
368
 
369
  # Generate zip name if not provided
370
  if zip_name is None:
371
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
372
+
373
  if is_list:
374
+ # For a list of results, create a more descriptive name based on the content
375
+ file_count = len(results)
376
+
377
+ # Count document types
378
+ pdf_count = sum(1 for r in results if r.get('file_name', '').lower().endswith('.pdf'))
379
+ img_count = sum(1 for r in results if r.get('file_name', '').lower().endswith(('.jpg', '.jpeg', '.png')))
380
+
381
+ # Create descriptive name based on contents
382
+ if pdf_count > 0 and img_count > 0:
383
+ zip_name = f"historical_ocr_mixed_{pdf_count}pdf_{img_count}img_{timestamp}.zip"
384
+ elif pdf_count > 0:
385
+ zip_name = f"historical_ocr_pdf_documents_{pdf_count}_{timestamp}.zip"
386
+ elif img_count > 0:
387
+ zip_name = f"historical_ocr_images_{img_count}_{timestamp}.zip"
388
  else:
389
+ zip_name = f"historical_ocr_results_{file_count}_{timestamp}.zip"
390
+ else:
391
+ # For single result, create descriptive filename
392
+ base_name = results.get("file_name", "document").split('.')[0]
393
+
394
+ # Add document type if available
395
+ if 'topics' in results and results['topics']:
396
+ topic = results['topics'][0].lower().replace(' ', '_')
397
+ base_name = f"{base_name}_{topic}"
398
+
399
+ # Add language if available
400
+ if 'languages' in results and results['languages']:
401
+ lang = results['languages'][0].lower()
402
+ # Only add if it's not already in the filename
403
+ if lang not in base_name.lower():
404
+ base_name = f"{base_name}_{lang}"
405
+
406
+ # For PDFs, add page information
407
+ if 'total_pages' in results and 'processed_pages' in results:
408
+ base_name = f"{base_name}_p{results['processed_pages']}of{results['total_pages']}"
409
+
410
+ # Add timestamp
411
+ zip_name = f"{base_name}_{timestamp}.zip"
412
 
413
  try:
414
  # Get zip data in memory first
 
434
  def preprocess_image_for_ocr(image_path: Union[str, Path]) -> Tuple[Image.Image, str]:
435
  """
436
  Preprocess an image for optimal OCR performance with enhanced speed and memory optimization.
437
+ Enhanced to handle large newspaper and document images.
438
 
439
  Args:
440
  image_path: Path to the image file
 
498
 
499
  preprocess_image_for_ocr._cache[cache_key] = result
500
  return result
501
+
502
+ # Special handling for large newspaper-style documents
503
+ if file_size_mb > 5 and image_file.name.lower().endswith(('.jpg', '.jpeg', '.png')):
504
+ logger.info(f"Large image detected ({file_size_mb:.2f}MB), checking for newspaper format")
505
+ try:
506
+ # Quickly check dimensions without loading full image
507
+ with Image.open(image_file) as img:
508
+ width, height = img.size
509
+ aspect_ratio = width / height
510
+
511
+ # Newspaper-style documents typically have width > height or are very large
512
+ is_newspaper_format = (aspect_ratio > 1.2 and width > 2000) or (width > 3000 or height > 3000)
513
+
514
+ if is_newspaper_format:
515
+ logger.info(f"Newspaper format detected: {width}x{height}, applying specialized processing")
516
+
517
+ except Exception as dim_err:
518
+ logger.debug(f"Error checking dimensions: {str(dim_err)}")
519
+ is_newspaper_format = False
520
+ else:
521
+ is_newspaper_format = False
522
 
523
  except Exception as e:
524
  # If stat or cache handling fails, log and continue with processing
 
529
  except:
530
  file_size_mb = 0 # Default if we can't determine size
531
 
532
+ # Default to not newspaper format on error
533
+ is_newspaper_format = False
534
+
535
  try:
536
  # Process start time for performance logging
537
  start_time = time.time()
 
548
 
549
  # Detect document type only for medium to large images to save processing time
550
  is_document = False
551
+ is_newspaper = False
552
+
553
+ # More aggressive document type detection for larger images
554
  if image_area > 500000: # Approx 700x700 or larger
555
  # Store image for document detection
556
  _detect_document_type_impl._current_img = img
557
  is_document = _detect_document_type_impl(None)
558
+
559
+ # Additional check for newspaper format
560
+ if is_document:
561
+ # Newspapers typically have wide formats or very large dimensions
562
+ aspect_ratio = width / height
563
+ is_newspaper = (aspect_ratio > 1.2 and width > 2000) or (width > 3000 or height > 3000)
564
+
565
+ logger.debug(f"Document type detection for {image_file.name}: " +
566
+ f"{'newspaper' if is_newspaper else 'document' if is_document else 'photo'}")
567
 
568
+ # Special processing for very large images (newspapers and large documents)
569
+ if is_newspaper:
570
+ # For newspaper format, we need more specialized processing
571
+ logger.info(f"Processing newspaper format image: {width}x{height}")
572
+
573
+ # For newspapers, we prioritize text clarity over file size
574
+ # Use higher target resolution to preserve small text common in newspapers
575
+ # But still need to resize if extremely large to avoid API limits
576
+ max_dimension = max(width, height)
577
+
578
+ if max_dimension > 6000: # Extremely large
579
+ scale_factor = 0.4 # Preserve more resolution for newspapers (increased from 0.35)
580
+ elif max_dimension > 4000:
581
+ scale_factor = 0.6 # Higher resolution for better text extraction (increased from 0.5)
582
+ else:
583
+ scale_factor = 0.8 # Minimal reduction for moderate newspaper size (increased from 0.7)
584
+
585
+ # Calculate new dimensions - maintain higher resolution
586
+ new_width = int(width * scale_factor)
587
+ new_height = int(height * scale_factor)
588
+
589
+ # Use high-quality resampling to preserve text clarity in newspapers
590
+ processed_img = img.resize((new_width, new_height), Image.LANCZOS)
591
+ logger.debug(f"Resized newspaper image from {width}x{height} to {new_width}x{new_height}")
592
+
593
+ # For newspapers, we also want to enhance the contrast and sharpen the image
594
+ # before the main OCR processing for better text extraction
595
+ if img.mode in ('RGB', 'RGBA'):
596
+ # For color newspapers, enhance both the overall image and then convert to grayscale
597
+ # This helps with mixed content newspapers that have both text and images
598
+ enhancer = ImageEnhance.Contrast(processed_img)
599
+ processed_img = enhancer.enhance(1.3) # Boost contrast but not too aggressively
600
+
601
+ # Also enhance saturation to make colored text more visible
602
+ enhancer_sat = ImageEnhance.Color(processed_img)
603
+ processed_img = enhancer_sat.enhance(1.2)
604
+
605
+ # Standard processing for other large images
606
+ elif file_size_mb > IMAGE_PREPROCESSING["max_size_mb"] or max(width, height) > 3000:
607
  # Calculate target dimensions directly instead of using the heavier resize function
608
  target_width, target_height = width, height
609
  max_dimension = max(width, height)
610
 
611
  # Use a sliding scale for reduction based on image size
612
  if max_dimension > 5000:
613
+ scale_factor = 0.3 # Slightly less aggressive reduction (was 0.25)
614
  elif max_dimension > 3000:
615
+ scale_factor = 0.45 # Slightly less aggressive reduction (was 0.4)
616
  else:
617
+ scale_factor = 0.65 # Slightly less aggressive reduction (was 0.6)
618
 
619
  # Calculate new dimensions
620
  new_width = int(width * scale_factor)
 
720
  Optimized implementation of document type detection for faster processing.
721
  The img_hash parameter is unused but kept for backward compatibility.
722
 
723
+ Enhanced to better detect handwritten documents and newspaper formats.
724
  """
725
  # Fast path: Get the image from thread-local storage
726
  if not hasattr(_detect_document_type_impl, "_current_img"):
 
841
  def _preprocess_document_image_impl() -> Image.Image:
842
  """
843
  Optimized implementation of document preprocessing with adaptive processing based on image size.
844
+ Enhanced for better handwritten document processing and newspaper format.
845
  """
846
  # Fast path: Get image from thread-local storage
847
  if not hasattr(preprocess_document_image, "_current_img"):
 
853
  width, height = img.size
854
  img_size = width * height
855
 
856
+ # Detect special document types
857
  is_handwritten = False
858
+ is_newspaper = False
859
+
860
+ # Check for newspaper format first (takes precedence)
861
+ aspect_ratio = width / height
862
+ if (aspect_ratio > 1.2 and width > 2000) or (width > 3000 or height > 3000):
863
+ is_newspaper = True
864
+ logger.debug(f"Newspaper format detected: {width}x{height}, aspect ratio: {aspect_ratio:.2f}")
865
+ else:
866
+ # If not newspaper, check if handwritten
867
+ try:
868
+ # Simple check for handwritten document characteristics
869
+ # Handwritten documents often have more varied strokes and less stark contrast
870
+ if CV2_AVAILABLE:
871
+ # Convert to grayscale and calculate local variance
872
+ gray_np = np.array(img.convert('L'))
873
+ # Higher variance in edge strengths can indicate handwriting
874
+ edges = cv2.Canny(gray_np, 30, 100)
875
+ if np.count_nonzero(edges) / edges.size > 0.02: # Low edge threshold for handwriting
876
+ # Additional check with gradient magnitudes
877
+ sobelx = cv2.Sobel(gray_np, cv2.CV_64F, 1, 0, ksize=3)
878
+ sobely = cv2.Sobel(gray_np, cv2.CV_64F, 0, 1, ksize=3)
879
+ magnitude = np.sqrt(sobelx**2 + sobely**2)
880
+ # Handwriting typically has more variation in gradient magnitudes
881
+ if np.std(magnitude) > 20:
882
+ is_handwritten = True
883
+ except:
884
+ # If detection fails, assume it's not handwritten
885
+ pass
886
+
887
+ # Special processing for newspaper format
888
+ if is_newspaper:
889
+ # Convert to grayscale for better text extraction
890
+ gray = img.convert('L')
891
+
892
+ # For newspapers, we need aggressive text enhancement to make small print readable
893
+ # First enhance contrast more aggressively for newspaper small text
894
+ enhancer = ImageEnhance.Contrast(gray)
895
+ enhanced = enhancer.enhance(2.0) # More aggressive contrast for newspaper text
896
+
897
+ # Apply stronger sharpening to make small text more defined
898
+ if IMAGE_PREPROCESSING["sharpen"]:
899
+ # Apply multiple passes of sharpening for newspaper text
900
+ enhanced = enhanced.filter(ImageFilter.SHARPEN)
901
+ enhanced = enhanced.filter(ImageFilter.EDGE_ENHANCE_MORE) # Stronger edge enhancement
902
+
903
+ # Enhanced processing for newspapers with OpenCV when available
904
  if CV2_AVAILABLE:
905
+ try:
906
+ # Convert to numpy array
907
+ img_np = np.array(enhanced)
908
+
909
+ # For newspaper text extraction, CLAHE (Contrast Limited Adaptive Histogram Equalization)
910
+ # works much better than simple contrast enhancement
911
+ clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
912
+ img_np = clahe.apply(img_np)
913
+
914
+ # Apply different adaptive thresholding approaches and choose the best one
915
+
916
+ # 1. Standard adaptive threshold with larger block size for newspaper columns
917
+ binary1 = cv2.adaptiveThreshold(img_np, 255,
918
+ cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
919
+ cv2.THRESH_BINARY, 15, 4)
920
+
921
+ # 2. Otsu's method for global thresholding - works well for clean newspaper print
922
+ _, binary2 = cv2.threshold(img_np, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
923
+
924
+ # Try to determine which method preserves text better
925
+ # Count white pixels and edges in each binary version
926
+ white_pixels1 = np.count_nonzero(binary1 > 200)
927
+ white_pixels2 = np.count_nonzero(binary2 > 200)
928
+
929
+ # Calculate edge density to help determine which preserves text features better
930
+ edges1 = cv2.Canny(binary1, 100, 200)
931
+ edges2 = cv2.Canny(binary2, 100, 200)
932
+ edge_count1 = np.count_nonzero(edges1)
933
+ edge_count2 = np.count_nonzero(edges2)
934
+
935
+ # For newspaper text, we want to preserve more edges while maintaining reasonable
936
+ # white space (typical of printed text on paper background)
937
+ if (edge_count1 > edge_count2 * 1.2 and white_pixels1 > white_pixels2 * 0.7) or \
938
+ (white_pixels1 < white_pixels2 * 0.5): # If Otsu removed too much content
939
+ # Adaptive thresholding usually better preserves small text in newspapers
940
+ logger.debug("Using adaptive thresholding for newspaper text")
941
 
942
+ # Apply optional denoising to clean up small speckles
943
+ result = cv2.fastNlMeansDenoising(binary1, None, 7, 7, 21)
944
+ return Image.fromarray(result)
945
+ else:
946
+ # Otsu method was better
947
+ logger.debug("Using Otsu thresholding for newspaper text")
948
+ result = cv2.fastNlMeansDenoising(binary2, None, 7, 7, 21)
949
+ return Image.fromarray(result)
950
+
951
+ except Exception as e:
952
+ logger.debug(f"Advanced newspaper processing failed: {str(e)}")
953
+ # Fall back to PIL processing
954
+ pass
955
+
956
+ # If OpenCV not available or fails, apply additional PIL enhancements
957
+ # Create a more aggressive binary version to better separate text
958
+ binary_threshold = enhanced.point(lambda x: 0 if x < 150 else 255, '1')
959
+
960
+ # Return enhanced binary image
961
+ return binary_threshold
962
+
963
  # Ultra-fast path for tiny images - just convert to grayscale with contrast enhancement
964
  if img_size < 300000: # ~500x600 or smaller
965
  gray = img.convert('L')
 
1245
  width, height = img.size
1246
 
1247
  # Fixed target dimensions based on DPI
1248
+ # Using larger dimensions to support newspapers and large documents
1249
+ max_width = int(14 * target_dpi) # Increased from 8.5 to 14 inches
1250
+ max_height = int(22 * target_dpi) # Increased from 11 to 22 inches
1251
 
1252
  # Check if resizing is needed - quick early return
1253
  if width <= max_width and height <= max_height:
 
1293
  def create_html_with_images(result):
1294
  """
1295
  Create an HTML document with embedded images from OCR results.
1296
+ Handles serialization of complex OCR objects automatically.
1297
 
1298
  Args:
1299
  result: OCR result dictionary containing pages_data
 
1301
  Returns:
1302
  HTML content as string
1303
  """
1304
+ # Ensure result is fully serializable first
1305
+ result = serialize_ocr_object(result)
1306
  # Create HTML document structure
1307
  html_content = """
1308
  <!DOCTYPE html>
 
1517
  # Return None if thumbnail generation fails
1518
  return None
1519
 
1520
+ def serialize_ocr_object(obj):
1521
+ """
1522
+ Serialize OCR response objects to JSON serializable format.
1523
+ Handles OCRImageObject specifically to prevent serialization errors.
1524
+
1525
+ Args:
1526
+ obj: The object to serialize
1527
+
1528
+ Returns:
1529
+ JSON serializable representation of the object
1530
+ """
1531
+ # Fast path: Handle primitive types directly
1532
+ if obj is None or isinstance(obj, (str, int, float, bool)):
1533
+ return obj
1534
+
1535
+ # Handle collections
1536
+ if isinstance(obj, list):
1537
+ return [serialize_ocr_object(item) for item in obj]
1538
+ elif isinstance(obj, dict):
1539
+ return {k: serialize_ocr_object(v) for k, v in obj.items()}
1540
+ elif isinstance(obj, OCRImageObject):
1541
+ # Special handling for OCRImageObject
1542
+ return {
1543
+ 'id': obj.id if hasattr(obj, 'id') else None,
1544
+ 'image_base64': obj.image_base64 if hasattr(obj, 'image_base64') else None
1545
+ }
1546
+ elif hasattr(obj, '__dict__'):
1547
+ # For objects with __dict__ attribute
1548
+ return {k: serialize_ocr_object(v) for k, v in obj.__dict__.items()
1549
+ if not k.startswith('_')} # Skip private attributes
1550
+ else:
1551
+ # Try to convert to string as last resort
1552
+ try:
1553
+ return str(obj)
1554
+ except:
1555
+ return None
1556
+
1557
  def try_local_ocr_fallback(image_path: Union[str, Path], base64_data_url: str = None) -> str:
1558
  """
1559
  Attempt to use local pytesseract OCR as a fallback when API fails
structured_ocr.py CHANGED
@@ -506,6 +506,32 @@ class StructuredOCR:
506
  if 'ocr_contents' in result:
507
  result['ocr_contents']['raw_text'] = all_text
508
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
509
  except Exception as e:
510
  logger.warning(f"Custom prompt processing failed: {str(e)}. Using standard processing.")
511
  # Fall back to standard processing
@@ -901,6 +927,25 @@ class StructuredOCR:
901
  "confidence_score": 0.0
902
  }
903
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
904
  try:
905
  # Check file size
906
  file_size_mb = file_path.stat().st_size / (1024 * 1024)
@@ -992,8 +1037,8 @@ class StructuredOCR:
992
  logger.info(f"Processing image with OCR using {OCR_MODEL}")
993
 
994
  # Add retry logic with more retries and longer backoff periods for rate limit issues
995
- max_retries = 4 # Increased from 2 to give more chances to succeed
996
- retry_delay = 2 # Increased from 1 to allow for longer backoff periods
997
 
998
  for retry in range(max_retries):
999
  try:
@@ -1001,7 +1046,7 @@ class StructuredOCR:
1001
  document=ImageURLChunk(image_url=base64_data_url),
1002
  model=OCR_MODEL,
1003
  include_image_base64=True,
1004
- timeout_ms=90000 # 90 second timeout for better success rate
1005
  )
1006
  break # Success, exit retry loop
1007
  except Exception as e:
@@ -1079,7 +1124,8 @@ class StructuredOCR:
1079
  image_ocr_markdown = image_response.pages[0].markdown if image_response.pages else ""
1080
 
1081
  # Optimize: Skip vision model step if ocr_markdown is very small or empty
1082
- if not image_ocr_markdown or len(image_ocr_markdown) < 50:
 
1083
  logger.warning("OCR produced minimal or no text. Returning basic result.")
1084
  return {
1085
  "file_name": file_path.name,
@@ -1090,6 +1136,14 @@ class StructuredOCR:
1090
  },
1091
  "processing_note": "OCR produced minimal text content"
1092
  }
 
 
 
 
 
 
 
 
1093
 
1094
  # Extract structured data using the appropriate model, with a single API call
1095
  if use_vision:
@@ -1182,17 +1236,37 @@ class StructuredOCR:
1182
  logger = logging.getLogger("vision_processor")
1183
 
1184
  try:
1185
- # Fast path: Skip vision API for minimal OCR text (saves an API call)
1186
- if not ocr_markdown or len(ocr_markdown.strip()) < 100: # Increased threshold for better detection
1187
- logger.info("Minimal OCR text detected, skipping vision model processing")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1188
  return {
1189
  "file_name": filename,
1190
  "topics": ["Document"],
1191
  "languages": ["English"],
1192
  "ocr_contents": {
1193
- "raw_text": ocr_markdown if ocr_markdown else "No text could be extracted"
1194
  }
1195
  }
 
 
 
 
 
 
1196
 
1197
  # Fast path: Skip if in test mode or no API key
1198
  if self.test_mode or not self.api_key:
@@ -1203,25 +1277,10 @@ class StructuredOCR:
1203
  doc_type = self._detect_document_type(custom_prompt, ocr_markdown)
1204
  logger.info(f"Detected document type: {doc_type}")
1205
 
1206
- # Optimize OCR text for processing - focus on the first part which usually contains
1207
- # the most important information (title, metadata, etc.)
1208
- if len(ocr_markdown) > 8000:
1209
- # Start with first 5000 chars
1210
- first_part = ocr_markdown[:5000]
1211
-
1212
- # Then add representative samples from different parts of the document
1213
- # This captures headings and key information throughout
1214
- middle_start = len(ocr_markdown) // 2 - 1000
1215
- middle_part = ocr_markdown[middle_start:middle_start+2000] if middle_start > 0 else ""
1216
-
1217
- # Get ending section if large enough
1218
- if len(ocr_markdown) > 15000:
1219
- end_part = ocr_markdown[-1000:]
1220
- truncated_ocr = f"{first_part}\n...\n{middle_part}\n...\n{end_part}"
1221
- else:
1222
- truncated_ocr = f"{first_part}\n...\n{middle_part}"
1223
-
1224
- logger.info(f"Truncated OCR text from {len(ocr_markdown)} to {len(truncated_ocr)} chars")
1225
  else:
1226
  truncated_ocr = ocr_markdown
1227
 
@@ -1232,9 +1291,8 @@ class StructuredOCR:
1232
  start_time = time.time()
1233
 
1234
  try:
1235
- # Try with enhanced timing parameters based on document complexity
1236
- # Use shorter timeout for smaller documents
1237
- timeout_ms = min(120000, max(60000, len(truncated_ocr) * 10)) # 60-120 seconds based on text length
1238
 
1239
  logger.info(f"Calling vision model with {timeout_ms}ms timeout and document type {doc_type}")
1240
  chat_response = self.client.chat.parse(
@@ -1260,20 +1318,18 @@ class StructuredOCR:
1260
  # If there's an error with the enhanced prompt, try progressively simpler approaches
1261
  logger.warning(f"Enhanced prompt failed after {time.time() - start_time:.2f}s: {str(e)}")
1262
 
1263
- # Try a simplified approach with less context
1264
  try:
1265
- # Shorter prompt with less contextual information
1266
  simplified_prompt = (
1267
- f"You are an expert in historical document analysis. "
1268
- f"Analyze this document image and the OCR text below. "
1269
- f"<BEGIN_OCR>\n{truncated_ocr[:4000]}\n<END_OCR>\n"
1270
- f"Identify the document type, main topics, languages used, and extract key information "
1271
- f"including names, dates, places, and events. Return a structured JSON response."
1272
  )
1273
 
1274
- # Add custom prompt if provided
1275
- if custom_prompt:
1276
- simplified_prompt += f"\n\nAdditional instructions: {custom_prompt}"
1277
 
1278
  logger.info(f"Trying simplified prompt approach")
1279
  chat_response = self.client.chat.parse(
@@ -1289,7 +1345,7 @@ class StructuredOCR:
1289
  ],
1290
  response_format=StructuredOCRModel,
1291
  temperature=0,
1292
- timeout_ms=60000 # Shorter timeout for simplified approach
1293
  )
1294
 
1295
  logger.info(f"Simplified prompt approach succeeded")
@@ -1299,11 +1355,10 @@ class StructuredOCR:
1299
  logger.warning(f"Simplified prompt failed: {str(second_e)}. Trying minimal prompt.")
1300
 
1301
  try:
1302
- # Minimal prompt focusing on just the image
1303
  minimal_prompt = (
1304
- f"Analyze this historical document image. "
1305
- f"Extract the document type, main topics, languages, and key information. "
1306
- f"Provide your analysis in a structured JSON format."
1307
  )
1308
 
1309
  logger.info(f"Trying minimal prompt with image-only focus")
@@ -1320,7 +1375,7 @@ class StructuredOCR:
1320
  ],
1321
  response_format=StructuredOCRModel,
1322
  temperature=0,
1323
- timeout_ms=45000 # Even shorter timeout for minimal approach
1324
  )
1325
 
1326
  logger.info(f"Minimal prompt approach succeeded")
@@ -1345,6 +1400,35 @@ class StructuredOCR:
1345
  'api_response_time': time.time() - start_time
1346
  }
1347
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1348
  # Add confidence score if not present
1349
  if 'confidence_score' not in result:
1350
  result['confidence_score'] = 0.92 # Vision model typically has higher confidence
@@ -1444,7 +1528,8 @@ class StructuredOCR:
1444
 
1445
  def _build_enhanced_prompt(self, doc_type: str, ocr_text: str, custom_prompt: Optional[str]) -> str:
1446
  """
1447
- Build an enhanced prompt based on document type.
 
1448
 
1449
  Args:
1450
  doc_type: Detected document type
@@ -1452,125 +1537,163 @@ class StructuredOCR:
1452
  custom_prompt: User-provided custom prompt
1453
 
1454
  Returns:
1455
- Enhanced prompt optimized for the document type
1456
  """
1457
  # Generic document section (included in all prompts)
1458
  generic_section = (
1459
- f"This is a historical document's OCR text:\n"
1460
  f"<BEGIN_OCR>\n{ocr_text}\n<END_OCR>\n\n"
1461
  )
1462
 
1463
- # Document-specific prompting
1464
- if doc_type == "handwritten":
1465
- specific_section = (
1466
- f"You are an expert historian specializing in handwritten document transcription and analysis. "
1467
- f"The OCR system has attempted to capture the handwriting, but may have made errors with cursive script "
1468
- f"or unusual letter formations.\n\n"
1469
- f"Pay careful attention to:\n"
1470
- f"- Correcting OCR errors common in handwriting recognition\n"
1471
- f"- Preserving the original document structure\n"
1472
- f"- Identifying topics, language(s), and document type accurately\n"
1473
- f"- Detecting any names, dates, places, or events mentioned\n"
1474
- )
1475
-
1476
- elif doc_type == "letter":
1477
- specific_section = (
1478
- f"You are an expert in historical correspondence analysis. "
1479
- f"Analyze this letter as a historian would, identifying:\n"
1480
- f"- Sender and recipient (if mentioned)\n"
1481
- f"- Date and location of writing (if present)\n"
1482
- f"- Key topics discussed\n"
1483
- f"- Historical context and significance\n"
1484
- f"- Sentiment and tone of the communication\n"
1485
- f"- Closing formulations and signature\n"
1486
- )
1487
-
1488
- elif doc_type == "recipe":
1489
  specific_section = (
1490
- f"You are a culinary historian specializing in historical recipes. "
1491
- f"Analyze this recipe document to extract:\n"
1492
- f"- Recipe name/title\n"
1493
- f"- Complete list of ingredients with measurements\n"
1494
- f"- Preparation instructions in correct order\n"
1495
- f"- Cooking time and temperature if mentioned\n"
1496
- f"- Serving suggestions or yield information\n"
1497
- f"- Any cultural or historical context provided\n"
 
 
1498
  )
1499
 
1500
- elif doc_type == "travel":
1501
- specific_section = (
1502
- f"You are a historian specializing in historical travel and exploration accounts. "
1503
- f"Analyze this document to extract:\n"
1504
- f"- Geographical locations mentioned\n"
1505
- f"- Names of explorers, ships, or expeditions\n"
1506
- f"- Dates and timelines\n"
1507
- f"- Descriptions of indigenous peoples, cultures, or local conditions\n"
1508
- f"- Natural features, weather, or navigational details\n"
1509
- f"- Historical significance of the journey described\n"
1510
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1511
 
1512
- elif doc_type == "scientific":
1513
- specific_section = (
1514
- f"You are a historian of science specializing in historical scientific documents. "
1515
- f"Analyze this document to extract:\n"
1516
- f"- Scientific methodology described\n"
1517
- f"- Observations, measurements, or data presented\n"
1518
- f"- Scientific terminology of the period\n"
1519
- f"- Experimental apparatus or tools mentioned\n"
1520
- f"- Conclusions or hypotheses presented\n"
1521
- f"- Historical significance within scientific development\n"
 
 
 
 
1522
  )
1523
-
1524
- elif doc_type == "newspaper":
1525
  specific_section = (
1526
- f"You are a media historian specializing in historical newspapers and publications. "
1527
- f"Analyze this document to extract:\n"
1528
- f"- Publication name and date if present\n"
1529
- f"- Headlines and article titles\n"
1530
- f"- Main news content with focus on events, people, and places\n"
1531
- f"- Advertisement content if present\n"
1532
- f"- Historical context and significance\n"
1533
- f"- Editorial perspective or bias if detectable\n"
 
1534
  )
1535
 
1536
- elif doc_type == "legal":
1537
- specific_section = (
1538
- f"You are a legal historian specializing in historical legal documents. "
1539
- f"Analyze this document to extract:\n"
1540
- f"- Document type (contract, certificate, will, deed, etc.)\n"
1541
- f"- Parties involved and their roles\n"
1542
- f"- Key terms, conditions, or declarations\n"
1543
- f"- Dates, locations, and jurisdictions mentioned\n"
1544
- f"- Legal terminology of the period\n"
1545
- f"- Signatures, witnesses, or official markings\n"
1546
- )
1547
 
1548
- else:
1549
- # General historical document
1550
- specific_section = (
1551
- f"You are a historian specializing in historical document analysis. "
1552
- f"Analyze this document to extract:\n"
1553
- f"- Document type and purpose\n"
1554
- f"- Time period and historical context\n"
1555
- f"- Key topics, themes, and subjects\n"
1556
- f"- People, places, and events mentioned\n"
1557
- f"- Languages used and writing style\n"
1558
- f"- Historical significance and connections\n"
1559
  )
1560
 
1561
- # Output instructions
1562
- output_section = (
1563
- f"Create a structured JSON response with the following fields:\n"
1564
- f"- file_name: The document's name\n"
1565
- f"- topics: An array of topics covered in the document\n"
1566
- f"- languages: An array of languages used in the document\n"
1567
- f"- ocr_contents: A dictionary with the document's contents, organized logically\n"
1568
- )
1569
-
1570
  # Add custom prompt if provided
1571
  custom_section = ""
1572
  if custom_prompt:
1573
- custom_section = f"\n\nADDITIONAL CONTEXT AND INSTRUCTIONS:\n{custom_prompt}\n"
 
 
 
 
 
 
 
 
 
 
1574
 
1575
  # Combine all sections into complete prompt
1576
  return generic_section + specific_section + output_section + custom_section
@@ -1667,6 +1790,35 @@ class StructuredOCR:
1667
  result['model_used'] = TEXT_MODEL
1668
  result['processing_time'] = time.time() - start_time
1669
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1670
  # Add raw text for reference if not already present
1671
  if 'ocr_contents' in result and 'raw_text' not in result['ocr_contents']:
1672
  # Add truncated raw text if very large
 
506
  if 'ocr_contents' in result:
507
  result['ocr_contents']['raw_text'] = all_text
508
 
509
+ # Add flag to indicate custom prompt was applied
510
+ result['custom_prompt_applied'] = 'text_only'
511
+
512
+ # Detect document type from custom prompt if available
513
+ if custom_prompt:
514
+ # Extract document type if specified
515
+ doc_type = "general"
516
+ if "DOCUMENT TYPE:" in custom_prompt:
517
+ doc_type_line = custom_prompt.split("\n")[0]
518
+ if "DOCUMENT TYPE:" in doc_type_line:
519
+ doc_type = doc_type_line.split("DOCUMENT TYPE:")[1].strip().lower()
520
+ # Keyword-based detection as fallback
521
+ elif any(keyword in custom_prompt.lower() for keyword in ["newspaper", "column", "article", "magazine"]):
522
+ doc_type = "newspaper"
523
+ elif any(keyword in custom_prompt.lower() for keyword in ["letter", "correspondence", "handwritten"]):
524
+ doc_type = "letter"
525
+ elif any(keyword in custom_prompt.lower() for keyword in ["book", "publication"]):
526
+ doc_type = "book"
527
+ elif any(keyword in custom_prompt.lower() for keyword in ["form", "certificate", "legal"]):
528
+ doc_type = "form"
529
+ elif any(keyword in custom_prompt.lower() for keyword in ["recipe", "ingredients"]):
530
+ doc_type = "recipe"
531
+
532
+ # Store detected document type in result
533
+ result['detected_document_type'] = doc_type
534
+
535
  except Exception as e:
536
  logger.warning(f"Custom prompt processing failed: {str(e)}. Using standard processing.")
537
  # Fall back to standard processing
 
927
  "confidence_score": 0.0
928
  }
929
 
930
+ # Check if this is likely a newspaper or document with columns by filename
931
+ is_likely_newspaper = False
932
+ newspaper_keywords = ["newspaper", "gazette", "herald", "times", "journal",
933
+ "chronicle", "post", "tribune", "news", "press", "gender"]
934
+
935
+ # Check filename for newspaper indicators
936
+ filename_lower = file_path.name.lower()
937
+ for keyword in newspaper_keywords:
938
+ if keyword in filename_lower:
939
+ is_likely_newspaper = True
940
+ logger.info(f"Likely newspaper document detected from filename: {file_path.name}")
941
+ # Add newspaper-specific processing hint to custom_prompt if not already present
942
+ if custom_prompt:
943
+ if "column" not in custom_prompt.lower() and "newspaper" not in custom_prompt.lower():
944
+ custom_prompt = custom_prompt + " This appears to be a newspaper or document with columns. Please extract all text content from each column."
945
+ else:
946
+ custom_prompt = "This appears to be a newspaper or document with columns. Please extract all text content from each column, maintaining proper reading order."
947
+ break
948
+
949
  try:
950
  # Check file size
951
  file_size_mb = file_path.stat().st_size / (1024 * 1024)
 
1037
  logger.info(f"Processing image with OCR using {OCR_MODEL}")
1038
 
1039
  # Add retry logic with more retries and longer backoff periods for rate limit issues
1040
+ max_retries = 2 # Reduced to prevent rate limiting
1041
+ retry_delay = 1 # Shorter delay between retries
1042
 
1043
  for retry in range(max_retries):
1044
  try:
 
1046
  document=ImageURLChunk(image_url=base64_data_url),
1047
  model=OCR_MODEL,
1048
  include_image_base64=True,
1049
+ timeout_ms=45000 # 45 second timeout for better performance
1050
  )
1051
  break # Success, exit retry loop
1052
  except Exception as e:
 
1124
  image_ocr_markdown = image_response.pages[0].markdown if image_response.pages else ""
1125
 
1126
  # Optimize: Skip vision model step if ocr_markdown is very small or empty
1127
+ # BUT make an exception for newspapers or if custom_prompt is provided
1128
+ if (not is_likely_newspaper and not custom_prompt) and (not image_ocr_markdown or len(image_ocr_markdown) < 50):
1129
  logger.warning("OCR produced minimal or no text. Returning basic result.")
1130
  return {
1131
  "file_name": file_path.name,
 
1136
  },
1137
  "processing_note": "OCR produced minimal text content"
1138
  }
1139
+
1140
+ # For newspapers with little text in OCR, set a more explicit prompt
1141
+ if is_likely_newspaper and (not image_ocr_markdown or len(image_ocr_markdown) < 100):
1142
+ logger.info("Newspaper with minimal OCR text detected. Using enhanced prompt.")
1143
+ if not custom_prompt:
1144
+ custom_prompt = "This is a newspaper or document with columns. The OCR may not have captured all text. Please examine the image carefully and extract ALL text content visible in the document, reading each column from top to bottom."
1145
+ elif "extract all text" not in custom_prompt.lower():
1146
+ custom_prompt += " Please examine the image carefully and extract ALL text content visible in the document."
1147
 
1148
  # Extract structured data using the appropriate model, with a single API call
1149
  if use_vision:
 
1236
  logger = logging.getLogger("vision_processor")
1237
 
1238
  try:
1239
+ # Check if this is a newspaper or document with columns by filename
1240
+ is_likely_newspaper = False
1241
+ newspaper_keywords = ["newspaper", "gazette", "herald", "times", "journal",
1242
+ "chronicle", "post", "tribune", "news", "press", "gender"]
1243
+
1244
+ # Check filename for newspaper indicators
1245
+ filename_lower = filename.lower()
1246
+ for keyword in newspaper_keywords:
1247
+ if keyword in filename_lower:
1248
+ is_likely_newspaper = True
1249
+ logger.info(f"Likely newspaper document detected in vision processing: {filename}")
1250
+ break
1251
+
1252
+ # Fast path: Skip vision API if OCR already produced reasonable text
1253
+ # We'll define "reasonable" as having at least 300 characters
1254
+ if len(ocr_markdown.strip()) > 300:
1255
+ logger.info("Sufficient OCR text detected, using OCR text directly")
1256
  return {
1257
  "file_name": filename,
1258
  "topics": ["Document"],
1259
  "languages": ["English"],
1260
  "ocr_contents": {
1261
+ "raw_text": ocr_markdown
1262
  }
1263
  }
1264
+
1265
+ # Only use vision model for minimal OCR text or when document has columns
1266
+ if is_likely_newspaper and (not ocr_markdown or len(ocr_markdown.strip()) < 300):
1267
+ logger.info("Using vision model for newspaper with minimal OCR text")
1268
+ if not custom_prompt:
1269
+ custom_prompt = "Document has columns. Extract text by reading each column top to bottom."
1270
 
1271
  # Fast path: Skip if in test mode or no API key
1272
  if self.test_mode or not self.api_key:
 
1277
  doc_type = self._detect_document_type(custom_prompt, ocr_markdown)
1278
  logger.info(f"Detected document type: {doc_type}")
1279
 
1280
+ # Use only the first part of OCR text to keep prompts small and processing fast
1281
+ if len(ocr_markdown) > 1000:
1282
+ truncated_ocr = ocr_markdown[:1000]
1283
+ logger.info(f"Truncated OCR text from {len(ocr_markdown)} to 1000 chars for faster processing")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1284
  else:
1285
  truncated_ocr = ocr_markdown
1286
 
 
1291
  start_time = time.time()
1292
 
1293
  try:
1294
+ # Use a fixed, shorter timeout for single-page documents
1295
+ timeout_ms = 45000 # 45 seconds is optimal for most single-page documents
 
1296
 
1297
  logger.info(f"Calling vision model with {timeout_ms}ms timeout and document type {doc_type}")
1298
  chat_response = self.client.chat.parse(
 
1318
  # If there's an error with the enhanced prompt, try progressively simpler approaches
1319
  logger.warning(f"Enhanced prompt failed after {time.time() - start_time:.2f}s: {str(e)}")
1320
 
1321
+ # Try a very simplified approach with minimal context
1322
  try:
1323
+ # Ultra-short prompt for faster processing
1324
  simplified_prompt = (
1325
+ f"Extract text from this document image. "
1326
+ f"<BEGIN_OCR>\n{truncated_ocr[:500]}\n<END_OCR>\n"
1327
+ f"Return a JSON with file_name, topics, languages, and ocr_contents fields."
 
 
1328
  )
1329
 
1330
+ # Only add minimal custom prompt if provided
1331
+ if custom_prompt and len(custom_prompt) < 100:
1332
+ simplified_prompt += f"\n{custom_prompt}"
1333
 
1334
  logger.info(f"Trying simplified prompt approach")
1335
  chat_response = self.client.chat.parse(
 
1345
  ],
1346
  response_format=StructuredOCRModel,
1347
  temperature=0,
1348
+ timeout_ms=30000 # Very short timeout for simplified approach (30 seconds)
1349
  )
1350
 
1351
  logger.info(f"Simplified prompt approach succeeded")
 
1355
  logger.warning(f"Simplified prompt failed: {str(second_e)}. Trying minimal prompt.")
1356
 
1357
  try:
1358
+ # Minimal prompt focusing only on OCR task
1359
  minimal_prompt = (
1360
+ f"Extract the text from this image. "
1361
+ f"Return JSON with file_name, topics, languages, and ocr_contents.raw_text fields."
 
1362
  )
1363
 
1364
  logger.info(f"Trying minimal prompt with image-only focus")
 
1375
  ],
1376
  response_format=StructuredOCRModel,
1377
  temperature=0,
1378
+ timeout_ms=25000 # Minimal timeout for last attempt (25 seconds)
1379
  )
1380
 
1381
  logger.info(f"Minimal prompt approach succeeded")
 
1400
  'api_response_time': time.time() - start_time
1401
  }
1402
 
1403
+ # Flag when custom prompt has been successfully applied
1404
+ if custom_prompt:
1405
+ result['custom_prompt_applied'] = 'vision_model'
1406
+
1407
+ # Attempt to detect document type from custom prompt
1408
+ if "DOCUMENT TYPE:" in custom_prompt:
1409
+ doc_type_line = custom_prompt.split("\n")[0]
1410
+ if "DOCUMENT TYPE:" in doc_type_line:
1411
+ custom_doc_type = doc_type_line.split("DOCUMENT TYPE:")[1].strip().lower()
1412
+ result['detected_document_type'] = custom_doc_type
1413
+ # Keyword-based detection as fallback
1414
+ elif any(keyword in custom_prompt.lower() for keyword in ["newspaper", "column", "article", "magazine"]):
1415
+ result['detected_document_type'] = "newspaper"
1416
+ elif any(keyword in custom_prompt.lower() for keyword in ["letter", "correspondence", "handwritten"]):
1417
+ result['detected_document_type'] = "letter"
1418
+ elif any(keyword in custom_prompt.lower() for keyword in ["book", "publication"]):
1419
+ result['detected_document_type'] = "book"
1420
+ elif any(keyword in custom_prompt.lower() for keyword in ["form", "certificate", "legal"]):
1421
+ result['detected_document_type'] = "form"
1422
+ elif any(keyword in custom_prompt.lower() for keyword in ["recipe", "ingredients"]):
1423
+ result['detected_document_type'] = "recipe"
1424
+ elif "this is a" in custom_prompt.lower():
1425
+ # Extract document type from "This is a [type]" format
1426
+ this_is_parts = custom_prompt.lower().split("this is a ")
1427
+ if len(this_is_parts) > 1:
1428
+ extracted_type = this_is_parts[1].split(".")[0].strip()
1429
+ if extracted_type:
1430
+ result['detected_document_type'] = extracted_type
1431
+
1432
  # Add confidence score if not present
1433
  if 'confidence_score' not in result:
1434
  result['confidence_score'] = 0.92 # Vision model typically has higher confidence
 
1528
 
1529
  def _build_enhanced_prompt(self, doc_type: str, ocr_text: str, custom_prompt: Optional[str]) -> str:
1530
  """
1531
+ Build an optimized prompt focused on OCR accuracy with specialized attention to
1532
+ historical typography, manuscript conventions, and document deterioration patterns.
1533
 
1534
  Args:
1535
  doc_type: Detected document type
 
1537
  custom_prompt: User-provided custom prompt
1538
 
1539
  Returns:
1540
+ Optimized prompt focused on text extraction with historical document expertise
1541
  """
1542
  # Generic document section (included in all prompts)
1543
  generic_section = (
1544
+ f"This is a document's OCR text:\n"
1545
  f"<BEGIN_OCR>\n{ocr_text}\n<END_OCR>\n\n"
1546
  )
1547
 
1548
+ # Check if custom prompt contains document type information
1549
+ has_custom_doc_type = False
1550
+ custom_doc_type = ""
1551
+
1552
+ if custom_prompt and "DOCUMENT TYPE:" in custom_prompt:
1553
+ # Extract the document type from the custom prompt
1554
+ doc_type_line = custom_prompt.split("\n")[0]
1555
+ if "DOCUMENT TYPE:" in doc_type_line:
1556
+ custom_doc_type = doc_type_line.split("DOCUMENT TYPE:")[1].strip()
1557
+ has_custom_doc_type = True
1558
+ # If we have a custom doc type, use it instead
1559
+ if custom_doc_type:
1560
+ doc_type = custom_doc_type.lower()
1561
+
1562
+ # If user has provided detailed instructions, provide more elaborate prompting
1563
+ if custom_prompt and (has_custom_doc_type or len(custom_prompt.strip()) > 20):
1564
+ # Enhanced prompt for documents with custom instructions and historical expertise
 
 
 
 
 
 
 
 
 
1565
  specific_section = (
1566
+ f"You are an advanced OCR specialist with expertise in historical documents, typography, and manuscript conventions. "
1567
+ f"Below is a document that requires specialized analysis with attention to historical characteristics. "
1568
+ f"Pay particular attention to:\n"
1569
+ f"- Historical typography features (long s 'ſ', ligatures, obsolete letter forms)\n"
1570
+ f"- Manuscript conventions of the period (abbreviations, contractions, marginalia)\n"
1571
+ f"- Document deterioration patterns (faded ink, foxing, water damage, paper degradation)\n"
1572
+ f"- Accurately capturing ALL text content visible in the image with historical context\n"
1573
+ f"- Following the specific user instructions for processing this document type\n"
1574
+ f"- Identifying key information, structure, and historical formatting conventions\n"
1575
+ f"- Providing comprehensive analysis with attention to historical context\n"
1576
  )
1577
 
1578
+ # Add specialized instructions based on document type
1579
+ if doc_type == "newspaper":
1580
+ specific_section += (
1581
+ f"\nThis appears to be a newspaper or document with columns. "
1582
+ f"Please read each column from top to bottom, then move to the next column. "
1583
+ f"Extract all article titles, headings, bylines, and body text in the correct reading order. "
1584
+ f"Pay special attention to section headers, page numbers, publication date, and newspaper name. "
1585
+ f"For historical newspapers, be aware of period-specific typography such as the long s (ſ), "
1586
+ f"unique ligatures (æ, œ, ct, st), and decorative fonts. Account for paper degradation around "
1587
+ f"fold lines and edges. Recognize archaic abbreviations and typesetting conventions of the period.\n"
1588
+ )
1589
+ elif doc_type == "letter":
1590
+ specific_section += (
1591
+ f"\nThis appears to be a letter or correspondence. "
1592
+ f"Pay special attention to the letterhead, date, greeting, body content, closing, and signature. "
1593
+ f"Preserve the original formatting including paragraph breaks and indentation. "
1594
+ f"Note any handwritten annotations or marginalia separately. "
1595
+ f"For historical letters, carefully transcribe historical scripts and handwriting styles, "
1596
+ f"noting unclear or damaged sections. Identify period-specific salutations, closings, and "
1597
+ f"formalities. Watch for ink fading, bleeding, and seepage through pages. "
1598
+ f"Recognize period-specific abbreviations (ye, yr, inst, ult, prox) and long s (ſ) in older printed correspondence.\n"
1599
+ )
1600
+ elif doc_type == "book":
1601
+ specific_section += (
1602
+ f"\nThis appears to be a book or publication page. "
1603
+ f"Pay attention to chapter titles, headers, page numbers, footnotes, and main body text. "
1604
+ f"Preserve paragraph structure and any special formatting. "
1605
+ f"Note any images, tables, or figures that might be referenced in the text. "
1606
+ f"For historical books, attend to period typography including the long s (ſ), ligatures (æ, œ, ct, ſt), "
1607
+ f"archaic letter forms, and decorative initials/drop caps. Account for foxing (brown spotting), "
1608
+ f"bleed-through from opposite pages, and binding damage. Recognize period-specific typographic "
1609
+ f"conventions like catchwords, signatures, obsolete punctuation, and historical spelling variants "
1610
+ f"(e.g., -ize/-ise, past tense 'd for -ed). Note bookplates, ownership marks, and marginalia.\n"
1611
+ )
1612
+ elif doc_type == "form":
1613
+ specific_section += (
1614
+ f"\nThis appears to be a form or legal document. "
1615
+ f"Carefully extract all field labels and their corresponding values. "
1616
+ f"Preserve the structure of form fields and sections. "
1617
+ f"Pay special attention to signature lines, dates, and any official markings. "
1618
+ f"For historical forms and legal documents, recognize period-specific legal terminology and "
1619
+ f"formulaic phrases. Note seals, stamps, watermarks, and official emblems. Watch for faded ink "
1620
+ f"in signatures and filled fields. Identify period handwriting styles in completed sections. "
1621
+ f"Account for specialized legal abbreviations (e.g., SS., Esq., inst., wit.) and archaic "
1622
+ f"measurement units. Note folding patterns and worn edges common in frequently handled legal documents.\n"
1623
+ )
1624
+ elif doc_type == "recipe":
1625
+ specific_section += (
1626
+ f"\nThis appears to be a recipe or food-related document. "
1627
+ f"Extract the recipe title, ingredient list (with measurements), preparation steps, "
1628
+ f"cooking times, serving information, and any notes or tips. "
1629
+ f"Maintain the distinction between ingredients and preparation instructions. "
1630
+ f"For historical recipes, attend to archaic measurements (gill, dram, peck, firkin), obsolete "
1631
+ f"cooking terminology, and period-specific ingredients and their modern equivalents. Note handwritten "
1632
+ f"annotations and personal modifications. Identify period-specific cooking methods and tools that "
1633
+ f"might need explanation. Watch for liquid stains and food residue common on well-used recipe pages. "
1634
+ f"Recognize unclear fractions and temperature instructions (e.g., 'slow oven', 'quick fire').\n"
1635
+ )
1636
 
1637
+ # Output instructions (enhanced for custom requests)
1638
+ output_section = (
1639
+ f"Create a detailed structured JSON response with the following fields:\n"
1640
+ f"- file_name: The document's name\n"
1641
+ f"- topics: An array of specific topics, themes, or subjects covered in the document\n"
1642
+ f"- languages: An array of languages used in the document\n"
1643
+ f"- ocr_contents: A comprehensive dictionary with the document's contents including:\n"
1644
+ f" * title: The main title or heading\n"
1645
+ f" * subtitle: Any subtitle or secondary heading (if present)\n"
1646
+ f" * date: Publication or document date (if present)\n"
1647
+ f" * author: Author or creator information (if present)\n"
1648
+ f" * content: The main body content, properly formatted\n"
1649
+ f" * additional sections as appropriate for this document type\n"
1650
+ f" * raw_text: The complete OCR text\n"
1651
  )
1652
+ else:
1653
+ # Default processing with basic historical document awareness
1654
  specific_section = (
1655
+ f"You are an OCR specialist with knowledge of historical documents and typography. "
1656
+ f"Focus on accurately extracting text content with attention to historical features. "
1657
+ f"Pay special attention to:\n"
1658
+ f"- Accurately capturing ALL text content visible in the image\n"
1659
+ f"- Maintaining the correct reading order and structure\n"
1660
+ f"- Preserving paragraph breaks and text layout\n"
1661
+ f"- Identifying the main document type, time period, and language\n"
1662
+ f"- Recognizing historical typography features (long s 'ſ', ligatures, archaic characters)\n"
1663
+ f"- Accounting for document deterioration (faded ink, stains, foxing, physical damage)\n"
1664
  )
1665
 
1666
+ # Only add specialized instructions for newspapers with columns
1667
+ if doc_type == "newspaper":
1668
+ specific_section += (
1669
+ f"\nThis appears to be a document with columns. "
1670
+ f"Be sure to read each column from top to bottom, then move to the next column. "
1671
+ f"Extract all article titles, headings, and body text.\n"
1672
+ )
 
 
 
 
1673
 
1674
+ # Simple output instructions for default cases
1675
+ output_section = (
1676
+ f"Create a structured JSON response with the following fields:\n"
1677
+ f"- file_name: The document's name\n"
1678
+ f"- topics: An array of topics covered in the document\n"
1679
+ f"- languages: An array of languages used in the document\n"
1680
+ f"- ocr_contents: A dictionary with the document's contents, with the focus on complete text extraction\n"
 
 
 
 
1681
  )
1682
 
 
 
 
 
 
 
 
 
 
1683
  # Add custom prompt if provided
1684
  custom_section = ""
1685
  if custom_prompt:
1686
+ # Process custom prompt to extract just the instructions part if available
1687
+ if "USER INSTRUCTIONS:" in custom_prompt:
1688
+ instructions_part = custom_prompt.split("USER INSTRUCTIONS:")[1].strip()
1689
+ custom_section = f"\n\nUser-provided instructions: {instructions_part}\n"
1690
+ elif "INSTRUCTIONS:" in custom_prompt:
1691
+ instructions_part = custom_prompt.split("INSTRUCTIONS:")[1].strip()
1692
+ custom_section = f"\n\nUser-provided instructions: {instructions_part}\n"
1693
+ else:
1694
+ # Strip custom prompt to essentials
1695
+ stripped_prompt = custom_prompt.replace("This is a", "").replace("It appears to be a", "")
1696
+ custom_section = f"\n\nUser-provided instructions: {stripped_prompt}\n"
1697
 
1698
  # Combine all sections into complete prompt
1699
  return generic_section + specific_section + output_section + custom_section
 
1790
  result['model_used'] = TEXT_MODEL
1791
  result['processing_time'] = time.time() - start_time
1792
 
1793
+ # Flag when custom prompt has been successfully applied
1794
+ if custom_prompt:
1795
+ result['custom_prompt_applied'] = 'text_model'
1796
+
1797
+ # Attempt to detect document type from custom prompt
1798
+ if "DOCUMENT TYPE:" in custom_prompt:
1799
+ doc_type_line = custom_prompt.split("\n")[0]
1800
+ if "DOCUMENT TYPE:" in doc_type_line:
1801
+ custom_doc_type = doc_type_line.split("DOCUMENT TYPE:")[1].strip().lower()
1802
+ result['detected_document_type'] = custom_doc_type
1803
+ # Keyword-based detection as fallback
1804
+ elif any(keyword in custom_prompt.lower() for keyword in ["newspaper", "column", "article", "magazine"]):
1805
+ result['detected_document_type'] = "newspaper"
1806
+ elif any(keyword in custom_prompt.lower() for keyword in ["letter", "correspondence", "handwritten"]):
1807
+ result['detected_document_type'] = "letter"
1808
+ elif any(keyword in custom_prompt.lower() for keyword in ["book", "publication"]):
1809
+ result['detected_document_type'] = "book"
1810
+ elif any(keyword in custom_prompt.lower() for keyword in ["form", "certificate", "legal"]):
1811
+ result['detected_document_type'] = "form"
1812
+ elif any(keyword in custom_prompt.lower() for keyword in ["recipe", "ingredients"]):
1813
+ result['detected_document_type'] = "recipe"
1814
+ elif "this is a" in custom_prompt.lower():
1815
+ # Extract document type from "This is a [type]" format
1816
+ this_is_parts = custom_prompt.lower().split("this is a ")
1817
+ if len(this_is_parts) > 1:
1818
+ extracted_type = this_is_parts[1].split(".")[0].strip()
1819
+ if extracted_type:
1820
+ result['detected_document_type'] = extracted_type
1821
+
1822
  # Add raw text for reference if not already present
1823
  if 'ocr_contents' in result and 'raw_text' not in result['ocr_contents']:
1824
  # Add truncated raw text if very large