milwright commited on
Commit
622c90f
·
1 Parent(s): 75ead00

Fix sample document loading and processing pipeline

Browse files

- Fixed sample document loading to automatically process after selection
- Enhanced SampleDocument class with better file emulation
- Added session state management for reliable sample processing
- Improved user feedback during sample document processing
- Updated CLAUDE.md with improved documentation

Files changed (2) hide show
  1. CLAUDE.md +4 -1
  2. app.py +246 -14
CLAUDE.md CHANGED
@@ -8,6 +8,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
8
  - Process PDF files: `python pdf_ocr.py <file_path>`
9
  - Process single file with logging: `python process_file.py <file_path>`
10
  - Run newspaper test: `python test_newspaper.py <file_path>`
 
11
  - Run typechecking: `mypy .`
12
  - Lint code: `ruff check .` or `flake8`
13
 
@@ -23,6 +24,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
23
  - **Naming**: snake_case for variables/functions, PascalCase for classes
24
  - **Documentation**: Google-style docstrings for all functions/classes
25
  - **Logging**: Use module-level loggers with appropriate log levels
 
26
  - **Line length**: ≤100 characters
27
 
28
  ## Architecture
@@ -30,4 +32,5 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
30
  - Utils: `ocr_utils.py` - OCR text and image processing utilities
31
  - PDF handling: `pdf_ocr.py` - PDF-specific processing functionality
32
  - Config: `config.py` - Configuration settings and API keys
33
- - Web: `app.py` - Streamlit interface with UI components in `/ui` directory
 
 
8
  - Process PDF files: `python pdf_ocr.py <file_path>`
9
  - Process single file with logging: `python process_file.py <file_path>`
10
  - Run newspaper test: `python test_newspaper.py <file_path>`
11
+ - Run notebook demo: `jupyter notebook notebook_demo.ipynb`
12
  - Run typechecking: `mypy .`
13
  - Lint code: `ruff check .` or `flake8`
14
 
 
24
  - **Naming**: snake_case for variables/functions, PascalCase for classes
25
  - **Documentation**: Google-style docstrings for all functions/classes
26
  - **Logging**: Use module-level loggers with appropriate log levels
27
+ - **Exception handling**: Implement graceful fallbacks for API errors
28
  - **Line length**: ≤100 characters
29
 
30
  ## Architecture
 
32
  - Utils: `ocr_utils.py` - OCR text and image processing utilities
33
  - PDF handling: `pdf_ocr.py` - PDF-specific processing functionality
34
  - Config: `config.py` - Configuration settings and API keys
35
+ - Web: `app.py` - Streamlit interface with UI components in `/ui` directory
36
+ - Demo: `notebook_demo.ipynb` - Interactive notebook with educational examples
app.py CHANGED
@@ -511,12 +511,12 @@ with main_tab1:
511
  # Add heading for the file uploader (just text, no container)
512
  st.markdown('### Upload Document')
513
 
514
- # Model info below the heading
515
- st.markdown("Using the latest `mistral-ocr-latest` model for advanced document understanding.")
516
 
517
  # Enhanced file uploader with better help text
518
  uploaded_file = st.file_uploader("Drag and drop PDFs or images here", type=["pdf", "png", "jpg", "jpeg"],
519
- help="Supports PDFs, JPGs, PNGs and other image formats")
520
 
521
  # Removed seed prompt instructions from here, moving to sidebar
522
 
@@ -917,6 +917,8 @@ with main_tab2:
917
  badge_color = "#6a1b9a" # Purple for document types
918
  elif any(term in topic.lower() for term in ["travel", "military", "science", "medicine", "education", "art", "literature"]):
919
  badge_color = "#2e7d32" # Green for subject domains
 
 
920
 
921
  st.markdown(
922
  f'<span style="background-color: {badge_color}; color: white; padding: 3px 8px; '
@@ -1193,6 +1195,27 @@ with main_tab3:
1193
  """)
1194
 
1195
  with main_tab1:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1196
  if uploaded_file is not None:
1197
  # Check file size (cap at 50MB)
1198
  file_size_mb = len(uploaded_file.getvalue()) / (1024 * 1024)
@@ -1247,8 +1270,21 @@ with main_tab1:
1247
  # No extra spacing needed as it will be managed programmatically
1248
  metadata_placeholder = st.empty()
1249
 
1250
- # Results section
1251
- if process_button:
 
 
 
 
 
 
 
 
 
 
 
 
 
1252
  # Move the progress indicator reference to just below the button
1253
  progress_container = progress_placeholder
1254
  try:
@@ -1477,8 +1513,8 @@ with main_tab1:
1477
  # Only show when custom_prompt exists in the session AND has content, or when the result explicitly states it was applied
1478
  has_instructions = ('custom_prompt' in locals() and custom_prompt and len(str(custom_prompt).strip()) > 0)
1479
  if has_instructions or 'custom_prompt_applied' in result:
1480
- # Use a simpler message that just shows custom instructions were applied
1481
- metadata_html += f'<p style="margin-top:10px; padding:5px 8px; background-color:#f0f8ff; border-left:3px solid #4ba3e3; border-radius:3px; color:#333;"><strong>Advanced Analysis:</strong> Custom instructions applied</p>'
1482
 
1483
  # Close the metadata card
1484
  metadata_html += '</div>'
@@ -1936,6 +1972,63 @@ with main_tab1:
1936
 
1937
  if 'ocr_contents' not in result:
1938
  st.error("No OCR content was extracted from the document.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1939
 
1940
  # Close document content div
1941
  st.markdown('</div>', unsafe_allow_html=True)
@@ -2038,6 +2131,41 @@ with main_tab1:
2038
  lang_tag = f"{lang} Language"
2039
  subject_tags.append(lang_tag)
2040
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2041
  except Exception as e:
2042
  logger.warning(f"Error generating subject tags: {str(e)}")
2043
  # Fallback tags if extraction fails
@@ -2094,9 +2222,7 @@ with main_tab1:
2094
  except Exception as e:
2095
  st.error(f"Error processing document: {str(e)}")
2096
  else:
2097
- # Empty placeholder - we've moved the upload instruction to the file_uploader
2098
-
2099
- # Show example images in a simpler layout
2100
  st.subheader("Example Documents")
2101
 
2102
  # Add a simplified info message about examples
@@ -2106,9 +2232,115 @@ with main_tab1:
2106
  - Handwritten letters and documents
2107
  - Printed books and articles
2108
  - Multi-page PDFs
2109
-
2110
- Upload your own document to get started or explore the 'About' tab for more information.
2111
  """)
2112
 
2113
- # Display a direct message about sample documents
2114
- st.info("Sample documents are available in the input directory. Upload a document to begin analysis.")# Minor update
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
511
  # Add heading for the file uploader (just text, no container)
512
  st.markdown('### Upload Document')
513
 
514
+ # Model info with clearer instructions
515
+ st.markdown("Using the latest `mistral-ocr-latest` model for advanced document understanding. To get started upload your own document, use an example document, or explore the 'About' tab for more info.")
516
 
517
  # Enhanced file uploader with better help text
518
  uploaded_file = st.file_uploader("Drag and drop PDFs or images here", type=["pdf", "png", "jpg", "jpeg"],
519
+ help="Limit 200MB per file PDF, PNG, JPG, JPEG")
520
 
521
  # Removed seed prompt instructions from here, moving to sidebar
522
 
 
917
  badge_color = "#6a1b9a" # Purple for document types
918
  elif any(term in topic.lower() for term in ["travel", "military", "science", "medicine", "education", "art", "literature"]):
919
  badge_color = "#2e7d32" # Green for subject domains
920
+ elif any(term in topic.lower() for term in ["preprocessed", "enhanced", "grayscale", "denoised", "contrast", "rotated"]):
921
+ badge_color = "#e65100" # Orange for preprocessing-related tags
922
 
923
  st.markdown(
924
  f'<span style="background-color: {badge_color}; color: white; padding: 3px 8px; '
 
1195
  """)
1196
 
1197
  with main_tab1:
1198
+ # Initialize session states if needed
1199
+ if 'auto_process_sample' not in st.session_state:
1200
+ st.session_state.auto_process_sample = False
1201
+ if 'sample_just_loaded' not in st.session_state:
1202
+ st.session_state.sample_just_loaded = False
1203
+
1204
+ # Use uploaded_file or sample_document if available
1205
+ if 'sample_document' in st.session_state and st.session_state.sample_document is not None:
1206
+ # Use the sample document
1207
+ uploaded_file = st.session_state.sample_document
1208
+ # Add a notice about using sample document
1209
+ st.success(f"Using sample document: {uploaded_file.name}")
1210
+
1211
+ # Set auto-process flag in session state if this is a newly loaded sample
1212
+ if st.session_state.sample_just_loaded:
1213
+ st.session_state.auto_process_sample = True
1214
+ st.session_state.sample_just_loaded = False
1215
+
1216
+ # Clear sample document after use to avoid interference with future uploads
1217
+ st.session_state.sample_document = None
1218
+
1219
  if uploaded_file is not None:
1220
  # Check file size (cap at 50MB)
1221
  file_size_mb = len(uploaded_file.getvalue()) / (1024 * 1024)
 
1270
  # No extra spacing needed as it will be managed programmatically
1271
  metadata_placeholder = st.empty()
1272
 
1273
+ # Check if we need to auto-process a sample document
1274
+ if 'auto_process_sample' not in st.session_state:
1275
+ st.session_state.auto_process_sample = False
1276
+
1277
+ # Results section - process if button clicked or auto-process flag is set
1278
+ process_now = process_button or st.session_state.auto_process_sample
1279
+
1280
+ # Show a message if auto-processing
1281
+ if st.session_state.auto_process_sample:
1282
+ st.info("Automatically processing sample document...")
1283
+
1284
+ if process_now:
1285
+ # Reset auto-process flag to avoid processing on next rerun
1286
+ if st.session_state.auto_process_sample:
1287
+ st.session_state.auto_process_sample = False
1288
  # Move the progress indicator reference to just below the button
1289
  progress_container = progress_placeholder
1290
  try:
 
1513
  # Only show when custom_prompt exists in the session AND has content, or when the result explicitly states it was applied
1514
  has_instructions = ('custom_prompt' in locals() and custom_prompt and len(str(custom_prompt).strip()) > 0)
1515
  if has_instructions or 'custom_prompt_applied' in result:
1516
+ # Use consistent styling with other metadata fields
1517
+ metadata_html += f'<p><strong>Advanced Analysis:</strong> Custom instructions applied</p>'
1518
 
1519
  # Close the metadata card
1520
  metadata_html += '</div>'
 
1972
 
1973
  if 'ocr_contents' not in result:
1974
  st.error("No OCR content was extracted from the document.")
1975
+ else:
1976
+ # Check for minimal text content in OCR results
1977
+ has_minimal_text = False
1978
+ total_text_length = 0
1979
+
1980
+ # Check if the document is an image (not a PDF)
1981
+ is_image = result.get('file_name', '').lower().endswith(('.jpg', '.jpeg', '.png', '.gif'))
1982
+
1983
+ # If image file with raw_text only
1984
+ if is_image and 'ocr_contents' in result:
1985
+ ocr_contents = result['ocr_contents']
1986
+
1987
+ # Check if only raw_text exists with minimal content
1988
+ has_raw_text_only = False
1989
+ if 'raw_text' in ocr_contents:
1990
+ raw_text = ocr_contents['raw_text']
1991
+ total_text_length += len(raw_text.strip())
1992
+
1993
+ # Check if raw_text is the only significant field
1994
+ other_content_fields = [k for k in ocr_contents.keys()
1995
+ if k not in ['raw_text', 'error', 'partial_text']
1996
+ and isinstance(ocr_contents[k], (str, list))
1997
+ and ocr_contents[k]]
1998
+
1999
+ if len(other_content_fields) <= 1: # Only raw_text or one other field
2000
+ has_raw_text_only = True
2001
+
2002
+ # Check if minimal text was extracted (less than 50 characters)
2003
+ if total_text_length < 50 and has_raw_text_only:
2004
+ has_minimal_text = True
2005
+
2006
+ # Check if any meaningful preprocessing options were used
2007
+ preprocessing_used = False
2008
+ if preprocessing_options.get("document_type", "standard") != "standard":
2009
+ preprocessing_used = True
2010
+ if preprocessing_options.get("grayscale", False):
2011
+ preprocessing_used = True
2012
+ if preprocessing_options.get("denoise", False):
2013
+ preprocessing_used = True
2014
+ if preprocessing_options.get("contrast", 0) != 0:
2015
+ preprocessing_used = True
2016
+ if preprocessing_options.get("rotation", 0) != 0:
2017
+ preprocessing_used = True
2018
+
2019
+ # If minimal text was found and preprocessing options weren't used
2020
+ if has_minimal_text and not preprocessing_used and uploaded_file.type.startswith('image/'):
2021
+ st.warning("""
2022
+ **Limited text extracted from this image.**
2023
+
2024
+ Try using preprocessing options in the sidebar to improve results:
2025
+ - Convert to grayscale for clearer text
2026
+ - Use denoising for aged or degraded documents
2027
+ - Adjust contrast for faded text
2028
+ - Try different rotation if text orientation is unclear
2029
+
2030
+ Click the "Preprocessing Options" section in the sidebar under "Image Processing".
2031
+ """)
2032
 
2033
  # Close document content div
2034
  st.markdown('</div>', unsafe_allow_html=True)
 
2131
  lang_tag = f"{lang} Language"
2132
  subject_tags.append(lang_tag)
2133
 
2134
+ # Add preprocessing information as tags if preprocessing was applied
2135
+ if uploaded_file.type.startswith('image/'):
2136
+ # Check if meaningful preprocessing options were used
2137
+ if preprocessing_options.get("document_type", "standard") != "standard":
2138
+ doc_type = preprocessing_options["document_type"].capitalize()
2139
+ preprocessing_tag = f"Enhanced ({doc_type})"
2140
+ if preprocessing_tag not in subject_tags:
2141
+ subject_tags.append(preprocessing_tag)
2142
+
2143
+ preprocessing_methods = []
2144
+ if preprocessing_options.get("grayscale", False):
2145
+ preprocessing_methods.append("Grayscale")
2146
+ if preprocessing_options.get("denoise", False):
2147
+ preprocessing_methods.append("Denoised")
2148
+ if preprocessing_options.get("contrast", 0) != 0:
2149
+ contrast_val = preprocessing_options.get("contrast", 0)
2150
+ if contrast_val > 0:
2151
+ preprocessing_methods.append("Contrast Enhanced")
2152
+ else:
2153
+ preprocessing_methods.append("Contrast Reduced")
2154
+ if preprocessing_options.get("rotation", 0) != 0:
2155
+ preprocessing_methods.append("Rotated")
2156
+
2157
+ # Add a combined preprocessing tag if methods were applied
2158
+ if preprocessing_methods:
2159
+ prep_tag = "Preprocessed"
2160
+ if prep_tag not in subject_tags:
2161
+ subject_tags.append(prep_tag)
2162
+
2163
+ # Add the specific method as a tag if only one was used
2164
+ if len(preprocessing_methods) == 1:
2165
+ method_tag = preprocessing_methods[0]
2166
+ if method_tag not in subject_tags:
2167
+ subject_tags.append(method_tag)
2168
+
2169
  except Exception as e:
2170
  logger.warning(f"Error generating subject tags: {str(e)}")
2171
  # Fallback tags if extraction fails
 
2222
  except Exception as e:
2223
  st.error(f"Error processing document: {str(e)}")
2224
  else:
2225
+ # Example Documents section after file uploader
 
 
2226
  st.subheader("Example Documents")
2227
 
2228
  # Add a simplified info message about examples
 
2232
  - Handwritten letters and documents
2233
  - Printed books and articles
2234
  - Multi-page PDFs
 
 
2235
  """)
2236
 
2237
+ # Add CSS to make the dropdown match the column width
2238
+ st.markdown("""
2239
+ <style>
2240
+ /* Make the selectbox container match the full column width */
2241
+ .main .block-container .element-container:has([data-testid="stSelectbox"]) {
2242
+ width: 100% !important;
2243
+ max-width: 100% !important;
2244
+ }
2245
+
2246
+ /* Make the actual selectbox control take the full width */
2247
+ .stSelectbox > div > div {
2248
+ width: 100% !important;
2249
+ max-width: 100% !important;
2250
+ }
2251
+ </style>
2252
+ """, unsafe_allow_html=True)
2253
+
2254
+ # Sample document URLs dropdown with clearer label
2255
+ sample_urls = [
2256
+ "Select a sample document",
2257
+ "https://huggingface.co/spaces/milwright/historical-ocr/resolve/main/input/a-la-carte.pdf",
2258
+ "https://huggingface.co/spaces/milwright/historical-ocr/resolve/main/input/magician-or-bottle-cungerer.jpg",
2259
+ "https://huggingface.co/spaces/milwright/historical-ocr/resolve/main/input/handwritten-letter.jpg",
2260
+ "https://huggingface.co/spaces/milwright/historical-ocr/resolve/main/input/magellan-travels.jpg",
2261
+ "https://huggingface.co/spaces/milwright/historical-ocr/resolve/main/input/milgram-flier.png",
2262
+ "https://huggingface.co/spaces/milwright/historical-ocr/resolve/main/input/baldwin-15st-north.jpg"
2263
+ ]
2264
+
2265
+ sample_names = [
2266
+ "Select a sample document",
2267
+ "Restaurant Menu (PDF)",
2268
+ "The Magician (Image)",
2269
+ "Handwritten Letter (Image)",
2270
+ "Magellan Travels (Image)",
2271
+ "Milgram Flier (Image)",
2272
+ "Baldwin Street (Image)"
2273
+ ]
2274
+
2275
+ # Initialize sample_document in session state if it doesn't exist
2276
+ if 'sample_document' not in st.session_state:
2277
+ st.session_state.sample_document = None
2278
+
2279
+ selected_sample = st.selectbox("Select a sample document from `~/input`", options=range(len(sample_urls)), format_func=lambda i: sample_names[i])
2280
+
2281
+ if selected_sample > 0:
2282
+ selected_url = sample_urls[selected_sample]
2283
+
2284
+ # Add process button for the sample document
2285
+ if st.button("Load Sample Document"):
2286
+ try:
2287
+ import requests
2288
+ from io import BytesIO
2289
+
2290
+ with st.spinner(f"Downloading {sample_names[selected_sample]}..."):
2291
+ response = requests.get(selected_url)
2292
+ response.raise_for_status()
2293
+
2294
+ # Extract filename from URL
2295
+ file_name = selected_url.split("/")[-1]
2296
+
2297
+ # Create a BytesIO object from the downloaded content
2298
+ file_content = BytesIO(response.content)
2299
+
2300
+ # Store as a UploadedFile-like object in session state
2301
+ class SampleDocument:
2302
+ def __init__(self, name, content, content_type):
2303
+ self.name = name
2304
+ self._content = content
2305
+ self.type = content_type
2306
+ self.size = len(content)
2307
+
2308
+ def getvalue(self):
2309
+ return self._content
2310
+
2311
+ def read(self):
2312
+ return self._content
2313
+
2314
+ def seek(self, position):
2315
+ # Implement seek for compatibility with some file operations
2316
+ return
2317
+
2318
+ def tell(self):
2319
+ # Implement tell for compatibility
2320
+ return 0
2321
+
2322
+ # Determine content type based on file extension
2323
+ if file_name.lower().endswith('.pdf'):
2324
+ content_type = 'application/pdf'
2325
+ elif file_name.lower().endswith(('.jpg', '.jpeg')):
2326
+ content_type = 'image/jpeg'
2327
+ elif file_name.lower().endswith('.png'):
2328
+ content_type = 'image/png'
2329
+ else:
2330
+ content_type = 'application/octet-stream'
2331
+
2332
+ # Save download info in session state for more reliable handling
2333
+ st.session_state.sample_document = SampleDocument(
2334
+ name=file_name,
2335
+ content=response.content,
2336
+ content_type=content_type
2337
+ )
2338
+
2339
+ # Set a flag to indicate this is a newly loaded sample
2340
+ st.session_state.sample_just_loaded = True
2341
+
2342
+ # Force rerun to load the document
2343
+ st.rerun()
2344
+ except Exception as e:
2345
+ st.error(f"Error downloading sample document: {str(e)}")
2346
+ st.info("Please try uploading your own document instead.")