docling / src /streamlit_app.py
levalencia's picture
Enhance Dockerfile and Streamlit app for comprehensive environment setup and permission testing
98aae70
raw
history blame
39.4 kB
# Set environment variables IMMEDIATELY to prevent root filesystem access
# This must happen before any other imports or operations
import os
import tempfile
# Get a writable temp directory first
try:
TEMP_DIR = os.path.join(tempfile.gettempdir(), "docling_temp")
os.makedirs(TEMP_DIR, exist_ok=True)
except Exception:
try:
TEMP_DIR = "/tmp/docling_temp"
os.makedirs(TEMP_DIR, exist_ok=True)
except Exception:
TEMP_DIR = os.getcwd()
# Set all environment variables that libraries might use
os.environ.update({
# Streamlit configuration
'STREAMLIT_SERVER_FILE_WATCHER_TYPE': 'none',
'STREAMLIT_SERVER_HEADLESS': 'true',
'STREAMLIT_BROWSER_GATHER_USAGE_STATS': 'false',
'STREAMLIT_SERVER_ENABLE_CORS': 'false',
'STREAMLIT_SERVER_ENABLE_XSRF_PROTECTION': 'false',
# EasyOCR configuration
'EASYOCR_MODULE_PATH': os.path.join(TEMP_DIR, 'easyocr_models'),
'HOME': TEMP_DIR,
'USERPROFILE': TEMP_DIR,
'XDG_CACHE_HOME': os.path.join(TEMP_DIR, 'cache'),
'XDG_CONFIG_HOME': os.path.join(TEMP_DIR, 'config'),
'XDG_DATA_HOME': os.path.join(TEMP_DIR, 'data'),
# Hugging Face Hub configuration - CRITICAL for preventing /.cache access
'HF_HOME': os.path.join(TEMP_DIR, 'huggingface'),
'HF_CACHE_HOME': os.path.join(TEMP_DIR, 'huggingface_cache'),
'HF_HUB_CACHE': os.path.join(TEMP_DIR, 'huggingface_cache'),
'TRANSFORMERS_CACHE': os.path.join(TEMP_DIR, 'transformers_cache'),
'HF_DATASETS_CACHE': os.path.join(TEMP_DIR, 'datasets_cache'),
'DIFFUSERS_CACHE': os.path.join(TEMP_DIR, 'diffusers_cache'),
'ACCELERATE_CACHE': os.path.join(TEMP_DIR, 'accelerate_cache'),
# Additional Hugging Face specific variables
'HF_HUB_DISABLE_TELEMETRY': '1',
'HF_HUB_DISABLE_IMPLICIT_TOKEN': '1',
'HF_HUB_OFFLINE': '0',
# Other ML libraries
'TORCH_HOME': os.path.join(TEMP_DIR, 'torch'),
'TENSORFLOW_HOME': os.path.join(TEMP_DIR, 'tensorflow'),
'KERAS_HOME': os.path.join(TEMP_DIR, 'keras'),
'MLFLOW_TRACKING_URI': f'file:{os.path.join(TEMP_DIR, "mlruns")}',
# Additional cache directories
'CACHE_DIR': os.path.join(TEMP_DIR, 'cache'),
'MODEL_CACHE_DIR': os.path.join(TEMP_DIR, 'models'),
# Additional environment variables to prevent root access
'PYTHONPATH': TEMP_DIR,
'TMPDIR': TEMP_DIR,
'TEMP': TEMP_DIR,
'TMP': TEMP_DIR,
'CACHE': os.path.join(TEMP_DIR, 'cache'),
'MODELS': os.path.join(TEMP_DIR, 'models'),
'DATA': os.path.join(TEMP_DIR, 'data'),
'CONFIG': os.path.join(TEMP_DIR, 'config'),
})
# Create all necessary directories
directories_to_create = [
os.environ['EASYOCR_MODULE_PATH'],
os.environ['XDG_CACHE_HOME'],
os.environ['XDG_CONFIG_HOME'],
os.environ['XDG_DATA_HOME'],
os.environ['HF_HOME'],
os.environ['HF_CACHE_HOME'],
os.environ['TRANSFORMERS_CACHE'],
os.environ['HF_DATASETS_CACHE'],
os.environ['TORCH_HOME'],
os.environ['TENSORFLOW_HOME'],
os.environ['KERAS_HOME'],
os.environ['CACHE_DIR'],
os.environ['MODEL_CACHE_DIR'],
os.environ['CACHE'],
os.environ['MODELS'],
os.environ['DATA'],
os.environ['CONFIG'],
os.environ['HF_HUB_CACHE'],
os.environ['DIFFUSERS_CACHE'],
os.environ['ACCELERATE_CACHE'],
]
# Monkey patch os.makedirs to prevent root directory access
original_makedirs = os.makedirs
def safe_makedirs(name, mode=0o777, exist_ok=False):
"""Safe version of makedirs that prevents root directory access."""
# Check if trying to create directory in root filesystem
if name.startswith('/') and not name.startswith('/tmp') and not name.startswith('/app'):
# Redirect to temp directory
basename = os.path.basename(name)
safe_name = os.path.join(TEMP_DIR, basename)
print(f"Redirecting root directory creation from {name} to {safe_name}")
return original_makedirs(safe_name, mode, exist_ok)
return original_makedirs(name, mode, exist_ok)
# Apply the monkey patch
os.makedirs = safe_makedirs
for directory in directories_to_create:
try:
os.makedirs(directory, exist_ok=True)
except Exception as e:
print(f"Warning: Could not create directory {directory}: {e}")
# Now import the rest of the modules
import streamlit as st
import logging
import shutil
from processing.document_processor import DocumentProcessor
from processing.sections import ReasoningSectionExtractor
from utils.logging_utils import get_log_handler
from dotenv import load_dotenv
import sys
import html
import difflib
import re
import time
# Configure logging early to avoid issues
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(name)s: %(message)s",
stream=sys.stdout,
force=True
)
# Load environment variables from .env
load_dotenv()
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY")
AZURE_OPENAI_VERSION = os.getenv("AZURE_OPENAI_VERSION")
AZURE_OPENAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT")
# Log startup information
logging.info("=" * 50)
logging.info("Docling Streamlit App Starting")
logging.info(f"Temp directory: {TEMP_DIR}")
logging.info(f"EasyOCR model directory: {os.environ.get('EASYOCR_MODULE_PATH', 'NOT_SET')}")
logging.info(f"Hugging Face cache: {os.environ.get('HF_CACHE_HOME', 'NOT_SET')}")
logging.info(f"Current working directory: {os.getcwd()}")
logging.info(f"Python version: {sys.version}")
logging.info("=" * 50)
def cleanup_temp_files():
"""Clean up temporary files in the temp directory."""
try:
if os.path.exists(TEMP_DIR):
for filename in os.listdir(TEMP_DIR):
file_path = os.path.join(TEMP_DIR, filename)
if os.path.isfile(file_path):
try:
os.remove(file_path)
logging.info(f"Removed temp file: {filename}")
except PermissionError as e:
logging.warning(f"Permission error removing {filename}: {e}")
except Exception as e:
logging.warning(f"Error removing {filename}: {e}")
logging.info(f"Cleaned up temporary files in {TEMP_DIR}")
else:
logging.info(f"Temp directory {TEMP_DIR} does not exist")
except PermissionError as e:
logging.warning(f"Permission error accessing temp directory {TEMP_DIR}: {e}")
except Exception as e:
logging.warning(f"Error cleaning up temp files: {e}")
def clear_all_data():
"""Clear all temporary files and session state data."""
try:
# Clean up temp files
cleanup_temp_files()
# Clear session state
if "processed_results" in st.session_state:
del st.session_state.processed_results
if "logs" in st.session_state:
del st.session_state.logs
if "original_structures" in st.session_state:
del st.session_state.original_structures
if "show_original" in st.session_state:
del st.session_state.show_original
if "show_processed" in st.session_state:
del st.session_state.show_processed
if "temp_cleaned" in st.session_state:
del st.session_state.temp_cleaned
if "last_cleanup_time" in st.session_state:
del st.session_state.last_cleanup_time
logging.info("Cleared all session state and temporary files")
return True
except Exception as e:
logging.error(f"Error clearing all data: {e}")
return False
def get_temp_files_info():
"""Get information about temporary files (count and total size)."""
try:
if not os.path.exists(TEMP_DIR):
return 0, 0
files = os.listdir(TEMP_DIR)
total_size = 0
for filename in files:
try:
file_path = os.path.join(TEMP_DIR, filename)
if os.path.isfile(file_path):
total_size += os.path.getsize(file_path)
except (PermissionError, OSError) as e:
logging.warning(f"Error accessing file {filename}: {e}")
continue
return len(files), total_size
except PermissionError as e:
logging.warning(f"Permission error accessing temp directory {TEMP_DIR}: {e}")
return 0, 0
except Exception as e:
logging.warning(f"Error getting temp files info: {e}")
return 0, 0
def format_file_size(size_bytes):
"""Format file size in human readable format."""
if size_bytes == 0:
return "0 B"
size_names = ["B", "KB", "MB", "GB"]
i = 0
while size_bytes >= 1024 and i < len(size_names) - 1:
size_bytes /= 1024.0
i += 1
return f"{size_bytes:.1f} {size_names[i]}"
def save_uploaded_file(uploaded_file, filename):
"""Save uploaded file to temp directory and return the path."""
temp_path = os.path.join(TEMP_DIR, f"temp_{filename}")
try:
uploaded_file.seek(0) # Reset file pointer to beginning
file_bytes = uploaded_file.read()
with open(temp_path, "wb") as f:
f.write(file_bytes)
logging.info(f"Saved uploaded file to {temp_path}")
return temp_path
except PermissionError as e:
logging.error(f"Permission error saving uploaded file to {temp_path}: {e}")
raise PermissionError(f"Cannot save file due to permission restrictions. Please try clearing data or contact support.")
except Exception as e:
logging.error(f"Error saving uploaded file: {e}")
raise
# Configure page layout to use wide mode
st.set_page_config(
page_title="Medical Document Parser & Redactor",
page_icon="πŸ“„",
layout="wide",
initial_sidebar_state="collapsed"
)
# Add custom CSS for better styling
st.markdown("""
<style>
/* Custom styling for text areas */
.stTextArea textarea {
font-family: 'Courier New', monospace !important;
font-size: 12px !important;
line-height: 1.4 !important;
border: 2px solid #e0e0e0 !important;
border-radius: 8px !important;
}
/* Hover effect for text areas */
.stTextArea textarea:hover {
border-color: #1f77b4 !important;
}
/* Custom styling for download buttons */
.stDownloadButton > button {
border-radius: 8px !important;
font-weight: 600 !important;
}
/* Custom styling for the comparison section */
.comparison-container {
background-color: #f8f9fa;
padding: 20px;
border-radius: 10px;
border: 1px solid #e9ecef;
}
/* Synchronized scrolling styles */
.sync-scroll-container {
display: flex;
gap: 20px;
height: 600px;
font-family: 'Courier New', monospace;
font-size: 12px;
}
.sync-scroll-panel {
flex: 1;
border: 1px solid #ddd;
border-radius: 5px;
overflow: hidden;
display: flex;
flex-direction: column;
}
.sync-scroll-header {
background-color: #f8f9fa;
padding: 10px;
border-bottom: 1px solid #ddd;
font-weight: bold;
}
.sync-scroll-content {
flex: 1;
overflow-y: auto;
padding: 10px;
background-color: #fff;
scroll-behavior: smooth;
transition: scroll-top 0.1s ease-out;
}
/* Prevent scroll chaining */
.sync-scroll-content::-webkit-scrollbar {
width: 8px;
}
.sync-scroll-content::-webkit-scrollbar-track {
background: #f1f1f1;
}
.sync-scroll-content::-webkit-scrollbar-thumb {
background: #888;
border-radius: 4px;
}
.sync-scroll-content::-webkit-scrollbar-thumb:hover {
background: #555;
}
</style>
<script>
// Improved synchronized scrolling implementation with better debugging
console.log('Starting sync scroll setup...');
function setupSyncScroll() {
console.log('setupSyncScroll called');
// Wait for elements to be available
setTimeout(function() {
console.log('Looking for scroll elements...');
const originalContent = document.getElementById('original-content');
const redactedContent = document.getElementById('redacted-content');
console.log('Original content element:', originalContent);
console.log('Redacted content element:', redactedContent);
if (originalContent && redactedContent) {
console.log('Both elements found, setting up sync...');
let isScrolling = false;
let scrollTimeout;
function syncScroll(source, target) {
if (!isScrolling) {
isScrolling = true;
console.log('Syncing scroll from', source.id, 'to', target.id, 'scrollTop:', source.scrollTop);
target.scrollTop = source.scrollTop;
// Clear existing timeout
if (scrollTimeout) {
clearTimeout(scrollTimeout);
}
// Reset flag after a short delay
scrollTimeout = setTimeout(() => {
isScrolling = false;
console.log('Scroll sync completed');
}, 100);
}
}
// Remove existing listeners to prevent duplicates
if (originalContent._syncScrollHandler) {
originalContent.removeEventListener('scroll', originalContent._syncScrollHandler);
}
if (redactedContent._syncScrollHandler) {
redactedContent.removeEventListener('scroll', redactedContent._syncScrollHandler);
}
// Create new handlers
originalContent._syncScrollHandler = function(e) {
console.log('Original content scrolled:', e.target.scrollTop);
syncScroll(originalContent, redactedContent);
};
redactedContent._syncScrollHandler = function(e) {
console.log('Redacted content scrolled:', e.target.scrollTop);
syncScroll(redactedContent, originalContent);
};
// Add event listeners
originalContent.addEventListener('scroll', originalContent._syncScrollHandler, { passive: true });
redactedContent.addEventListener('scroll', redactedContent._syncScrollHandler, { passive: true });
console.log('Event listeners added successfully');
// Show status indicator
const statusElement = document.getElementById('sync-status');
if (statusElement) {
statusElement.style.display = 'block';
console.log('Status indicator shown');
}
// Test the synchronization
setTimeout(() => {
console.log('Testing scroll sync...');
console.log('Original scrollTop:', originalContent.scrollTop);
console.log('Redacted scrollTop:', redactedContent.scrollTop);
// Try a small scroll to test
originalContent.scrollTop = 10;
setTimeout(() => {
console.log('After test scroll - Original:', originalContent.scrollTop, 'Redacted:', redactedContent.scrollTop);
}, 50);
}, 200);
} else {
console.log('Elements not found, will retry...');
// Retry with exponential backoff
setTimeout(setupSyncScroll, 300);
}
}, 200);
}
// Multiple initialization strategies
function initializeSyncScroll() {
console.log('Initializing sync scroll...');
// Strategy 1: Immediate setup
setupSyncScroll();
// Strategy 2: Setup after DOM ready
if (document.readyState === 'loading') {
document.addEventListener('DOMContentLoaded', function() {
console.log('DOM loaded, setting up sync scroll...');
setupSyncScroll();
});
}
// Strategy 3: Setup after window load
window.addEventListener('load', function() {
console.log('Window loaded, setting up sync scroll...');
setupSyncScroll();
});
// Strategy 4: Periodic retry for first 10 seconds
let attempts = 0;
const maxAttempts = 20;
const retryInterval = setInterval(function() {
attempts++;
console.log('Retry attempt', attempts);
const originalContent = document.getElementById('original-content');
const redactedContent = document.getElementById('redacted-content');
if (originalContent && redactedContent) {
console.log('Elements found on retry, setting up...');
setupSyncScroll();
clearInterval(retryInterval);
} else if (attempts >= maxAttempts) {
console.log('Max retry attempts reached, giving up');
clearInterval(retryInterval);
}
}, 500);
}
// Start initialization
initializeSyncScroll();
// Listen for Streamlit-specific events
if (window.parent && window.parent.postMessage) {
console.log('Streamlit environment detected');
// Listen for any messages that might indicate a rerun
window.addEventListener('message', function(event) {
console.log('Received message:', event.data);
if (event.data && (event.data.type === 'streamlit:rerun' || event.data.type === 'streamlit:setComponentValue')) {
console.log('Streamlit rerun detected, reinitializing sync scroll...');
setTimeout(setupSyncScroll, 1000);
}
});
}
console.log('Sync scroll script loaded');
</script>
""", unsafe_allow_html=True)
# Configure root logger only once (avoid duplicate handlers on reruns)
if len(logging.getLogger().handlers) == 0:
logging.getLogger().setLevel(logging.INFO)
# (We will attach custom handlers during processing as needed)
# Title and description
st.title("Medical Document Parser & Redactor")
st.write("""
Upload PDF medical documents to parse their content using **Docling** (structure-aware parser)
and automatically **redact specific sections** (e.g., initial and final medication lists).
Use the buttons below to view the original structure or process with redaction.
**πŸ’‘ Tip:** This is a Hugging Face Space with limited storage. Use the "Clear All Data" button to remove temporary files when you're done processing documents.
""")
# Add clear all data button at the top
if st.button("🧹 Clear All Data", type="secondary", help="Remove all temporary files and reset the application"):
if clear_all_data():
st.success("βœ… All data cleared successfully! The application has been reset.")
st.rerun()
else:
st.error("❌ Error clearing data. Please try again.")
# File uploader (accept multiple PDF files)
uploaded_files = st.file_uploader("Upload PDF medical documents", type=["pdf"], accept_multiple_files=True)
# Clean up temp files on app start (but keep the directory)
if "temp_cleaned" not in st.session_state:
cleanup_temp_files()
st.session_state.temp_cleaned = True
# Initialize session state storage for results and logs
if "processed_results" not in st.session_state:
st.session_state.processed_results = {} # {filename: {"structured_json": ..., "redacted_md": ..., "redacted_json": ...}}
if "logs" not in st.session_state:
st.session_state.logs = {} # {filename: log_text}
if "original_structures" not in st.session_state:
st.session_state.original_structures = {} # {filename: structured_json}
# Show temp directory status and cleanup button
temp_file_count, total_size = get_temp_files_info()
# Automatic cleanup: if temp files are too old or too large, clean them up
if "last_cleanup_time" not in st.session_state:
st.session_state.last_cleanup_time = time.time()
# Check if we should do automatic cleanup (every 30 minutes or if files are too large)
current_time = time.time()
time_since_cleanup = current_time - st.session_state.last_cleanup_time
if (time_since_cleanup > 1800 or # 30 minutes
total_size > 100 * 1024 * 1024): # 100MB
if temp_file_count > 0:
cleanup_temp_files()
st.session_state.last_cleanup_time = current_time
st.info("🧹 Automatic cleanup: Removed old temporary files")
# Recalculate after cleanup
temp_file_count, total_size = get_temp_files_info()
# Create a row with temp file status and delete button
col1, col2 = st.columns([3, 1])
with col1:
if temp_file_count > 0:
st.caption(f"πŸ“ {temp_file_count} temporary file(s) - Total size: {format_file_size(total_size)}")
# Show warning if total size is large
if total_size > 50 * 1024 * 1024: # 50MB
st.warning("⚠️ Large temporary files detected. Consider clearing data to free up space.")
else:
st.caption("πŸ“ No temporary files")
with col2:
if temp_file_count > 0:
if st.button("πŸ—‘οΈ Delete Temp Files", type="secondary", help="Remove all temporary files from the server"):
try:
cleanup_temp_files()
st.success(f"βœ… Successfully deleted {temp_file_count} temporary file(s)")
st.rerun() # Refresh the page to update the file count
except Exception as e:
st.error(f"❌ Error deleting temporary files: {e}")
else:
st.caption("No files to delete")
def create_diff_content(original_text: str, redacted_text: str, view_type: str) -> str:
"""Create HTML content for diff view with highlighting."""
import difflib
import re
# Normalize the text to reduce formatting differences
def normalize_text(text):
# Remove extra whitespace and normalize line endings
lines = text.split('\n')
normalized_lines = []
for line in lines:
# Strip whitespace but preserve content
stripped = line.strip()
if stripped:
# Normalize header formatting differences
# Convert ## to # for level 1 headers
if re.match(r'^##\s+', stripped):
stripped = re.sub(r'^##\s+', '# ', stripped)
# Normalize quote formatting
if stripped.startswith('&gt; '):
stripped = stripped.replace('&gt; ', '> ')
elif stripped.startswith('+ > '):
stripped = stripped.replace('+ > ', '> ')
normalized_lines.append(stripped)
return normalized_lines
original_lines = normalize_text(original_text)
redacted_lines = normalize_text(redacted_text)
# Use difflib to get a more sophisticated diff
differ = difflib.Differ()
diff = list(differ.compare(original_lines, redacted_lines))
html_lines = []
if view_type == 'original':
# Show original with removed content highlighted
for line in diff:
if line.startswith(' '): # Unchanged line
escaped_line = html.escape(line[2:])
html_lines.append(f'<div style="padding: 2px 4px; margin: 1px 0; color: #333;">{escaped_line}</div>')
elif line.startswith('- '): # Removed line
escaped_line = html.escape(line[2:])
html_lines.append(f'<div style="background-color: #ffebee; color: #c62828; padding: 2px 4px; margin: 1px 0; border-left: 3px solid #f44336; font-weight: bold;">- {escaped_line}</div>')
elif line.startswith('+ '): # Added line (show as empty space in original view)
html_lines.append(f'<div style="background-color: #e8f5e8; color: #2e7d32; padding: 2px 4px; margin: 1px 0; border-left: 3px solid #4caf50; font-style: italic; opacity: 0.7;">+ (added in redacted version)</div>')
elif line.startswith('? '): # Ignore difflib hints
continue
elif view_type == 'redacted':
# Show redacted content with added content highlighted
for line in diff:
if line.startswith(' '): # Unchanged line
escaped_line = html.escape(line[2:])
html_lines.append(f'<div style="padding: 2px 4px; margin: 1px 0; color: #333;">{escaped_line}</div>')
elif line.startswith('- '): # Removed line (show as empty space in redacted view)
html_lines.append(f'<div style="background-color: #ffebee; color: #c62828; padding: 2px 4px; margin: 1px 0; border-left: 3px solid #f44336; font-style: italic; opacity: 0.7;">- (removed from original)</div>')
elif line.startswith('+ '): # Added line
escaped_line = html.escape(line[2:])
html_lines.append(f'<div style="background-color: #e8f5e8; color: #2e7d32; padding: 2px 4px; margin: 1px 0; border-left: 3px solid #4caf50; font-weight: bold;">+ {escaped_line}</div>')
elif line.startswith('? '): # Ignore difflib hints
continue
return '\n'.join(html_lines)
if uploaded_files:
# UI to select which file to work with (if multiple files uploaded)
file_names = [f.name for f in uploaded_files]
selected_file = st.selectbox("Select a file to work with", options=file_names)
if selected_file:
# Find the selected uploaded file
uploaded_file = next(f for f in uploaded_files if f.name == selected_file)
# Create buttons for different actions
col1, col2, col3 = st.columns(3)
with col1:
if st.button("πŸ“„ Show Original", type="primary"):
# Process the document to get original structure (without redaction)
if selected_file not in st.session_state.original_structures:
# Save uploaded file to a temporary location
temp_path = save_uploaded_file(uploaded_file, selected_file)
# Create a DocumentProcessor without section extraction (for original structure)
processor = DocumentProcessor(section_extractor=None)
# Process the document to get original structure
result = processor.process(temp_path)
st.session_state.original_structures[selected_file] = result.structured_json
# Also store the original markdown for comparison
st.session_state.original_structures[f"{selected_file}_markdown"] = result.structured_markdown
# Display the original structure
st.session_state.show_original = True
st.session_state.show_processed = False
with col2:
if st.button("πŸ”’ Process with Redaction"):
# Process the document with redaction
if selected_file not in st.session_state.processed_results:
# Save uploaded file to a temporary location
temp_path = save_uploaded_file(uploaded_file, selected_file)
# Create a DocumentProcessor with a SectionExtractor for our target sections
section_extractor = ReasoningSectionExtractor(
endpoint=AZURE_OPENAI_ENDPOINT,
api_key=AZURE_OPENAI_KEY,
api_version=AZURE_OPENAI_VERSION,
deployment=AZURE_OPENAI_DEPLOYMENT,
)
processor = DocumentProcessor(section_extractor=section_extractor)
# Attach an in-memory log handler to capture logs for this file
log_handler, log_buffer = get_log_handler()
root_logger = logging.getLogger()
root_logger.addHandler(log_handler)
try:
# Process the document (Docling parse + section redaction)
result = processor.process(temp_path)
finally:
# Remove handler and stop capturing logs
root_logger.removeHandler(log_handler)
# Save results in session state
st.session_state.processed_results[selected_file] = {
"structured_json": result.structured_json,
"redacted_md": result.redacted_markdown,
"redacted_json": result.redacted_json
}
# Combine log records into a single text
log_text = "\n".join(log_buffer)
st.session_state.logs[selected_file] = log_text
st.session_state.show_original = False
st.session_state.show_processed = True
with col3:
if st.button("πŸ”„ Switch View"):
# Toggle between views
if st.session_state.get("show_original", False):
st.session_state.show_original = False
st.session_state.show_processed = True
else:
st.session_state.show_original = True
st.session_state.show_processed = False
# Show current view status
if st.session_state.get("show_original", False):
st.info("πŸ“„ Currently viewing: **Original Document Structure**")
elif st.session_state.get("show_processed", False):
st.success("πŸ”’ Currently viewing: **Processed Document with Redaction**")
else:
st.info("ℹ️ Select an action above to view document content")
# Display results based on button clicked
if st.session_state.get("show_original", False):
st.markdown("---")
st.subheader(f"Original Document Structure - {selected_file}")
# Get the original structure
original_json = st.session_state.original_structures[selected_file]
original_markdown = st.session_state.original_structures.get(f"{selected_file}_markdown", "")
# Display PDF viewer and original markdown side by side
col1, col2 = st.columns([1, 1])
with col1:
st.subheader("πŸ“„ Original PDF")
# Reset file pointer to beginning
uploaded_file.seek(0)
# Display PDF using base64 encoding for inline display
import base64
pdf_bytes = uploaded_file.getvalue()
b64_pdf = base64.b64encode(pdf_bytes).decode()
pdf_display = f'<iframe src="data:application/pdf;base64,{b64_pdf}" width="100%" height="600" type="application/pdf"></iframe>'
st.markdown(pdf_display, unsafe_allow_html=True)
with col2:
st.subheader("πŸ“‹ Original Document (Markdown)")
st.caption("Docling-generated markdown from the PDF")
# Use a text area for better readability and scrolling
st.text_area(
label="Original markdown content",
value=original_markdown,
height=600,
key="original_markdown_display",
label_visibility="collapsed"
)
# Add a download button for the original markdown
st.markdown("---")
col1, col2 = st.columns(2)
with col1:
st.download_button(
label="πŸ“₯ Download Original Markdown",
data=original_markdown,
file_name=f"{selected_file}_original.md",
mime="text/markdown"
)
with col2:
st.subheader("πŸ“Š JSON Structure")
st.json(original_json)
elif st.session_state.get("show_processed", False):
st.markdown("---")
st.subheader(f"Processed Document - {selected_file}")
# Retrieve stored results
data = st.session_state.processed_results[selected_file]
structured_json = data["structured_json"]
redacted_md = data["redacted_md"]
redacted_json = data["redacted_json"]
# Get the original markdown from the structured JSON
# We need to reconstruct the original markdown from the structured JSON
# For now, we'll use the structured_markdown from the DocumentResult
# But we need to store this in the session state
# Create a DocumentProcessor to get the original markdown
if "original_markdown" not in st.session_state.processed_results[selected_file]:
# Save uploaded file to a temporary location
temp_path = save_uploaded_file(uploaded_file, selected_file)
# Create a DocumentProcessor without section extraction to get original markdown
processor = DocumentProcessor(section_extractor=None)
result = processor.process(temp_path)
# Store the original markdown
st.session_state.processed_results[selected_file]["original_markdown"] = result.structured_markdown
original_md = st.session_state.processed_results[selected_file]["original_markdown"]
# Show processing summary
original_texts = structured_json.get("texts", [])
redacted_texts = redacted_json.get("texts", [])
removed_count = len(original_texts) - len(redacted_texts)
if removed_count > 0:
st.success(f"βœ… Successfully removed {removed_count} text elements containing medication information")
else:
st.info("ℹ️ No medication sections were identified for removal")
# Create tabs for different views
tab1, tab2, tab3 = st.tabs(["πŸ“„ Side-by-Side Comparison", "πŸ” JSON Structure", "πŸ“Š Processing Details"])
with tab1:
st.subheader("Original vs Redacted Content")
st.caption("Compare the original document content with the redacted version")
# Add status indicator
st.markdown("""
<div id="sync-status" style="padding: 8px; background-color: #e8f5e8; border: 1px solid #4caf50; border-radius: 4px; margin-bottom: 10px; display: none;">
βœ… <strong>Synchronized scrolling is active</strong> - Scroll either panel to sync both views
</div>
""", unsafe_allow_html=True)
# Create a diff-like interface with synchronized scrolling and highlighting
diff_html = f"""
<div class="sync-scroll-container">
<div class="sync-scroll-panel">
<div class="sync-scroll-header">
πŸ“‹ Original Document
</div>
<div id="original-content" class="sync-scroll-content">
{create_diff_content(original_md, redacted_md, 'original')}
</div>
</div>
<div class="sync-scroll-panel">
<div class="sync-scroll-header">
πŸ”’ Redacted Document
</div>
<div id="redacted-content" class="sync-scroll-content">
{create_diff_content(original_md, redacted_md, 'redacted')}
</div>
</div>
</div>
"""
st.markdown(diff_html, unsafe_allow_html=True)
# Add a hidden component to trigger JavaScript setup after Streamlit reruns
st.markdown("""
<script>
// Trigger setup after Streamlit rerun
if (window.parent && window.parent.postMessage) {
// Wait for Streamlit to finish rendering
setTimeout(function() {
setupSyncScroll();
}, 500);
}
</script>
""", unsafe_allow_html=True)
# Add legend for the diff highlighting
st.markdown("---")
col1, col2 = st.columns(2)
with col1:
st.markdown("**🎨 Diff Legend:**")
st.markdown("πŸ”΄ **Red background** = Removed content")
st.markdown("🟒 **Green background** = Added content")
st.markdown("βšͺ **White background** = Unchanged content")
with col2:
st.markdown("**πŸ’‘ Tips:**")
st.markdown("Look for red-highlighted sections")
st.markdown("These show what was redacted")
st.markdown("Use scroll to navigate long documents")
with tab2:
st.subheader("Document Structure Analysis")
# Show JSON structure comparison
col1, col2 = st.columns(2)
with col1:
st.markdown("**πŸ“Š Original Structure (JSON)**")
st.json(structured_json)
with col2:
st.markdown("**πŸ”’ Redacted Structure (JSON)**")
st.json(redacted_json)
with tab3:
st.subheader("Processing Details")
# Show what was removed
if removed_count > 0:
st.info(f"**Removed {removed_count} text elements from the document structure.**")
# Show the removed text elements
st.subheader("Removed Text Elements:")
removed_texts = []
for i, text_elem in enumerate(original_texts):
if i >= len(redacted_texts) or text_elem.get("text", "") != redacted_texts[i].get("text", ""):
removed_texts.append((i, text_elem.get("text", "")[:100] + "..." if len(text_elem.get("text", "")) > 100 else text_elem.get("text", "")))
for idx, text in removed_texts:
st.text(f"Text {idx}: {text}")
else:
st.info("No text elements were removed during processing.")
# Show processing logs
st.subheader("Processing Logs")
st.text_area(
label="Processing logs",
value=st.session_state.logs.get(selected_file, ""),
height=300,
label_visibility="collapsed"
)