Spaces:

levalencia
/

docling

Sleeping

App Files Files Community

docling / src /streamlit_app.py

levalencia

Enhance Dockerfile and Streamlit app for comprehensive environment setup and permission testing

98aae70 2 months ago

raw

history blame

39.4 kB

	# Set environment variables IMMEDIATELY to prevent root filesystem access
	# This must happen before any other imports or operations

	import os
	import tempfile

	# Get a writable temp directory first
	try:
	TEMP_DIR = os.path.join(tempfile.gettempdir(), "docling_temp")
	os.makedirs(TEMP_DIR, exist_ok=True)
	except Exception:
	try:
	TEMP_DIR = "/tmp/docling_temp"
	os.makedirs(TEMP_DIR, exist_ok=True)
	except Exception:
	TEMP_DIR = os.getcwd()

	# Set all environment variables that libraries might use
	os.environ.update({
	# Streamlit configuration
	'STREAMLIT_SERVER_FILE_WATCHER_TYPE': 'none',
	'STREAMLIT_SERVER_HEADLESS': 'true',
	'STREAMLIT_BROWSER_GATHER_USAGE_STATS': 'false',
	'STREAMLIT_SERVER_ENABLE_CORS': 'false',
	'STREAMLIT_SERVER_ENABLE_XSRF_PROTECTION': 'false',

	# EasyOCR configuration
	'EASYOCR_MODULE_PATH': os.path.join(TEMP_DIR, 'easyocr_models'),
	'HOME': TEMP_DIR,
	'USERPROFILE': TEMP_DIR,
	'XDG_CACHE_HOME': os.path.join(TEMP_DIR, 'cache'),
	'XDG_CONFIG_HOME': os.path.join(TEMP_DIR, 'config'),
	'XDG_DATA_HOME': os.path.join(TEMP_DIR, 'data'),

	# Hugging Face Hub configuration - CRITICAL for preventing /.cache access
	'HF_HOME': os.path.join(TEMP_DIR, 'huggingface'),
	'HF_CACHE_HOME': os.path.join(TEMP_DIR, 'huggingface_cache'),
	'HF_HUB_CACHE': os.path.join(TEMP_DIR, 'huggingface_cache'),
	'TRANSFORMERS_CACHE': os.path.join(TEMP_DIR, 'transformers_cache'),
	'HF_DATASETS_CACHE': os.path.join(TEMP_DIR, 'datasets_cache'),
	'DIFFUSERS_CACHE': os.path.join(TEMP_DIR, 'diffusers_cache'),
	'ACCELERATE_CACHE': os.path.join(TEMP_DIR, 'accelerate_cache'),

	# Additional Hugging Face specific variables
	'HF_HUB_DISABLE_TELEMETRY': '1',
	'HF_HUB_DISABLE_IMPLICIT_TOKEN': '1',
	'HF_HUB_OFFLINE': '0',

	# Other ML libraries
	'TORCH_HOME': os.path.join(TEMP_DIR, 'torch'),
	'TENSORFLOW_HOME': os.path.join(TEMP_DIR, 'tensorflow'),
	'KERAS_HOME': os.path.join(TEMP_DIR, 'keras'),
	'MLFLOW_TRACKING_URI': f'file:{os.path.join(TEMP_DIR, "mlruns")}',

	# Additional cache directories
	'CACHE_DIR': os.path.join(TEMP_DIR, 'cache'),
	'MODEL_CACHE_DIR': os.path.join(TEMP_DIR, 'models'),

	# Additional environment variables to prevent root access
	'PYTHONPATH': TEMP_DIR,
	'TMPDIR': TEMP_DIR,
	'TEMP': TEMP_DIR,
	'TMP': TEMP_DIR,
	'CACHE': os.path.join(TEMP_DIR, 'cache'),
	'MODELS': os.path.join(TEMP_DIR, 'models'),
	'DATA': os.path.join(TEMP_DIR, 'data'),
	'CONFIG': os.path.join(TEMP_DIR, 'config'),
	})

	# Create all necessary directories
	directories_to_create = [
	os.environ['EASYOCR_MODULE_PATH'],
	os.environ['XDG_CACHE_HOME'],
	os.environ['XDG_CONFIG_HOME'],
	os.environ['XDG_DATA_HOME'],
	os.environ['HF_HOME'],
	os.environ['HF_CACHE_HOME'],
	os.environ['TRANSFORMERS_CACHE'],
	os.environ['HF_DATASETS_CACHE'],
	os.environ['TORCH_HOME'],
	os.environ['TENSORFLOW_HOME'],
	os.environ['KERAS_HOME'],
	os.environ['CACHE_DIR'],
	os.environ['MODEL_CACHE_DIR'],
	os.environ['CACHE'],
	os.environ['MODELS'],
	os.environ['DATA'],
	os.environ['CONFIG'],
	os.environ['HF_HUB_CACHE'],
	os.environ['DIFFUSERS_CACHE'],
	os.environ['ACCELERATE_CACHE'],
	]

	# Monkey patch os.makedirs to prevent root directory access
	original_makedirs = os.makedirs

	def safe_makedirs(name, mode=0o777, exist_ok=False):
	"""Safe version of makedirs that prevents root directory access."""
	# Check if trying to create directory in root filesystem
	if name.startswith('/') and not name.startswith('/tmp') and not name.startswith('/app'):
	# Redirect to temp directory
	basename = os.path.basename(name)
	safe_name = os.path.join(TEMP_DIR, basename)
	print(f"Redirecting root directory creation from {name} to {safe_name}")
	return original_makedirs(safe_name, mode, exist_ok)
	return original_makedirs(name, mode, exist_ok)

	# Apply the monkey patch
	os.makedirs = safe_makedirs

	for directory in directories_to_create:
	try:
	os.makedirs(directory, exist_ok=True)
	except Exception as e:
	print(f"Warning: Could not create directory {directory}: {e}")

	# Now import the rest of the modules
	import streamlit as st
	import logging
	import shutil
	from processing.document_processor import DocumentProcessor
	from processing.sections import ReasoningSectionExtractor
	from utils.logging_utils import get_log_handler
	from dotenv import load_dotenv
	import sys
	import html
	import difflib
	import re
	import time

	# Configure logging early to avoid issues
	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s %(levelname)s %(name)s: %(message)s",
	stream=sys.stdout,
	force=True
	)

	# Load environment variables from .env
	load_dotenv()

	AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
	AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY")
	AZURE_OPENAI_VERSION = os.getenv("AZURE_OPENAI_VERSION")
	AZURE_OPENAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT")

	# Log startup information
	logging.info("=" * 50)
	logging.info("Docling Streamlit App Starting")
	logging.info(f"Temp directory: {TEMP_DIR}")
	logging.info(f"EasyOCR model directory: {os.environ.get('EASYOCR_MODULE_PATH', 'NOT_SET')}")
	logging.info(f"Hugging Face cache: {os.environ.get('HF_CACHE_HOME', 'NOT_SET')}")
	logging.info(f"Current working directory: {os.getcwd()}")
	logging.info(f"Python version: {sys.version}")
	logging.info("=" * 50)

	def cleanup_temp_files():
	"""Clean up temporary files in the temp directory."""
	try:
	if os.path.exists(TEMP_DIR):
	for filename in os.listdir(TEMP_DIR):
	file_path = os.path.join(TEMP_DIR, filename)
	if os.path.isfile(file_path):
	try:
	os.remove(file_path)
	logging.info(f"Removed temp file: {filename}")
	except PermissionError as e:
	logging.warning(f"Permission error removing {filename}: {e}")
	except Exception as e:
	logging.warning(f"Error removing {filename}: {e}")
	logging.info(f"Cleaned up temporary files in {TEMP_DIR}")
	else:
	logging.info(f"Temp directory {TEMP_DIR} does not exist")
	except PermissionError as e:
	logging.warning(f"Permission error accessing temp directory {TEMP_DIR}: {e}")
	except Exception as e:
	logging.warning(f"Error cleaning up temp files: {e}")

	def clear_all_data():
	"""Clear all temporary files and session state data."""
	try:
	# Clean up temp files
	cleanup_temp_files()

	# Clear session state
	if "processed_results" in st.session_state:
	del st.session_state.processed_results
	if "logs" in st.session_state:
	del st.session_state.logs
	if "original_structures" in st.session_state:
	del st.session_state.original_structures
	if "show_original" in st.session_state:
	del st.session_state.show_original
	if "show_processed" in st.session_state:
	del st.session_state.show_processed
	if "temp_cleaned" in st.session_state:
	del st.session_state.temp_cleaned
	if "last_cleanup_time" in st.session_state:
	del st.session_state.last_cleanup_time

	logging.info("Cleared all session state and temporary files")
	return True
	except Exception as e:
	logging.error(f"Error clearing all data: {e}")
	return False

	def get_temp_files_info():
	"""Get information about temporary files (count and total size)."""
	try:
	if not os.path.exists(TEMP_DIR):
	return 0, 0

	files = os.listdir(TEMP_DIR)
	total_size = 0

	for filename in files:
	try:
	file_path = os.path.join(TEMP_DIR, filename)
	if os.path.isfile(file_path):
	total_size += os.path.getsize(file_path)
	except (PermissionError, OSError) as e:
	logging.warning(f"Error accessing file {filename}: {e}")
	continue

	return len(files), total_size
	except PermissionError as e:
	logging.warning(f"Permission error accessing temp directory {TEMP_DIR}: {e}")
	return 0, 0
	except Exception as e:
	logging.warning(f"Error getting temp files info: {e}")
	return 0, 0

	def format_file_size(size_bytes):
	"""Format file size in human readable format."""
	if size_bytes == 0:
	return "0 B"

	size_names = ["B", "KB", "MB", "GB"]
	i = 0
	while size_bytes >= 1024 and i < len(size_names) - 1:
	size_bytes /= 1024.0
	i += 1

	return f"{size_bytes:.1f} {size_names[i]}"

	def save_uploaded_file(uploaded_file, filename):
	"""Save uploaded file to temp directory and return the path."""
	temp_path = os.path.join(TEMP_DIR, f"temp_{filename}")
	try:
	uploaded_file.seek(0) # Reset file pointer to beginning
	file_bytes = uploaded_file.read()
	with open(temp_path, "wb") as f:
	f.write(file_bytes)
	logging.info(f"Saved uploaded file to {temp_path}")
	return temp_path
	except PermissionError as e:
	logging.error(f"Permission error saving uploaded file to {temp_path}: {e}")
	raise PermissionError(f"Cannot save file due to permission restrictions. Please try clearing data or contact support.")
	except Exception as e:
	logging.error(f"Error saving uploaded file: {e}")
	raise

	# Configure page layout to use wide mode
	st.set_page_config(
	page_title="Medical Document Parser & Redactor",
	page_icon="📄",
	layout="wide",
	initial_sidebar_state="collapsed"
	)

	# Add custom CSS for better styling
	st.markdown("""
	<style>
	/* Custom styling for text areas */
	.stTextArea textarea {
	font-family: 'Courier New', monospace !important;
	font-size: 12px !important;
	line-height: 1.4 !important;
	border: 2px solid #e0e0e0 !important;
	border-radius: 8px !important;
	}

	/* Hover effect for text areas */
	.stTextArea textarea:hover {
	border-color: #1f77b4 !important;
	}

	/* Custom styling for download buttons */
	.stDownloadButton > button {
	border-radius: 8px !important;
	font-weight: 600 !important;
	}

	/* Custom styling for the comparison section */
	.comparison-container {
	background-color: #f8f9fa;
	padding: 20px;
	border-radius: 10px;
	border: 1px solid #e9ecef;
	}

	/* Synchronized scrolling styles */
	.sync-scroll-container {
	display: flex;
	gap: 20px;
	height: 600px;
	font-family: 'Courier New', monospace;
	font-size: 12px;
	}

	.sync-scroll-panel {
	flex: 1;
	border: 1px solid #ddd;
	border-radius: 5px;
	overflow: hidden;
	display: flex;
	flex-direction: column;
	}

	.sync-scroll-header {
	background-color: #f8f9fa;
	padding: 10px;
	border-bottom: 1px solid #ddd;
	font-weight: bold;
	}

	.sync-scroll-content {
	flex: 1;
	overflow-y: auto;
	padding: 10px;
	background-color: #fff;
	scroll-behavior: smooth;
	transition: scroll-top 0.1s ease-out;
	}

	/* Prevent scroll chaining */
	.sync-scroll-content::-webkit-scrollbar {
	width: 8px;
	}

	.sync-scroll-content::-webkit-scrollbar-track {
	background: #f1f1f1;
	}

	.sync-scroll-content::-webkit-scrollbar-thumb {
	background: #888;
	border-radius: 4px;
	}

	.sync-scroll-content::-webkit-scrollbar-thumb:hover {
	background: #555;
	}
	</style>

	<script>
	// Improved synchronized scrolling implementation with better debugging
	console.log('Starting sync scroll setup...');

	function setupSyncScroll() {
	console.log('setupSyncScroll called');

	// Wait for elements to be available
	setTimeout(function() {
	console.log('Looking for scroll elements...');
	const originalContent = document.getElementById('original-content');
	const redactedContent = document.getElementById('redacted-content');

	console.log('Original content element:', originalContent);
	console.log('Redacted content element:', redactedContent);

	if (originalContent && redactedContent) {
	console.log('Both elements found, setting up sync...');

	let isScrolling = false;
	let scrollTimeout;

	function syncScroll(source, target) {
	if (!isScrolling) {
	isScrolling = true;
	console.log('Syncing scroll from', source.id, 'to', target.id, 'scrollTop:', source.scrollTop);
	target.scrollTop = source.scrollTop;

	// Clear existing timeout
	if (scrollTimeout) {
	clearTimeout(scrollTimeout);
	}

	// Reset flag after a short delay
	scrollTimeout = setTimeout(() => {
	isScrolling = false;
	console.log('Scroll sync completed');
	}, 100);
	}
	}

	// Remove existing listeners to prevent duplicates
	if (originalContent._syncScrollHandler) {
	originalContent.removeEventListener('scroll', originalContent._syncScrollHandler);
	}
	if (redactedContent._syncScrollHandler) {
	redactedContent.removeEventListener('scroll', redactedContent._syncScrollHandler);
	}

	// Create new handlers
	originalContent._syncScrollHandler = function(e) {
	console.log('Original content scrolled:', e.target.scrollTop);
	syncScroll(originalContent, redactedContent);
	};

	redactedContent._syncScrollHandler = function(e) {
	console.log('Redacted content scrolled:', e.target.scrollTop);
	syncScroll(redactedContent, originalContent);
	};

	// Add event listeners
	originalContent.addEventListener('scroll', originalContent._syncScrollHandler, { passive: true });
	redactedContent.addEventListener('scroll', redactedContent._syncScrollHandler, { passive: true });

	console.log('Event listeners added successfully');

	// Show status indicator
	const statusElement = document.getElementById('sync-status');
	if (statusElement) {
	statusElement.style.display = 'block';
	console.log('Status indicator shown');
	}

	// Test the synchronization
	setTimeout(() => {
	console.log('Testing scroll sync...');
	console.log('Original scrollTop:', originalContent.scrollTop);
	console.log('Redacted scrollTop:', redactedContent.scrollTop);

	// Try a small scroll to test
	originalContent.scrollTop = 10;
	setTimeout(() => {
	console.log('After test scroll - Original:', originalContent.scrollTop, 'Redacted:', redactedContent.scrollTop);
	}, 50);
	}, 200);

	} else {
	console.log('Elements not found, will retry...');
	// Retry with exponential backoff
	setTimeout(setupSyncScroll, 300);
	}
	}, 200);
	}

	// Multiple initialization strategies
	function initializeSyncScroll() {
	console.log('Initializing sync scroll...');

	// Strategy 1: Immediate setup
	setupSyncScroll();

	// Strategy 2: Setup after DOM ready
	if (document.readyState === 'loading') {
	document.addEventListener('DOMContentLoaded', function() {
	console.log('DOM loaded, setting up sync scroll...');
	setupSyncScroll();
	});
	}

	// Strategy 3: Setup after window load
	window.addEventListener('load', function() {
	console.log('Window loaded, setting up sync scroll...');
	setupSyncScroll();
	});

	// Strategy 4: Periodic retry for first 10 seconds
	let attempts = 0;
	const maxAttempts = 20;
	const retryInterval = setInterval(function() {
	attempts++;
	console.log('Retry attempt', attempts);

	const originalContent = document.getElementById('original-content');
	const redactedContent = document.getElementById('redacted-content');

	if (originalContent && redactedContent) {
	console.log('Elements found on retry, setting up...');
	setupSyncScroll();
	clearInterval(retryInterval);
	} else if (attempts >= maxAttempts) {
	console.log('Max retry attempts reached, giving up');
	clearInterval(retryInterval);
	}
	}, 500);
	}

	// Start initialization
	initializeSyncScroll();

	// Listen for Streamlit-specific events
	if (window.parent && window.parent.postMessage) {
	console.log('Streamlit environment detected');

	// Listen for any messages that might indicate a rerun
	window.addEventListener('message', function(event) {
	console.log('Received message:', event.data);
	if (event.data && (event.data.type === 'streamlit:rerun' \|\| event.data.type === 'streamlit:setComponentValue')) {
	console.log('Streamlit rerun detected, reinitializing sync scroll...');
	setTimeout(setupSyncScroll, 1000);
	}
	});
	}

	console.log('Sync scroll script loaded');
	</script>
	""", unsafe_allow_html=True)

	# Configure root logger only once (avoid duplicate handlers on reruns)
	if len(logging.getLogger().handlers) == 0:
	logging.getLogger().setLevel(logging.INFO)
	# (We will attach custom handlers during processing as needed)

	# Title and description
	st.title("Medical Document Parser & Redactor")
	st.write("""
	Upload PDF medical documents to parse their content using Docling (structure-aware parser)
	and automatically redact specific sections (e.g., initial and final medication lists).
	Use the buttons below to view the original structure or process with redaction.

	💡 Tip: This is a Hugging Face Space with limited storage. Use the "Clear All Data" button to remove temporary files when you're done processing documents.
	""")

	# Add clear all data button at the top
	if st.button("🧹 Clear All Data", type="secondary", help="Remove all temporary files and reset the application"):
	if clear_all_data():
	st.success("✅ All data cleared successfully! The application has been reset.")
	st.rerun()
	else:
	st.error("❌ Error clearing data. Please try again.")

	# File uploader (accept multiple PDF files)
	uploaded_files = st.file_uploader("Upload PDF medical documents", type=["pdf"], accept_multiple_files=True)

	# Clean up temp files on app start (but keep the directory)
	if "temp_cleaned" not in st.session_state:
	cleanup_temp_files()
	st.session_state.temp_cleaned = True

	# Initialize session state storage for results and logs
	if "processed_results" not in st.session_state:
	st.session_state.processed_results = {} # {filename: {"structured_json": ..., "redacted_md": ..., "redacted_json": ...}}
	if "logs" not in st.session_state:
	st.session_state.logs = {} # {filename: log_text}
	if "original_structures" not in st.session_state:
	st.session_state.original_structures = {} # {filename: structured_json}

	# Show temp directory status and cleanup button
	temp_file_count, total_size = get_temp_files_info()

	# Automatic cleanup: if temp files are too old or too large, clean them up
	if "last_cleanup_time" not in st.session_state:
	st.session_state.last_cleanup_time = time.time()

	# Check if we should do automatic cleanup (every 30 minutes or if files are too large)
	current_time = time.time()
	time_since_cleanup = current_time - st.session_state.last_cleanup_time

	if (time_since_cleanup > 1800 or # 30 minutes
	total_size > 100 * 1024 * 1024): # 100MB
	if temp_file_count > 0:
	cleanup_temp_files()
	st.session_state.last_cleanup_time = current_time
	st.info("🧹 Automatic cleanup: Removed old temporary files")
	# Recalculate after cleanup
	temp_file_count, total_size = get_temp_files_info()

	# Create a row with temp file status and delete button
	col1, col2 = st.columns([3, 1])

	with col1:
	if temp_file_count > 0:
	st.caption(f"📁 {temp_file_count} temporary file(s) - Total size: {format_file_size(total_size)}")

	# Show warning if total size is large
	if total_size > 50 * 1024 * 1024: # 50MB
	st.warning("⚠️ Large temporary files detected. Consider clearing data to free up space.")
	else:
	st.caption("📁 No temporary files")

	with col2:
	if temp_file_count > 0:
	if st.button("🗑️ Delete Temp Files", type="secondary", help="Remove all temporary files from the server"):
	try:
	cleanup_temp_files()
	st.success(f"✅ Successfully deleted {temp_file_count} temporary file(s)")
	st.rerun() # Refresh the page to update the file count
	except Exception as e:
	st.error(f"❌ Error deleting temporary files: {e}")
	else:
	st.caption("No files to delete")

	def create_diff_content(original_text: str, redacted_text: str, view_type: str) -> str:
	"""Create HTML content for diff view with highlighting."""
	import difflib
	import re

	# Normalize the text to reduce formatting differences
	def normalize_text(text):
	# Remove extra whitespace and normalize line endings
	lines = text.split('\n')
	normalized_lines = []
	for line in lines:
	# Strip whitespace but preserve content
	stripped = line.strip()
	if stripped:
	# Normalize header formatting differences
	# Convert ## to # for level 1 headers
	if re.match(r'^##\s+', stripped):
	stripped = re.sub(r'^##\s+', '# ', stripped)
	# Normalize quote formatting
	if stripped.startswith('> '):
	stripped = stripped.replace('> ', '> ')
	elif stripped.startswith('+ > '):
	stripped = stripped.replace('+ > ', '> ')

	normalized_lines.append(stripped)
	return normalized_lines

	original_lines = normalize_text(original_text)
	redacted_lines = normalize_text(redacted_text)

	# Use difflib to get a more sophisticated diff
	differ = difflib.Differ()
	diff = list(differ.compare(original_lines, redacted_lines))

	html_lines = []

	if view_type == 'original':
	# Show original with removed content highlighted
	for line in diff:
	if line.startswith(' '): # Unchanged line
	escaped_line = html.escape(line[2:])
	html_lines.append(f'<div style="padding: 2px 4px; margin: 1px 0; color: #333;">{escaped_line}</div>')
	elif line.startswith('- '): # Removed line
	escaped_line = html.escape(line[2:])
	html_lines.append(f'<div style="background-color: #ffebee; color: #c62828; padding: 2px 4px; margin: 1px 0; border-left: 3px solid #f44336; font-weight: bold;">- {escaped_line}</div>')
	elif line.startswith('+ '): # Added line (show as empty space in original view)
	html_lines.append(f'<div style="background-color: #e8f5e8; color: #2e7d32; padding: 2px 4px; margin: 1px 0; border-left: 3px solid #4caf50; font-style: italic; opacity: 0.7;">+ (added in redacted version)</div>')
	elif line.startswith('? '): # Ignore difflib hints
	continue

	elif view_type == 'redacted':
	# Show redacted content with added content highlighted
	for line in diff:
	if line.startswith(' '): # Unchanged line
	escaped_line = html.escape(line[2:])
	html_lines.append(f'<div style="padding: 2px 4px; margin: 1px 0; color: #333;">{escaped_line}</div>')
	elif line.startswith('- '): # Removed line (show as empty space in redacted view)
	html_lines.append(f'<div style="background-color: #ffebee; color: #c62828; padding: 2px 4px; margin: 1px 0; border-left: 3px solid #f44336; font-style: italic; opacity: 0.7;">- (removed from original)</div>')
	elif line.startswith('+ '): # Added line
	escaped_line = html.escape(line[2:])
	html_lines.append(f'<div style="background-color: #e8f5e8; color: #2e7d32; padding: 2px 4px; margin: 1px 0; border-left: 3px solid #4caf50; font-weight: bold;">+ {escaped_line}</div>')
	elif line.startswith('? '): # Ignore difflib hints
	continue

	return '\n'.join(html_lines)

	if uploaded_files:
	# UI to select which file to work with (if multiple files uploaded)
	file_names = [f.name for f in uploaded_files]
	selected_file = st.selectbox("Select a file to work with", options=file_names)

	if selected_file:
	# Find the selected uploaded file
	uploaded_file = next(f for f in uploaded_files if f.name == selected_file)

	# Create buttons for different actions
	col1, col2, col3 = st.columns(3)

	with col1:
	if st.button("📄 Show Original", type="primary"):
	# Process the document to get original structure (without redaction)
	if selected_file not in st.session_state.original_structures:
	# Save uploaded file to a temporary location
	temp_path = save_uploaded_file(uploaded_file, selected_file)

	# Create a DocumentProcessor without section extraction (for original structure)
	processor = DocumentProcessor(section_extractor=None)

	# Process the document to get original structure
	result = processor.process(temp_path)
	st.session_state.original_structures[selected_file] = result.structured_json
	# Also store the original markdown for comparison
	st.session_state.original_structures[f"{selected_file}_markdown"] = result.structured_markdown

	# Display the original structure
	st.session_state.show_original = True
	st.session_state.show_processed = False

	with col2:
	if st.button("🔒 Process with Redaction"):
	# Process the document with redaction
	if selected_file not in st.session_state.processed_results:
	# Save uploaded file to a temporary location
	temp_path = save_uploaded_file(uploaded_file, selected_file)

	# Create a DocumentProcessor with a SectionExtractor for our target sections
	section_extractor = ReasoningSectionExtractor(
	endpoint=AZURE_OPENAI_ENDPOINT,
	api_key=AZURE_OPENAI_KEY,
	api_version=AZURE_OPENAI_VERSION,
	deployment=AZURE_OPENAI_DEPLOYMENT,
	)
	processor = DocumentProcessor(section_extractor=section_extractor)

	# Attach an in-memory log handler to capture logs for this file
	log_handler, log_buffer = get_log_handler()
	root_logger = logging.getLogger()
	root_logger.addHandler(log_handler)
	try:
	# Process the document (Docling parse + section redaction)
	result = processor.process(temp_path)
	finally:
	# Remove handler and stop capturing logs
	root_logger.removeHandler(log_handler)

	# Save results in session state
	st.session_state.processed_results[selected_file] = {
	"structured_json": result.structured_json,
	"redacted_md": result.redacted_markdown,
	"redacted_json": result.redacted_json
	}
	# Combine log records into a single text
	log_text = "\n".join(log_buffer)
	st.session_state.logs[selected_file] = log_text

	st.session_state.show_original = False
	st.session_state.show_processed = True

	with col3:
	if st.button("🔄 Switch View"):
	# Toggle between views
	if st.session_state.get("show_original", False):
	st.session_state.show_original = False
	st.session_state.show_processed = True
	else:
	st.session_state.show_original = True
	st.session_state.show_processed = False

	# Show current view status
	if st.session_state.get("show_original", False):
	st.info("📄 Currently viewing: Original Document Structure")
	elif st.session_state.get("show_processed", False):
	st.success("🔒 Currently viewing: Processed Document with Redaction")
	else:
	st.info("ℹ️ Select an action above to view document content")

	# Display results based on button clicked
	if st.session_state.get("show_original", False):
	st.markdown("---")
	st.subheader(f"Original Document Structure - {selected_file}")

	# Get the original structure
	original_json = st.session_state.original_structures[selected_file]
	original_markdown = st.session_state.original_structures.get(f"{selected_file}_markdown", "")

	# Display PDF viewer and original markdown side by side
	col1, col2 = st.columns([1, 1])

	with col1:
	st.subheader("📄 Original PDF")
	# Reset file pointer to beginning
	uploaded_file.seek(0)
	# Display PDF using base64 encoding for inline display
	import base64
	pdf_bytes = uploaded_file.getvalue()
	b64_pdf = base64.b64encode(pdf_bytes).decode()
	pdf_display = f'<iframe src="data:application/pdf;base64,{b64_pdf}" width="100%" height="600" type="application/pdf"></iframe>'
	st.markdown(pdf_display, unsafe_allow_html=True)

	with col2:
	st.subheader("📋 Original Document (Markdown)")
	st.caption("Docling-generated markdown from the PDF")
	# Use a text area for better readability and scrolling
	st.text_area(
	label="Original markdown content",
	value=original_markdown,
	height=600,
	key="original_markdown_display",
	label_visibility="collapsed"
	)

	# Add a download button for the original markdown
	st.markdown("---")
	col1, col2 = st.columns(2)
	with col1:
	st.download_button(
	label="📥 Download Original Markdown",
	data=original_markdown,
	file_name=f"{selected_file}_original.md",
	mime="text/markdown"
	)
	with col2:
	st.subheader("📊 JSON Structure")
	st.json(original_json)

	elif st.session_state.get("show_processed", False):
	st.markdown("---")
	st.subheader(f"Processed Document - {selected_file}")

	# Retrieve stored results
	data = st.session_state.processed_results[selected_file]
	structured_json = data["structured_json"]
	redacted_md = data["redacted_md"]
	redacted_json = data["redacted_json"]

	# Get the original markdown from the structured JSON
	# We need to reconstruct the original markdown from the structured JSON
	# For now, we'll use the structured_markdown from the DocumentResult
	# But we need to store this in the session state

	# Create a DocumentProcessor to get the original markdown
	if "original_markdown" not in st.session_state.processed_results[selected_file]:
	# Save uploaded file to a temporary location
	temp_path = save_uploaded_file(uploaded_file, selected_file)

	# Create a DocumentProcessor without section extraction to get original markdown
	processor = DocumentProcessor(section_extractor=None)
	result = processor.process(temp_path)

	# Store the original markdown
	st.session_state.processed_results[selected_file]["original_markdown"] = result.structured_markdown

	original_md = st.session_state.processed_results[selected_file]["original_markdown"]

	# Show processing summary
	original_texts = structured_json.get("texts", [])
	redacted_texts = redacted_json.get("texts", [])
	removed_count = len(original_texts) - len(redacted_texts)

	if removed_count > 0:
	st.success(f"✅ Successfully removed {removed_count} text elements containing medication information")
	else:
	st.info("ℹ️ No medication sections were identified for removal")

	# Create tabs for different views
	tab1, tab2, tab3 = st.tabs(["📄 Side-by-Side Comparison", "🔍 JSON Structure", "📊 Processing Details"])

	with tab1:
	st.subheader("Original vs Redacted Content")
	st.caption("Compare the original document content with the redacted version")

	# Add status indicator
	st.markdown("""
	<div id="sync-status" style="padding: 8px; background-color: #e8f5e8; border: 1px solid #4caf50; border-radius: 4px; margin-bottom: 10px; display: none;">
	✅ <strong>Synchronized scrolling is active</strong> - Scroll either panel to sync both views
	</div>
	""", unsafe_allow_html=True)

	# Create a diff-like interface with synchronized scrolling and highlighting
	diff_html = f"""
	<div class="sync-scroll-container">
	<div class="sync-scroll-panel">
	<div class="sync-scroll-header">
	📋 Original Document
	</div>
	<div id="original-content" class="sync-scroll-content">
	{create_diff_content(original_md, redacted_md, 'original')}
	</div>
	</div>
	<div class="sync-scroll-panel">
	<div class="sync-scroll-header">
	🔒 Redacted Document
	</div>
	<div id="redacted-content" class="sync-scroll-content">
	{create_diff_content(original_md, redacted_md, 'redacted')}
	</div>
	</div>
	</div>
	"""

	st.markdown(diff_html, unsafe_allow_html=True)

	# Add a hidden component to trigger JavaScript setup after Streamlit reruns
	st.markdown("""
	<script>
	// Trigger setup after Streamlit rerun
	if (window.parent && window.parent.postMessage) {
	// Wait for Streamlit to finish rendering
	setTimeout(function() {
	setupSyncScroll();
	}, 500);
	}
	</script>
	""", unsafe_allow_html=True)


	# Add legend for the diff highlighting
	st.markdown("---")
	col1, col2 = st.columns(2)
	with col1:
	st.markdown("🎨 Diff Legend:")
	st.markdown("🔴 Red background = Removed content")
	st.markdown("🟢 Green background = Added content")
	st.markdown("⚪ White background = Unchanged content")

	with col2:
	st.markdown("💡 Tips:")
	st.markdown("Look for red-highlighted sections")
	st.markdown("These show what was redacted")
	st.markdown("Use scroll to navigate long documents")



	with tab2:
	st.subheader("Document Structure Analysis")
	# Show JSON structure comparison
	col1, col2 = st.columns(2)

	with col1:
	st.markdown("📊 Original Structure (JSON)")
	st.json(structured_json)

	with col2:
	st.markdown("🔒 Redacted Structure (JSON)")
	st.json(redacted_json)

	with tab3:
	st.subheader("Processing Details")

	# Show what was removed
	if removed_count > 0:
	st.info(f"Removed {removed_count} text elements from the document structure.")

	# Show the removed text elements
	st.subheader("Removed Text Elements:")
	removed_texts = []
	for i, text_elem in enumerate(original_texts):
	if i >= len(redacted_texts) or text_elem.get("text", "") != redacted_texts[i].get("text", ""):
	removed_texts.append((i, text_elem.get("text", "")[:100] + "..." if len(text_elem.get("text", "")) > 100 else text_elem.get("text", "")))

	for idx, text in removed_texts:
	st.text(f"Text {idx}: {text}")
	else:
	st.info("No text elements were removed during processing.")

	# Show processing logs
	st.subheader("Processing Logs")
	st.text_area(
	label="Processing logs",
	value=st.session_state.logs.get(selected_file, ""),
	height=300,
	label_visibility="collapsed"
	)