Spaces:
Sleeping
Sleeping

Enhance Dockerfile and Streamlit app for comprehensive environment setup and permission testing
98aae70
# Set environment variables IMMEDIATELY to prevent root filesystem access | |
# This must happen before any other imports or operations | |
import os | |
import tempfile | |
# Get a writable temp directory first | |
try: | |
TEMP_DIR = os.path.join(tempfile.gettempdir(), "docling_temp") | |
os.makedirs(TEMP_DIR, exist_ok=True) | |
except Exception: | |
try: | |
TEMP_DIR = "/tmp/docling_temp" | |
os.makedirs(TEMP_DIR, exist_ok=True) | |
except Exception: | |
TEMP_DIR = os.getcwd() | |
# Set all environment variables that libraries might use | |
os.environ.update({ | |
# Streamlit configuration | |
'STREAMLIT_SERVER_FILE_WATCHER_TYPE': 'none', | |
'STREAMLIT_SERVER_HEADLESS': 'true', | |
'STREAMLIT_BROWSER_GATHER_USAGE_STATS': 'false', | |
'STREAMLIT_SERVER_ENABLE_CORS': 'false', | |
'STREAMLIT_SERVER_ENABLE_XSRF_PROTECTION': 'false', | |
# EasyOCR configuration | |
'EASYOCR_MODULE_PATH': os.path.join(TEMP_DIR, 'easyocr_models'), | |
'HOME': TEMP_DIR, | |
'USERPROFILE': TEMP_DIR, | |
'XDG_CACHE_HOME': os.path.join(TEMP_DIR, 'cache'), | |
'XDG_CONFIG_HOME': os.path.join(TEMP_DIR, 'config'), | |
'XDG_DATA_HOME': os.path.join(TEMP_DIR, 'data'), | |
# Hugging Face Hub configuration - CRITICAL for preventing /.cache access | |
'HF_HOME': os.path.join(TEMP_DIR, 'huggingface'), | |
'HF_CACHE_HOME': os.path.join(TEMP_DIR, 'huggingface_cache'), | |
'HF_HUB_CACHE': os.path.join(TEMP_DIR, 'huggingface_cache'), | |
'TRANSFORMERS_CACHE': os.path.join(TEMP_DIR, 'transformers_cache'), | |
'HF_DATASETS_CACHE': os.path.join(TEMP_DIR, 'datasets_cache'), | |
'DIFFUSERS_CACHE': os.path.join(TEMP_DIR, 'diffusers_cache'), | |
'ACCELERATE_CACHE': os.path.join(TEMP_DIR, 'accelerate_cache'), | |
# Additional Hugging Face specific variables | |
'HF_HUB_DISABLE_TELEMETRY': '1', | |
'HF_HUB_DISABLE_IMPLICIT_TOKEN': '1', | |
'HF_HUB_OFFLINE': '0', | |
# Other ML libraries | |
'TORCH_HOME': os.path.join(TEMP_DIR, 'torch'), | |
'TENSORFLOW_HOME': os.path.join(TEMP_DIR, 'tensorflow'), | |
'KERAS_HOME': os.path.join(TEMP_DIR, 'keras'), | |
'MLFLOW_TRACKING_URI': f'file:{os.path.join(TEMP_DIR, "mlruns")}', | |
# Additional cache directories | |
'CACHE_DIR': os.path.join(TEMP_DIR, 'cache'), | |
'MODEL_CACHE_DIR': os.path.join(TEMP_DIR, 'models'), | |
# Additional environment variables to prevent root access | |
'PYTHONPATH': TEMP_DIR, | |
'TMPDIR': TEMP_DIR, | |
'TEMP': TEMP_DIR, | |
'TMP': TEMP_DIR, | |
'CACHE': os.path.join(TEMP_DIR, 'cache'), | |
'MODELS': os.path.join(TEMP_DIR, 'models'), | |
'DATA': os.path.join(TEMP_DIR, 'data'), | |
'CONFIG': os.path.join(TEMP_DIR, 'config'), | |
}) | |
# Create all necessary directories | |
directories_to_create = [ | |
os.environ['EASYOCR_MODULE_PATH'], | |
os.environ['XDG_CACHE_HOME'], | |
os.environ['XDG_CONFIG_HOME'], | |
os.environ['XDG_DATA_HOME'], | |
os.environ['HF_HOME'], | |
os.environ['HF_CACHE_HOME'], | |
os.environ['TRANSFORMERS_CACHE'], | |
os.environ['HF_DATASETS_CACHE'], | |
os.environ['TORCH_HOME'], | |
os.environ['TENSORFLOW_HOME'], | |
os.environ['KERAS_HOME'], | |
os.environ['CACHE_DIR'], | |
os.environ['MODEL_CACHE_DIR'], | |
os.environ['CACHE'], | |
os.environ['MODELS'], | |
os.environ['DATA'], | |
os.environ['CONFIG'], | |
os.environ['HF_HUB_CACHE'], | |
os.environ['DIFFUSERS_CACHE'], | |
os.environ['ACCELERATE_CACHE'], | |
] | |
# Monkey patch os.makedirs to prevent root directory access | |
original_makedirs = os.makedirs | |
def safe_makedirs(name, mode=0o777, exist_ok=False): | |
"""Safe version of makedirs that prevents root directory access.""" | |
# Check if trying to create directory in root filesystem | |
if name.startswith('/') and not name.startswith('/tmp') and not name.startswith('/app'): | |
# Redirect to temp directory | |
basename = os.path.basename(name) | |
safe_name = os.path.join(TEMP_DIR, basename) | |
print(f"Redirecting root directory creation from {name} to {safe_name}") | |
return original_makedirs(safe_name, mode, exist_ok) | |
return original_makedirs(name, mode, exist_ok) | |
# Apply the monkey patch | |
os.makedirs = safe_makedirs | |
for directory in directories_to_create: | |
try: | |
os.makedirs(directory, exist_ok=True) | |
except Exception as e: | |
print(f"Warning: Could not create directory {directory}: {e}") | |
# Now import the rest of the modules | |
import streamlit as st | |
import logging | |
import shutil | |
from processing.document_processor import DocumentProcessor | |
from processing.sections import ReasoningSectionExtractor | |
from utils.logging_utils import get_log_handler | |
from dotenv import load_dotenv | |
import sys | |
import html | |
import difflib | |
import re | |
import time | |
# Configure logging early to avoid issues | |
logging.basicConfig( | |
level=logging.INFO, | |
format="%(asctime)s %(levelname)s %(name)s: %(message)s", | |
stream=sys.stdout, | |
force=True | |
) | |
# Load environment variables from .env | |
load_dotenv() | |
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT") | |
AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY") | |
AZURE_OPENAI_VERSION = os.getenv("AZURE_OPENAI_VERSION") | |
AZURE_OPENAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT") | |
# Log startup information | |
logging.info("=" * 50) | |
logging.info("Docling Streamlit App Starting") | |
logging.info(f"Temp directory: {TEMP_DIR}") | |
logging.info(f"EasyOCR model directory: {os.environ.get('EASYOCR_MODULE_PATH', 'NOT_SET')}") | |
logging.info(f"Hugging Face cache: {os.environ.get('HF_CACHE_HOME', 'NOT_SET')}") | |
logging.info(f"Current working directory: {os.getcwd()}") | |
logging.info(f"Python version: {sys.version}") | |
logging.info("=" * 50) | |
def cleanup_temp_files(): | |
"""Clean up temporary files in the temp directory.""" | |
try: | |
if os.path.exists(TEMP_DIR): | |
for filename in os.listdir(TEMP_DIR): | |
file_path = os.path.join(TEMP_DIR, filename) | |
if os.path.isfile(file_path): | |
try: | |
os.remove(file_path) | |
logging.info(f"Removed temp file: {filename}") | |
except PermissionError as e: | |
logging.warning(f"Permission error removing {filename}: {e}") | |
except Exception as e: | |
logging.warning(f"Error removing {filename}: {e}") | |
logging.info(f"Cleaned up temporary files in {TEMP_DIR}") | |
else: | |
logging.info(f"Temp directory {TEMP_DIR} does not exist") | |
except PermissionError as e: | |
logging.warning(f"Permission error accessing temp directory {TEMP_DIR}: {e}") | |
except Exception as e: | |
logging.warning(f"Error cleaning up temp files: {e}") | |
def clear_all_data(): | |
"""Clear all temporary files and session state data.""" | |
try: | |
# Clean up temp files | |
cleanup_temp_files() | |
# Clear session state | |
if "processed_results" in st.session_state: | |
del st.session_state.processed_results | |
if "logs" in st.session_state: | |
del st.session_state.logs | |
if "original_structures" in st.session_state: | |
del st.session_state.original_structures | |
if "show_original" in st.session_state: | |
del st.session_state.show_original | |
if "show_processed" in st.session_state: | |
del st.session_state.show_processed | |
if "temp_cleaned" in st.session_state: | |
del st.session_state.temp_cleaned | |
if "last_cleanup_time" in st.session_state: | |
del st.session_state.last_cleanup_time | |
logging.info("Cleared all session state and temporary files") | |
return True | |
except Exception as e: | |
logging.error(f"Error clearing all data: {e}") | |
return False | |
def get_temp_files_info(): | |
"""Get information about temporary files (count and total size).""" | |
try: | |
if not os.path.exists(TEMP_DIR): | |
return 0, 0 | |
files = os.listdir(TEMP_DIR) | |
total_size = 0 | |
for filename in files: | |
try: | |
file_path = os.path.join(TEMP_DIR, filename) | |
if os.path.isfile(file_path): | |
total_size += os.path.getsize(file_path) | |
except (PermissionError, OSError) as e: | |
logging.warning(f"Error accessing file {filename}: {e}") | |
continue | |
return len(files), total_size | |
except PermissionError as e: | |
logging.warning(f"Permission error accessing temp directory {TEMP_DIR}: {e}") | |
return 0, 0 | |
except Exception as e: | |
logging.warning(f"Error getting temp files info: {e}") | |
return 0, 0 | |
def format_file_size(size_bytes): | |
"""Format file size in human readable format.""" | |
if size_bytes == 0: | |
return "0 B" | |
size_names = ["B", "KB", "MB", "GB"] | |
i = 0 | |
while size_bytes >= 1024 and i < len(size_names) - 1: | |
size_bytes /= 1024.0 | |
i += 1 | |
return f"{size_bytes:.1f} {size_names[i]}" | |
def save_uploaded_file(uploaded_file, filename): | |
"""Save uploaded file to temp directory and return the path.""" | |
temp_path = os.path.join(TEMP_DIR, f"temp_{filename}") | |
try: | |
uploaded_file.seek(0) # Reset file pointer to beginning | |
file_bytes = uploaded_file.read() | |
with open(temp_path, "wb") as f: | |
f.write(file_bytes) | |
logging.info(f"Saved uploaded file to {temp_path}") | |
return temp_path | |
except PermissionError as e: | |
logging.error(f"Permission error saving uploaded file to {temp_path}: {e}") | |
raise PermissionError(f"Cannot save file due to permission restrictions. Please try clearing data or contact support.") | |
except Exception as e: | |
logging.error(f"Error saving uploaded file: {e}") | |
raise | |
# Configure page layout to use wide mode | |
st.set_page_config( | |
page_title="Medical Document Parser & Redactor", | |
page_icon="π", | |
layout="wide", | |
initial_sidebar_state="collapsed" | |
) | |
# Add custom CSS for better styling | |
st.markdown(""" | |
<style> | |
/* Custom styling for text areas */ | |
.stTextArea textarea { | |
font-family: 'Courier New', monospace !important; | |
font-size: 12px !important; | |
line-height: 1.4 !important; | |
border: 2px solid #e0e0e0 !important; | |
border-radius: 8px !important; | |
} | |
/* Hover effect for text areas */ | |
.stTextArea textarea:hover { | |
border-color: #1f77b4 !important; | |
} | |
/* Custom styling for download buttons */ | |
.stDownloadButton > button { | |
border-radius: 8px !important; | |
font-weight: 600 !important; | |
} | |
/* Custom styling for the comparison section */ | |
.comparison-container { | |
background-color: #f8f9fa; | |
padding: 20px; | |
border-radius: 10px; | |
border: 1px solid #e9ecef; | |
} | |
/* Synchronized scrolling styles */ | |
.sync-scroll-container { | |
display: flex; | |
gap: 20px; | |
height: 600px; | |
font-family: 'Courier New', monospace; | |
font-size: 12px; | |
} | |
.sync-scroll-panel { | |
flex: 1; | |
border: 1px solid #ddd; | |
border-radius: 5px; | |
overflow: hidden; | |
display: flex; | |
flex-direction: column; | |
} | |
.sync-scroll-header { | |
background-color: #f8f9fa; | |
padding: 10px; | |
border-bottom: 1px solid #ddd; | |
font-weight: bold; | |
} | |
.sync-scroll-content { | |
flex: 1; | |
overflow-y: auto; | |
padding: 10px; | |
background-color: #fff; | |
scroll-behavior: smooth; | |
transition: scroll-top 0.1s ease-out; | |
} | |
/* Prevent scroll chaining */ | |
.sync-scroll-content::-webkit-scrollbar { | |
width: 8px; | |
} | |
.sync-scroll-content::-webkit-scrollbar-track { | |
background: #f1f1f1; | |
} | |
.sync-scroll-content::-webkit-scrollbar-thumb { | |
background: #888; | |
border-radius: 4px; | |
} | |
.sync-scroll-content::-webkit-scrollbar-thumb:hover { | |
background: #555; | |
} | |
</style> | |
<script> | |
// Improved synchronized scrolling implementation with better debugging | |
console.log('Starting sync scroll setup...'); | |
function setupSyncScroll() { | |
console.log('setupSyncScroll called'); | |
// Wait for elements to be available | |
setTimeout(function() { | |
console.log('Looking for scroll elements...'); | |
const originalContent = document.getElementById('original-content'); | |
const redactedContent = document.getElementById('redacted-content'); | |
console.log('Original content element:', originalContent); | |
console.log('Redacted content element:', redactedContent); | |
if (originalContent && redactedContent) { | |
console.log('Both elements found, setting up sync...'); | |
let isScrolling = false; | |
let scrollTimeout; | |
function syncScroll(source, target) { | |
if (!isScrolling) { | |
isScrolling = true; | |
console.log('Syncing scroll from', source.id, 'to', target.id, 'scrollTop:', source.scrollTop); | |
target.scrollTop = source.scrollTop; | |
// Clear existing timeout | |
if (scrollTimeout) { | |
clearTimeout(scrollTimeout); | |
} | |
// Reset flag after a short delay | |
scrollTimeout = setTimeout(() => { | |
isScrolling = false; | |
console.log('Scroll sync completed'); | |
}, 100); | |
} | |
} | |
// Remove existing listeners to prevent duplicates | |
if (originalContent._syncScrollHandler) { | |
originalContent.removeEventListener('scroll', originalContent._syncScrollHandler); | |
} | |
if (redactedContent._syncScrollHandler) { | |
redactedContent.removeEventListener('scroll', redactedContent._syncScrollHandler); | |
} | |
// Create new handlers | |
originalContent._syncScrollHandler = function(e) { | |
console.log('Original content scrolled:', e.target.scrollTop); | |
syncScroll(originalContent, redactedContent); | |
}; | |
redactedContent._syncScrollHandler = function(e) { | |
console.log('Redacted content scrolled:', e.target.scrollTop); | |
syncScroll(redactedContent, originalContent); | |
}; | |
// Add event listeners | |
originalContent.addEventListener('scroll', originalContent._syncScrollHandler, { passive: true }); | |
redactedContent.addEventListener('scroll', redactedContent._syncScrollHandler, { passive: true }); | |
console.log('Event listeners added successfully'); | |
// Show status indicator | |
const statusElement = document.getElementById('sync-status'); | |
if (statusElement) { | |
statusElement.style.display = 'block'; | |
console.log('Status indicator shown'); | |
} | |
// Test the synchronization | |
setTimeout(() => { | |
console.log('Testing scroll sync...'); | |
console.log('Original scrollTop:', originalContent.scrollTop); | |
console.log('Redacted scrollTop:', redactedContent.scrollTop); | |
// Try a small scroll to test | |
originalContent.scrollTop = 10; | |
setTimeout(() => { | |
console.log('After test scroll - Original:', originalContent.scrollTop, 'Redacted:', redactedContent.scrollTop); | |
}, 50); | |
}, 200); | |
} else { | |
console.log('Elements not found, will retry...'); | |
// Retry with exponential backoff | |
setTimeout(setupSyncScroll, 300); | |
} | |
}, 200); | |
} | |
// Multiple initialization strategies | |
function initializeSyncScroll() { | |
console.log('Initializing sync scroll...'); | |
// Strategy 1: Immediate setup | |
setupSyncScroll(); | |
// Strategy 2: Setup after DOM ready | |
if (document.readyState === 'loading') { | |
document.addEventListener('DOMContentLoaded', function() { | |
console.log('DOM loaded, setting up sync scroll...'); | |
setupSyncScroll(); | |
}); | |
} | |
// Strategy 3: Setup after window load | |
window.addEventListener('load', function() { | |
console.log('Window loaded, setting up sync scroll...'); | |
setupSyncScroll(); | |
}); | |
// Strategy 4: Periodic retry for first 10 seconds | |
let attempts = 0; | |
const maxAttempts = 20; | |
const retryInterval = setInterval(function() { | |
attempts++; | |
console.log('Retry attempt', attempts); | |
const originalContent = document.getElementById('original-content'); | |
const redactedContent = document.getElementById('redacted-content'); | |
if (originalContent && redactedContent) { | |
console.log('Elements found on retry, setting up...'); | |
setupSyncScroll(); | |
clearInterval(retryInterval); | |
} else if (attempts >= maxAttempts) { | |
console.log('Max retry attempts reached, giving up'); | |
clearInterval(retryInterval); | |
} | |
}, 500); | |
} | |
// Start initialization | |
initializeSyncScroll(); | |
// Listen for Streamlit-specific events | |
if (window.parent && window.parent.postMessage) { | |
console.log('Streamlit environment detected'); | |
// Listen for any messages that might indicate a rerun | |
window.addEventListener('message', function(event) { | |
console.log('Received message:', event.data); | |
if (event.data && (event.data.type === 'streamlit:rerun' || event.data.type === 'streamlit:setComponentValue')) { | |
console.log('Streamlit rerun detected, reinitializing sync scroll...'); | |
setTimeout(setupSyncScroll, 1000); | |
} | |
}); | |
} | |
console.log('Sync scroll script loaded'); | |
</script> | |
""", unsafe_allow_html=True) | |
# Configure root logger only once (avoid duplicate handlers on reruns) | |
if len(logging.getLogger().handlers) == 0: | |
logging.getLogger().setLevel(logging.INFO) | |
# (We will attach custom handlers during processing as needed) | |
# Title and description | |
st.title("Medical Document Parser & Redactor") | |
st.write(""" | |
Upload PDF medical documents to parse their content using **Docling** (structure-aware parser) | |
and automatically **redact specific sections** (e.g., initial and final medication lists). | |
Use the buttons below to view the original structure or process with redaction. | |
**π‘ Tip:** This is a Hugging Face Space with limited storage. Use the "Clear All Data" button to remove temporary files when you're done processing documents. | |
""") | |
# Add clear all data button at the top | |
if st.button("π§Ή Clear All Data", type="secondary", help="Remove all temporary files and reset the application"): | |
if clear_all_data(): | |
st.success("β All data cleared successfully! The application has been reset.") | |
st.rerun() | |
else: | |
st.error("β Error clearing data. Please try again.") | |
# File uploader (accept multiple PDF files) | |
uploaded_files = st.file_uploader("Upload PDF medical documents", type=["pdf"], accept_multiple_files=True) | |
# Clean up temp files on app start (but keep the directory) | |
if "temp_cleaned" not in st.session_state: | |
cleanup_temp_files() | |
st.session_state.temp_cleaned = True | |
# Initialize session state storage for results and logs | |
if "processed_results" not in st.session_state: | |
st.session_state.processed_results = {} # {filename: {"structured_json": ..., "redacted_md": ..., "redacted_json": ...}} | |
if "logs" not in st.session_state: | |
st.session_state.logs = {} # {filename: log_text} | |
if "original_structures" not in st.session_state: | |
st.session_state.original_structures = {} # {filename: structured_json} | |
# Show temp directory status and cleanup button | |
temp_file_count, total_size = get_temp_files_info() | |
# Automatic cleanup: if temp files are too old or too large, clean them up | |
if "last_cleanup_time" not in st.session_state: | |
st.session_state.last_cleanup_time = time.time() | |
# Check if we should do automatic cleanup (every 30 minutes or if files are too large) | |
current_time = time.time() | |
time_since_cleanup = current_time - st.session_state.last_cleanup_time | |
if (time_since_cleanup > 1800 or # 30 minutes | |
total_size > 100 * 1024 * 1024): # 100MB | |
if temp_file_count > 0: | |
cleanup_temp_files() | |
st.session_state.last_cleanup_time = current_time | |
st.info("π§Ή Automatic cleanup: Removed old temporary files") | |
# Recalculate after cleanup | |
temp_file_count, total_size = get_temp_files_info() | |
# Create a row with temp file status and delete button | |
col1, col2 = st.columns([3, 1]) | |
with col1: | |
if temp_file_count > 0: | |
st.caption(f"π {temp_file_count} temporary file(s) - Total size: {format_file_size(total_size)}") | |
# Show warning if total size is large | |
if total_size > 50 * 1024 * 1024: # 50MB | |
st.warning("β οΈ Large temporary files detected. Consider clearing data to free up space.") | |
else: | |
st.caption("π No temporary files") | |
with col2: | |
if temp_file_count > 0: | |
if st.button("ποΈ Delete Temp Files", type="secondary", help="Remove all temporary files from the server"): | |
try: | |
cleanup_temp_files() | |
st.success(f"β Successfully deleted {temp_file_count} temporary file(s)") | |
st.rerun() # Refresh the page to update the file count | |
except Exception as e: | |
st.error(f"β Error deleting temporary files: {e}") | |
else: | |
st.caption("No files to delete") | |
def create_diff_content(original_text: str, redacted_text: str, view_type: str) -> str: | |
"""Create HTML content for diff view with highlighting.""" | |
import difflib | |
import re | |
# Normalize the text to reduce formatting differences | |
def normalize_text(text): | |
# Remove extra whitespace and normalize line endings | |
lines = text.split('\n') | |
normalized_lines = [] | |
for line in lines: | |
# Strip whitespace but preserve content | |
stripped = line.strip() | |
if stripped: | |
# Normalize header formatting differences | |
# Convert ## to # for level 1 headers | |
if re.match(r'^##\s+', stripped): | |
stripped = re.sub(r'^##\s+', '# ', stripped) | |
# Normalize quote formatting | |
if stripped.startswith('> '): | |
stripped = stripped.replace('> ', '> ') | |
elif stripped.startswith('+ > '): | |
stripped = stripped.replace('+ > ', '> ') | |
normalized_lines.append(stripped) | |
return normalized_lines | |
original_lines = normalize_text(original_text) | |
redacted_lines = normalize_text(redacted_text) | |
# Use difflib to get a more sophisticated diff | |
differ = difflib.Differ() | |
diff = list(differ.compare(original_lines, redacted_lines)) | |
html_lines = [] | |
if view_type == 'original': | |
# Show original with removed content highlighted | |
for line in diff: | |
if line.startswith(' '): # Unchanged line | |
escaped_line = html.escape(line[2:]) | |
html_lines.append(f'<div style="padding: 2px 4px; margin: 1px 0; color: #333;">{escaped_line}</div>') | |
elif line.startswith('- '): # Removed line | |
escaped_line = html.escape(line[2:]) | |
html_lines.append(f'<div style="background-color: #ffebee; color: #c62828; padding: 2px 4px; margin: 1px 0; border-left: 3px solid #f44336; font-weight: bold;">- {escaped_line}</div>') | |
elif line.startswith('+ '): # Added line (show as empty space in original view) | |
html_lines.append(f'<div style="background-color: #e8f5e8; color: #2e7d32; padding: 2px 4px; margin: 1px 0; border-left: 3px solid #4caf50; font-style: italic; opacity: 0.7;">+ (added in redacted version)</div>') | |
elif line.startswith('? '): # Ignore difflib hints | |
continue | |
elif view_type == 'redacted': | |
# Show redacted content with added content highlighted | |
for line in diff: | |
if line.startswith(' '): # Unchanged line | |
escaped_line = html.escape(line[2:]) | |
html_lines.append(f'<div style="padding: 2px 4px; margin: 1px 0; color: #333;">{escaped_line}</div>') | |
elif line.startswith('- '): # Removed line (show as empty space in redacted view) | |
html_lines.append(f'<div style="background-color: #ffebee; color: #c62828; padding: 2px 4px; margin: 1px 0; border-left: 3px solid #f44336; font-style: italic; opacity: 0.7;">- (removed from original)</div>') | |
elif line.startswith('+ '): # Added line | |
escaped_line = html.escape(line[2:]) | |
html_lines.append(f'<div style="background-color: #e8f5e8; color: #2e7d32; padding: 2px 4px; margin: 1px 0; border-left: 3px solid #4caf50; font-weight: bold;">+ {escaped_line}</div>') | |
elif line.startswith('? '): # Ignore difflib hints | |
continue | |
return '\n'.join(html_lines) | |
if uploaded_files: | |
# UI to select which file to work with (if multiple files uploaded) | |
file_names = [f.name for f in uploaded_files] | |
selected_file = st.selectbox("Select a file to work with", options=file_names) | |
if selected_file: | |
# Find the selected uploaded file | |
uploaded_file = next(f for f in uploaded_files if f.name == selected_file) | |
# Create buttons for different actions | |
col1, col2, col3 = st.columns(3) | |
with col1: | |
if st.button("π Show Original", type="primary"): | |
# Process the document to get original structure (without redaction) | |
if selected_file not in st.session_state.original_structures: | |
# Save uploaded file to a temporary location | |
temp_path = save_uploaded_file(uploaded_file, selected_file) | |
# Create a DocumentProcessor without section extraction (for original structure) | |
processor = DocumentProcessor(section_extractor=None) | |
# Process the document to get original structure | |
result = processor.process(temp_path) | |
st.session_state.original_structures[selected_file] = result.structured_json | |
# Also store the original markdown for comparison | |
st.session_state.original_structures[f"{selected_file}_markdown"] = result.structured_markdown | |
# Display the original structure | |
st.session_state.show_original = True | |
st.session_state.show_processed = False | |
with col2: | |
if st.button("π Process with Redaction"): | |
# Process the document with redaction | |
if selected_file not in st.session_state.processed_results: | |
# Save uploaded file to a temporary location | |
temp_path = save_uploaded_file(uploaded_file, selected_file) | |
# Create a DocumentProcessor with a SectionExtractor for our target sections | |
section_extractor = ReasoningSectionExtractor( | |
endpoint=AZURE_OPENAI_ENDPOINT, | |
api_key=AZURE_OPENAI_KEY, | |
api_version=AZURE_OPENAI_VERSION, | |
deployment=AZURE_OPENAI_DEPLOYMENT, | |
) | |
processor = DocumentProcessor(section_extractor=section_extractor) | |
# Attach an in-memory log handler to capture logs for this file | |
log_handler, log_buffer = get_log_handler() | |
root_logger = logging.getLogger() | |
root_logger.addHandler(log_handler) | |
try: | |
# Process the document (Docling parse + section redaction) | |
result = processor.process(temp_path) | |
finally: | |
# Remove handler and stop capturing logs | |
root_logger.removeHandler(log_handler) | |
# Save results in session state | |
st.session_state.processed_results[selected_file] = { | |
"structured_json": result.structured_json, | |
"redacted_md": result.redacted_markdown, | |
"redacted_json": result.redacted_json | |
} | |
# Combine log records into a single text | |
log_text = "\n".join(log_buffer) | |
st.session_state.logs[selected_file] = log_text | |
st.session_state.show_original = False | |
st.session_state.show_processed = True | |
with col3: | |
if st.button("π Switch View"): | |
# Toggle between views | |
if st.session_state.get("show_original", False): | |
st.session_state.show_original = False | |
st.session_state.show_processed = True | |
else: | |
st.session_state.show_original = True | |
st.session_state.show_processed = False | |
# Show current view status | |
if st.session_state.get("show_original", False): | |
st.info("π Currently viewing: **Original Document Structure**") | |
elif st.session_state.get("show_processed", False): | |
st.success("π Currently viewing: **Processed Document with Redaction**") | |
else: | |
st.info("βΉοΈ Select an action above to view document content") | |
# Display results based on button clicked | |
if st.session_state.get("show_original", False): | |
st.markdown("---") | |
st.subheader(f"Original Document Structure - {selected_file}") | |
# Get the original structure | |
original_json = st.session_state.original_structures[selected_file] | |
original_markdown = st.session_state.original_structures.get(f"{selected_file}_markdown", "") | |
# Display PDF viewer and original markdown side by side | |
col1, col2 = st.columns([1, 1]) | |
with col1: | |
st.subheader("π Original PDF") | |
# Reset file pointer to beginning | |
uploaded_file.seek(0) | |
# Display PDF using base64 encoding for inline display | |
import base64 | |
pdf_bytes = uploaded_file.getvalue() | |
b64_pdf = base64.b64encode(pdf_bytes).decode() | |
pdf_display = f'<iframe src="data:application/pdf;base64,{b64_pdf}" width="100%" height="600" type="application/pdf"></iframe>' | |
st.markdown(pdf_display, unsafe_allow_html=True) | |
with col2: | |
st.subheader("π Original Document (Markdown)") | |
st.caption("Docling-generated markdown from the PDF") | |
# Use a text area for better readability and scrolling | |
st.text_area( | |
label="Original markdown content", | |
value=original_markdown, | |
height=600, | |
key="original_markdown_display", | |
label_visibility="collapsed" | |
) | |
# Add a download button for the original markdown | |
st.markdown("---") | |
col1, col2 = st.columns(2) | |
with col1: | |
st.download_button( | |
label="π₯ Download Original Markdown", | |
data=original_markdown, | |
file_name=f"{selected_file}_original.md", | |
mime="text/markdown" | |
) | |
with col2: | |
st.subheader("π JSON Structure") | |
st.json(original_json) | |
elif st.session_state.get("show_processed", False): | |
st.markdown("---") | |
st.subheader(f"Processed Document - {selected_file}") | |
# Retrieve stored results | |
data = st.session_state.processed_results[selected_file] | |
structured_json = data["structured_json"] | |
redacted_md = data["redacted_md"] | |
redacted_json = data["redacted_json"] | |
# Get the original markdown from the structured JSON | |
# We need to reconstruct the original markdown from the structured JSON | |
# For now, we'll use the structured_markdown from the DocumentResult | |
# But we need to store this in the session state | |
# Create a DocumentProcessor to get the original markdown | |
if "original_markdown" not in st.session_state.processed_results[selected_file]: | |
# Save uploaded file to a temporary location | |
temp_path = save_uploaded_file(uploaded_file, selected_file) | |
# Create a DocumentProcessor without section extraction to get original markdown | |
processor = DocumentProcessor(section_extractor=None) | |
result = processor.process(temp_path) | |
# Store the original markdown | |
st.session_state.processed_results[selected_file]["original_markdown"] = result.structured_markdown | |
original_md = st.session_state.processed_results[selected_file]["original_markdown"] | |
# Show processing summary | |
original_texts = structured_json.get("texts", []) | |
redacted_texts = redacted_json.get("texts", []) | |
removed_count = len(original_texts) - len(redacted_texts) | |
if removed_count > 0: | |
st.success(f"β Successfully removed {removed_count} text elements containing medication information") | |
else: | |
st.info("βΉοΈ No medication sections were identified for removal") | |
# Create tabs for different views | |
tab1, tab2, tab3 = st.tabs(["π Side-by-Side Comparison", "π JSON Structure", "π Processing Details"]) | |
with tab1: | |
st.subheader("Original vs Redacted Content") | |
st.caption("Compare the original document content with the redacted version") | |
# Add status indicator | |
st.markdown(""" | |
<div id="sync-status" style="padding: 8px; background-color: #e8f5e8; border: 1px solid #4caf50; border-radius: 4px; margin-bottom: 10px; display: none;"> | |
β <strong>Synchronized scrolling is active</strong> - Scroll either panel to sync both views | |
</div> | |
""", unsafe_allow_html=True) | |
# Create a diff-like interface with synchronized scrolling and highlighting | |
diff_html = f""" | |
<div class="sync-scroll-container"> | |
<div class="sync-scroll-panel"> | |
<div class="sync-scroll-header"> | |
π Original Document | |
</div> | |
<div id="original-content" class="sync-scroll-content"> | |
{create_diff_content(original_md, redacted_md, 'original')} | |
</div> | |
</div> | |
<div class="sync-scroll-panel"> | |
<div class="sync-scroll-header"> | |
π Redacted Document | |
</div> | |
<div id="redacted-content" class="sync-scroll-content"> | |
{create_diff_content(original_md, redacted_md, 'redacted')} | |
</div> | |
</div> | |
</div> | |
""" | |
st.markdown(diff_html, unsafe_allow_html=True) | |
# Add a hidden component to trigger JavaScript setup after Streamlit reruns | |
st.markdown(""" | |
<script> | |
// Trigger setup after Streamlit rerun | |
if (window.parent && window.parent.postMessage) { | |
// Wait for Streamlit to finish rendering | |
setTimeout(function() { | |
setupSyncScroll(); | |
}, 500); | |
} | |
</script> | |
""", unsafe_allow_html=True) | |
# Add legend for the diff highlighting | |
st.markdown("---") | |
col1, col2 = st.columns(2) | |
with col1: | |
st.markdown("**π¨ Diff Legend:**") | |
st.markdown("π΄ **Red background** = Removed content") | |
st.markdown("π’ **Green background** = Added content") | |
st.markdown("βͺ **White background** = Unchanged content") | |
with col2: | |
st.markdown("**π‘ Tips:**") | |
st.markdown("Look for red-highlighted sections") | |
st.markdown("These show what was redacted") | |
st.markdown("Use scroll to navigate long documents") | |
with tab2: | |
st.subheader("Document Structure Analysis") | |
# Show JSON structure comparison | |
col1, col2 = st.columns(2) | |
with col1: | |
st.markdown("**π Original Structure (JSON)**") | |
st.json(structured_json) | |
with col2: | |
st.markdown("**π Redacted Structure (JSON)**") | |
st.json(redacted_json) | |
with tab3: | |
st.subheader("Processing Details") | |
# Show what was removed | |
if removed_count > 0: | |
st.info(f"**Removed {removed_count} text elements from the document structure.**") | |
# Show the removed text elements | |
st.subheader("Removed Text Elements:") | |
removed_texts = [] | |
for i, text_elem in enumerate(original_texts): | |
if i >= len(redacted_texts) or text_elem.get("text", "") != redacted_texts[i].get("text", ""): | |
removed_texts.append((i, text_elem.get("text", "")[:100] + "..." if len(text_elem.get("text", "")) > 100 else text_elem.get("text", ""))) | |
for idx, text in removed_texts: | |
st.text(f"Text {idx}: {text}") | |
else: | |
st.info("No text elements were removed during processing.") | |
# Show processing logs | |
st.subheader("Processing Logs") | |
st.text_area( | |
label="Processing logs", | |
value=st.session_state.logs.get(selected_file, ""), | |
height=300, | |
label_visibility="collapsed" | |
) | |