Spaces:

levalencia
/

docling

Sleeping

File size: 39,433 Bytes

# Set environment variables IMMEDIATELY to prevent root filesystem access
# This must happen before any other imports or operations

import os
import tempfile

# Get a writable temp directory first
try:
    TEMP_DIR = os.path.join(tempfile.gettempdir(), "docling_temp")
    os.makedirs(TEMP_DIR, exist_ok=True)
except Exception:
    try:
        TEMP_DIR = "/tmp/docling_temp"
        os.makedirs(TEMP_DIR, exist_ok=True)
    except Exception:
        TEMP_DIR = os.getcwd()

# Set all environment variables that libraries might use
os.environ.update({
    # Streamlit configuration
    'STREAMLIT_SERVER_FILE_WATCHER_TYPE': 'none',
    'STREAMLIT_SERVER_HEADLESS': 'true',
    'STREAMLIT_BROWSER_GATHER_USAGE_STATS': 'false',
    'STREAMLIT_SERVER_ENABLE_CORS': 'false',
    'STREAMLIT_SERVER_ENABLE_XSRF_PROTECTION': 'false',
    
    # EasyOCR configuration
    'EASYOCR_MODULE_PATH': os.path.join(TEMP_DIR, 'easyocr_models'),
    'HOME': TEMP_DIR,
    'USERPROFILE': TEMP_DIR,
    'XDG_CACHE_HOME': os.path.join(TEMP_DIR, 'cache'),
    'XDG_CONFIG_HOME': os.path.join(TEMP_DIR, 'config'),
    'XDG_DATA_HOME': os.path.join(TEMP_DIR, 'data'),
    
    # Hugging Face Hub configuration - CRITICAL for preventing /.cache access
    'HF_HOME': os.path.join(TEMP_DIR, 'huggingface'),
    'HF_CACHE_HOME': os.path.join(TEMP_DIR, 'huggingface_cache'),
    'HF_HUB_CACHE': os.path.join(TEMP_DIR, 'huggingface_cache'),
    'TRANSFORMERS_CACHE': os.path.join(TEMP_DIR, 'transformers_cache'),
    'HF_DATASETS_CACHE': os.path.join(TEMP_DIR, 'datasets_cache'),
    'DIFFUSERS_CACHE': os.path.join(TEMP_DIR, 'diffusers_cache'),
    'ACCELERATE_CACHE': os.path.join(TEMP_DIR, 'accelerate_cache'),
    
    # Additional Hugging Face specific variables
    'HF_HUB_DISABLE_TELEMETRY': '1',
    'HF_HUB_DISABLE_IMPLICIT_TOKEN': '1',
    'HF_HUB_OFFLINE': '0',
    
    # Other ML libraries
    'TORCH_HOME': os.path.join(TEMP_DIR, 'torch'),
    'TENSORFLOW_HOME': os.path.join(TEMP_DIR, 'tensorflow'),
    'KERAS_HOME': os.path.join(TEMP_DIR, 'keras'),
    'MLFLOW_TRACKING_URI': f'file:{os.path.join(TEMP_DIR, "mlruns")}',
    
    # Additional cache directories
    'CACHE_DIR': os.path.join(TEMP_DIR, 'cache'),
    'MODEL_CACHE_DIR': os.path.join(TEMP_DIR, 'models'),
    
    # Additional environment variables to prevent root access
    'PYTHONPATH': TEMP_DIR,
    'TMPDIR': TEMP_DIR,
    'TEMP': TEMP_DIR,
    'TMP': TEMP_DIR,
    'CACHE': os.path.join(TEMP_DIR, 'cache'),
    'MODELS': os.path.join(TEMP_DIR, 'models'),
    'DATA': os.path.join(TEMP_DIR, 'data'),
    'CONFIG': os.path.join(TEMP_DIR, 'config'),
})

# Create all necessary directories
directories_to_create = [
    os.environ['EASYOCR_MODULE_PATH'],
    os.environ['XDG_CACHE_HOME'],
    os.environ['XDG_CONFIG_HOME'],
    os.environ['XDG_DATA_HOME'],
    os.environ['HF_HOME'],
    os.environ['HF_CACHE_HOME'],
    os.environ['TRANSFORMERS_CACHE'],
    os.environ['HF_DATASETS_CACHE'],
    os.environ['TORCH_HOME'],
    os.environ['TENSORFLOW_HOME'],
    os.environ['KERAS_HOME'],
    os.environ['CACHE_DIR'],
    os.environ['MODEL_CACHE_DIR'],
    os.environ['CACHE'],
    os.environ['MODELS'],
    os.environ['DATA'],
    os.environ['CONFIG'],
    os.environ['HF_HUB_CACHE'],
    os.environ['DIFFUSERS_CACHE'],
    os.environ['ACCELERATE_CACHE'],
]

# Monkey patch os.makedirs to prevent root directory access
original_makedirs = os.makedirs

def safe_makedirs(name, mode=0o777, exist_ok=False):
    """Safe version of makedirs that prevents root directory access."""
    # Check if trying to create directory in root filesystem
    if name.startswith('/') and not name.startswith('/tmp') and not name.startswith('/app'):
        # Redirect to temp directory
        basename = os.path.basename(name)
        safe_name = os.path.join(TEMP_DIR, basename)
        print(f"Redirecting root directory creation from {name} to {safe_name}")
        return original_makedirs(safe_name, mode, exist_ok)
    return original_makedirs(name, mode, exist_ok)

# Apply the monkey patch
os.makedirs = safe_makedirs

for directory in directories_to_create:
    try:
        os.makedirs(directory, exist_ok=True)
    except Exception as e:
        print(f"Warning: Could not create directory {directory}: {e}")

# Now import the rest of the modules
import streamlit as st
import logging
import shutil
from processing.document_processor import DocumentProcessor
from processing.sections import ReasoningSectionExtractor
from utils.logging_utils import get_log_handler
from dotenv import load_dotenv
import sys
import html
import difflib
import re
import time

# Configure logging early to avoid issues
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(levelname)s %(name)s: %(message)s",
    stream=sys.stdout,
    force=True
)

# Load environment variables from .env
load_dotenv()

AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY")
AZURE_OPENAI_VERSION = os.getenv("AZURE_OPENAI_VERSION")
AZURE_OPENAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT")

# Log startup information
logging.info("=" * 50)
logging.info("Docling Streamlit App Starting")
logging.info(f"Temp directory: {TEMP_DIR}")
logging.info(f"EasyOCR model directory: {os.environ.get('EASYOCR_MODULE_PATH', 'NOT_SET')}")
logging.info(f"Hugging Face cache: {os.environ.get('HF_CACHE_HOME', 'NOT_SET')}")
logging.info(f"Current working directory: {os.getcwd()}")
logging.info(f"Python version: {sys.version}")
logging.info("=" * 50)

def cleanup_temp_files():
    """Clean up temporary files in the temp directory."""
    try:
        if os.path.exists(TEMP_DIR):
            for filename in os.listdir(TEMP_DIR):
                file_path = os.path.join(TEMP_DIR, filename)
                if os.path.isfile(file_path):
                    try:
                        os.remove(file_path)
                        logging.info(f"Removed temp file: {filename}")
                    except PermissionError as e:
                        logging.warning(f"Permission error removing {filename}: {e}")
                    except Exception as e:
                        logging.warning(f"Error removing {filename}: {e}")
            logging.info(f"Cleaned up temporary files in {TEMP_DIR}")
        else:
            logging.info(f"Temp directory {TEMP_DIR} does not exist")
    except PermissionError as e:
        logging.warning(f"Permission error accessing temp directory {TEMP_DIR}: {e}")
    except Exception as e:
        logging.warning(f"Error cleaning up temp files: {e}")

def clear_all_data():
    """Clear all temporary files and session state data."""
    try:
        # Clean up temp files
        cleanup_temp_files()
        
        # Clear session state
        if "processed_results" in st.session_state:
            del st.session_state.processed_results
        if "logs" in st.session_state:
            del st.session_state.logs
        if "original_structures" in st.session_state:
            del st.session_state.original_structures
        if "show_original" in st.session_state:
            del st.session_state.show_original
        if "show_processed" in st.session_state:
            del st.session_state.show_processed
        if "temp_cleaned" in st.session_state:
            del st.session_state.temp_cleaned
        if "last_cleanup_time" in st.session_state:
            del st.session_state.last_cleanup_time
            
        logging.info("Cleared all session state and temporary files")
        return True
    except Exception as e:
        logging.error(f"Error clearing all data: {e}")
        return False

def get_temp_files_info():
    """Get information about temporary files (count and total size)."""
    try:
        if not os.path.exists(TEMP_DIR):
            return 0, 0
        
        files = os.listdir(TEMP_DIR)
        total_size = 0
        
        for filename in files:
            try:
                file_path = os.path.join(TEMP_DIR, filename)
                if os.path.isfile(file_path):
                    total_size += os.path.getsize(file_path)
            except (PermissionError, OSError) as e:
                logging.warning(f"Error accessing file {filename}: {e}")
                continue
        
        return len(files), total_size
    except PermissionError as e:
        logging.warning(f"Permission error accessing temp directory {TEMP_DIR}: {e}")
        return 0, 0
    except Exception as e:
        logging.warning(f"Error getting temp files info: {e}")
        return 0, 0

def format_file_size(size_bytes):
    """Format file size in human readable format."""
    if size_bytes == 0:
        return "0 B"
    
    size_names = ["B", "KB", "MB", "GB"]
    i = 0
    while size_bytes >= 1024 and i < len(size_names) - 1:
        size_bytes /= 1024.0
        i += 1
    
    return f"{size_bytes:.1f} {size_names[i]}"

def save_uploaded_file(uploaded_file, filename):
    """Save uploaded file to temp directory and return the path."""
    temp_path = os.path.join(TEMP_DIR, f"temp_{filename}")
    try:
        uploaded_file.seek(0)  # Reset file pointer to beginning
        file_bytes = uploaded_file.read()
        with open(temp_path, "wb") as f:
            f.write(file_bytes)
        logging.info(f"Saved uploaded file to {temp_path}")
        return temp_path
    except PermissionError as e:
        logging.error(f"Permission error saving uploaded file to {temp_path}: {e}")
        raise PermissionError(f"Cannot save file due to permission restrictions. Please try clearing data or contact support.")
    except Exception as e:
        logging.error(f"Error saving uploaded file: {e}")
        raise

# Configure page layout to use wide mode
st.set_page_config(
    page_title="Medical Document Parser & Redactor",
    page_icon="📄",
    layout="wide",
    initial_sidebar_state="collapsed"
)

# Add custom CSS for better styling
st.markdown("""
<style>
/* Custom styling for text areas */
.stTextArea textarea {
    font-family: 'Courier New', monospace !important;
    font-size: 12px !important;
    line-height: 1.4 !important;
    border: 2px solid #e0e0e0 !important;
    border-radius: 8px !important;
}

/* Hover effect for text areas */
.stTextArea textarea:hover {
    border-color: #1f77b4 !important;
}

/* Custom styling for download buttons */
.stDownloadButton > button {
    border-radius: 8px !important;
    font-weight: 600 !important;
}

/* Custom styling for the comparison section */
.comparison-container {
    background-color: #f8f9fa;
    padding: 20px;
    border-radius: 10px;
    border: 1px solid #e9ecef;
}

/* Synchronized scrolling styles */
.sync-scroll-container {
    display: flex;
    gap: 20px;
    height: 600px;
    font-family: 'Courier New', monospace;
    font-size: 12px;
}

.sync-scroll-panel {
    flex: 1;
    border: 1px solid #ddd;
    border-radius: 5px;
    overflow: hidden;
    display: flex;
    flex-direction: column;
}

.sync-scroll-header {
    background-color: #f8f9fa;
    padding: 10px;
    border-bottom: 1px solid #ddd;
    font-weight: bold;
}

.sync-scroll-content {
    flex: 1;
    overflow-y: auto;
    padding: 10px;
    background-color: #fff;
    scroll-behavior: smooth;
    transition: scroll-top 0.1s ease-out;
}

/* Prevent scroll chaining */
.sync-scroll-content::-webkit-scrollbar {
    width: 8px;
}

.sync-scroll-content::-webkit-scrollbar-track {
    background: #f1f1f1;
}

.sync-scroll-content::-webkit-scrollbar-thumb {
    background: #888;
    border-radius: 4px;
}

.sync-scroll-content::-webkit-scrollbar-thumb:hover {
    background: #555;
}
</style>

<script>
// Improved synchronized scrolling implementation with better debugging
console.log('Starting sync scroll setup...');

function setupSyncScroll() {
    console.log('setupSyncScroll called');
    
    // Wait for elements to be available
    setTimeout(function() {
        console.log('Looking for scroll elements...');
        const originalContent = document.getElementById('original-content');
        const redactedContent = document.getElementById('redacted-content');
        
        console.log('Original content element:', originalContent);
        console.log('Redacted content element:', redactedContent);
        
        if (originalContent && redactedContent) {
            console.log('Both elements found, setting up sync...');
            
            let isScrolling = false;
            let scrollTimeout;
            
            function syncScroll(source, target) {
                if (!isScrolling) {
                    isScrolling = true;
                    console.log('Syncing scroll from', source.id, 'to', target.id, 'scrollTop:', source.scrollTop);
                    target.scrollTop = source.scrollTop;
                    
                    // Clear existing timeout
                    if (scrollTimeout) {
                        clearTimeout(scrollTimeout);
                    }
                    
                    // Reset flag after a short delay
                    scrollTimeout = setTimeout(() => { 
                        isScrolling = false; 
                        console.log('Scroll sync completed');
                    }, 100);
                }
            }
            
            // Remove existing listeners to prevent duplicates
            if (originalContent._syncScrollHandler) {
                originalContent.removeEventListener('scroll', originalContent._syncScrollHandler);
            }
            if (redactedContent._syncScrollHandler) {
                redactedContent.removeEventListener('scroll', redactedContent._syncScrollHandler);
            }
            
            // Create new handlers
            originalContent._syncScrollHandler = function(e) {
                console.log('Original content scrolled:', e.target.scrollTop);
                syncScroll(originalContent, redactedContent);
            };
            
            redactedContent._syncScrollHandler = function(e) {
                console.log('Redacted content scrolled:', e.target.scrollTop);
                syncScroll(redactedContent, originalContent);
            };
            
            // Add event listeners
            originalContent.addEventListener('scroll', originalContent._syncScrollHandler, { passive: true });
            redactedContent.addEventListener('scroll', redactedContent._syncScrollHandler, { passive: true });
            
            console.log('Event listeners added successfully');
            
            // Show status indicator
            const statusElement = document.getElementById('sync-status');
            if (statusElement) {
                statusElement.style.display = 'block';
                console.log('Status indicator shown');
            }
            
            // Test the synchronization
            setTimeout(() => {
                console.log('Testing scroll sync...');
                console.log('Original scrollTop:', originalContent.scrollTop);
                console.log('Redacted scrollTop:', redactedContent.scrollTop);
                
                // Try a small scroll to test
                originalContent.scrollTop = 10;
                setTimeout(() => {
                    console.log('After test scroll - Original:', originalContent.scrollTop, 'Redacted:', redactedContent.scrollTop);
                }, 50);
            }, 200);
            
        } else {
            console.log('Elements not found, will retry...');
            // Retry with exponential backoff
            setTimeout(setupSyncScroll, 300);
        }
    }, 200);
}

// Multiple initialization strategies
function initializeSyncScroll() {
    console.log('Initializing sync scroll...');
    
    // Strategy 1: Immediate setup
    setupSyncScroll();
    
    // Strategy 2: Setup after DOM ready
    if (document.readyState === 'loading') {
        document.addEventListener('DOMContentLoaded', function() {
            console.log('DOM loaded, setting up sync scroll...');
            setupSyncScroll();
        });
    }
    
    // Strategy 3: Setup after window load
    window.addEventListener('load', function() {
        console.log('Window loaded, setting up sync scroll...');
        setupSyncScroll();
    });
    
    // Strategy 4: Periodic retry for first 10 seconds
    let attempts = 0;
    const maxAttempts = 20;
    const retryInterval = setInterval(function() {
        attempts++;
        console.log('Retry attempt', attempts);
        
        const originalContent = document.getElementById('original-content');
        const redactedContent = document.getElementById('redacted-content');
        
        if (originalContent && redactedContent) {
            console.log('Elements found on retry, setting up...');
            setupSyncScroll();
            clearInterval(retryInterval);
        } else if (attempts >= maxAttempts) {
            console.log('Max retry attempts reached, giving up');
            clearInterval(retryInterval);
        }
    }, 500);
}

// Start initialization
initializeSyncScroll();

// Listen for Streamlit-specific events
if (window.parent && window.parent.postMessage) {
    console.log('Streamlit environment detected');
    
    // Listen for any messages that might indicate a rerun
    window.addEventListener('message', function(event) {
        console.log('Received message:', event.data);
        if (event.data && (event.data.type === 'streamlit:rerun' || event.data.type === 'streamlit:setComponentValue')) {
            console.log('Streamlit rerun detected, reinitializing sync scroll...');
            setTimeout(setupSyncScroll, 1000);
        }
    });
}

console.log('Sync scroll script loaded');
</script>
""", unsafe_allow_html=True)

# Configure root logger only once (avoid duplicate handlers on reruns)
if len(logging.getLogger().handlers) == 0:
    logging.getLogger().setLevel(logging.INFO)
    # (We will attach custom handlers during processing as needed)

# Title and description
st.title("Medical Document Parser & Redactor")
st.write("""
Upload PDF medical documents to parse their content using **Docling** (structure-aware parser) 
and automatically **redact specific sections** (e.g., initial and final medication lists). 
Use the buttons below to view the original structure or process with redaction.

**💡 Tip:** This is a Hugging Face Space with limited storage. Use the "Clear All Data" button to remove temporary files when you're done processing documents.
""")

# Add clear all data button at the top
if st.button("🧹 Clear All Data", type="secondary", help="Remove all temporary files and reset the application"):
    if clear_all_data():
        st.success("✅ All data cleared successfully! The application has been reset.")
        st.rerun()
    else:
        st.error("❌ Error clearing data. Please try again.")

# File uploader (accept multiple PDF files)
uploaded_files = st.file_uploader("Upload PDF medical documents", type=["pdf"], accept_multiple_files=True)

# Clean up temp files on app start (but keep the directory)
if "temp_cleaned" not in st.session_state:
    cleanup_temp_files()
    st.session_state.temp_cleaned = True

# Initialize session state storage for results and logs
if "processed_results" not in st.session_state:
    st.session_state.processed_results = {}  # {filename: {"structured_json": ..., "redacted_md": ..., "redacted_json": ...}}
if "logs" not in st.session_state:
    st.session_state.logs = {}  # {filename: log_text}
if "original_structures" not in st.session_state:
    st.session_state.original_structures = {}  # {filename: structured_json}

# Show temp directory status and cleanup button
temp_file_count, total_size = get_temp_files_info()

# Automatic cleanup: if temp files are too old or too large, clean them up
if "last_cleanup_time" not in st.session_state:
    st.session_state.last_cleanup_time = time.time()

# Check if we should do automatic cleanup (every 30 minutes or if files are too large)
current_time = time.time()
time_since_cleanup = current_time - st.session_state.last_cleanup_time

if (time_since_cleanup > 1800 or  # 30 minutes
    total_size > 100 * 1024 * 1024):  # 100MB
    if temp_file_count > 0:
        cleanup_temp_files()
        st.session_state.last_cleanup_time = current_time
        st.info("🧹 Automatic cleanup: Removed old temporary files")
        # Recalculate after cleanup
        temp_file_count, total_size = get_temp_files_info()

# Create a row with temp file status and delete button
col1, col2 = st.columns([3, 1])

with col1:
    if temp_file_count > 0:
        st.caption(f"📁 {temp_file_count} temporary file(s) - Total size: {format_file_size(total_size)}")
        
        # Show warning if total size is large
        if total_size > 50 * 1024 * 1024:  # 50MB
            st.warning("⚠️ Large temporary files detected. Consider clearing data to free up space.")
    else:
        st.caption("📁 No temporary files")

with col2:
    if temp_file_count > 0:
        if st.button("🗑️ Delete Temp Files", type="secondary", help="Remove all temporary files from the server"):
            try:
                cleanup_temp_files()
                st.success(f"✅ Successfully deleted {temp_file_count} temporary file(s)")
                st.rerun()  # Refresh the page to update the file count
            except Exception as e:
                st.error(f"❌ Error deleting temporary files: {e}")
    else:
        st.caption("No files to delete")

def create_diff_content(original_text: str, redacted_text: str, view_type: str) -> str:
    """Create HTML content for diff view with highlighting."""
    import difflib
    import re
    
    # Normalize the text to reduce formatting differences
    def normalize_text(text):
        # Remove extra whitespace and normalize line endings
        lines = text.split('\n')
        normalized_lines = []
        for line in lines:
            # Strip whitespace but preserve content
            stripped = line.strip()
            if stripped:
                # Normalize header formatting differences
                # Convert ## to # for level 1 headers
                if re.match(r'^##\s+', stripped):
                    stripped = re.sub(r'^##\s+', '# ', stripped)
                # Normalize quote formatting
                if stripped.startswith('&gt; '):
                    stripped = stripped.replace('&gt; ', '> ')
                elif stripped.startswith('+ > '):
                    stripped = stripped.replace('+ > ', '> ')
                
                normalized_lines.append(stripped)
        return normalized_lines
    
    original_lines = normalize_text(original_text)
    redacted_lines = normalize_text(redacted_text)
    
    # Use difflib to get a more sophisticated diff
    differ = difflib.Differ()
    diff = list(differ.compare(original_lines, redacted_lines))
    
    html_lines = []
    
    if view_type == 'original':
        # Show original with removed content highlighted
        for line in diff:
            if line.startswith('  '):  # Unchanged line
                escaped_line = html.escape(line[2:])
                html_lines.append(f'<div style="padding: 2px 4px; margin: 1px 0; color: #333;">{escaped_line}</div>')
            elif line.startswith('- '):  # Removed line
                escaped_line = html.escape(line[2:])
                html_lines.append(f'<div style="background-color: #ffebee; color: #c62828; padding: 2px 4px; margin: 1px 0; border-left: 3px solid #f44336; font-weight: bold;">- {escaped_line}</div>')
            elif line.startswith('+ '):  # Added line (show as empty space in original view)
                html_lines.append(f'<div style="background-color: #e8f5e8; color: #2e7d32; padding: 2px 4px; margin: 1px 0; border-left: 3px solid #4caf50; font-style: italic; opacity: 0.7;">+ (added in redacted version)</div>')
            elif line.startswith('? '):  # Ignore difflib hints
                continue
    
    elif view_type == 'redacted':
        # Show redacted content with added content highlighted
        for line in diff:
            if line.startswith('  '):  # Unchanged line
                escaped_line = html.escape(line[2:])
                html_lines.append(f'<div style="padding: 2px 4px; margin: 1px 0; color: #333;">{escaped_line}</div>')
            elif line.startswith('- '):  # Removed line (show as empty space in redacted view)
                html_lines.append(f'<div style="background-color: #ffebee; color: #c62828; padding: 2px 4px; margin: 1px 0; border-left: 3px solid #f44336; font-style: italic; opacity: 0.7;">- (removed from original)</div>')
            elif line.startswith('+ '):  # Added line
                escaped_line = html.escape(line[2:])
                html_lines.append(f'<div style="background-color: #e8f5e8; color: #2e7d32; padding: 2px 4px; margin: 1px 0; border-left: 3px solid #4caf50; font-weight: bold;">+ {escaped_line}</div>')
            elif line.startswith('? '):  # Ignore difflib hints
                continue
    
    return '\n'.join(html_lines)

if uploaded_files:
    # UI to select which file to work with (if multiple files uploaded)
    file_names = [f.name for f in uploaded_files]
    selected_file = st.selectbox("Select a file to work with", options=file_names)
    
    if selected_file:
        # Find the selected uploaded file
        uploaded_file = next(f for f in uploaded_files if f.name == selected_file)
        
        # Create buttons for different actions
        col1, col2, col3 = st.columns(3)
        
        with col1:
            if st.button("📄 Show Original", type="primary"):
                # Process the document to get original structure (without redaction)
                if selected_file not in st.session_state.original_structures:
                    # Save uploaded file to a temporary location
                    temp_path = save_uploaded_file(uploaded_file, selected_file)
                    
                    # Create a DocumentProcessor without section extraction (for original structure)
                    processor = DocumentProcessor(section_extractor=None)
                    
                    # Process the document to get original structure
                    result = processor.process(temp_path)
                    st.session_state.original_structures[selected_file] = result.structured_json
                    # Also store the original markdown for comparison
                    st.session_state.original_structures[f"{selected_file}_markdown"] = result.structured_markdown
                
                # Display the original structure
                st.session_state.show_original = True
                st.session_state.show_processed = False
        
        with col2:
            if st.button("🔒 Process with Redaction"):
                # Process the document with redaction
                if selected_file not in st.session_state.processed_results:
                    # Save uploaded file to a temporary location
                    temp_path = save_uploaded_file(uploaded_file, selected_file)
                    
                    # Create a DocumentProcessor with a SectionExtractor for our target sections
                    section_extractor = ReasoningSectionExtractor(
                        endpoint=AZURE_OPENAI_ENDPOINT,
                        api_key=AZURE_OPENAI_KEY,
                        api_version=AZURE_OPENAI_VERSION,
                        deployment=AZURE_OPENAI_DEPLOYMENT,
                    )
                    processor = DocumentProcessor(section_extractor=section_extractor)
                    
                    # Attach an in-memory log handler to capture logs for this file
                    log_handler, log_buffer = get_log_handler()
                    root_logger = logging.getLogger()
                    root_logger.addHandler(log_handler)
                    try:
                        # Process the document (Docling parse + section redaction)
                        result = processor.process(temp_path)
                    finally:
                        # Remove handler and stop capturing logs
                        root_logger.removeHandler(log_handler)
                    
                    # Save results in session state
                    st.session_state.processed_results[selected_file] = {
                        "structured_json": result.structured_json,
                        "redacted_md": result.redacted_markdown,
                        "redacted_json": result.redacted_json
                    }
                    # Combine log records into a single text
                    log_text = "\n".join(log_buffer)
                    st.session_state.logs[selected_file] = log_text
                
                st.session_state.show_original = False
                st.session_state.show_processed = True
        
        with col3:
            if st.button("🔄 Switch View"):
                # Toggle between views
                if st.session_state.get("show_original", False):
                    st.session_state.show_original = False
                    st.session_state.show_processed = True
                else:
                    st.session_state.show_original = True
                    st.session_state.show_processed = False
        
        # Show current view status
        if st.session_state.get("show_original", False):
            st.info("📄 Currently viewing: **Original Document Structure**")
        elif st.session_state.get("show_processed", False):
            st.success("🔒 Currently viewing: **Processed Document with Redaction**")
        else:
            st.info("ℹ️ Select an action above to view document content")
        
        # Display results based on button clicked
        if st.session_state.get("show_original", False):
            st.markdown("---")
            st.subheader(f"Original Document Structure - {selected_file}")
            
            # Get the original structure
            original_json = st.session_state.original_structures[selected_file]
            original_markdown = st.session_state.original_structures.get(f"{selected_file}_markdown", "")
            
            # Display PDF viewer and original markdown side by side
            col1, col2 = st.columns([1, 1])
            
            with col1:
                st.subheader("📄 Original PDF")
                # Reset file pointer to beginning
                uploaded_file.seek(0)
                # Display PDF using base64 encoding for inline display
                import base64
                pdf_bytes = uploaded_file.getvalue()
                b64_pdf = base64.b64encode(pdf_bytes).decode()
                pdf_display = f'<iframe src="data:application/pdf;base64,{b64_pdf}" width="100%" height="600" type="application/pdf"></iframe>'
                st.markdown(pdf_display, unsafe_allow_html=True)
            
            with col2:
                st.subheader("📋 Original Document (Markdown)")
                st.caption("Docling-generated markdown from the PDF")
                # Use a text area for better readability and scrolling
                st.text_area(
                    label="Original markdown content",
                    value=original_markdown,
                    height=600,
                    key="original_markdown_display",
                    label_visibility="collapsed"
                )
            
            # Add a download button for the original markdown
            st.markdown("---")
            col1, col2 = st.columns(2)
            with col1:
                st.download_button(
                    label="📥 Download Original Markdown",
                    data=original_markdown,
                    file_name=f"{selected_file}_original.md",
                    mime="text/markdown"
                )
            with col2:
                st.subheader("📊 JSON Structure")
                st.json(original_json)
        
        elif st.session_state.get("show_processed", False):
            st.markdown("---")
            st.subheader(f"Processed Document - {selected_file}")
            
            # Retrieve stored results
            data = st.session_state.processed_results[selected_file]
            structured_json = data["structured_json"]
            redacted_md = data["redacted_md"]
            redacted_json = data["redacted_json"]
            
            # Get the original markdown from the structured JSON
            # We need to reconstruct the original markdown from the structured JSON
            # For now, we'll use the structured_markdown from the DocumentResult
            # But we need to store this in the session state
            
            # Create a DocumentProcessor to get the original markdown
            if "original_markdown" not in st.session_state.processed_results[selected_file]:
                # Save uploaded file to a temporary location
                temp_path = save_uploaded_file(uploaded_file, selected_file)
                
                # Create a DocumentProcessor without section extraction to get original markdown
                processor = DocumentProcessor(section_extractor=None)
                result = processor.process(temp_path)
                
                # Store the original markdown
                st.session_state.processed_results[selected_file]["original_markdown"] = result.structured_markdown
            
            original_md = st.session_state.processed_results[selected_file]["original_markdown"]
            
            # Show processing summary
            original_texts = structured_json.get("texts", [])
            redacted_texts = redacted_json.get("texts", [])
            removed_count = len(original_texts) - len(redacted_texts)
            
            if removed_count > 0:
                st.success(f"✅ Successfully removed {removed_count} text elements containing medication information")
            else:
                st.info("ℹ️ No medication sections were identified for removal")
            
            # Create tabs for different views
            tab1, tab2, tab3 = st.tabs(["📄 Side-by-Side Comparison", "🔍 JSON Structure", "📊 Processing Details"])
            
            with tab1:
                st.subheader("Original vs Redacted Content")
                st.caption("Compare the original document content with the redacted version")
                
                # Add status indicator
                st.markdown("""
                <div id="sync-status" style="padding: 8px; background-color: #e8f5e8; border: 1px solid #4caf50; border-radius: 4px; margin-bottom: 10px; display: none;">
                    ✅ <strong>Synchronized scrolling is active</strong> - Scroll either panel to sync both views
                </div>
                """, unsafe_allow_html=True)
                
                # Create a diff-like interface with synchronized scrolling and highlighting
                diff_html = f"""
                <div class="sync-scroll-container">
                    <div class="sync-scroll-panel">
                        <div class="sync-scroll-header">
                            📋 Original Document
                        </div>
                        <div id="original-content" class="sync-scroll-content">
                            {create_diff_content(original_md, redacted_md, 'original')}
                        </div>
                    </div>
                    <div class="sync-scroll-panel">
                        <div class="sync-scroll-header">
                            🔒 Redacted Document
                        </div>
                        <div id="redacted-content" class="sync-scroll-content">
                            {create_diff_content(original_md, redacted_md, 'redacted')}
                        </div>
                    </div>
                </div>
                """
                
                st.markdown(diff_html, unsafe_allow_html=True)
                
                # Add a hidden component to trigger JavaScript setup after Streamlit reruns
                st.markdown("""
                <script>
                // Trigger setup after Streamlit rerun
                if (window.parent && window.parent.postMessage) {
                    // Wait for Streamlit to finish rendering
                    setTimeout(function() {
                        setupSyncScroll();
                    }, 500);
                }
                </script>
                """, unsafe_allow_html=True)
                
                  
                # Add legend for the diff highlighting
                st.markdown("---")
                col1, col2 = st.columns(2)
                with col1:
                    st.markdown("**🎨 Diff Legend:**")
                    st.markdown("🔴 **Red background** = Removed content")
                    st.markdown("🟢 **Green background** = Added content")
                    st.markdown("⚪ **White background** = Unchanged content")                
                
                with col2:
                    st.markdown("**💡 Tips:**")
                    st.markdown("Look for red-highlighted sections")
                    st.markdown("These show what was redacted")
                    st.markdown("Use scroll to navigate long documents")
                
             
            
            with tab2:
                st.subheader("Document Structure Analysis")                
                # Show JSON structure comparison
                col1, col2 = st.columns(2)
                
                with col1:
                    st.markdown("**📊 Original Structure (JSON)**")
                    st.json(structured_json)
                
                with col2:
                    st.markdown("**🔒 Redacted Structure (JSON)**")
                    st.json(redacted_json)
            
            with tab3:
                st.subheader("Processing Details")
                
                # Show what was removed
                if removed_count > 0:
                    st.info(f"**Removed {removed_count} text elements from the document structure.**")
                    
                    # Show the removed text elements
                    st.subheader("Removed Text Elements:")
                    removed_texts = []
                    for i, text_elem in enumerate(original_texts):
                        if i >= len(redacted_texts) or text_elem.get("text", "") != redacted_texts[i].get("text", ""):
                            removed_texts.append((i, text_elem.get("text", "")[:100] + "..." if len(text_elem.get("text", "")) > 100 else text_elem.get("text", "")))
                    
                    for idx, text in removed_texts:
                        st.text(f"Text {idx}: {text}")
                else:
                    st.info("No text elements were removed during processing.")
                
                # Show processing logs
                st.subheader("Processing Logs")
                st.text_area(
                    label="Processing logs",
                    value=st.session_state.logs.get(selected_file, ""),
                    height=300,
                    label_visibility="collapsed"
                )