Spaces:
Sleeping
Sleeping
Commit
·
5d3ebd9
1
Parent(s):
442515d
Add reportlab dependency for PDF generation and enhance document processing
Browse files- Added reportlab to pyproject.toml and requirements.txt for PDF generation capabilities.
- Updated document processing logic to include a new function for generating redacted PDFs from the processed document structure.
- Enhanced logging and error handling during PDF generation to improve user feedback and debugging.
- Refactored the document processor to return detailed processing results, including removed indices and cost metrics.
- JUPYTER_USAGE.md +143 -0
- pyproject.toml +1 -0
- requirements.txt +2 -1
- src/processing/document_processor.py +261 -7
- src/processing/llm_extractor.py +103 -72
- src/processing/sections.py +45 -5
- src/streamlit_app.py +426 -387
- src/utils/cost_tracker.py +241 -0
- uv.lock +15 -0
JUPYTER_USAGE.md
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Jupyter Notebook Usage
|
| 2 |
+
|
| 3 |
+
This document shows how to use the document processing function in Jupyter notebooks for integration into larger processing pipelines.
|
| 4 |
+
|
| 5 |
+
## Simple Usage
|
| 6 |
+
|
| 7 |
+
```python
|
| 8 |
+
from processing.document_processor import process_document_with_redaction
|
| 9 |
+
|
| 10 |
+
# Process a single document
|
| 11 |
+
result = process_document_with_redaction(
|
| 12 |
+
file_path="path/to/your/document.pdf",
|
| 13 |
+
endpoint="your-azure-openai-endpoint",
|
| 14 |
+
api_key="your-azure-openai-key",
|
| 15 |
+
api_version="2024-02-15-preview",
|
| 16 |
+
deployment="o3-mini" # or "o4-mini", "o3", "o4"
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
# Access the results
|
| 20 |
+
original_md = result.original_document_md
|
| 21 |
+
redacted_md = result.redacted_document_md
|
| 22 |
+
input_tokens = result.input_tokens
|
| 23 |
+
output_tokens = result.output_tokens
|
| 24 |
+
cost = result.cost
|
| 25 |
+
|
| 26 |
+
print(f"Processing complete!")
|
| 27 |
+
print(f"Input tokens: {input_tokens:,}")
|
| 28 |
+
print(f"Output tokens: {output_tokens:,}")
|
| 29 |
+
print(f"Total cost: ${cost:.4f}")
|
| 30 |
+
```
|
| 31 |
+
|
| 32 |
+
## Batch Processing
|
| 33 |
+
|
| 34 |
+
```python
|
| 35 |
+
import os
|
| 36 |
+
from processing.document_processor import process_document_with_redaction
|
| 37 |
+
|
| 38 |
+
# Configuration
|
| 39 |
+
AZURE_OPENAI_ENDPOINT = "your-azure-openai-endpoint"
|
| 40 |
+
AZURE_OPENAI_KEY = "your-azure-openai-key"
|
| 41 |
+
AZURE_OPENAI_VERSION = "2024-02-15-preview"
|
| 42 |
+
AZURE_OPENAI_DEPLOYMENT = "o3-mini"
|
| 43 |
+
|
| 44 |
+
# Process multiple documents
|
| 45 |
+
pdf_directory = "path/to/pdf/files"
|
| 46 |
+
results = []
|
| 47 |
+
|
| 48 |
+
for filename in os.listdir(pdf_directory):
|
| 49 |
+
if filename.endswith('.pdf'):
|
| 50 |
+
file_path = os.path.join(pdf_directory, filename)
|
| 51 |
+
|
| 52 |
+
print(f"Processing {filename}...")
|
| 53 |
+
|
| 54 |
+
try:
|
| 55 |
+
result = process_document_with_redaction(
|
| 56 |
+
file_path=file_path,
|
| 57 |
+
endpoint=AZURE_OPENAI_ENDPOINT,
|
| 58 |
+
api_key=AZURE_OPENAI_KEY,
|
| 59 |
+
api_version=AZURE_OPENAI_VERSION,
|
| 60 |
+
deployment=AZURE_OPENAI_DEPLOYMENT
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
results.append({
|
| 64 |
+
'filename': filename,
|
| 65 |
+
'original_md': result.original_document_md,
|
| 66 |
+
'redacted_md': result.redacted_document_md,
|
| 67 |
+
'input_tokens': result.input_tokens,
|
| 68 |
+
'output_tokens': result.output_tokens,
|
| 69 |
+
'cost': result.cost
|
| 70 |
+
})
|
| 71 |
+
|
| 72 |
+
print(f" ✓ Completed - Cost: ${result.cost:.4f}")
|
| 73 |
+
|
| 74 |
+
except Exception as e:
|
| 75 |
+
print(f" ✗ Error processing {filename}: {e}")
|
| 76 |
+
|
| 77 |
+
# Summary
|
| 78 |
+
total_cost = sum(r['cost'] for r in results)
|
| 79 |
+
total_input_tokens = sum(r['input_tokens'] for r in results)
|
| 80 |
+
total_output_tokens = sum(r['output_tokens'] for r in results)
|
| 81 |
+
|
| 82 |
+
print(f"\nBatch processing complete!")
|
| 83 |
+
print(f"Documents processed: {len(results)}")
|
| 84 |
+
print(f"Total input tokens: {total_input_tokens:,}")
|
| 85 |
+
print(f"Total output tokens: {total_output_tokens:,}")
|
| 86 |
+
print(f"Total cost: ${total_cost:.4f}")
|
| 87 |
+
```
|
| 88 |
+
|
| 89 |
+
## Environment Variables
|
| 90 |
+
|
| 91 |
+
You can also use environment variables for configuration:
|
| 92 |
+
|
| 93 |
+
```python
|
| 94 |
+
import os
|
| 95 |
+
from dotenv import load_dotenv
|
| 96 |
+
from processing.document_processor import process_document_with_redaction
|
| 97 |
+
|
| 98 |
+
# Load environment variables
|
| 99 |
+
load_dotenv()
|
| 100 |
+
|
| 101 |
+
# Get configuration from environment
|
| 102 |
+
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
|
| 103 |
+
AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY")
|
| 104 |
+
AZURE_OPENAI_VERSION = os.getenv("AZURE_OPENAI_VERSION")
|
| 105 |
+
AZURE_OPENAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT")
|
| 106 |
+
|
| 107 |
+
# Process document
|
| 108 |
+
result = process_document_with_redaction(
|
| 109 |
+
file_path="document.pdf",
|
| 110 |
+
endpoint=AZURE_OPENAI_ENDPOINT,
|
| 111 |
+
api_key=AZURE_OPENAI_KEY,
|
| 112 |
+
api_version=AZURE_OPENAI_VERSION,
|
| 113 |
+
deployment=AZURE_OPENAI_DEPLOYMENT
|
| 114 |
+
)
|
| 115 |
+
```
|
| 116 |
+
|
| 117 |
+
## Return Value
|
| 118 |
+
|
| 119 |
+
The function returns a `ProcessingResult` object with the following attributes:
|
| 120 |
+
|
| 121 |
+
- `original_document_md`: Markdown version of the original document
|
| 122 |
+
- `redacted_document_md`: Markdown version with medication sections removed
|
| 123 |
+
- `input_tokens`: Number of input tokens used
|
| 124 |
+
- `output_tokens`: Number of output tokens generated
|
| 125 |
+
- `cost`: Total cost in USD
|
| 126 |
+
|
| 127 |
+
## Supported Models
|
| 128 |
+
|
| 129 |
+
The function supports the following Azure OpenAI deployment names:
|
| 130 |
+
- `o3-mini` (GPT-4o Mini) - Cheapest option
|
| 131 |
+
- `o4-mini` (GPT-4o Mini) - Same as o3-mini
|
| 132 |
+
- `o3` (GPT-3.5 Turbo) - Medium cost
|
| 133 |
+
- `o4` (GPT-4o) - Most expensive but most capable
|
| 134 |
+
|
| 135 |
+
## Error Handling
|
| 136 |
+
|
| 137 |
+
The function will raise exceptions for:
|
| 138 |
+
- File not found
|
| 139 |
+
- Invalid Azure OpenAI credentials
|
| 140 |
+
- API rate limits
|
| 141 |
+
- Network errors
|
| 142 |
+
|
| 143 |
+
Make sure to handle these appropriately in your pipeline.
|
pyproject.toml
CHANGED
|
@@ -10,4 +10,5 @@ dependencies = [
|
|
| 10 |
"pyyaml>=6.0",
|
| 11 |
"python-dotenv>=1.1.1",
|
| 12 |
"openai>=1.91.0",
|
|
|
|
| 13 |
]
|
|
|
|
| 10 |
"pyyaml>=6.0",
|
| 11 |
"python-dotenv>=1.1.1",
|
| 12 |
"openai>=1.91.0",
|
| 13 |
+
"reportlab>=4.4.2",
|
| 14 |
]
|
requirements.txt
CHANGED
|
@@ -4,4 +4,5 @@ docling
|
|
| 4 |
streamlit
|
| 5 |
pyyaml
|
| 6 |
python-dotenv
|
| 7 |
-
openai
|
|
|
|
|
|
| 4 |
streamlit
|
| 5 |
pyyaml
|
| 6 |
python-dotenv
|
| 7 |
+
openai
|
| 8 |
+
reportlab
|
src/processing/document_processor.py
CHANGED
|
@@ -3,11 +3,12 @@ import time
|
|
| 3 |
import logging
|
| 4 |
import json
|
| 5 |
from dataclasses import dataclass
|
| 6 |
-
from typing import Optional
|
| 7 |
|
| 8 |
# Don't import DocumentConverter at module level to prevent early initialization
|
| 9 |
# from docling.document_converter import DocumentConverter
|
| 10 |
from processing.sections import SectionExtractor
|
|
|
|
| 11 |
|
| 12 |
# Remove global converter initialization - will be done lazily
|
| 13 |
# _docling_converter = DocumentConverter()
|
|
@@ -23,6 +24,92 @@ class DocumentResult:
|
|
| 23 |
redacted_markdown: str
|
| 24 |
redacted_json: dict
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
class DocumentProcessor:
|
| 27 |
"""Handles parsing of documents with Docling and redacting specified sections."""
|
| 28 |
def __init__(self, section_extractor: Optional[SectionExtractor] = None):
|
|
@@ -80,12 +167,12 @@ class DocumentProcessor:
|
|
| 80 |
|
| 81 |
# Persist outputs to files (JSON and redacted text) for auditing
|
| 82 |
base_name = os.path.splitext(os.path.basename(file_path))[0]
|
| 83 |
-
# Use
|
| 84 |
-
temp_dir =
|
| 85 |
try:
|
| 86 |
os.makedirs(temp_dir, exist_ok=True)
|
| 87 |
except PermissionError:
|
| 88 |
-
# Fallback to system temp directory if we can't create in
|
| 89 |
import tempfile
|
| 90 |
temp_dir = os.path.join(tempfile.gettempdir(), "docling_temp_files")
|
| 91 |
os.makedirs(temp_dir, exist_ok=True)
|
|
@@ -139,10 +226,177 @@ class DocumentProcessor:
|
|
| 139 |
logger.warning(f"Could not create cache directory {cache_dir}: {e}")
|
| 140 |
|
| 141 |
def _export_redacted_markdown(self, document, redacted_json):
|
| 142 |
-
"""Export redacted markdown using
|
| 143 |
-
|
| 144 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
def _json_to_markdown(self, json_data: dict) -> str:
|
| 147 |
"""Convert JSON document structure back to markdown format using Docling's structure."""
|
| 148 |
markdown_lines = []
|
|
|
|
| 3 |
import logging
|
| 4 |
import json
|
| 5 |
from dataclasses import dataclass
|
| 6 |
+
from typing import Optional, Tuple
|
| 7 |
|
| 8 |
# Don't import DocumentConverter at module level to prevent early initialization
|
| 9 |
# from docling.document_converter import DocumentConverter
|
| 10 |
from processing.sections import SectionExtractor
|
| 11 |
+
from utils.cost_tracker import cost_tracker
|
| 12 |
|
| 13 |
# Remove global converter initialization - will be done lazily
|
| 14 |
# _docling_converter = DocumentConverter()
|
|
|
|
| 24 |
redacted_markdown: str
|
| 25 |
redacted_json: dict
|
| 26 |
|
| 27 |
+
@dataclass
|
| 28 |
+
class ProcessingResult:
|
| 29 |
+
"""Simple result for Jupyter notebook usage."""
|
| 30 |
+
original_document_md: str
|
| 31 |
+
redacted_document_md: str
|
| 32 |
+
original_document_json: dict
|
| 33 |
+
redacted_document_json: dict
|
| 34 |
+
removed_indices: list # Add the actual indices that were removed
|
| 35 |
+
input_tokens: int
|
| 36 |
+
output_tokens: int
|
| 37 |
+
cost: float
|
| 38 |
+
|
| 39 |
+
def process_document_with_redaction(
|
| 40 |
+
file_path: str,
|
| 41 |
+
endpoint: str,
|
| 42 |
+
api_key: str,
|
| 43 |
+
api_version: str,
|
| 44 |
+
deployment: str,
|
| 45 |
+
section_extractor: Optional[SectionExtractor] = None
|
| 46 |
+
) -> ProcessingResult:
|
| 47 |
+
"""
|
| 48 |
+
Process a document and return a simple tuple with results.
|
| 49 |
+
|
| 50 |
+
Args:
|
| 51 |
+
file_path: Path to the PDF file to process
|
| 52 |
+
endpoint: Azure OpenAI endpoint
|
| 53 |
+
api_key: Azure OpenAI API key
|
| 54 |
+
api_version: Azure OpenAI API version
|
| 55 |
+
deployment: Azure OpenAI deployment name
|
| 56 |
+
section_extractor: Optional custom section extractor
|
| 57 |
+
|
| 58 |
+
Returns:
|
| 59 |
+
ProcessingResult with (original_document_md, redacted_document_md, input_tokens, output_tokens, cost)
|
| 60 |
+
"""
|
| 61 |
+
logger.info(f"Processing document: {file_path}")
|
| 62 |
+
|
| 63 |
+
# Reset cost tracker for this processing session
|
| 64 |
+
cost_tracker.reset_session()
|
| 65 |
+
|
| 66 |
+
# Create section extractor if not provided
|
| 67 |
+
if section_extractor is None:
|
| 68 |
+
from processing.sections import ReasoningSectionExtractor
|
| 69 |
+
section_extractor = ReasoningSectionExtractor(
|
| 70 |
+
endpoint=endpoint,
|
| 71 |
+
api_key=api_key,
|
| 72 |
+
api_version=api_version,
|
| 73 |
+
deployment=deployment,
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
# Process the document
|
| 77 |
+
processor = DocumentProcessor(section_extractor=section_extractor)
|
| 78 |
+
result = processor.process(file_path)
|
| 79 |
+
|
| 80 |
+
# Get the actual removed indices from the section extractor
|
| 81 |
+
removed_indices = []
|
| 82 |
+
if section_extractor:
|
| 83 |
+
# Extract the removed indices from the LLM response
|
| 84 |
+
extraction_result = section_extractor.llm_extractor.extract_medication_sections(result.structured_json)
|
| 85 |
+
removed_indices = extraction_result.get("indices_to_remove", [])
|
| 86 |
+
|
| 87 |
+
# Get cost summary
|
| 88 |
+
cost_summary = cost_tracker.get_session_summary()
|
| 89 |
+
total_input_tokens = cost_summary.get("total_tokens", 0)
|
| 90 |
+
total_output_tokens = 0 # We'll calculate this from the breakdown
|
| 91 |
+
total_cost = cost_summary.get("total_cost", 0.0)
|
| 92 |
+
|
| 93 |
+
# Calculate output tokens from model breakdown
|
| 94 |
+
for model_stats in cost_summary.get("model_breakdown", {}).values():
|
| 95 |
+
total_output_tokens += model_stats.get("output_tokens", 0)
|
| 96 |
+
|
| 97 |
+
# Calculate input tokens (total - output)
|
| 98 |
+
total_input_tokens = total_input_tokens - total_output_tokens
|
| 99 |
+
|
| 100 |
+
logger.info(f"Processing complete - Input: {total_input_tokens}, Output: {total_output_tokens}, Cost: ${total_cost:.4f}")
|
| 101 |
+
|
| 102 |
+
return ProcessingResult(
|
| 103 |
+
original_document_md=result.structured_markdown,
|
| 104 |
+
redacted_document_md=result.redacted_markdown,
|
| 105 |
+
original_document_json=result.structured_json,
|
| 106 |
+
redacted_document_json=result.redacted_json,
|
| 107 |
+
removed_indices=removed_indices,
|
| 108 |
+
input_tokens=total_input_tokens,
|
| 109 |
+
output_tokens=total_output_tokens,
|
| 110 |
+
cost=total_cost
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
class DocumentProcessor:
|
| 114 |
"""Handles parsing of documents with Docling and redacting specified sections."""
|
| 115 |
def __init__(self, section_extractor: Optional[SectionExtractor] = None):
|
|
|
|
| 167 |
|
| 168 |
# Persist outputs to files (JSON and redacted text) for auditing
|
| 169 |
base_name = os.path.splitext(os.path.basename(file_path))[0]
|
| 170 |
+
# Use the same temp directory as the main application
|
| 171 |
+
temp_dir = os.environ.get('TEMP_DIR', '/tmp/docling_temp')
|
| 172 |
try:
|
| 173 |
os.makedirs(temp_dir, exist_ok=True)
|
| 174 |
except PermissionError:
|
| 175 |
+
# Fallback to system temp directory if we can't create in the main temp dir
|
| 176 |
import tempfile
|
| 177 |
temp_dir = os.path.join(tempfile.gettempdir(), "docling_temp_files")
|
| 178 |
os.makedirs(temp_dir, exist_ok=True)
|
|
|
|
| 226 |
logger.warning(f"Could not create cache directory {cache_dir}: {e}")
|
| 227 |
|
| 228 |
def _export_redacted_markdown(self, document, redacted_json):
|
| 229 |
+
"""Export redacted markdown using Docling's Document class for proper formatting."""
|
| 230 |
+
try:
|
| 231 |
+
# Try different possible import paths for Docling Document class
|
| 232 |
+
try:
|
| 233 |
+
from docling.document import Document
|
| 234 |
+
except ImportError:
|
| 235 |
+
try:
|
| 236 |
+
from docling import Document
|
| 237 |
+
except ImportError:
|
| 238 |
+
try:
|
| 239 |
+
from docling.core import Document
|
| 240 |
+
except ImportError:
|
| 241 |
+
# If all imports fail, use the fallback method
|
| 242 |
+
logger.warning("Could not import Docling Document class from any known location")
|
| 243 |
+
raise ImportError("Docling Document class not found")
|
| 244 |
+
|
| 245 |
+
# Create a new Document from the redacted JSON
|
| 246 |
+
redacted_document = Document.from_dict(redacted_json)
|
| 247 |
+
|
| 248 |
+
# Use Docling's export method for proper markdown formatting
|
| 249 |
+
redacted_md = redacted_document.export_to_markdown()
|
| 250 |
+
logger.info("Successfully generated redacted markdown using Docling Document class")
|
| 251 |
+
return redacted_md
|
| 252 |
+
|
| 253 |
+
except Exception as e:
|
| 254 |
+
logger.warning(f"Failed to create Docling Document from redacted JSON: {e}")
|
| 255 |
+
logger.info("Falling back to manual JSON-to-markdown conversion")
|
| 256 |
+
# Fallback to the old method if Docling Document creation fails
|
| 257 |
+
return self._json_to_markdown(redacted_json)
|
| 258 |
|
| 259 |
+
def generate_redacted_pdf(self, redacted_json: dict, output_path: str) -> bool:
|
| 260 |
+
"""
|
| 261 |
+
Generate a redacted PDF from the redacted JSON structure.
|
| 262 |
+
|
| 263 |
+
Args:
|
| 264 |
+
redacted_json: The redacted document JSON structure
|
| 265 |
+
output_path: Path where the PDF should be saved
|
| 266 |
+
|
| 267 |
+
Returns:
|
| 268 |
+
bool: True if PDF generation was successful, False otherwise
|
| 269 |
+
"""
|
| 270 |
+
try:
|
| 271 |
+
# Import required libraries
|
| 272 |
+
from reportlab.lib.pagesizes import letter, A4
|
| 273 |
+
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle
|
| 274 |
+
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
|
| 275 |
+
from reportlab.lib.units import inch
|
| 276 |
+
from reportlab.lib import colors
|
| 277 |
+
import io
|
| 278 |
+
|
| 279 |
+
logger.info(f"Generating redacted PDF: {output_path}")
|
| 280 |
+
|
| 281 |
+
# Create PDF document
|
| 282 |
+
doc = SimpleDocTemplate(output_path, pagesize=A4)
|
| 283 |
+
story = []
|
| 284 |
+
|
| 285 |
+
# Get styles
|
| 286 |
+
styles = getSampleStyleSheet()
|
| 287 |
+
normal_style = styles['Normal']
|
| 288 |
+
heading_style = styles['Heading1']
|
| 289 |
+
|
| 290 |
+
# Create custom styles for better formatting
|
| 291 |
+
table_style = ParagraphStyle(
|
| 292 |
+
'TableStyle',
|
| 293 |
+
parent=normal_style,
|
| 294 |
+
fontName='Courier',
|
| 295 |
+
fontSize=9,
|
| 296 |
+
spaceAfter=6
|
| 297 |
+
)
|
| 298 |
+
|
| 299 |
+
# Process text elements from JSON
|
| 300 |
+
texts = redacted_json.get("texts", [])
|
| 301 |
+
|
| 302 |
+
# Group consecutive table-like elements together
|
| 303 |
+
i = 0
|
| 304 |
+
while i < len(texts):
|
| 305 |
+
text_elem = texts[i]
|
| 306 |
+
text_content = text_elem.get("text", "").strip()
|
| 307 |
+
label = text_elem.get("label", "")
|
| 308 |
+
level = text_elem.get("level", 0)
|
| 309 |
+
|
| 310 |
+
if not text_content:
|
| 311 |
+
i += 1
|
| 312 |
+
continue
|
| 313 |
+
|
| 314 |
+
# Handle different content types
|
| 315 |
+
if label == "section_header":
|
| 316 |
+
# Create header with appropriate level
|
| 317 |
+
if level == 1:
|
| 318 |
+
story.append(Paragraph(text_content, heading_style))
|
| 319 |
+
else:
|
| 320 |
+
# Create sub-heading style
|
| 321 |
+
sub_heading_style = ParagraphStyle(
|
| 322 |
+
f'Heading{min(level, 3)}',
|
| 323 |
+
parent=normal_style,
|
| 324 |
+
fontSize=14 - level,
|
| 325 |
+
spaceAfter=12,
|
| 326 |
+
spaceBefore=12,
|
| 327 |
+
textColor=colors.darkblue
|
| 328 |
+
)
|
| 329 |
+
story.append(Paragraph(text_content, sub_heading_style))
|
| 330 |
+
|
| 331 |
+
elif label == "list_item":
|
| 332 |
+
# Handle list items
|
| 333 |
+
marker = text_elem.get("marker", "•")
|
| 334 |
+
list_text = f"{marker} {text_content}"
|
| 335 |
+
story.append(Paragraph(list_text, normal_style))
|
| 336 |
+
|
| 337 |
+
elif '|' in text_content and text_content.count('|') > 1:
|
| 338 |
+
# Handle table-like content - collect consecutive table rows
|
| 339 |
+
table_rows = []
|
| 340 |
+
|
| 341 |
+
# Add the current row
|
| 342 |
+
cells = [cell.strip() for cell in text_content.split('|') if cell.strip()]
|
| 343 |
+
if cells:
|
| 344 |
+
table_rows.append(cells)
|
| 345 |
+
|
| 346 |
+
# Look ahead for consecutive table rows
|
| 347 |
+
j = i + 1
|
| 348 |
+
while j < len(texts):
|
| 349 |
+
next_text = texts[j].get("text", "").strip()
|
| 350 |
+
if '|' in next_text and next_text.count('|') > 1:
|
| 351 |
+
next_cells = [cell.strip() for cell in next_text.split('|') if cell.strip()]
|
| 352 |
+
if next_cells:
|
| 353 |
+
table_rows.append(next_cells)
|
| 354 |
+
j += 1
|
| 355 |
+
else:
|
| 356 |
+
break
|
| 357 |
+
|
| 358 |
+
# Create table if we have rows
|
| 359 |
+
if table_rows:
|
| 360 |
+
table = Table(table_rows)
|
| 361 |
+
table.setStyle(TableStyle([
|
| 362 |
+
('ALIGN', (0, 0), (-1, -1), 'LEFT'),
|
| 363 |
+
('FONTNAME', (0, 0), (-1, -1), 'Courier'),
|
| 364 |
+
('FONTSIZE', (0, 0), (-1, -1), 9),
|
| 365 |
+
('BOTTOMPADDING', (0, 0), (-1, -1), 3),
|
| 366 |
+
('TOPPADDING', (0, 0), (-1, -1), 3),
|
| 367 |
+
('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
|
| 368 |
+
('BACKGROUND', (0, 0), (-1, 0), colors.lightgrey), # Header row
|
| 369 |
+
]))
|
| 370 |
+
story.append(table)
|
| 371 |
+
story.append(Spacer(1, 6))
|
| 372 |
+
|
| 373 |
+
# Skip the rows we've already processed
|
| 374 |
+
i = j - 1
|
| 375 |
+
else:
|
| 376 |
+
# Single row or no valid cells
|
| 377 |
+
story.append(Paragraph(text_content, table_style))
|
| 378 |
+
|
| 379 |
+
else:
|
| 380 |
+
# Regular text content
|
| 381 |
+
story.append(Paragraph(text_content, normal_style))
|
| 382 |
+
|
| 383 |
+
# Add small spacing between elements
|
| 384 |
+
story.append(Spacer(1, 3))
|
| 385 |
+
i += 1
|
| 386 |
+
|
| 387 |
+
# Build PDF
|
| 388 |
+
doc.build(story)
|
| 389 |
+
logger.info(f"Successfully generated redacted PDF: {output_path}")
|
| 390 |
+
return True
|
| 391 |
+
|
| 392 |
+
except ImportError as e:
|
| 393 |
+
logger.error(f"Required PDF generation libraries not available: {e}")
|
| 394 |
+
logger.info("Install reportlab with: pip install reportlab")
|
| 395 |
+
return False
|
| 396 |
+
except Exception as e:
|
| 397 |
+
logger.error(f"Error generating redacted PDF: {e}")
|
| 398 |
+
return False
|
| 399 |
+
|
| 400 |
def _json_to_markdown(self, json_data: dict) -> str:
|
| 401 |
"""Convert JSON document structure back to markdown format using Docling's structure."""
|
| 402 |
markdown_lines = []
|
src/processing/llm_extractor.py
CHANGED
|
@@ -6,6 +6,7 @@ import logging
|
|
| 6 |
from typing import Dict, Any
|
| 7 |
|
| 8 |
from openai import AzureOpenAI
|
|
|
|
| 9 |
|
| 10 |
logger = logging.getLogger(__name__)
|
| 11 |
|
|
@@ -39,107 +40,137 @@ class AzureO1MedicationExtractor:
|
|
| 39 |
})
|
| 40 |
|
| 41 |
prompt = f"""
|
| 42 |
-
You are a medical document analysis expert specializing in discharge letters. Your task is to identify ONLY the
|
| 43 |
-
|
| 44 |
-
**CRITICAL: You should ONLY remove
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
**
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
-
|
| 52 |
-
-
|
| 53 |
-
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
-
|
| 58 |
-
-
|
| 59 |
-
- Any
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
-
|
| 65 |
-
-
|
| 66 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
|
| 68 |
Document structure:
|
| 69 |
{text_analysis}
|
| 70 |
|
| 71 |
**Analysis Instructions:**
|
| 72 |
-
1. Look for formal medication sections with clear headers (e.g., "Thuismedicatie", "
|
| 73 |
-
2. Identify
|
| 74 |
-
3. **
|
| 75 |
-
4.
|
| 76 |
-
5. Be conservative - if in doubt, do NOT remove
|
| 77 |
-
6.
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
- Complete lists under "Thuismedicatie" header
|
| 81 |
-
- Formal medication lists under "Als verdere behandeling stellen wij voor"
|
| 82 |
-
- Standalone medication sections with multiple drugs
|
| 83 |
-
- Lists that appear at the beginning or end of the document
|
| 84 |
-
|
| 85 |
-
**Examples of what to KEEP:**
|
| 86 |
-
- "Patient was treated with Eliquis 2x 2.5mg" (clinical discussion)
|
| 87 |
-
- "Stop Clopidogrel bij opname" (clinical instruction)
|
| 88 |
-
- "Jardiance 10mg & Burinex 5mg" (if mentioned in clinical context)
|
| 89 |
-
- Any medication mentioned in the context of treatment discussion
|
| 90 |
-
|
| 91 |
-
Return your analysis as JSON:
|
| 92 |
{{
|
| 93 |
"indices_to_remove": [list of integer indices - ONLY formal medication lists],
|
| 94 |
"reasoning": {{
|
| 95 |
-
"
|
| 96 |
-
"clinical_medication_mentions": [list of clinical mentions that were correctly preserved],
|
| 97 |
-
"justification": "explanation of why only formal lists were selected for removal",
|
| 98 |
"confidence": "high/medium/low"
|
| 99 |
}}
|
| 100 |
}}
|
| 101 |
"""
|
|
|
|
| 102 |
logger.info(f"Prompt length: {len(prompt)}")
|
| 103 |
logger.info(f"Number of text elements: {len(text_analysis)}")
|
|
|
|
| 104 |
try:
|
| 105 |
response = self.client.chat.completions.create(
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
|
|
|
| 118 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
except Exception as e:
|
| 120 |
logger.error(f"Exception during LLM call: {e}", exc_info=True)
|
| 121 |
return {"indices_to_remove": [], "reasoning": {"confidence": "low"}}
|
| 122 |
|
| 123 |
try:
|
| 124 |
-
logger.
|
|
|
|
|
|
|
| 125 |
result = json.loads(response.choices[0].message.content)
|
| 126 |
|
| 127 |
-
#
|
| 128 |
indices_to_remove = result.get("indices_to_remove", [])
|
| 129 |
|
| 130 |
-
#
|
| 131 |
-
|
| 132 |
-
logger.warning(f"LLM suggested removing {len(indices_to_remove)} elements, limiting to 10 most likely formal medication lists")
|
| 133 |
-
# Keep only the first 10 (assuming they're ordered by importance)
|
| 134 |
-
indices_to_remove = indices_to_remove[:10]
|
| 135 |
-
result["indices_to_remove"] = indices_to_remove
|
| 136 |
-
result["reasoning"]["justification"] += " [LIMITED: Only top 10 elements selected to prevent over-removal]"
|
| 137 |
|
| 138 |
-
# Log
|
| 139 |
-
|
| 140 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
|
| 142 |
return result
|
|
|
|
| 143 |
except Exception as e:
|
| 144 |
logger.error(f"Failed to parse LLM response: {e}")
|
| 145 |
return {"indices_to_remove": [], "reasoning": {"confidence": "low"}}
|
|
|
|
| 6 |
from typing import Dict, Any
|
| 7 |
|
| 8 |
from openai import AzureOpenAI
|
| 9 |
+
from utils.cost_tracker import cost_tracker
|
| 10 |
|
| 11 |
logger = logging.getLogger(__name__)
|
| 12 |
|
|
|
|
| 40 |
})
|
| 41 |
|
| 42 |
prompt = f"""
|
| 43 |
+
You are a medical document analysis expert specializing in discharge letters. Your task is to identify ONLY the formal medication lists that should be redacted, while preserving ALL other content including medical history tables.
|
| 44 |
+
|
| 45 |
+
**CRITICAL: You should ONLY remove formal medication lists with explicit medication names, dosages, and frequencies.**
|
| 46 |
+
|
| 47 |
+
**What to REMOVE (medication lists only):**
|
| 48 |
+
1. **Current medication list** - sections with headers like "Huidige thuismedicatie", "Current medications", "Medicatie"
|
| 49 |
+
2. **Discharge medication list** - sections with headers like "Als verdere behandeling stellen wij voor", "Thuismedicatie", "Discharge medications"
|
| 50 |
+
|
| 51 |
+
**What medication lists look like:**
|
| 52 |
+
- Header: "Huidige thuismedicatie" or similar
|
| 53 |
+
- Followed by multiple lines with medication names, dosages, frequencies
|
| 54 |
+
- Example: "Pantomed 20mg Tablet Oral - 1 tablet - 2 maal daags"
|
| 55 |
+
- Example: "Forlax 10g Zakje Oral - 2 zakje - 1 maal daags (zo nodig)"
|
| 56 |
+
|
| 57 |
+
**What to ABSOLUTELY NEVER REMOVE:**
|
| 58 |
+
1. **Medical history tables** - Tables with "Datum" and "Bespreking" columns containing dates and medical events
|
| 59 |
+
2. **Treatment history** - Narrative descriptions of medical procedures, treatments, or events
|
| 60 |
+
3. **Clinical discussions** - Any text discussing medical conditions, procedures, or clinical decisions
|
| 61 |
+
4. **Tables with dates and procedures** - Any table format showing timeline of medical events
|
| 62 |
+
5. **Individual medication mentions in clinical text** - References to medications within clinical narratives
|
| 63 |
+
|
| 64 |
+
**EXAMPLES OF CONTENT TO NEVER REMOVE:**
|
| 65 |
+
- Tables like: "| Datum | Bespreking |" followed by medical events
|
| 66 |
+
- "| 07/07/2017 | Niertransplantatie met donornier..."
|
| 67 |
+
- "| 15/8/2017 | Uitgestelde transplantfunctie..."
|
| 68 |
+
- "| 26/03/2018 | plaatsing peritoneaal dialysekatheter..."
|
| 69 |
+
- Any text describing medical procedures, surgeries, or treatments
|
| 70 |
+
- Clinical narratives mentioning medications in context (e.g., "behandeling met Sotrovimab")
|
| 71 |
+
|
| 72 |
+
**KEY DISTINGUISHING FEATURES:**
|
| 73 |
+
- **Medication lists**: Standalone sections with drug names + dosages + frequencies
|
| 74 |
+
- **Medical history**: Tables or narratives describing medical events, procedures, surgeries
|
| 75 |
+
- **Clinical text**: Discussions of treatment decisions, medical events, or conditions
|
| 76 |
+
|
| 77 |
+
**If you see a table with dates and medical procedures, it is MEDICAL HISTORY, not a medication list.**
|
| 78 |
+
**If you see clinical text discussing treatments or procedures, it is CLINICAL DISCUSSION, not a medication list.**
|
| 79 |
|
| 80 |
Document structure:
|
| 81 |
{text_analysis}
|
| 82 |
|
| 83 |
**Analysis Instructions:**
|
| 84 |
+
1. Look ONLY for formal medication sections with clear headers (e.g., "Thuismedicatie", "Huidige thuismedicatie")
|
| 85 |
+
2. Identify sections that contain LISTS of medications with dosages and frequencies
|
| 86 |
+
3. **NEVER identify medical history tables as medication lists**
|
| 87 |
+
4. **NEVER identify clinical discussions as medication lists**
|
| 88 |
+
5. Be extremely conservative - if in doubt, do NOT remove
|
| 89 |
+
6. Focus ONLY on standalone medication documentation sections
|
| 90 |
+
|
| 91 |
+
Return your analysis as a JSON object with this exact structure:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
{{
|
| 93 |
"indices_to_remove": [list of integer indices - ONLY formal medication lists],
|
| 94 |
"reasoning": {{
|
| 95 |
+
"justification": "explanation of why only formal medication lists were selected for removal",
|
|
|
|
|
|
|
| 96 |
"confidence": "high/medium/low"
|
| 97 |
}}
|
| 98 |
}}
|
| 99 |
"""
|
| 100 |
+
|
| 101 |
logger.info(f"Prompt length: {len(prompt)}")
|
| 102 |
logger.info(f"Number of text elements: {len(text_analysis)}")
|
| 103 |
+
|
| 104 |
try:
|
| 105 |
response = self.client.chat.completions.create(
|
| 106 |
+
messages=[
|
| 107 |
+
{
|
| 108 |
+
"role": "system",
|
| 109 |
+
"content": "You are a helpful assistant that analyzes medical documents and identifies formal medication lists for redaction.",
|
| 110 |
+
},
|
| 111 |
+
{
|
| 112 |
+
"role": "user",
|
| 113 |
+
"content": prompt,
|
| 114 |
+
}
|
| 115 |
+
],
|
| 116 |
+
max_completion_tokens=100000,
|
| 117 |
+
model=self.deployment,
|
| 118 |
+
response_format={"type": "json_object"}
|
| 119 |
)
|
| 120 |
+
|
| 121 |
+
# Record token usage and cost
|
| 122 |
+
if hasattr(response, 'usage') and response.usage:
|
| 123 |
+
cost_tracker.record_usage(
|
| 124 |
+
prompt_tokens=response.usage.prompt_tokens,
|
| 125 |
+
completion_tokens=response.usage.completion_tokens,
|
| 126 |
+
model=self.model_name
|
| 127 |
+
)
|
| 128 |
+
logger.info(f"API call completed - Input: {response.usage.prompt_tokens}, "
|
| 129 |
+
f"Output: {response.usage.completion_tokens}, "
|
| 130 |
+
f"Total: {response.usage.total_tokens} tokens")
|
| 131 |
+
|
| 132 |
except Exception as e:
|
| 133 |
logger.error(f"Exception during LLM call: {e}", exc_info=True)
|
| 134 |
return {"indices_to_remove": [], "reasoning": {"confidence": "low"}}
|
| 135 |
|
| 136 |
try:
|
| 137 |
+
logger.info(f"Raw LLM response: {response.choices[0].message.content!r}")
|
| 138 |
+
|
| 139 |
+
# Parse the structured JSON response
|
| 140 |
result = json.loads(response.choices[0].message.content)
|
| 141 |
|
| 142 |
+
# Get the indices to remove
|
| 143 |
indices_to_remove = result.get("indices_to_remove", [])
|
| 144 |
|
| 145 |
+
# Log what the LLM suggested
|
| 146 |
+
logger.info(f"LLM suggested removing {len(indices_to_remove)} elements: {indices_to_remove}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
|
| 148 |
+
# Log detailed information about what's being removed
|
| 149 |
+
if indices_to_remove:
|
| 150 |
+
logger.info("DETAILED ANALYSIS OF LLM SUGGESTIONS:")
|
| 151 |
+
logger.info("=" * 60)
|
| 152 |
+
|
| 153 |
+
for idx in indices_to_remove:
|
| 154 |
+
if idx < len(text_analysis):
|
| 155 |
+
text_content = text_analysis[idx].get("text", "")
|
| 156 |
+
text_label = text_analysis[idx].get("label", "")
|
| 157 |
+
logger.info(f"Index {idx} ({text_label}): '{text_content}'")
|
| 158 |
+
else:
|
| 159 |
+
logger.error(f"Index {idx} is out of bounds (max: {len(text_analysis)-1})")
|
| 160 |
+
|
| 161 |
+
logger.info("=" * 60)
|
| 162 |
+
|
| 163 |
+
# Log the reasoning if provided
|
| 164 |
+
reasoning = result.get("reasoning", {})
|
| 165 |
+
if reasoning:
|
| 166 |
+
logger.info(f"LLM reasoning: {reasoning}")
|
| 167 |
+
|
| 168 |
+
logger.info(f"Final removal list: {len(indices_to_remove)} elements will be removed")
|
| 169 |
+
else:
|
| 170 |
+
logger.info("No elements will be removed")
|
| 171 |
|
| 172 |
return result
|
| 173 |
+
|
| 174 |
except Exception as e:
|
| 175 |
logger.error(f"Failed to parse LLM response: {e}")
|
| 176 |
return {"indices_to_remove": [], "reasoning": {"confidence": "low"}}
|
src/processing/sections.py
CHANGED
|
@@ -21,18 +21,57 @@ class ReasoningSectionExtractor:
|
|
| 21 |
reasoning = extraction_result.get("reasoning", {})
|
| 22 |
|
| 23 |
# Log detailed reasoning for transparency
|
| 24 |
-
logger.info(f"
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
# Provide specific feedback about what was removed
|
| 27 |
if indices_to_remove:
|
| 28 |
logger.info(f"Removing {len(indices_to_remove)} text elements: {indices_to_remove}")
|
| 29 |
|
| 30 |
-
#
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
| 32 |
for idx in indices_to_remove:
|
| 33 |
if idx < len(texts):
|
| 34 |
-
text_content = texts[idx].get("text", "")
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
else:
|
| 37 |
logger.info("No formal medication lists identified for removal")
|
| 38 |
|
|
@@ -46,6 +85,7 @@ class ReasoningSectionExtractor:
|
|
| 46 |
# Log the result
|
| 47 |
removed_count = len(texts) - len(redacted_texts)
|
| 48 |
logger.info(f"Successfully removed {removed_count} text elements from document structure")
|
|
|
|
| 49 |
|
| 50 |
return redacted_json
|
| 51 |
|
|
|
|
| 21 |
reasoning = extraction_result.get("reasoning", {})
|
| 22 |
|
| 23 |
# Log detailed reasoning for transparency
|
| 24 |
+
logger.info(f"LLM reasoning summary: {reasoning}")
|
| 25 |
+
|
| 26 |
+
# Get the texts for detailed logging
|
| 27 |
+
texts = doc_json.get("texts", [])
|
| 28 |
|
| 29 |
# Provide specific feedback about what was removed
|
| 30 |
if indices_to_remove:
|
| 31 |
logger.info(f"Removing {len(indices_to_remove)} text elements: {indices_to_remove}")
|
| 32 |
|
| 33 |
+
# Categorize and show what specific content is being removed
|
| 34 |
+
medication_headers = []
|
| 35 |
+
medication_items = []
|
| 36 |
+
other_content = []
|
| 37 |
+
|
| 38 |
for idx in indices_to_remove:
|
| 39 |
if idx < len(texts):
|
| 40 |
+
text_content = texts[idx].get("text", "")
|
| 41 |
+
text_label = texts[idx].get("label", "")
|
| 42 |
+
|
| 43 |
+
# Categorize the content
|
| 44 |
+
if any(keyword in text_content.lower() for keyword in ['medicatie', 'thuismedicatie', 'medication', 'drugs']):
|
| 45 |
+
medication_headers.append((idx, text_content))
|
| 46 |
+
elif any(keyword in text_content.lower() for keyword in ['tablet', 'capsule', 'mg', 'ml', 'zakje', 'oral', 'maal daags']):
|
| 47 |
+
medication_items.append((idx, text_content))
|
| 48 |
+
else:
|
| 49 |
+
other_content.append((idx, text_content))
|
| 50 |
+
|
| 51 |
+
# Log with more detail
|
| 52 |
+
logger.info(f" → Removing index {idx} ({text_label}): '{text_content[:150]}{'...' if len(text_content) > 150 else ''}'")
|
| 53 |
+
else:
|
| 54 |
+
logger.warning(f" → Invalid index {idx}: exceeds document length ({len(texts)})")
|
| 55 |
+
|
| 56 |
+
# Summary of what was categorized
|
| 57 |
+
if medication_headers:
|
| 58 |
+
logger.info(f"Medication headers removed: {len(medication_headers)} items")
|
| 59 |
+
for idx, content in medication_headers:
|
| 60 |
+
logger.info(f" Header {idx}: {content}")
|
| 61 |
+
|
| 62 |
+
if medication_items:
|
| 63 |
+
logger.info(f"Medication items removed: {len(medication_items)} items")
|
| 64 |
+
for idx, content in medication_items[:5]: # Show first 5 to avoid spam
|
| 65 |
+
logger.info(f" Item {idx}: {content[:100]}...")
|
| 66 |
+
if len(medication_items) > 5:
|
| 67 |
+
logger.info(f" ... and {len(medication_items) - 5} more medication items")
|
| 68 |
+
|
| 69 |
+
if other_content:
|
| 70 |
+
logger.warning(f"⚠️ NON-MEDICATION content removed: {len(other_content)} items")
|
| 71 |
+
for idx, content in other_content:
|
| 72 |
+
logger.warning(f" ⚠️ Index {idx}: {content[:200]}...")
|
| 73 |
+
logger.warning("⚠️ Please review: non-medication content was removed - this may indicate an issue with the LLM detection")
|
| 74 |
+
|
| 75 |
else:
|
| 76 |
logger.info("No formal medication lists identified for removal")
|
| 77 |
|
|
|
|
| 85 |
# Log the result
|
| 86 |
removed_count = len(texts) - len(redacted_texts)
|
| 87 |
logger.info(f"Successfully removed {removed_count} text elements from document structure")
|
| 88 |
+
logger.info(f"Document structure: {len(texts)} → {len(redacted_texts)} text elements")
|
| 89 |
|
| 90 |
return redacted_json
|
| 91 |
|
src/streamlit_app.py
CHANGED
|
@@ -3,6 +3,8 @@
|
|
| 3 |
|
| 4 |
import os
|
| 5 |
import tempfile
|
|
|
|
|
|
|
| 6 |
|
| 7 |
# Get a writable temp directory first
|
| 8 |
try:
|
|
@@ -91,26 +93,12 @@ directories_to_create = [
|
|
| 91 |
os.environ['ACCELERATE_CACHE'],
|
| 92 |
]
|
| 93 |
|
| 94 |
-
# Monkey patch os.makedirs to prevent root directory access
|
| 95 |
-
original_makedirs = os.makedirs
|
| 96 |
-
|
| 97 |
-
def safe_makedirs(name, mode=0o777, exist_ok=False):
|
| 98 |
-
"""Safe version of makedirs that prevents root directory access."""
|
| 99 |
-
# Check if trying to create directory in root filesystem
|
| 100 |
-
if name.startswith('/') and not name.startswith('/tmp') and not name.startswith('/app'):
|
| 101 |
-
# Redirect to temp directory
|
| 102 |
-
basename = os.path.basename(name)
|
| 103 |
-
safe_name = os.path.join(TEMP_DIR, basename)
|
| 104 |
-
print(f"Redirecting root directory creation from {name} to {safe_name}")
|
| 105 |
-
return original_makedirs(safe_name, mode, exist_ok)
|
| 106 |
-
return original_makedirs(name, mode, exist_ok)
|
| 107 |
-
|
| 108 |
-
# Apply the monkey patch
|
| 109 |
-
os.makedirs = safe_makedirs
|
| 110 |
-
|
| 111 |
for directory in directories_to_create:
|
| 112 |
try:
|
| 113 |
-
|
|
|
|
|
|
|
|
|
|
| 114 |
except Exception as e:
|
| 115 |
print(f"Warning: Could not create directory {directory}: {e}")
|
| 116 |
|
|
@@ -121,11 +109,10 @@ import shutil
|
|
| 121 |
from processing.document_processor import DocumentProcessor
|
| 122 |
from processing.sections import ReasoningSectionExtractor
|
| 123 |
from utils.logging_utils import get_log_handler
|
|
|
|
| 124 |
from dotenv import load_dotenv
|
| 125 |
import sys
|
| 126 |
-
import html
|
| 127 |
import difflib
|
| 128 |
-
import re
|
| 129 |
import time
|
| 130 |
|
| 131 |
# Configure logging early to avoid issues
|
|
@@ -212,16 +199,40 @@ def get_temp_files_info():
|
|
| 212 |
|
| 213 |
files = os.listdir(TEMP_DIR)
|
| 214 |
total_size = 0
|
|
|
|
| 215 |
|
| 216 |
for filename in files:
|
| 217 |
try:
|
| 218 |
file_path = os.path.join(TEMP_DIR, filename)
|
| 219 |
if os.path.isfile(file_path):
|
| 220 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 221 |
except (PermissionError, OSError) as e:
|
| 222 |
logging.warning(f"Error accessing file {filename}: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
continue
|
| 224 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
return len(files), total_size
|
| 226 |
except PermissionError as e:
|
| 227 |
logging.warning(f"Permission error accessing temp directory {TEMP_DIR}: {e}")
|
|
@@ -298,215 +309,7 @@ st.markdown("""
|
|
| 298 |
border-radius: 10px;
|
| 299 |
border: 1px solid #e9ecef;
|
| 300 |
}
|
| 301 |
-
|
| 302 |
-
/* Synchronized scrolling styles */
|
| 303 |
-
.sync-scroll-container {
|
| 304 |
-
display: flex;
|
| 305 |
-
gap: 20px;
|
| 306 |
-
height: 600px;
|
| 307 |
-
font-family: 'Courier New', monospace;
|
| 308 |
-
font-size: 12px;
|
| 309 |
-
}
|
| 310 |
-
|
| 311 |
-
.sync-scroll-panel {
|
| 312 |
-
flex: 1;
|
| 313 |
-
border: 1px solid #ddd;
|
| 314 |
-
border-radius: 5px;
|
| 315 |
-
overflow: hidden;
|
| 316 |
-
display: flex;
|
| 317 |
-
flex-direction: column;
|
| 318 |
-
}
|
| 319 |
-
|
| 320 |
-
.sync-scroll-header {
|
| 321 |
-
background-color: #f8f9fa;
|
| 322 |
-
padding: 10px;
|
| 323 |
-
border-bottom: 1px solid #ddd;
|
| 324 |
-
font-weight: bold;
|
| 325 |
-
}
|
| 326 |
-
|
| 327 |
-
.sync-scroll-content {
|
| 328 |
-
flex: 1;
|
| 329 |
-
overflow-y: auto;
|
| 330 |
-
padding: 10px;
|
| 331 |
-
background-color: #fff;
|
| 332 |
-
scroll-behavior: smooth;
|
| 333 |
-
transition: scroll-top 0.1s ease-out;
|
| 334 |
-
}
|
| 335 |
-
|
| 336 |
-
/* Prevent scroll chaining */
|
| 337 |
-
.sync-scroll-content::-webkit-scrollbar {
|
| 338 |
-
width: 8px;
|
| 339 |
-
}
|
| 340 |
-
|
| 341 |
-
.sync-scroll-content::-webkit-scrollbar-track {
|
| 342 |
-
background: #f1f1f1;
|
| 343 |
-
}
|
| 344 |
-
|
| 345 |
-
.sync-scroll-content::-webkit-scrollbar-thumb {
|
| 346 |
-
background: #888;
|
| 347 |
-
border-radius: 4px;
|
| 348 |
-
}
|
| 349 |
-
|
| 350 |
-
.sync-scroll-content::-webkit-scrollbar-thumb:hover {
|
| 351 |
-
background: #555;
|
| 352 |
-
}
|
| 353 |
</style>
|
| 354 |
-
|
| 355 |
-
<script>
|
| 356 |
-
// Improved synchronized scrolling implementation with better debugging
|
| 357 |
-
console.log('Starting sync scroll setup...');
|
| 358 |
-
|
| 359 |
-
function setupSyncScroll() {
|
| 360 |
-
console.log('setupSyncScroll called');
|
| 361 |
-
|
| 362 |
-
// Wait for elements to be available
|
| 363 |
-
setTimeout(function() {
|
| 364 |
-
console.log('Looking for scroll elements...');
|
| 365 |
-
const originalContent = document.getElementById('original-content');
|
| 366 |
-
const redactedContent = document.getElementById('redacted-content');
|
| 367 |
-
|
| 368 |
-
console.log('Original content element:', originalContent);
|
| 369 |
-
console.log('Redacted content element:', redactedContent);
|
| 370 |
-
|
| 371 |
-
if (originalContent && redactedContent) {
|
| 372 |
-
console.log('Both elements found, setting up sync...');
|
| 373 |
-
|
| 374 |
-
let isScrolling = false;
|
| 375 |
-
let scrollTimeout;
|
| 376 |
-
|
| 377 |
-
function syncScroll(source, target) {
|
| 378 |
-
if (!isScrolling) {
|
| 379 |
-
isScrolling = true;
|
| 380 |
-
console.log('Syncing scroll from', source.id, 'to', target.id, 'scrollTop:', source.scrollTop);
|
| 381 |
-
target.scrollTop = source.scrollTop;
|
| 382 |
-
|
| 383 |
-
// Clear existing timeout
|
| 384 |
-
if (scrollTimeout) {
|
| 385 |
-
clearTimeout(scrollTimeout);
|
| 386 |
-
}
|
| 387 |
-
|
| 388 |
-
// Reset flag after a short delay
|
| 389 |
-
scrollTimeout = setTimeout(() => {
|
| 390 |
-
isScrolling = false;
|
| 391 |
-
console.log('Scroll sync completed');
|
| 392 |
-
}, 100);
|
| 393 |
-
}
|
| 394 |
-
}
|
| 395 |
-
|
| 396 |
-
// Remove existing listeners to prevent duplicates
|
| 397 |
-
if (originalContent._syncScrollHandler) {
|
| 398 |
-
originalContent.removeEventListener('scroll', originalContent._syncScrollHandler);
|
| 399 |
-
}
|
| 400 |
-
if (redactedContent._syncScrollHandler) {
|
| 401 |
-
redactedContent.removeEventListener('scroll', redactedContent._syncScrollHandler);
|
| 402 |
-
}
|
| 403 |
-
|
| 404 |
-
// Create new handlers
|
| 405 |
-
originalContent._syncScrollHandler = function(e) {
|
| 406 |
-
console.log('Original content scrolled:', e.target.scrollTop);
|
| 407 |
-
syncScroll(originalContent, redactedContent);
|
| 408 |
-
};
|
| 409 |
-
|
| 410 |
-
redactedContent._syncScrollHandler = function(e) {
|
| 411 |
-
console.log('Redacted content scrolled:', e.target.scrollTop);
|
| 412 |
-
syncScroll(redactedContent, originalContent);
|
| 413 |
-
};
|
| 414 |
-
|
| 415 |
-
// Add event listeners
|
| 416 |
-
originalContent.addEventListener('scroll', originalContent._syncScrollHandler, { passive: true });
|
| 417 |
-
redactedContent.addEventListener('scroll', redactedContent._syncScrollHandler, { passive: true });
|
| 418 |
-
|
| 419 |
-
console.log('Event listeners added successfully');
|
| 420 |
-
|
| 421 |
-
// Show status indicator
|
| 422 |
-
const statusElement = document.getElementById('sync-status');
|
| 423 |
-
if (statusElement) {
|
| 424 |
-
statusElement.style.display = 'block';
|
| 425 |
-
console.log('Status indicator shown');
|
| 426 |
-
}
|
| 427 |
-
|
| 428 |
-
// Test the synchronization
|
| 429 |
-
setTimeout(() => {
|
| 430 |
-
console.log('Testing scroll sync...');
|
| 431 |
-
console.log('Original scrollTop:', originalContent.scrollTop);
|
| 432 |
-
console.log('Redacted scrollTop:', redactedContent.scrollTop);
|
| 433 |
-
|
| 434 |
-
// Try a small scroll to test
|
| 435 |
-
originalContent.scrollTop = 10;
|
| 436 |
-
setTimeout(() => {
|
| 437 |
-
console.log('After test scroll - Original:', originalContent.scrollTop, 'Redacted:', redactedContent.scrollTop);
|
| 438 |
-
}, 50);
|
| 439 |
-
}, 200);
|
| 440 |
-
|
| 441 |
-
} else {
|
| 442 |
-
console.log('Elements not found, will retry...');
|
| 443 |
-
// Retry with exponential backoff
|
| 444 |
-
setTimeout(setupSyncScroll, 300);
|
| 445 |
-
}
|
| 446 |
-
}, 200);
|
| 447 |
-
}
|
| 448 |
-
|
| 449 |
-
// Multiple initialization strategies
|
| 450 |
-
function initializeSyncScroll() {
|
| 451 |
-
console.log('Initializing sync scroll...');
|
| 452 |
-
|
| 453 |
-
// Strategy 1: Immediate setup
|
| 454 |
-
setupSyncScroll();
|
| 455 |
-
|
| 456 |
-
// Strategy 2: Setup after DOM ready
|
| 457 |
-
if (document.readyState === 'loading') {
|
| 458 |
-
document.addEventListener('DOMContentLoaded', function() {
|
| 459 |
-
console.log('DOM loaded, setting up sync scroll...');
|
| 460 |
-
setupSyncScroll();
|
| 461 |
-
});
|
| 462 |
-
}
|
| 463 |
-
|
| 464 |
-
// Strategy 3: Setup after window load
|
| 465 |
-
window.addEventListener('load', function() {
|
| 466 |
-
console.log('Window loaded, setting up sync scroll...');
|
| 467 |
-
setupSyncScroll();
|
| 468 |
-
});
|
| 469 |
-
|
| 470 |
-
// Strategy 4: Periodic retry for first 10 seconds
|
| 471 |
-
let attempts = 0;
|
| 472 |
-
const maxAttempts = 20;
|
| 473 |
-
const retryInterval = setInterval(function() {
|
| 474 |
-
attempts++;
|
| 475 |
-
console.log('Retry attempt', attempts);
|
| 476 |
-
|
| 477 |
-
const originalContent = document.getElementById('original-content');
|
| 478 |
-
const redactedContent = document.getElementById('redacted-content');
|
| 479 |
-
|
| 480 |
-
if (originalContent && redactedContent) {
|
| 481 |
-
console.log('Elements found on retry, setting up...');
|
| 482 |
-
setupSyncScroll();
|
| 483 |
-
clearInterval(retryInterval);
|
| 484 |
-
} else if (attempts >= maxAttempts) {
|
| 485 |
-
console.log('Max retry attempts reached, giving up');
|
| 486 |
-
clearInterval(retryInterval);
|
| 487 |
-
}
|
| 488 |
-
}, 500);
|
| 489 |
-
}
|
| 490 |
-
|
| 491 |
-
// Start initialization
|
| 492 |
-
initializeSyncScroll();
|
| 493 |
-
|
| 494 |
-
// Listen for Streamlit-specific events
|
| 495 |
-
if (window.parent && window.parent.postMessage) {
|
| 496 |
-
console.log('Streamlit environment detected');
|
| 497 |
-
|
| 498 |
-
// Listen for any messages that might indicate a rerun
|
| 499 |
-
window.addEventListener('message', function(event) {
|
| 500 |
-
console.log('Received message:', event.data);
|
| 501 |
-
if (event.data && (event.data.type === 'streamlit:rerun' || event.data.type === 'streamlit:setComponentValue')) {
|
| 502 |
-
console.log('Streamlit rerun detected, reinitializing sync scroll...');
|
| 503 |
-
setTimeout(setupSyncScroll, 1000);
|
| 504 |
-
}
|
| 505 |
-
});
|
| 506 |
-
}
|
| 507 |
-
|
| 508 |
-
console.log('Sync scroll script loaded');
|
| 509 |
-
</script>
|
| 510 |
""", unsafe_allow_html=True)
|
| 511 |
|
| 512 |
# Configure root logger only once (avoid duplicate handlers on reruns)
|
|
@@ -528,6 +331,7 @@ Use the buttons below to view the original structure or process with redaction.
|
|
| 528 |
if st.button("🧹 Clear All Data", type="secondary", help="Remove all temporary files and reset the application"):
|
| 529 |
if clear_all_data():
|
| 530 |
st.success("✅ All data cleared successfully! The application has been reset.")
|
|
|
|
| 531 |
st.rerun()
|
| 532 |
else:
|
| 533 |
st.error("❌ Error clearing data. Please try again.")
|
|
@@ -578,6 +382,32 @@ with col1:
|
|
| 578 |
# Show warning if total size is large
|
| 579 |
if total_size > 50 * 1024 * 1024: # 50MB
|
| 580 |
st.warning("⚠️ Large temporary files detected. Consider clearing data to free up space.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 581 |
else:
|
| 582 |
st.caption("📁 No temporary files")
|
| 583 |
|
|
@@ -593,72 +423,6 @@ with col2:
|
|
| 593 |
else:
|
| 594 |
st.caption("No files to delete")
|
| 595 |
|
| 596 |
-
def create_diff_content(original_text: str, redacted_text: str, view_type: str) -> str:
|
| 597 |
-
"""Create HTML content for diff view with highlighting."""
|
| 598 |
-
import difflib
|
| 599 |
-
import re
|
| 600 |
-
|
| 601 |
-
# Normalize the text to reduce formatting differences
|
| 602 |
-
def normalize_text(text):
|
| 603 |
-
# Remove extra whitespace and normalize line endings
|
| 604 |
-
lines = text.split('\n')
|
| 605 |
-
normalized_lines = []
|
| 606 |
-
for line in lines:
|
| 607 |
-
# Strip whitespace but preserve content
|
| 608 |
-
stripped = line.strip()
|
| 609 |
-
if stripped:
|
| 610 |
-
# Normalize header formatting differences
|
| 611 |
-
# Convert ## to # for level 1 headers
|
| 612 |
-
if re.match(r'^##\s+', stripped):
|
| 613 |
-
stripped = re.sub(r'^##\s+', '# ', stripped)
|
| 614 |
-
# Normalize quote formatting
|
| 615 |
-
if stripped.startswith('> '):
|
| 616 |
-
stripped = stripped.replace('> ', '> ')
|
| 617 |
-
elif stripped.startswith('+ > '):
|
| 618 |
-
stripped = stripped.replace('+ > ', '> ')
|
| 619 |
-
|
| 620 |
-
normalized_lines.append(stripped)
|
| 621 |
-
return normalized_lines
|
| 622 |
-
|
| 623 |
-
original_lines = normalize_text(original_text)
|
| 624 |
-
redacted_lines = normalize_text(redacted_text)
|
| 625 |
-
|
| 626 |
-
# Use difflib to get a more sophisticated diff
|
| 627 |
-
differ = difflib.Differ()
|
| 628 |
-
diff = list(differ.compare(original_lines, redacted_lines))
|
| 629 |
-
|
| 630 |
-
html_lines = []
|
| 631 |
-
|
| 632 |
-
if view_type == 'original':
|
| 633 |
-
# Show original with removed content highlighted
|
| 634 |
-
for line in diff:
|
| 635 |
-
if line.startswith(' '): # Unchanged line
|
| 636 |
-
escaped_line = html.escape(line[2:])
|
| 637 |
-
html_lines.append(f'<div style="padding: 2px 4px; margin: 1px 0; color: #333;">{escaped_line}</div>')
|
| 638 |
-
elif line.startswith('- '): # Removed line
|
| 639 |
-
escaped_line = html.escape(line[2:])
|
| 640 |
-
html_lines.append(f'<div style="background-color: #ffebee; color: #c62828; padding: 2px 4px; margin: 1px 0; border-left: 3px solid #f44336; font-weight: bold;">- {escaped_line}</div>')
|
| 641 |
-
elif line.startswith('+ '): # Added line (show as empty space in original view)
|
| 642 |
-
html_lines.append(f'<div style="background-color: #e8f5e8; color: #2e7d32; padding: 2px 4px; margin: 1px 0; border-left: 3px solid #4caf50; font-style: italic; opacity: 0.7;">+ (added in redacted version)</div>')
|
| 643 |
-
elif line.startswith('? '): # Ignore difflib hints
|
| 644 |
-
continue
|
| 645 |
-
|
| 646 |
-
elif view_type == 'redacted':
|
| 647 |
-
# Show redacted content with added content highlighted
|
| 648 |
-
for line in diff:
|
| 649 |
-
if line.startswith(' '): # Unchanged line
|
| 650 |
-
escaped_line = html.escape(line[2:])
|
| 651 |
-
html_lines.append(f'<div style="padding: 2px 4px; margin: 1px 0; color: #333;">{escaped_line}</div>')
|
| 652 |
-
elif line.startswith('- '): # Removed line (show as empty space in redacted view)
|
| 653 |
-
html_lines.append(f'<div style="background-color: #ffebee; color: #c62828; padding: 2px 4px; margin: 1px 0; border-left: 3px solid #f44336; font-style: italic; opacity: 0.7;">- (removed from original)</div>')
|
| 654 |
-
elif line.startswith('+ '): # Added line
|
| 655 |
-
escaped_line = html.escape(line[2:])
|
| 656 |
-
html_lines.append(f'<div style="background-color: #e8f5e8; color: #2e7d32; padding: 2px 4px; margin: 1px 0; border-left: 3px solid #4caf50; font-weight: bold;">+ {escaped_line}</div>')
|
| 657 |
-
elif line.startswith('? '): # Ignore difflib hints
|
| 658 |
-
continue
|
| 659 |
-
|
| 660 |
-
return '\n'.join(html_lines)
|
| 661 |
-
|
| 662 |
if uploaded_files:
|
| 663 |
# UI to select which file to work with (if multiple files uploaded)
|
| 664 |
file_names = [f.name for f in uploaded_files]
|
|
@@ -698,32 +462,42 @@ if uploaded_files:
|
|
| 698 |
# Save uploaded file to a temporary location
|
| 699 |
temp_path = save_uploaded_file(uploaded_file, selected_file)
|
| 700 |
|
| 701 |
-
#
|
| 702 |
-
|
| 703 |
-
|
| 704 |
-
|
| 705 |
-
|
| 706 |
-
|
| 707 |
-
|
| 708 |
-
processor = DocumentProcessor(section_extractor=section_extractor)
|
| 709 |
|
| 710 |
# Attach an in-memory log handler to capture logs for this file
|
| 711 |
log_handler, log_buffer = get_log_handler()
|
| 712 |
root_logger = logging.getLogger()
|
| 713 |
root_logger.addHandler(log_handler)
|
|
|
|
| 714 |
try:
|
| 715 |
-
# Process the document
|
| 716 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 717 |
finally:
|
| 718 |
# Remove handler and stop capturing logs
|
| 719 |
root_logger.removeHandler(log_handler)
|
| 720 |
|
| 721 |
-
# Save results in session state
|
| 722 |
-
st.session_state.processed_results[selected_file] = {
|
| 723 |
-
"structured_json": result.structured_json,
|
| 724 |
-
"redacted_md": result.redacted_markdown,
|
| 725 |
-
"redacted_json": result.redacted_json
|
| 726 |
-
}
|
| 727 |
# Combine log records into a single text
|
| 728 |
log_text = "\n".join(log_buffer)
|
| 729 |
st.session_state.logs[selected_file] = log_text
|
|
@@ -807,25 +581,7 @@ if uploaded_files:
|
|
| 807 |
structured_json = data["structured_json"]
|
| 808 |
redacted_md = data["redacted_md"]
|
| 809 |
redacted_json = data["redacted_json"]
|
| 810 |
-
|
| 811 |
-
# Get the original markdown from the structured JSON
|
| 812 |
-
# We need to reconstruct the original markdown from the structured JSON
|
| 813 |
-
# For now, we'll use the structured_markdown from the DocumentResult
|
| 814 |
-
# But we need to store this in the session state
|
| 815 |
-
|
| 816 |
-
# Create a DocumentProcessor to get the original markdown
|
| 817 |
-
if "original_markdown" not in st.session_state.processed_results[selected_file]:
|
| 818 |
-
# Save uploaded file to a temporary location
|
| 819 |
-
temp_path = save_uploaded_file(uploaded_file, selected_file)
|
| 820 |
-
|
| 821 |
-
# Create a DocumentProcessor without section extraction to get original markdown
|
| 822 |
-
processor = DocumentProcessor(section_extractor=None)
|
| 823 |
-
result = processor.process(temp_path)
|
| 824 |
-
|
| 825 |
-
# Store the original markdown
|
| 826 |
-
st.session_state.processed_results[selected_file]["original_markdown"] = result.structured_markdown
|
| 827 |
-
|
| 828 |
-
original_md = st.session_state.processed_results[selected_file]["original_markdown"]
|
| 829 |
|
| 830 |
# Show processing summary
|
| 831 |
original_texts = structured_json.get("texts", [])
|
|
@@ -844,67 +600,230 @@ if uploaded_files:
|
|
| 844 |
st.subheader("Original vs Redacted Content")
|
| 845 |
st.caption("Compare the original document content with the redacted version")
|
| 846 |
|
| 847 |
-
#
|
| 848 |
-
|
| 849 |
-
|
| 850 |
-
|
| 851 |
-
|
| 852 |
-
""", unsafe_allow_html=True)
|
| 853 |
|
| 854 |
-
# Create a
|
| 855 |
-
|
| 856 |
-
<div class="sync-scroll-container">
|
| 857 |
-
<div class="sync-scroll-panel">
|
| 858 |
-
<div class="sync-scroll-header">
|
| 859 |
-
📋 Original Document
|
| 860 |
-
</div>
|
| 861 |
-
<div id="original-content" class="sync-scroll-content">
|
| 862 |
-
{create_diff_content(original_md, redacted_md, 'original')}
|
| 863 |
-
</div>
|
| 864 |
-
</div>
|
| 865 |
-
<div class="sync-scroll-panel">
|
| 866 |
-
<div class="sync-scroll-header">
|
| 867 |
-
🔒 Redacted Document
|
| 868 |
-
</div>
|
| 869 |
-
<div id="redacted-content" class="sync-scroll-content">
|
| 870 |
-
{create_diff_content(original_md, redacted_md, 'redacted')}
|
| 871 |
-
</div>
|
| 872 |
-
</div>
|
| 873 |
-
</div>
|
| 874 |
-
"""
|
| 875 |
|
| 876 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 877 |
|
| 878 |
-
|
| 879 |
-
|
| 880 |
-
|
| 881 |
-
|
| 882 |
-
|
| 883 |
-
|
| 884 |
-
|
| 885 |
-
|
| 886 |
-
|
| 887 |
-
|
| 888 |
-
|
| 889 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 890 |
|
| 891 |
-
|
| 892 |
-
# Add legend for the diff highlighting
|
| 893 |
st.markdown("---")
|
| 894 |
col1, col2 = st.columns(2)
|
| 895 |
with col1:
|
| 896 |
-
st.markdown("**🎨
|
| 897 |
st.markdown("🔴 **Red background** = Removed content")
|
| 898 |
-
st.markdown("
|
| 899 |
-
st.markdown("
|
| 900 |
|
| 901 |
with col2:
|
| 902 |
-
st.markdown("**💡
|
| 903 |
-
st.markdown("
|
| 904 |
-
st.markdown("
|
| 905 |
-
st.markdown("
|
| 906 |
|
| 907 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 908 |
|
| 909 |
with tab2:
|
| 910 |
st.subheader("Document Structure Analysis")
|
|
@@ -922,19 +841,139 @@ if uploaded_files:
|
|
| 922 |
with tab3:
|
| 923 |
st.subheader("Processing Details")
|
| 924 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 925 |
# Show what was removed
|
| 926 |
if removed_count > 0:
|
| 927 |
st.info(f"**Removed {removed_count} text elements from the document structure.**")
|
| 928 |
|
| 929 |
-
# Show the removed text elements
|
| 930 |
st.subheader("Removed Text Elements:")
|
| 931 |
-
removed_texts = []
|
| 932 |
-
for i, text_elem in enumerate(original_texts):
|
| 933 |
-
if i >= len(redacted_texts) or text_elem.get("text", "") != redacted_texts[i].get("text", ""):
|
| 934 |
-
removed_texts.append((i, text_elem.get("text", "")[:100] + "..." if len(text_elem.get("text", "")) > 100 else text_elem.get("text", "")))
|
| 935 |
|
| 936 |
-
|
| 937 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 938 |
else:
|
| 939 |
st.info("No text elements were removed during processing.")
|
| 940 |
|
|
|
|
| 3 |
|
| 4 |
import os
|
| 5 |
import tempfile
|
| 6 |
+
import json
|
| 7 |
+
from datetime import datetime
|
| 8 |
|
| 9 |
# Get a writable temp directory first
|
| 10 |
try:
|
|
|
|
| 93 |
os.environ['ACCELERATE_CACHE'],
|
| 94 |
]
|
| 95 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
for directory in directories_to_create:
|
| 97 |
try:
|
| 98 |
+
# Create directory and all parent directories
|
| 99 |
+
os.makedirs(directory, mode=0o777, exist_ok=True)
|
| 100 |
+
# Ensure the directory has write permissions
|
| 101 |
+
os.chmod(directory, 0o777)
|
| 102 |
except Exception as e:
|
| 103 |
print(f"Warning: Could not create directory {directory}: {e}")
|
| 104 |
|
|
|
|
| 109 |
from processing.document_processor import DocumentProcessor
|
| 110 |
from processing.sections import ReasoningSectionExtractor
|
| 111 |
from utils.logging_utils import get_log_handler
|
| 112 |
+
from utils.cost_tracker import cost_tracker
|
| 113 |
from dotenv import load_dotenv
|
| 114 |
import sys
|
|
|
|
| 115 |
import difflib
|
|
|
|
| 116 |
import time
|
| 117 |
|
| 118 |
# Configure logging early to avoid issues
|
|
|
|
| 199 |
|
| 200 |
files = os.listdir(TEMP_DIR)
|
| 201 |
total_size = 0
|
| 202 |
+
file_details = []
|
| 203 |
|
| 204 |
for filename in files:
|
| 205 |
try:
|
| 206 |
file_path = os.path.join(TEMP_DIR, filename)
|
| 207 |
if os.path.isfile(file_path):
|
| 208 |
+
file_size = os.path.getsize(file_path)
|
| 209 |
+
total_size += file_size
|
| 210 |
+
file_details.append({
|
| 211 |
+
'name': filename,
|
| 212 |
+
'size': file_size,
|
| 213 |
+
'type': 'file'
|
| 214 |
+
})
|
| 215 |
+
elif os.path.isdir(file_path):
|
| 216 |
+
file_details.append({
|
| 217 |
+
'name': filename,
|
| 218 |
+
'size': 0,
|
| 219 |
+
'type': 'directory'
|
| 220 |
+
})
|
| 221 |
except (PermissionError, OSError) as e:
|
| 222 |
logging.warning(f"Error accessing file {filename}: {e}")
|
| 223 |
+
file_details.append({
|
| 224 |
+
'name': filename,
|
| 225 |
+
'size': 0,
|
| 226 |
+
'type': 'error'
|
| 227 |
+
})
|
| 228 |
continue
|
| 229 |
|
| 230 |
+
# Log detailed information for debugging
|
| 231 |
+
if file_details:
|
| 232 |
+
logging.info(f"Temp directory contents ({TEMP_DIR}):")
|
| 233 |
+
for detail in file_details:
|
| 234 |
+
logging.info(f" - {detail['name']} ({detail['type']}): {detail['size']} bytes")
|
| 235 |
+
|
| 236 |
return len(files), total_size
|
| 237 |
except PermissionError as e:
|
| 238 |
logging.warning(f"Permission error accessing temp directory {TEMP_DIR}: {e}")
|
|
|
|
| 309 |
border-radius: 10px;
|
| 310 |
border: 1px solid #e9ecef;
|
| 311 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 312 |
</style>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 313 |
""", unsafe_allow_html=True)
|
| 314 |
|
| 315 |
# Configure root logger only once (avoid duplicate handlers on reruns)
|
|
|
|
| 331 |
if st.button("🧹 Clear All Data", type="secondary", help="Remove all temporary files and reset the application"):
|
| 332 |
if clear_all_data():
|
| 333 |
st.success("✅ All data cleared successfully! The application has been reset.")
|
| 334 |
+
cost_tracker.reset_session() # Reset cost tracking when clearing data
|
| 335 |
st.rerun()
|
| 336 |
else:
|
| 337 |
st.error("❌ Error clearing data. Please try again.")
|
|
|
|
| 382 |
# Show warning if total size is large
|
| 383 |
if total_size > 50 * 1024 * 1024: # 50MB
|
| 384 |
st.warning("⚠️ Large temporary files detected. Consider clearing data to free up space.")
|
| 385 |
+
|
| 386 |
+
# Debug: Show temp files (expandable)
|
| 387 |
+
with st.expander("🔍 Debug: View temporary files"):
|
| 388 |
+
try:
|
| 389 |
+
if os.path.exists(TEMP_DIR):
|
| 390 |
+
files = os.listdir(TEMP_DIR)
|
| 391 |
+
if files:
|
| 392 |
+
st.write("**Temporary files in directory:**")
|
| 393 |
+
for filename in files:
|
| 394 |
+
file_path = os.path.join(TEMP_DIR, filename)
|
| 395 |
+
try:
|
| 396 |
+
if os.path.isfile(file_path):
|
| 397 |
+
size = os.path.getsize(file_path)
|
| 398 |
+
st.write(f"📄 {filename} ({format_file_size(size)})")
|
| 399 |
+
elif os.path.isdir(file_path):
|
| 400 |
+
st.write(f"📁 {filename} (directory)")
|
| 401 |
+
else:
|
| 402 |
+
st.write(f"❓ {filename} (unknown)")
|
| 403 |
+
except Exception as e:
|
| 404 |
+
st.write(f"❌ {filename} (error: {e})")
|
| 405 |
+
else:
|
| 406 |
+
st.write("No files found in temp directory")
|
| 407 |
+
else:
|
| 408 |
+
st.write("Temp directory does not exist")
|
| 409 |
+
except Exception as e:
|
| 410 |
+
st.write(f"Error accessing temp directory: {e}")
|
| 411 |
else:
|
| 412 |
st.caption("📁 No temporary files")
|
| 413 |
|
|
|
|
| 423 |
else:
|
| 424 |
st.caption("No files to delete")
|
| 425 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 426 |
if uploaded_files:
|
| 427 |
# UI to select which file to work with (if multiple files uploaded)
|
| 428 |
file_names = [f.name for f in uploaded_files]
|
|
|
|
| 462 |
# Save uploaded file to a temporary location
|
| 463 |
temp_path = save_uploaded_file(uploaded_file, selected_file)
|
| 464 |
|
| 465 |
+
# Ensure the deployment name is in the cost tracker
|
| 466 |
+
if AZURE_OPENAI_DEPLOYMENT and AZURE_OPENAI_DEPLOYMENT not in cost_tracker.get_available_models():
|
| 467 |
+
model_type = cost_tracker.guess_model_type(AZURE_OPENAI_DEPLOYMENT)
|
| 468 |
+
cost_tracker.add_deployment_pricing(AZURE_OPENAI_DEPLOYMENT, model_type)
|
| 469 |
+
|
| 470 |
+
# Use the new processing function
|
| 471 |
+
from processing.document_processor import process_document_with_redaction
|
|
|
|
| 472 |
|
| 473 |
# Attach an in-memory log handler to capture logs for this file
|
| 474 |
log_handler, log_buffer = get_log_handler()
|
| 475 |
root_logger = logging.getLogger()
|
| 476 |
root_logger.addHandler(log_handler)
|
| 477 |
+
|
| 478 |
try:
|
| 479 |
+
# Process the document using the new function
|
| 480 |
+
processing_result = process_document_with_redaction(
|
| 481 |
+
file_path=temp_path,
|
| 482 |
+
endpoint=AZURE_OPENAI_ENDPOINT,
|
| 483 |
+
api_key=AZURE_OPENAI_KEY,
|
| 484 |
+
api_version=AZURE_OPENAI_VERSION,
|
| 485 |
+
deployment=AZURE_OPENAI_DEPLOYMENT,
|
| 486 |
+
)
|
| 487 |
+
|
| 488 |
+
# Save results in session state (maintaining compatibility with existing UI)
|
| 489 |
+
st.session_state.processed_results[selected_file] = {
|
| 490 |
+
"structured_json": processing_result.original_document_json,
|
| 491 |
+
"redacted_md": processing_result.redacted_document_md,
|
| 492 |
+
"redacted_json": processing_result.redacted_document_json, # Now this is actually redacted!
|
| 493 |
+
"original_markdown": processing_result.original_document_md,
|
| 494 |
+
"processing_result": processing_result # Store the new result
|
| 495 |
+
}
|
| 496 |
+
|
| 497 |
finally:
|
| 498 |
# Remove handler and stop capturing logs
|
| 499 |
root_logger.removeHandler(log_handler)
|
| 500 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 501 |
# Combine log records into a single text
|
| 502 |
log_text = "\n".join(log_buffer)
|
| 503 |
st.session_state.logs[selected_file] = log_text
|
|
|
|
| 581 |
structured_json = data["structured_json"]
|
| 582 |
redacted_md = data["redacted_md"]
|
| 583 |
redacted_json = data["redacted_json"]
|
| 584 |
+
original_md = data["original_markdown"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 585 |
|
| 586 |
# Show processing summary
|
| 587 |
original_texts = structured_json.get("texts", [])
|
|
|
|
| 600 |
st.subheader("Original vs Redacted Content")
|
| 601 |
st.caption("Compare the original document content with the redacted version")
|
| 602 |
|
| 603 |
+
# Get the actual removed indices from the processing result
|
| 604 |
+
actual_removed_indices = []
|
| 605 |
+
if "processing_result" in st.session_state.processed_results[selected_file]:
|
| 606 |
+
processing_result = st.session_state.processed_results[selected_file]["processing_result"]
|
| 607 |
+
actual_removed_indices = processing_result.removed_indices
|
|
|
|
| 608 |
|
| 609 |
+
# Create a more intelligent side-by-side comparison based on JSON structure
|
| 610 |
+
col1, col2 = st.columns(2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 611 |
|
| 612 |
+
with col1:
|
| 613 |
+
st.markdown("**📋 Original Document**")
|
| 614 |
+
|
| 615 |
+
# Display original content with removed sections highlighted
|
| 616 |
+
for i, text_elem in enumerate(original_texts):
|
| 617 |
+
text_content = text_elem.get("text", "")
|
| 618 |
+
label = text_elem.get("label", "")
|
| 619 |
+
|
| 620 |
+
# Check if this element was removed
|
| 621 |
+
is_removed = i in actual_removed_indices
|
| 622 |
+
|
| 623 |
+
if is_removed:
|
| 624 |
+
# Highlight removed content in red
|
| 625 |
+
st.markdown(f"""
|
| 626 |
+
<div style="background-color: #ffebee; color: #c62828; padding: 8px; margin: 4px 0; border-left: 4px solid #f44336; border-radius: 4px;">
|
| 627 |
+
<strong>Text {i} ({label}) - REMOVED:</strong><br>
|
| 628 |
+
{text_content}
|
| 629 |
+
</div>
|
| 630 |
+
""", unsafe_allow_html=True)
|
| 631 |
+
else:
|
| 632 |
+
# Show normal content
|
| 633 |
+
content_preview = text_content[:150] + "..." if len(text_content) > 150 else text_content
|
| 634 |
+
st.markdown(f"""
|
| 635 |
+
<div style="padding: 4px; margin: 2px 0; border-radius: 4px;">
|
| 636 |
+
<strong>Text {i} ({label}) - {len(text_content)} chars:</strong><br>
|
| 637 |
+
<code style="background-color: #f5f5f5; padding: 2px; border-radius: 2px;">{content_preview}</code>
|
| 638 |
+
</div>
|
| 639 |
+
""", unsafe_allow_html=True)
|
| 640 |
|
| 641 |
+
with col2:
|
| 642 |
+
st.markdown("**🔒 Redacted Document**")
|
| 643 |
+
|
| 644 |
+
# Display redacted content (only non-removed elements)
|
| 645 |
+
redacted_index = 0
|
| 646 |
+
for i, text_elem in enumerate(original_texts):
|
| 647 |
+
text_content = text_elem.get("text", "")
|
| 648 |
+
label = text_elem.get("label", "")
|
| 649 |
+
|
| 650 |
+
# Check if this element was removed
|
| 651 |
+
is_removed = i in actual_removed_indices
|
| 652 |
+
|
| 653 |
+
if is_removed:
|
| 654 |
+
# Show placeholder for removed content
|
| 655 |
+
st.markdown(f"""
|
| 656 |
+
<div style="background-color: #ffebee; color: #c62828; padding: 8px; margin: 4px 0; border-left: 4px solid #f44336; border-radius: 4px; font-style: italic; opacity: 0.7;">
|
| 657 |
+
<strong>Text {i} ({label}) - REMOVED</strong><br>
|
| 658 |
+
[Content removed by redaction]
|
| 659 |
+
</div>
|
| 660 |
+
""", unsafe_allow_html=True)
|
| 661 |
+
else:
|
| 662 |
+
# Show the actual content from redacted texts
|
| 663 |
+
if redacted_index < len(redacted_texts):
|
| 664 |
+
redacted_content = redacted_texts[redacted_index].get("text", "")
|
| 665 |
+
content_preview = redacted_content[:150] + "..." if len(redacted_content) > 150 else redacted_content
|
| 666 |
+
st.markdown(f"""
|
| 667 |
+
<div style="padding: 4px; margin: 2px 0; border-radius: 4px;">
|
| 668 |
+
<strong>Text {i} ({label}) - {len(redacted_content)} chars:</strong><br>
|
| 669 |
+
<code style="background-color: #f5f5f5; padding: 2px; border-radius: 2px;">{content_preview}</code>
|
| 670 |
+
</div>
|
| 671 |
+
""", unsafe_allow_html=True)
|
| 672 |
+
redacted_index += 1
|
| 673 |
+
else:
|
| 674 |
+
st.markdown(f"""
|
| 675 |
+
<div style="padding: 4px; margin: 2px 0; border-radius: 4px; background-color: #f5f5f5;">
|
| 676 |
+
<strong>Text {i} ({label}):</strong><br>
|
| 677 |
+
[Content preserved]
|
| 678 |
+
</div>
|
| 679 |
+
""", unsafe_allow_html=True)
|
| 680 |
|
| 681 |
+
# Add legend
|
|
|
|
| 682 |
st.markdown("---")
|
| 683 |
col1, col2 = st.columns(2)
|
| 684 |
with col1:
|
| 685 |
+
st.markdown("**🎨 Comparison Legend:**")
|
| 686 |
st.markdown("🔴 **Red background** = Removed content")
|
| 687 |
+
st.markdown("⚪ **White background** = Preserved content")
|
| 688 |
+
st.markdown("📝 **Italic text** = Placeholder for removed content")
|
| 689 |
|
| 690 |
with col2:
|
| 691 |
+
st.markdown("**💡 How to read:**")
|
| 692 |
+
st.markdown("Left panel shows original with removed sections highlighted")
|
| 693 |
+
st.markdown("Right panel shows redacted version with placeholders")
|
| 694 |
+
st.markdown("Compare corresponding text indices to see changes")
|
| 695 |
|
| 696 |
+
# Add debug information to help identify missing content
|
| 697 |
+
with st.expander("🔍 Debug: Content Analysis"):
|
| 698 |
+
st.write("**Searching for table content...**")
|
| 699 |
+
|
| 700 |
+
# Search for table-related content in original texts
|
| 701 |
+
table_elements = []
|
| 702 |
+
for i, text_elem in enumerate(original_texts):
|
| 703 |
+
text_content = text_elem.get("text", "")
|
| 704 |
+
label = text_elem.get("label", "")
|
| 705 |
+
|
| 706 |
+
if "Bespreking" in text_content or "|" in text_content or "table" in label.lower():
|
| 707 |
+
table_elements.append({
|
| 708 |
+
"index": i,
|
| 709 |
+
"label": label,
|
| 710 |
+
"content": text_content[:200] + "..." if len(text_content) > 200 else text_content,
|
| 711 |
+
"is_removed": i in actual_removed_indices
|
| 712 |
+
})
|
| 713 |
+
|
| 714 |
+
if table_elements:
|
| 715 |
+
st.write(f"**Found {len(table_elements)} table-related elements:**")
|
| 716 |
+
for elem in table_elements:
|
| 717 |
+
status = "🔴 REMOVED" if elem["is_removed"] else "✅ PRESERVED"
|
| 718 |
+
st.write(f"**Text {elem['index']} ({elem['label']}) - {status}:**")
|
| 719 |
+
st.write(f"`{elem['content']}`")
|
| 720 |
+
st.write("---")
|
| 721 |
+
else:
|
| 722 |
+
st.write("**No table-related content found in original texts**")
|
| 723 |
+
|
| 724 |
+
# Also check redacted texts
|
| 725 |
+
st.write("**Table content in redacted texts:**")
|
| 726 |
+
table_elements_redacted = []
|
| 727 |
+
for i, text_elem in enumerate(redacted_texts):
|
| 728 |
+
text_content = text_elem.get("text", "")
|
| 729 |
+
label = text_elem.get("label", "")
|
| 730 |
+
|
| 731 |
+
if "Bespreking" in text_content or "|" in text_content or "table" in label.lower():
|
| 732 |
+
table_elements_redacted.append({
|
| 733 |
+
"index": i,
|
| 734 |
+
"label": label,
|
| 735 |
+
"content": text_content[:200] + "..." if len(text_content) > 200 else text_content
|
| 736 |
+
})
|
| 737 |
+
|
| 738 |
+
if table_elements_redacted:
|
| 739 |
+
st.write(f"**Found {len(table_elements_redacted)} table-related elements in redacted content:**")
|
| 740 |
+
for elem in table_elements_redacted:
|
| 741 |
+
st.write(f"**Text {elem['index']} ({elem['label']}):**")
|
| 742 |
+
st.write(f"`{elem['content']}`")
|
| 743 |
+
st.write("---")
|
| 744 |
+
else:
|
| 745 |
+
st.write("**No table-related content found in redacted texts**")
|
| 746 |
+
|
| 747 |
+
# Add download buttons for redacted content
|
| 748 |
+
st.markdown("---")
|
| 749 |
+
st.subheader("📥 Download Redacted Content")
|
| 750 |
+
|
| 751 |
+
col1, col2, col3 = st.columns(3)
|
| 752 |
+
|
| 753 |
+
with col1:
|
| 754 |
+
# Download redacted markdown
|
| 755 |
+
st.download_button(
|
| 756 |
+
label="📄 Download Redacted Markdown",
|
| 757 |
+
data=redacted_md,
|
| 758 |
+
file_name=f"{selected_file}_redacted.md",
|
| 759 |
+
mime="text/markdown",
|
| 760 |
+
help="Download the redacted document as Markdown format"
|
| 761 |
+
)
|
| 762 |
+
|
| 763 |
+
with col2:
|
| 764 |
+
# Generate and download redacted PDF
|
| 765 |
+
pdf_generated = False
|
| 766 |
+
pdf_bytes = None
|
| 767 |
+
|
| 768 |
+
if st.button("📋 Generate Redacted PDF", help="Generate a PDF version of the redacted document"):
|
| 769 |
+
with st.spinner("Generating redacted PDF..."):
|
| 770 |
+
try:
|
| 771 |
+
# Create a DocumentProcessor to access PDF generation
|
| 772 |
+
temp_path = save_uploaded_file(uploaded_file, selected_file)
|
| 773 |
+
processor = DocumentProcessor(section_extractor=None)
|
| 774 |
+
|
| 775 |
+
# Generate PDF path
|
| 776 |
+
base_name = os.path.splitext(selected_file)[0]
|
| 777 |
+
pdf_path = os.path.join(TEMP_DIR, f"{base_name}_redacted.pdf")
|
| 778 |
+
|
| 779 |
+
# Generate the PDF
|
| 780 |
+
success = processor.generate_redacted_pdf(redacted_json, pdf_path)
|
| 781 |
+
|
| 782 |
+
if success:
|
| 783 |
+
# Read the generated PDF and store for download
|
| 784 |
+
with open(pdf_path, "rb") as pdf_file:
|
| 785 |
+
pdf_bytes = pdf_file.read()
|
| 786 |
+
pdf_generated = True
|
| 787 |
+
st.success("✅ PDF generated successfully!")
|
| 788 |
+
else:
|
| 789 |
+
st.error("❌ Failed to generate PDF. Check logs for details.")
|
| 790 |
+
|
| 791 |
+
except Exception as e:
|
| 792 |
+
st.error(f"❌ Error generating PDF: {e}")
|
| 793 |
+
st.info("💡 Make sure reportlab is installed: `pip install reportlab`")
|
| 794 |
+
|
| 795 |
+
# Show download button if PDF was generated
|
| 796 |
+
if pdf_generated and pdf_bytes:
|
| 797 |
+
st.download_button(
|
| 798 |
+
label="📥 Download Redacted PDF",
|
| 799 |
+
data=pdf_bytes,
|
| 800 |
+
file_name=f"{os.path.splitext(selected_file)[0]}_redacted.pdf",
|
| 801 |
+
mime="application/pdf",
|
| 802 |
+
help="Download the redacted document as PDF"
|
| 803 |
+
)
|
| 804 |
+
|
| 805 |
+
# Show debug information about what's in the PDF
|
| 806 |
+
with st.expander("🔍 Debug: PDF Content Analysis"):
|
| 807 |
+
st.write("**Content that will be included in the PDF:**")
|
| 808 |
+
texts_in_pdf = redacted_json.get("texts", [])
|
| 809 |
+
st.write(f"Total text elements: {len(texts_in_pdf)}")
|
| 810 |
+
|
| 811 |
+
for i, text_elem in enumerate(texts_in_pdf):
|
| 812 |
+
text_content = text_elem.get("text", "")[:100] + "..." if len(text_elem.get("text", "")) > 100 else text_elem.get("text", "")
|
| 813 |
+
label = text_elem.get("label", "")
|
| 814 |
+
st.write(f"**Text {i} ({label}):** {text_content}")
|
| 815 |
+
elif not pdf_generated:
|
| 816 |
+
st.info("💡 Click 'Generate Redacted PDF' to create a PDF version")
|
| 817 |
+
|
| 818 |
+
with col3:
|
| 819 |
+
# Download redacted JSON structure
|
| 820 |
+
st.download_button(
|
| 821 |
+
label="🔧 Download Redacted JSON",
|
| 822 |
+
data=json.dumps(redacted_json, indent=2, ensure_ascii=False),
|
| 823 |
+
file_name=f"{selected_file}_redacted.json",
|
| 824 |
+
mime="application/json",
|
| 825 |
+
help="Download the redacted document structure as JSON"
|
| 826 |
+
)
|
| 827 |
|
| 828 |
with tab2:
|
| 829 |
st.subheader("Document Structure Analysis")
|
|
|
|
| 841 |
with tab3:
|
| 842 |
st.subheader("Processing Details")
|
| 843 |
|
| 844 |
+
# Show cost analysis for this processing session
|
| 845 |
+
st.subheader("💰 Cost Analysis")
|
| 846 |
+
|
| 847 |
+
# Get cost data from the processing result
|
| 848 |
+
if "processing_result" in st.session_state.processed_results[selected_file]:
|
| 849 |
+
processing_result = st.session_state.processed_results[selected_file]["processing_result"]
|
| 850 |
+
|
| 851 |
+
col1, col2, col3 = st.columns(3)
|
| 852 |
+
with col1:
|
| 853 |
+
st.metric("Total Cost", f"${processing_result.cost:.4f}")
|
| 854 |
+
with col2:
|
| 855 |
+
st.metric("Input Tokens", f"{processing_result.input_tokens:,}")
|
| 856 |
+
with col3:
|
| 857 |
+
st.metric("Output Tokens", f"{processing_result.output_tokens:,}")
|
| 858 |
+
|
| 859 |
+
# Add download button for cost report
|
| 860 |
+
cost_report = {
|
| 861 |
+
"timestamp": datetime.now().isoformat(),
|
| 862 |
+
"total_cost": processing_result.cost,
|
| 863 |
+
"input_tokens": processing_result.input_tokens,
|
| 864 |
+
"output_tokens": processing_result.output_tokens,
|
| 865 |
+
"total_tokens": processing_result.input_tokens + processing_result.output_tokens,
|
| 866 |
+
"document_processed": selected_file,
|
| 867 |
+
"model_used": AZURE_OPENAI_DEPLOYMENT
|
| 868 |
+
}
|
| 869 |
+
|
| 870 |
+
st.download_button(
|
| 871 |
+
label="📥 Download Cost Report (JSON)",
|
| 872 |
+
data=json.dumps(cost_report, indent=2),
|
| 873 |
+
file_name=f"cost_report_{selected_file}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
|
| 874 |
+
mime="application/json"
|
| 875 |
+
)
|
| 876 |
+
|
| 877 |
+
# Show model information
|
| 878 |
+
model_info = cost_tracker.get_model_info(AZURE_OPENAI_DEPLOYMENT)
|
| 879 |
+
if model_info:
|
| 880 |
+
st.subheader("Model Information")
|
| 881 |
+
st.write(f"**Model:** {model_info.description}")
|
| 882 |
+
st.write(f"**Input cost:** ${model_info.input_cost_per_1k_tokens:.4f}/1K tokens")
|
| 883 |
+
st.write(f"**Output cost:** ${model_info.output_cost_per_1k_tokens:.4f}/1K tokens")
|
| 884 |
+
|
| 885 |
+
# Calculate cost breakdown
|
| 886 |
+
input_cost = (processing_result.input_tokens / 1000) * model_info.input_cost_per_1k_tokens
|
| 887 |
+
output_cost = (processing_result.output_tokens / 1000) * model_info.output_cost_per_1k_tokens
|
| 888 |
+
st.write(f"**Cost breakdown:** Input: ${input_cost:.4f}, Output: ${output_cost:.4f}")
|
| 889 |
+
else:
|
| 890 |
+
# Fallback to old cost summary method
|
| 891 |
+
cost_summary = cost_tracker.get_session_summary()
|
| 892 |
+
|
| 893 |
+
if cost_summary["usage_count"] > 0:
|
| 894 |
+
col1, col2, col3 = st.columns(3)
|
| 895 |
+
with col1:
|
| 896 |
+
st.metric("Total Cost", f"${cost_summary['total_cost']:.4f}")
|
| 897 |
+
with col2:
|
| 898 |
+
st.metric("Total Tokens", f"{cost_summary['total_tokens']:,}")
|
| 899 |
+
with col3:
|
| 900 |
+
st.metric("API Calls", cost_summary["usage_count"])
|
| 901 |
+
|
| 902 |
+
# Add download button for cost report
|
| 903 |
+
cost_report = {
|
| 904 |
+
"timestamp": datetime.now().isoformat(),
|
| 905 |
+
"total_cost": cost_summary["total_cost"],
|
| 906 |
+
"total_tokens": cost_summary["total_tokens"],
|
| 907 |
+
"api_calls": cost_summary["usage_count"],
|
| 908 |
+
"model_breakdown": cost_summary["model_breakdown"],
|
| 909 |
+
"document_processed": selected_file
|
| 910 |
+
}
|
| 911 |
+
|
| 912 |
+
st.download_button(
|
| 913 |
+
label="📥 Download Cost Report (JSON)",
|
| 914 |
+
data=json.dumps(cost_report, indent=2),
|
| 915 |
+
file_name=f"cost_report_{selected_file}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
|
| 916 |
+
mime="application/json"
|
| 917 |
+
)
|
| 918 |
+
|
| 919 |
+
# Show detailed model breakdown
|
| 920 |
+
if cost_summary["model_breakdown"]:
|
| 921 |
+
st.subheader("Model Usage Breakdown")
|
| 922 |
+
for model, stats in cost_summary["model_breakdown"].items():
|
| 923 |
+
model_info = cost_tracker.get_model_info(model)
|
| 924 |
+
model_display_name = model_info.description if model_info else model
|
| 925 |
+
|
| 926 |
+
with st.expander(f"{model_display_name} - ${stats['cost']:.4f}"):
|
| 927 |
+
col1, col2 = st.columns(2)
|
| 928 |
+
with col1:
|
| 929 |
+
st.write(f"**Input tokens:** {stats['input_tokens']:,}")
|
| 930 |
+
st.write(f"**Output tokens:** {stats['output_tokens']:,}")
|
| 931 |
+
with col2:
|
| 932 |
+
st.write(f"**Total tokens:** {stats['total_tokens']:,}")
|
| 933 |
+
st.write(f"**API calls:** {stats['usage_count']}")
|
| 934 |
+
|
| 935 |
+
# Show cost breakdown
|
| 936 |
+
if model_info:
|
| 937 |
+
input_cost = (stats['input_tokens'] / 1000) * model_info.input_cost_per_1k_tokens
|
| 938 |
+
output_cost = (stats['output_tokens'] / 1000) * model_info.output_cost_per_1k_tokens
|
| 939 |
+
st.write(f"**Cost breakdown:** Input: ${input_cost:.4f}, Output: ${output_cost:.4f}")
|
| 940 |
+
else:
|
| 941 |
+
st.info("No API calls recorded for this session")
|
| 942 |
+
|
| 943 |
# Show what was removed
|
| 944 |
if removed_count > 0:
|
| 945 |
st.info(f"**Removed {removed_count} text elements from the document structure.**")
|
| 946 |
|
| 947 |
+
# Show the removed text elements - use the actual indices from the processing result
|
| 948 |
st.subheader("Removed Text Elements:")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 949 |
|
| 950 |
+
# Get the actual indices that were removed from the processing result
|
| 951 |
+
if "processing_result" in st.session_state.processed_results[selected_file]:
|
| 952 |
+
# Get the actual removed indices from the LLM response
|
| 953 |
+
processing_result = st.session_state.processed_results[selected_file]["processing_result"]
|
| 954 |
+
actual_removed_indices = processing_result.removed_indices
|
| 955 |
+
|
| 956 |
+
if actual_removed_indices:
|
| 957 |
+
st.info(f"**Elements removed by LLM analysis ({len(actual_removed_indices)} elements):**")
|
| 958 |
+
|
| 959 |
+
for idx in actual_removed_indices:
|
| 960 |
+
if idx < len(original_texts):
|
| 961 |
+
text_content = original_texts[idx].get("text", "")
|
| 962 |
+
st.text(f"Text {idx}: {text_content[:100]}{'...' if len(text_content) > 100 else ''}")
|
| 963 |
+
else:
|
| 964 |
+
st.text(f"Text {idx}: [Index out of bounds]")
|
| 965 |
+
else:
|
| 966 |
+
st.info("**No elements were identified for removal by the LLM.**")
|
| 967 |
+
else:
|
| 968 |
+
# Fallback to the old method if processing result not available
|
| 969 |
+
st.warning("**Note: Using fallback calculation method**")
|
| 970 |
+
removed_texts = []
|
| 971 |
+
for i, text_elem in enumerate(original_texts):
|
| 972 |
+
if i >= len(redacted_texts) or text_elem.get("text", "") != redacted_texts[i].get("text", ""):
|
| 973 |
+
removed_texts.append((i, text_elem.get("text", "")[:100] + "..." if len(text_elem.get("text", "")) > 100 else text_elem.get("text", "")))
|
| 974 |
+
|
| 975 |
+
for idx, text in removed_texts:
|
| 976 |
+
st.text(f"Text {idx}: {text}")
|
| 977 |
else:
|
| 978 |
st.info("No text elements were removed during processing.")
|
| 979 |
|
src/utils/cost_tracker.py
ADDED
|
@@ -0,0 +1,241 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from dataclasses import dataclass
|
| 3 |
+
from typing import Dict, Optional
|
| 4 |
+
from datetime import datetime
|
| 5 |
+
|
| 6 |
+
logger = logging.getLogger(__name__)
|
| 7 |
+
|
| 8 |
+
@dataclass
|
| 9 |
+
class ModelPricing:
|
| 10 |
+
"""Pricing information for Azure OpenAI models."""
|
| 11 |
+
model_name: str
|
| 12 |
+
input_cost_per_1k_tokens: float # Cost per 1000 input tokens
|
| 13 |
+
output_cost_per_1k_tokens: float # Cost per 1000 output tokens
|
| 14 |
+
description: str
|
| 15 |
+
|
| 16 |
+
@dataclass
|
| 17 |
+
class TokenUsage:
|
| 18 |
+
"""Token usage statistics for a single API call."""
|
| 19 |
+
prompt_tokens: int
|
| 20 |
+
completion_tokens: int
|
| 21 |
+
total_tokens: int
|
| 22 |
+
model: str
|
| 23 |
+
timestamp: datetime
|
| 24 |
+
|
| 25 |
+
@dataclass
|
| 26 |
+
class CostAnalysis:
|
| 27 |
+
"""Cost analysis for document processing."""
|
| 28 |
+
total_input_tokens: int
|
| 29 |
+
total_output_tokens: int
|
| 30 |
+
total_cost: float
|
| 31 |
+
model_breakdown: Dict[str, Dict[str, float]] # {model: {"input_cost": x, "output_cost": y, "total_cost": z}}
|
| 32 |
+
processing_time: float
|
| 33 |
+
timestamp: datetime
|
| 34 |
+
|
| 35 |
+
class CostTracker:
|
| 36 |
+
"""Tracks token usage and calculates costs for Azure OpenAI API calls."""
|
| 37 |
+
|
| 38 |
+
# Hardcoded pricing for Azure OpenAI models (current as of 2024)
|
| 39 |
+
# Source: https://azure.microsoft.com/en-us/pricing/details/cognitive-services/openai-service/
|
| 40 |
+
MODEL_PRICING = {
|
| 41 |
+
# Standard model names
|
| 42 |
+
"gpt-4o-mini": ModelPricing(
|
| 43 |
+
model_name="gpt-4o-mini",
|
| 44 |
+
input_cost_per_1k_tokens=0.00015, # $0.00015 per 1K input tokens
|
| 45 |
+
output_cost_per_1k_tokens=0.0006, # $0.0006 per 1K output tokens
|
| 46 |
+
description="GPT-4o Mini (O3 Mini)"
|
| 47 |
+
),
|
| 48 |
+
"gpt-4o": ModelPricing(
|
| 49 |
+
model_name="gpt-4o",
|
| 50 |
+
input_cost_per_1k_tokens=0.0025, # $0.0025 per 1K input tokens
|
| 51 |
+
output_cost_per_1k_tokens=0.01, # $0.01 per 1K output tokens
|
| 52 |
+
description="GPT-4o (O4)"
|
| 53 |
+
),
|
| 54 |
+
"gpt-35-turbo": ModelPricing(
|
| 55 |
+
model_name="gpt-35-turbo",
|
| 56 |
+
input_cost_per_1k_tokens=0.0005, # $0.0005 per 1K input tokens
|
| 57 |
+
output_cost_per_1k_tokens=0.0015, # $0.0015 per 1K output tokens
|
| 58 |
+
description="GPT-3.5 Turbo (O3)"
|
| 59 |
+
),
|
| 60 |
+
# Azure deployment names (custom names set in Azure)
|
| 61 |
+
"o3-mini": ModelPricing(
|
| 62 |
+
model_name="o3-mini",
|
| 63 |
+
input_cost_per_1k_tokens=0.00015, # $0.00015 per 1K input tokens
|
| 64 |
+
output_cost_per_1k_tokens=0.0006, # $0.0006 per 1K output tokens
|
| 65 |
+
description="O3 Mini (GPT-4o Mini)"
|
| 66 |
+
),
|
| 67 |
+
"o4-mini": ModelPricing(
|
| 68 |
+
model_name="o4-mini",
|
| 69 |
+
input_cost_per_1k_tokens=0.00015, # $0.00015 per 1K input tokens
|
| 70 |
+
output_cost_per_1k_tokens=0.0006, # $0.0006 per 1K output tokens
|
| 71 |
+
description="O4 Mini (GPT-4o Mini)"
|
| 72 |
+
),
|
| 73 |
+
"o3": ModelPricing(
|
| 74 |
+
model_name="o3",
|
| 75 |
+
input_cost_per_1k_tokens=0.0005, # $0.0005 per 1K input tokens
|
| 76 |
+
output_cost_per_1k_tokens=0.0015, # $0.0015 per 1K output tokens
|
| 77 |
+
description="O3 (GPT-3.5 Turbo)"
|
| 78 |
+
),
|
| 79 |
+
"o4": ModelPricing(
|
| 80 |
+
model_name="o4",
|
| 81 |
+
input_cost_per_1k_tokens=0.0025, # $0.0025 per 1K input tokens
|
| 82 |
+
output_cost_per_1k_tokens=0.01, # $0.01 per 1K output tokens
|
| 83 |
+
description="O4 (GPT-4o)"
|
| 84 |
+
),
|
| 85 |
+
# Alternative model names that might be used in Azure deployments
|
| 86 |
+
"gpt-4o-mini-2024-07-18": ModelPricing(
|
| 87 |
+
model_name="gpt-4o-mini-2024-07-18",
|
| 88 |
+
input_cost_per_1k_tokens=0.00015, # $0.00015 per 1K input tokens
|
| 89 |
+
output_cost_per_1k_tokens=0.0006, # $0.0006 per 1K output tokens
|
| 90 |
+
description="GPT-4o Mini (O3 Mini) - Latest"
|
| 91 |
+
),
|
| 92 |
+
"gpt-4o-2024-05-13": ModelPricing(
|
| 93 |
+
model_name="gpt-4o-2024-05-13",
|
| 94 |
+
input_cost_per_1k_tokens=0.0025, # $0.0025 per 1K input tokens
|
| 95 |
+
output_cost_per_1k_tokens=0.01, # $0.01 per 1K output tokens
|
| 96 |
+
description="GPT-4o (O4) - Latest"
|
| 97 |
+
),
|
| 98 |
+
"gpt-35-turbo-0125": ModelPricing(
|
| 99 |
+
model_name="gpt-35-turbo-0125",
|
| 100 |
+
input_cost_per_1k_tokens=0.0005, # $0.0005 per 1K input tokens
|
| 101 |
+
output_cost_per_1k_tokens=0.0015, # $0.0015 per 1K output tokens
|
| 102 |
+
description="GPT-3.5 Turbo (O3) - Latest"
|
| 103 |
+
),
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
def __init__(self):
|
| 107 |
+
self.usage_history: list[TokenUsage] = []
|
| 108 |
+
self.current_session_tokens = 0
|
| 109 |
+
self.current_session_cost = 0.0
|
| 110 |
+
|
| 111 |
+
def record_usage(self, prompt_tokens: int, completion_tokens: int, model: str) -> TokenUsage:
|
| 112 |
+
"""Record token usage from an API call."""
|
| 113 |
+
total_tokens = prompt_tokens + completion_tokens
|
| 114 |
+
usage = TokenUsage(
|
| 115 |
+
prompt_tokens=prompt_tokens,
|
| 116 |
+
completion_tokens=completion_tokens,
|
| 117 |
+
total_tokens=total_tokens,
|
| 118 |
+
model=model,
|
| 119 |
+
timestamp=datetime.now()
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
self.usage_history.append(usage)
|
| 123 |
+
self.current_session_tokens += total_tokens
|
| 124 |
+
|
| 125 |
+
# Calculate cost for this usage
|
| 126 |
+
cost = self._calculate_cost(prompt_tokens, completion_tokens, model)
|
| 127 |
+
self.current_session_cost += cost
|
| 128 |
+
|
| 129 |
+
logger.info(f"Recorded usage: {prompt_tokens} input + {completion_tokens} output = {total_tokens} total tokens "
|
| 130 |
+
f"for model {model}, cost: ${cost:.6f}")
|
| 131 |
+
|
| 132 |
+
return usage
|
| 133 |
+
|
| 134 |
+
def _calculate_cost(self, input_tokens: int, output_tokens: int, model: str) -> float:
|
| 135 |
+
"""Calculate cost for given token usage and model."""
|
| 136 |
+
if model not in self.MODEL_PRICING:
|
| 137 |
+
logger.warning(f"Unknown model pricing for {model}, using default pricing")
|
| 138 |
+
# Try to guess the model type based on the name
|
| 139 |
+
if "mini" in model.lower():
|
| 140 |
+
# Assume it's a mini model (cheapest)
|
| 141 |
+
model = "o3-mini"
|
| 142 |
+
elif "o4" in model.lower():
|
| 143 |
+
# Assume it's O4 (most expensive)
|
| 144 |
+
model = "o4"
|
| 145 |
+
elif "o3" in model.lower():
|
| 146 |
+
# Assume it's O3 (medium)
|
| 147 |
+
model = "o3"
|
| 148 |
+
else:
|
| 149 |
+
# Default to cheapest option
|
| 150 |
+
model = "o3-mini"
|
| 151 |
+
|
| 152 |
+
pricing = self.MODEL_PRICING[model]
|
| 153 |
+
|
| 154 |
+
input_cost = (input_tokens / 1000) * pricing.input_cost_per_1k_tokens
|
| 155 |
+
output_cost = (output_tokens / 1000) * pricing.output_cost_per_1k_tokens
|
| 156 |
+
|
| 157 |
+
return input_cost + output_cost
|
| 158 |
+
|
| 159 |
+
def get_session_summary(self) -> Dict[str, any]:
|
| 160 |
+
"""Get summary of current session usage."""
|
| 161 |
+
if not self.usage_history:
|
| 162 |
+
return {
|
| 163 |
+
"total_tokens": 0,
|
| 164 |
+
"total_cost": 0.0,
|
| 165 |
+
"model_breakdown": {},
|
| 166 |
+
"usage_count": 0
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
model_breakdown = {}
|
| 170 |
+
for usage in self.usage_history:
|
| 171 |
+
if usage.model not in model_breakdown:
|
| 172 |
+
model_breakdown[usage.model] = {
|
| 173 |
+
"input_tokens": 0,
|
| 174 |
+
"output_tokens": 0,
|
| 175 |
+
"total_tokens": 0,
|
| 176 |
+
"cost": 0.0,
|
| 177 |
+
"usage_count": 0
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
model_breakdown[usage.model]["input_tokens"] += usage.prompt_tokens
|
| 181 |
+
model_breakdown[usage.model]["output_tokens"] += usage.completion_tokens
|
| 182 |
+
model_breakdown[usage.model]["total_tokens"] += usage.total_tokens
|
| 183 |
+
model_breakdown[usage.model]["usage_count"] += 1
|
| 184 |
+
model_breakdown[usage.model]["cost"] += self._calculate_cost(
|
| 185 |
+
usage.prompt_tokens, usage.completion_tokens, usage.model
|
| 186 |
+
)
|
| 187 |
+
|
| 188 |
+
return {
|
| 189 |
+
"total_tokens": self.current_session_tokens,
|
| 190 |
+
"total_cost": self.current_session_cost,
|
| 191 |
+
"model_breakdown": model_breakdown,
|
| 192 |
+
"usage_count": len(self.usage_history)
|
| 193 |
+
}
|
| 194 |
+
|
| 195 |
+
def reset_session(self):
|
| 196 |
+
"""Reset current session statistics."""
|
| 197 |
+
self.usage_history = []
|
| 198 |
+
self.current_session_tokens = 0
|
| 199 |
+
self.current_session_cost = 0.0
|
| 200 |
+
logger.info("Cost tracker session reset")
|
| 201 |
+
|
| 202 |
+
def get_available_models(self) -> list[str]:
|
| 203 |
+
"""Get list of available models with pricing."""
|
| 204 |
+
return list(self.MODEL_PRICING.keys())
|
| 205 |
+
|
| 206 |
+
def get_model_info(self, model: str) -> Optional[ModelPricing]:
|
| 207 |
+
"""Get pricing information for a specific model."""
|
| 208 |
+
return self.MODEL_PRICING.get(model)
|
| 209 |
+
|
| 210 |
+
def add_deployment_pricing(self, deployment_name: str, model_type: str = "o3-mini"):
|
| 211 |
+
"""Add pricing for a custom deployment name by mapping it to an existing model type."""
|
| 212 |
+
if deployment_name in self.MODEL_PRICING:
|
| 213 |
+
return # Already exists
|
| 214 |
+
|
| 215 |
+
# Map deployment name to existing model pricing
|
| 216 |
+
if model_type in self.MODEL_PRICING:
|
| 217 |
+
base_pricing = self.MODEL_PRICING[model_type]
|
| 218 |
+
self.MODEL_PRICING[deployment_name] = ModelPricing(
|
| 219 |
+
model_name=deployment_name,
|
| 220 |
+
input_cost_per_1k_tokens=base_pricing.input_cost_per_1k_tokens,
|
| 221 |
+
output_cost_per_1k_tokens=base_pricing.output_cost_per_1k_tokens,
|
| 222 |
+
description=f"{deployment_name} ({base_pricing.description})"
|
| 223 |
+
)
|
| 224 |
+
logger.info(f"Added pricing for deployment {deployment_name} based on {model_type}")
|
| 225 |
+
else:
|
| 226 |
+
logger.warning(f"Unknown model type {model_type} for deployment {deployment_name}")
|
| 227 |
+
|
| 228 |
+
def guess_model_type(self, deployment_name: str) -> str:
|
| 229 |
+
"""Guess the model type based on deployment name."""
|
| 230 |
+
deployment_lower = deployment_name.lower()
|
| 231 |
+
if "mini" in deployment_lower:
|
| 232 |
+
return "o3-mini"
|
| 233 |
+
elif "o4" in deployment_lower:
|
| 234 |
+
return "o4"
|
| 235 |
+
elif "o3" in deployment_lower:
|
| 236 |
+
return "o3"
|
| 237 |
+
else:
|
| 238 |
+
return "o3-mini" # Default to cheapest
|
| 239 |
+
|
| 240 |
+
# Global cost tracker instance
|
| 241 |
+
cost_tracker = CostTracker()
|
uv.lock
CHANGED
|
@@ -200,6 +200,7 @@ dependencies = [
|
|
| 200 |
{ name = "openai" },
|
| 201 |
{ name = "python-dotenv" },
|
| 202 |
{ name = "pyyaml" },
|
|
|
|
| 203 |
{ name = "streamlit" },
|
| 204 |
]
|
| 205 |
|
|
@@ -209,6 +210,7 @@ requires-dist = [
|
|
| 209 |
{ name = "openai", specifier = ">=1.91.0" },
|
| 210 |
{ name = "python-dotenv", specifier = ">=1.1.1" },
|
| 211 |
{ name = "pyyaml", specifier = ">=6.0" },
|
|
|
|
| 212 |
{ name = "streamlit", specifier = ">=1.46.0" },
|
| 213 |
]
|
| 214 |
|
|
@@ -1333,6 +1335,19 @@ wheels = [
|
|
| 1333 |
{ url = "https://files.pythonhosted.org/packages/45/94/bc295babb3062a731f52621cdc992d123111282e291abaf23faa413443ea/regex-2024.11.6-cp313-cp313-win_amd64.whl", hash = "sha256:2b3361af3198667e99927da8b84c1b010752fa4b1115ee30beaa332cabc3ef1a", size = 273545, upload-time = "2024-11-06T20:11:15Z" },
|
| 1334 |
]
|
| 1335 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1336 |
[[package]]
|
| 1337 |
name = "requests"
|
| 1338 |
version = "2.32.4"
|
|
|
|
| 200 |
{ name = "openai" },
|
| 201 |
{ name = "python-dotenv" },
|
| 202 |
{ name = "pyyaml" },
|
| 203 |
+
{ name = "reportlab" },
|
| 204 |
{ name = "streamlit" },
|
| 205 |
]
|
| 206 |
|
|
|
|
| 210 |
{ name = "openai", specifier = ">=1.91.0" },
|
| 211 |
{ name = "python-dotenv", specifier = ">=1.1.1" },
|
| 212 |
{ name = "pyyaml", specifier = ">=6.0" },
|
| 213 |
+
{ name = "reportlab", specifier = ">=4.4.2" },
|
| 214 |
{ name = "streamlit", specifier = ">=1.46.0" },
|
| 215 |
]
|
| 216 |
|
|
|
|
| 1335 |
{ url = "https://files.pythonhosted.org/packages/45/94/bc295babb3062a731f52621cdc992d123111282e291abaf23faa413443ea/regex-2024.11.6-cp313-cp313-win_amd64.whl", hash = "sha256:2b3361af3198667e99927da8b84c1b010752fa4b1115ee30beaa332cabc3ef1a", size = 273545, upload-time = "2024-11-06T20:11:15Z" },
|
| 1336 |
]
|
| 1337 |
|
| 1338 |
+
[[package]]
|
| 1339 |
+
name = "reportlab"
|
| 1340 |
+
version = "4.4.2"
|
| 1341 |
+
source = { registry = "https://pypi.org/simple" }
|
| 1342 |
+
dependencies = [
|
| 1343 |
+
{ name = "charset-normalizer" },
|
| 1344 |
+
{ name = "pillow" },
|
| 1345 |
+
]
|
| 1346 |
+
sdist = { url = "https://files.pythonhosted.org/packages/ec/9b/3483c7e4ad33d15f22d528872439e5bc92485814d7e7d10dbc3130368a83/reportlab-4.4.2.tar.gz", hash = "sha256:fc6283048ddd0781a9db1d671715990e6aa059c8d40ec9baf34294c4bd583a36", size = 3509063, upload-time = "2025-06-18T12:20:19.526Z" }
|
| 1347 |
+
wheels = [
|
| 1348 |
+
{ url = "https://files.pythonhosted.org/packages/9f/74/ed990bc9586605d4e46f6b0e0b978a5b8e757aa599e39664bee26d6dc666/reportlab-4.4.2-py3-none-any.whl", hash = "sha256:58e11be387457928707c12153b7e41e52533a5da3f587b15ba8f8fd0805c6ee2", size = 1953624, upload-time = "2025-06-18T12:20:16.152Z" },
|
| 1349 |
+
]
|
| 1350 |
+
|
| 1351 |
[[package]]
|
| 1352 |
name = "requests"
|
| 1353 |
version = "2.32.4"
|