Spaces:
Running
Running
Commit
Β·
bb68eb6
1
Parent(s):
e3a2ecc
Enhance README and Streamlit app for Medical Document Parser & Redactor
Browse files- Added detailed project overview and structure in README.md.
- Implemented file upload, processing, and redaction features in streamlit_app.py.
- Integrated logging and temporary file management.
- Enhanced UI with custom styling and synchronized scrolling for document comparison.
- Added functionality for viewing original and redacted documents side by side, including download options for markdown and JSON structures.
- .gitignore +63 -0
- .python-version +1 -0
- README.md +334 -0
- main.py +6 -0
- pyproject.toml +13 -0
- src/processing/__init__.py +0 -0
- src/processing/document_processor.py +140 -0
- src/processing/llm_extractor.py +145 -0
- src/processing/sections.py +222 -0
- src/streamlit_app.py +547 -38
- src/utils/__init__.py +0 -0
- src/utils/logging_utils.py +25 -0
- uv.lock +0 -0
.gitignore
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Temporary files
|
2 |
+
temp_files/
|
3 |
+
temp_*.pdf
|
4 |
+
temp_*.json
|
5 |
+
temp_*.txt
|
6 |
+
|
7 |
+
# Python
|
8 |
+
__pycache__/
|
9 |
+
*.py[cod]
|
10 |
+
*$py.class
|
11 |
+
*.so
|
12 |
+
.Python
|
13 |
+
build/
|
14 |
+
develop-eggs/
|
15 |
+
dist/
|
16 |
+
downloads/
|
17 |
+
eggs/
|
18 |
+
.eggs/
|
19 |
+
lib/
|
20 |
+
lib64/
|
21 |
+
parts/
|
22 |
+
sdist/
|
23 |
+
var/
|
24 |
+
wheels/
|
25 |
+
*.egg-info/
|
26 |
+
.installed.cfg
|
27 |
+
*.egg
|
28 |
+
MANIFEST
|
29 |
+
|
30 |
+
# Virtual environments
|
31 |
+
.env
|
32 |
+
.venv
|
33 |
+
env/
|
34 |
+
venv/
|
35 |
+
ENV/
|
36 |
+
env.bak/
|
37 |
+
venv.bak/
|
38 |
+
|
39 |
+
# IDE
|
40 |
+
.vscode/
|
41 |
+
.idea/
|
42 |
+
*.swp
|
43 |
+
*.swo
|
44 |
+
*~
|
45 |
+
|
46 |
+
# OS
|
47 |
+
.DS_Store
|
48 |
+
.DS_Store?
|
49 |
+
._*
|
50 |
+
.Spotlight-V100
|
51 |
+
.Trashes
|
52 |
+
ehthumbs.db
|
53 |
+
Thumbs.db
|
54 |
+
|
55 |
+
# Logs
|
56 |
+
*.log
|
57 |
+
|
58 |
+
# Streamlit
|
59 |
+
.streamlit/
|
60 |
+
|
61 |
+
# Azure credentials (if any)
|
62 |
+
*.pem
|
63 |
+
*.key
|
.python-version
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
3.13
|
README.md
CHANGED
@@ -17,3 +17,337 @@ Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :hear
|
|
17 |
|
18 |
If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
|
19 |
forums](https://discuss.streamlit.io).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
|
19 |
forums](https://discuss.streamlit.io).
|
20 |
+
|
21 |
+
# Medical Document Parser & Redactor
|
22 |
+
|
23 |
+
A sophisticated medical document processing application that uses **Docling** (structure-aware parser) to parse PDF medical documents and automatically redact medication information using AI-powered analysis.
|
24 |
+
|
25 |
+
## π― Overview
|
26 |
+
|
27 |
+
This application provides a Streamlit-based interface for uploading medical PDF documents, parsing them with Docling to extract structured content, and using Azure OpenAI to intelligently identify and redact formal medication lists while preserving clinical context.
|
28 |
+
|
29 |
+
## ποΈ Project Structure
|
30 |
+
|
31 |
+
```
|
32 |
+
docling/
|
33 |
+
βββ src/ # Main source code
|
34 |
+
β βββ processing/ # Core processing logic
|
35 |
+
β β βββ __init__.py
|
36 |
+
β β βββ document_processor.py # Main document processing pipeline
|
37 |
+
β β βββ llm_extractor.py # Azure OpenAI integration for medication detection
|
38 |
+
β β βββ sections.py # Section extraction and redaction logic
|
39 |
+
β βββ utils/ # Utility functions
|
40 |
+
β β βββ __init__.py
|
41 |
+
β β βββ logging_utils.py # Logging configuration and handlers
|
42 |
+
β βββ streamlit_app.py # Main Streamlit application interface
|
43 |
+
βββ temp_files/ # Temporary file storage (auto-created)
|
44 |
+
βββ .env # Environment variables (Azure OpenAI credentials)
|
45 |
+
βββ requirements.txt # Python dependencies
|
46 |
+
βββ pyproject.toml # Project configuration
|
47 |
+
βββ Dockerfile # Container configuration
|
48 |
+
βββ README.md # This file
|
49 |
+
```
|
50 |
+
|
51 |
+
## π File Responsibilities
|
52 |
+
|
53 |
+
### Core Processing Files
|
54 |
+
|
55 |
+
#### `src/processing/document_processor.py`
|
56 |
+
**Purpose**: Main document processing pipeline that orchestrates the entire workflow.
|
57 |
+
|
58 |
+
**Key Classes**:
|
59 |
+
- `DocumentResult`: Data class holding processed results
|
60 |
+
- `DocumentProcessor`: Main processing class
|
61 |
+
|
62 |
+
**Key Functions**:
|
63 |
+
- `process(file_path)`: Main processing method
|
64 |
+
- `_export_redacted_markdown()`: Generates redacted markdown
|
65 |
+
- `_reconstruct_markdown_from_filtered_texts()`: Reconstructs markdown from filtered content
|
66 |
+
|
67 |
+
**Responsibilities**:
|
68 |
+
- Document conversion using Docling
|
69 |
+
- Section redaction coordination
|
70 |
+
- Markdown generation and reconstruction
|
71 |
+
- File persistence and logging
|
72 |
+
|
73 |
+
#### `src/processing/llm_extractor.py`
|
74 |
+
**Purpose**: Azure OpenAI integration for intelligent medication detection.
|
75 |
+
|
76 |
+
**Key Classes**:
|
77 |
+
- `AzureO1MedicationExtractor`: LLM-based medication extractor
|
78 |
+
|
79 |
+
**Key Functions**:
|
80 |
+
- `extract_medication_sections(doc_json)`: Main extraction method
|
81 |
+
- `__init__()`: Azure OpenAI client initialization
|
82 |
+
|
83 |
+
**Responsibilities**:
|
84 |
+
- Azure OpenAI API communication
|
85 |
+
- Medication section identification
|
86 |
+
- Structured JSON response generation
|
87 |
+
- Error handling and logging
|
88 |
+
|
89 |
+
#### `src/processing/sections.py`
|
90 |
+
**Purpose**: Section extraction and redaction logic.
|
91 |
+
|
92 |
+
**Key Classes**:
|
93 |
+
- `ReasoningSectionExtractor`: AI-powered section extractor
|
94 |
+
- `SectionDefinition`: Section definition data class
|
95 |
+
- `SectionExtractor`: Traditional regex-based extractor
|
96 |
+
|
97 |
+
**Key Functions**:
|
98 |
+
- `remove_sections_from_json()`: JSON-based section removal
|
99 |
+
- `remove_sections()`: Text-based section removal (fallback)
|
100 |
+
|
101 |
+
**Responsibilities**:
|
102 |
+
- Section identification and removal
|
103 |
+
- JSON structure manipulation
|
104 |
+
- Text processing and redaction
|
105 |
+
- Reasoning logging and transparency
|
106 |
+
|
107 |
+
### Interface Files
|
108 |
+
|
109 |
+
#### `src/streamlit_app.py`
|
110 |
+
**Purpose**: Main Streamlit web application interface.
|
111 |
+
|
112 |
+
**Key Functions**:
|
113 |
+
- `save_uploaded_file()`: File upload handling
|
114 |
+
- `cleanup_temp_files()`: Temporary file management
|
115 |
+
- `create_diff_content()`: Diff view generation
|
116 |
+
|
117 |
+
**Responsibilities**:
|
118 |
+
- User interface and interaction
|
119 |
+
- File upload and management
|
120 |
+
- Visualization and diff display
|
121 |
+
- Session state management
|
122 |
+
- Download functionality
|
123 |
+
|
124 |
+
### Utility Files
|
125 |
+
|
126 |
+
#### `src/utils/logging_utils.py`
|
127 |
+
**Purpose**: Logging configuration and management.
|
128 |
+
|
129 |
+
**Key Functions**:
|
130 |
+
- `get_log_handler()`: Creates in-memory log handlers
|
131 |
+
- Log buffer management for UI display
|
132 |
+
|
133 |
+
**Responsibilities**:
|
134 |
+
- Logging setup and configuration
|
135 |
+
- In-memory log capture
|
136 |
+
- Log display in UI
|
137 |
+
|
138 |
+
## π§ Detailed Function Documentation
|
139 |
+
|
140 |
+
### Document Processing Pipeline
|
141 |
+
|
142 |
+
#### `DocumentProcessor.process(file_path: str) -> DocumentResult`
|
143 |
+
**Purpose**: Main entry point for document processing.
|
144 |
+
|
145 |
+
**Parameters**:
|
146 |
+
- `file_path`: Path to the PDF file to process
|
147 |
+
|
148 |
+
**Returns**:
|
149 |
+
- `DocumentResult`: Object containing all processing results
|
150 |
+
|
151 |
+
**Process Flow**:
|
152 |
+
1. Converts PDF using Docling
|
153 |
+
2. Exports structured markdown and JSON
|
154 |
+
3. Applies section redaction if extractor is provided
|
155 |
+
4. Persists results to temporary files
|
156 |
+
5. Returns comprehensive result object
|
157 |
+
|
158 |
+
**Example Usage**:
|
159 |
+
```python
|
160 |
+
processor = DocumentProcessor(section_extractor=extractor)
|
161 |
+
result = processor.process("document.pdf")
|
162 |
+
print(f"Original: {len(result.structured_markdown)} chars")
|
163 |
+
print(f"Redacted: {len(result.redacted_markdown)} chars")
|
164 |
+
```
|
165 |
+
|
166 |
+
#### `AzureO1MedicationExtractor.extract_medication_sections(doc_json: Dict) -> Dict`
|
167 |
+
**Purpose**: Uses Azure OpenAI to identify medication sections for redaction.
|
168 |
+
|
169 |
+
**Parameters**:
|
170 |
+
- `doc_json`: Docling-generated JSON structure
|
171 |
+
|
172 |
+
**Returns**:
|
173 |
+
- Dictionary with indices to remove and reasoning
|
174 |
+
|
175 |
+
**Process Flow**:
|
176 |
+
1. Analyzes document structure
|
177 |
+
2. Sends structured prompt to Azure OpenAI
|
178 |
+
3. Parses JSON response
|
179 |
+
4. Validates and limits results
|
180 |
+
5. Returns structured analysis
|
181 |
+
|
182 |
+
**Example Usage**:
|
183 |
+
```python
|
184 |
+
extractor = AzureO1MedicationExtractor(endpoint, api_key, version, deployment)
|
185 |
+
result = extractor.extract_medication_sections(doc_json)
|
186 |
+
print(f"Removing {len(result['indices_to_remove'])} elements")
|
187 |
+
```
|
188 |
+
|
189 |
+
#### `ReasoningSectionExtractor.remove_sections_from_json(doc_json: Dict) -> Dict`
|
190 |
+
**Purpose**: Removes identified sections from JSON structure.
|
191 |
+
|
192 |
+
**Parameters**:
|
193 |
+
- `doc_json`: Original document JSON structure
|
194 |
+
|
195 |
+
**Returns**:
|
196 |
+
- Redacted JSON structure
|
197 |
+
|
198 |
+
**Process Flow**:
|
199 |
+
1. Calls LLM extractor for analysis
|
200 |
+
2. Logs detailed reasoning
|
201 |
+
3. Removes identified text elements
|
202 |
+
4. Updates document structure
|
203 |
+
5. Returns redacted JSON
|
204 |
+
|
205 |
+
### UI and Visualization Functions
|
206 |
+
|
207 |
+
#### `create_diff_content(original_text: str, redacted_text: str, view_type: str) -> str`
|
208 |
+
**Purpose**: Generates HTML content for side-by-side diff view.
|
209 |
+
|
210 |
+
**Parameters**:
|
211 |
+
- `original_text`: Original document content
|
212 |
+
- `redacted_text`: Redacted document content
|
213 |
+
- `view_type`: 'original' or 'redacted'
|
214 |
+
|
215 |
+
**Returns**:
|
216 |
+
- HTML string for diff display
|
217 |
+
|
218 |
+
**Features**:
|
219 |
+
- Text normalization (headers, quotes)
|
220 |
+
- Synchronized scrolling
|
221 |
+
- Color-coded highlighting
|
222 |
+
- Git-style diff visualization
|
223 |
+
|
224 |
+
#### `save_uploaded_file(uploaded_file, filename) -> str`
|
225 |
+
**Purpose**: Safely saves uploaded files to temporary directory.
|
226 |
+
|
227 |
+
**Parameters**:
|
228 |
+
- `uploaded_file`: Streamlit uploaded file object
|
229 |
+
- `filename`: Target filename
|
230 |
+
|
231 |
+
**Returns**:
|
232 |
+
- Path to saved temporary file
|
233 |
+
|
234 |
+
**Features**:
|
235 |
+
- File pointer reset handling
|
236 |
+
- Temporary directory management
|
237 |
+
- Error handling and logging
|
238 |
+
|
239 |
+
## π Sequence Diagram
|
240 |
+
|
241 |
+
```
|
242 |
+
User Uploads PDF
|
243 |
+
β
|
244 |
+
βΌ
|
245 |
+
βββββββββββββββββββ
|
246 |
+
β Streamlit App β
|
247 |
+
β - File Upload β
|
248 |
+
β - Validation β
|
249 |
+
βββββββββββββββββββ
|
250 |
+
β
|
251 |
+
βΌ
|
252 |
+
βββββββββββββββββββ
|
253 |
+
β save_uploaded_ β
|
254 |
+
β file() β
|
255 |
+
β - Reset pointer β
|
256 |
+
β - Save to temp β
|
257 |
+
βββββββββββββββββββ
|
258 |
+
β
|
259 |
+
βΌ
|
260 |
+
βββββββββββββββββββ
|
261 |
+
β DocumentProcessorβ
|
262 |
+
β .process() β
|
263 |
+
β - Docling conv β
|
264 |
+
β - Export JSON β
|
265 |
+
β - Export MD β
|
266 |
+
βββββββββββββββββββ
|
267 |
+
β
|
268 |
+
βΌ
|
269 |
+
βββββββββββββββββββ
|
270 |
+
β ReasoningSectionβ
|
271 |
+
β Extractor β
|
272 |
+
β .remove_sectionsβ
|
273 |
+
β _from_json() β
|
274 |
+
βββββββββββββββββββ
|
275 |
+
β
|
276 |
+
βΌ
|
277 |
+
βββββββββββββββββββ
|
278 |
+
β AzureO1Medicationβ
|
279 |
+
β Extractor β
|
280 |
+
β .extract_medicatβ
|
281 |
+
β ion_sections() β
|
282 |
+
β - API call β
|
283 |
+
β - JSON parsing β
|
284 |
+
βββββββββββββββββββ
|
285 |
+
β
|
286 |
+
βΌ
|
287 |
+
βββββββββββββββββββ
|
288 |
+
β DocumentProcessorβ
|
289 |
+
β _export_redactedβ
|
290 |
+
β _markdown() β
|
291 |
+
β - Filter texts β
|
292 |
+
β - Reconstruct MDβ
|
293 |
+
βββββββββββββββββββ
|
294 |
+
β
|
295 |
+
βΌ
|
296 |
+
βββββββββββββββββββ
|
297 |
+
β Streamlit App β
|
298 |
+
β - Store results β
|
299 |
+
β - Update UI β
|
300 |
+
β - Show diff β
|
301 |
+
βββββββββββββββββββ
|
302 |
+
β
|
303 |
+
βΌ
|
304 |
+
βββββββββββββββββββ
|
305 |
+
β create_diff_ β
|
306 |
+
β content() β
|
307 |
+
β - Normalize textβ
|
308 |
+
β - Generate HTML β
|
309 |
+
β - Sync scrollingβ
|
310 |
+
βββββββββββββββββββ
|
311 |
+
```
|
312 |
+
|
313 |
+
## π Setup and Installation
|
314 |
+
|
315 |
+
### Prerequisites
|
316 |
+
- Python 3.11+
|
317 |
+
- Azure OpenAI account with API access
|
318 |
+
- Docling library
|
319 |
+
|
320 |
+
### Environment Variables
|
321 |
+
Create a `.env` file with:
|
322 |
+
```env
|
323 |
+
AZURE_OPENAI_ENDPOINT=your_endpoint
|
324 |
+
AZURE_OPENAI_KEY=your_api_key
|
325 |
+
AZURE_OPENAI_VERSION=2024-12-01-preview
|
326 |
+
AZURE_OPENAI_DEPLOYMENT=your_deployment_name
|
327 |
+
```
|
328 |
+
|
329 |
+
### Installation
|
330 |
+
```bash
|
331 |
+
# Clone repository
|
332 |
+
git clone <repository-url>
|
333 |
+
cd docling
|
334 |
+
|
335 |
+
# Install dependencies
|
336 |
+
pip install -r requirements.txt
|
337 |
+
|
338 |
+
# Run application
|
339 |
+
streamlit run src/streamlit_app.py
|
340 |
+
```
|
341 |
+
|
342 |
+
## π§ Configuration
|
343 |
+
|
344 |
+
### Azure OpenAI Settings
|
345 |
+
- **Model**: O1-mini (recommended for medication extraction)
|
346 |
+
- **Max Tokens**: 100,000 (for large documents)
|
347 |
+
- **Temperature**: 0 (for consistent results)
|
348 |
+
|
349 |
+
### Processing Settings
|
350 |
+
- **Max Elements to Remove**: 10 (safety limit)
|
351 |
+
- **Temp Directory**: `temp_files/` (auto-created)
|
352 |
+
- **Log Level**: INFO (configurable)
|
353 |
+
|
main.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def main():
|
2 |
+
print("Hello from docling!")
|
3 |
+
|
4 |
+
|
5 |
+
if __name__ == "__main__":
|
6 |
+
main()
|
pyproject.toml
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[project]
|
2 |
+
name = "docling-app"
|
3 |
+
version = "0.1.0"
|
4 |
+
description = "Add your description here"
|
5 |
+
readme = "README.md"
|
6 |
+
requires-python = ">=3.13"
|
7 |
+
dependencies = [
|
8 |
+
"streamlit>=1.46.0",
|
9 |
+
"docling>=2.38.0",
|
10 |
+
"pyyaml>=6.0",
|
11 |
+
"python-dotenv>=1.1.1",
|
12 |
+
"openai>=1.91.0",
|
13 |
+
]
|
src/processing/__init__.py
ADDED
File without changes
|
src/processing/document_processor.py
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import time
|
3 |
+
import logging
|
4 |
+
import json
|
5 |
+
from dataclasses import dataclass
|
6 |
+
from typing import Optional
|
7 |
+
from docling.document_converter import DocumentConverter
|
8 |
+
from processing.sections import SectionExtractor
|
9 |
+
|
10 |
+
# Initialize a Docling converter globally (can be reused for multiple docs)
|
11 |
+
_docling_converter = DocumentConverter()
|
12 |
+
|
13 |
+
logger = logging.getLogger(__name__) # Logger for this module
|
14 |
+
|
15 |
+
@dataclass
|
16 |
+
class DocumentResult:
|
17 |
+
"""Holds processed results for a document."""
|
18 |
+
file_path: str
|
19 |
+
structured_markdown: str
|
20 |
+
structured_json: dict
|
21 |
+
redacted_markdown: str
|
22 |
+
redacted_json: dict
|
23 |
+
|
24 |
+
class DocumentProcessor:
|
25 |
+
"""Handles parsing of documents with Docling and redacting specified sections."""
|
26 |
+
def __init__(self, section_extractor: Optional[SectionExtractor] = None):
|
27 |
+
"""
|
28 |
+
Initialize with an optional SectionExtractor for removing specific sections.
|
29 |
+
If None, no redaction will be performed (original structure only).
|
30 |
+
The Docling DocumentConverter is taken as a dependency (global or injected).
|
31 |
+
"""
|
32 |
+
self.section_extractor = section_extractor
|
33 |
+
# Allow dependency injection of converter if needed (use global by default)
|
34 |
+
self.converter = _docling_converter
|
35 |
+
|
36 |
+
def process(self, file_path: str) -> DocumentResult:
|
37 |
+
"""Parse the document and optionally remove specified sections. Returns a DocumentResult."""
|
38 |
+
logger.info(f"Starting processing for file: {file_path}")
|
39 |
+
start_time = time.time()
|
40 |
+
# Convert the document using Docling
|
41 |
+
conv_result = self.converter.convert(file_path)
|
42 |
+
elapsed = time.time() - start_time
|
43 |
+
logger.info(f"Docling conversion completed in {elapsed:.2f} seconds")
|
44 |
+
|
45 |
+
# Export results from Docling
|
46 |
+
structured_md = conv_result.document.export_to_markdown()
|
47 |
+
structured_text = conv_result.document.export_to_text()
|
48 |
+
doc_json = conv_result.document.export_to_dict()
|
49 |
+
logger.info(f"Extracted document content (text length {len(structured_text)} characters)")
|
50 |
+
|
51 |
+
# Use SectionExtractor to remove target sections if provided
|
52 |
+
if self.section_extractor:
|
53 |
+
# Use the new JSON-based approach for better section removal
|
54 |
+
redacted_json = self.section_extractor.remove_sections_from_json(doc_json)
|
55 |
+
|
56 |
+
# Convert the redacted JSON back to markdown using Docling's export method
|
57 |
+
# Create a modified document structure for proper markdown export
|
58 |
+
redacted_md = self._export_redacted_markdown(conv_result.document, redacted_json)
|
59 |
+
logger.info("Applied section redaction to remove specified sections")
|
60 |
+
else:
|
61 |
+
redacted_md = structured_md # No redaction, use original
|
62 |
+
redacted_json = doc_json # No redaction, use original
|
63 |
+
logger.info("No section redaction applied (showing original structure)")
|
64 |
+
|
65 |
+
# Persist outputs to files (JSON and redacted text) for auditing
|
66 |
+
base_name = os.path.splitext(os.path.basename(file_path))[0]
|
67 |
+
# Use temp directory for output files
|
68 |
+
temp_dir = "temp_files"
|
69 |
+
os.makedirs(temp_dir, exist_ok=True)
|
70 |
+
|
71 |
+
json_path = os.path.join(temp_dir, f"{base_name}_structured.json")
|
72 |
+
redacted_path = os.path.join(temp_dir, f"{base_name}_redacted.txt")
|
73 |
+
redacted_json_path = os.path.join(temp_dir, f"{base_name}_redacted.json")
|
74 |
+
|
75 |
+
try:
|
76 |
+
with open(json_path, "w", encoding="utf-8") as jf:
|
77 |
+
json.dump(doc_json, jf, ensure_ascii=False, indent=2)
|
78 |
+
with open(redacted_path, "w", encoding="utf-8") as tf:
|
79 |
+
tf.write(redacted_md)
|
80 |
+
with open(redacted_json_path, "w", encoding="utf-8") as jf:
|
81 |
+
json.dump(redacted_json, jf, ensure_ascii=False, indent=2)
|
82 |
+
logger.info(f"Saved structured JSON to {json_path}, redacted text to {redacted_path}, and redacted JSON to {redacted_json_path}")
|
83 |
+
except Exception as e:
|
84 |
+
logger.error(f"Error saving outputs to files: {e}")
|
85 |
+
|
86 |
+
# Prepare result object
|
87 |
+
result = DocumentResult(
|
88 |
+
file_path=file_path,
|
89 |
+
structured_markdown=structured_md,
|
90 |
+
structured_json=doc_json,
|
91 |
+
redacted_markdown=redacted_md,
|
92 |
+
redacted_json=redacted_json
|
93 |
+
)
|
94 |
+
logger.info(f"Finished processing for file: {file_path}")
|
95 |
+
return result
|
96 |
+
|
97 |
+
def _export_redacted_markdown(self, document, redacted_json):
|
98 |
+
"""Export redacted markdown using the redacted JSON structure."""
|
99 |
+
# Simply convert the redacted JSON back to markdown
|
100 |
+
return self._json_to_markdown(redacted_json)
|
101 |
+
|
102 |
+
def _json_to_markdown(self, json_data: dict) -> str:
|
103 |
+
"""Convert JSON document structure back to markdown format using Docling's structure."""
|
104 |
+
markdown_lines = []
|
105 |
+
|
106 |
+
# Get all text elements from the JSON
|
107 |
+
texts = json_data.get("texts", [])
|
108 |
+
|
109 |
+
for text_elem in texts:
|
110 |
+
text_content = text_elem.get("text", "")
|
111 |
+
label = text_elem.get("label", "")
|
112 |
+
level = text_elem.get("level", 0)
|
113 |
+
|
114 |
+
if not text_content.strip():
|
115 |
+
continue
|
116 |
+
|
117 |
+
# Format based on the label and level (following Docling's structure)
|
118 |
+
if label == "section_header":
|
119 |
+
# Add appropriate markdown headers
|
120 |
+
if level == 1:
|
121 |
+
markdown_lines.append(f"# {text_content}")
|
122 |
+
elif level == 2:
|
123 |
+
markdown_lines.append(f"## {text_content}")
|
124 |
+
elif level == 3:
|
125 |
+
markdown_lines.append(f"### {text_content}")
|
126 |
+
else:
|
127 |
+
markdown_lines.append(f"#### {text_content}")
|
128 |
+
elif label == "list_item":
|
129 |
+
# Handle list items - preserve the original marker
|
130 |
+
marker = text_elem.get("marker", "-")
|
131 |
+
markdown_lines.append(f"{marker} {text_content}")
|
132 |
+
elif label == "text":
|
133 |
+
# Regular text content - preserve as-is
|
134 |
+
markdown_lines.append(text_content)
|
135 |
+
else:
|
136 |
+
# Default to regular text
|
137 |
+
markdown_lines.append(text_content)
|
138 |
+
|
139 |
+
# Join without extra spacing to match Docling's formatting
|
140 |
+
return "\n".join(markdown_lines)
|
src/processing/llm_extractor.py
ADDED
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# src/processing/llm_extractor.py
|
2 |
+
|
3 |
+
|
4 |
+
import json
|
5 |
+
import logging
|
6 |
+
from typing import Dict, Any
|
7 |
+
|
8 |
+
from openai import AzureOpenAI
|
9 |
+
|
10 |
+
logger = logging.getLogger(__name__)
|
11 |
+
|
12 |
+
class AzureO1MedicationExtractor:
|
13 |
+
def __init__(
|
14 |
+
self,
|
15 |
+
endpoint: str,
|
16 |
+
api_key: str,
|
17 |
+
api_version: str,
|
18 |
+
deployment: str,
|
19 |
+
model_name: str = None,
|
20 |
+
):
|
21 |
+
self.client = AzureOpenAI(
|
22 |
+
api_version=api_version,
|
23 |
+
azure_endpoint=endpoint,
|
24 |
+
api_key=api_key,
|
25 |
+
)
|
26 |
+
self.deployment = deployment
|
27 |
+
self.model_name = model_name or deployment
|
28 |
+
|
29 |
+
def extract_medication_sections(self, doc_json: Dict[str, Any]) -> Dict[str, Any]:
|
30 |
+
texts = doc_json.get("texts", [])
|
31 |
+
text_analysis = []
|
32 |
+
for i, text_elem in enumerate(texts):
|
33 |
+
text_analysis.append({
|
34 |
+
"index": i,
|
35 |
+
"text": text_elem.get("text", ""),
|
36 |
+
"label": text_elem.get("label", ""),
|
37 |
+
"level": text_elem.get("level", 0),
|
38 |
+
"parent": text_elem.get("parent", {}),
|
39 |
+
})
|
40 |
+
|
41 |
+
prompt = f"""
|
42 |
+
You are a medical document analysis expert specializing in discharge letters. Your task is to identify ONLY the two formal medication lists that should be redacted, while preserving all medication mentions in clinical discussion.
|
43 |
+
|
44 |
+
**CRITICAL: You should ONLY remove the two formal medication lists:**
|
45 |
+
1. **Current medication list** (usually at the beginning of the document)
|
46 |
+
2. **Discharge medication list** (usually at the end of the document, often under headers like "Als verdere behandeling stellen wij voor" or "Thuismedicatie")
|
47 |
+
|
48 |
+
**Typical discharge letter structure:**
|
49 |
+
- Patient information and admission details
|
50 |
+
- Clinical discussion and treatment narrative (KEEP medication mentions here)
|
51 |
+
- Current medication list (REMOVE this formal list)
|
52 |
+
- Discharge instructions and follow-up
|
53 |
+
- Discharge medication list (REMOVE this formal list)
|
54 |
+
|
55 |
+
**DO NOT remove:**
|
56 |
+
- Medication mentions in clinical discussion (e.g., "patient was treated with Eliquis")
|
57 |
+
- Medication adjustments mentioned in the narrative
|
58 |
+
- Dosage information in clinical context
|
59 |
+
- Any medication information that appears in the main clinical text
|
60 |
+
- Treatment decisions and clinical reasoning
|
61 |
+
|
62 |
+
**ONLY remove:**
|
63 |
+
- Complete medication lists with multiple drugs
|
64 |
+
- Formal medication sections with headers
|
65 |
+
- Standalone medication lists that are clearly separated from clinical text
|
66 |
+
- Lists that appear to be formal medication documentation
|
67 |
+
|
68 |
+
Document structure:
|
69 |
+
{text_analysis}
|
70 |
+
|
71 |
+
**Analysis Instructions:**
|
72 |
+
1. Look for formal medication sections with clear headers (e.g., "Thuismedicatie", "Als verdere behandeling stellen wij voor")
|
73 |
+
2. Identify complete medication lists that contain multiple drugs with dosages
|
74 |
+
3. **IGNORE** any medication mentions that appear within clinical discussion or narrative text
|
75 |
+
4. Focus on structural elements that represent formal medication documentation
|
76 |
+
5. Be conservative - if in doubt, do NOT remove
|
77 |
+
6. Consider the position in the document (beginning/end vs. middle)
|
78 |
+
|
79 |
+
**Examples of what to REMOVE:**
|
80 |
+
- Complete lists under "Thuismedicatie" header
|
81 |
+
- Formal medication lists under "Als verdere behandeling stellen wij voor"
|
82 |
+
- Standalone medication sections with multiple drugs
|
83 |
+
- Lists that appear at the beginning or end of the document
|
84 |
+
|
85 |
+
**Examples of what to KEEP:**
|
86 |
+
- "Patient was treated with Eliquis 2x 2.5mg" (clinical discussion)
|
87 |
+
- "Stop Clopidogrel bij opname" (clinical instruction)
|
88 |
+
- "Jardiance 10mg & Burinex 5mg" (if mentioned in clinical context)
|
89 |
+
- Any medication mentioned in the context of treatment discussion
|
90 |
+
|
91 |
+
Return your analysis as JSON:
|
92 |
+
{{
|
93 |
+
"indices_to_remove": [list of integer indices - ONLY formal medication lists],
|
94 |
+
"reasoning": {{
|
95 |
+
"formal_medication_lists": [list of identified formal medication list indices with explanations],
|
96 |
+
"clinical_medication_mentions": [list of clinical mentions that were correctly preserved],
|
97 |
+
"justification": "explanation of why only formal lists were selected for removal",
|
98 |
+
"confidence": "high/medium/low"
|
99 |
+
}}
|
100 |
+
}}
|
101 |
+
"""
|
102 |
+
logger.info(f"Prompt length: {len(prompt)}")
|
103 |
+
logger.info(f"Number of text elements: {len(text_analysis)}")
|
104 |
+
try:
|
105 |
+
response = self.client.chat.completions.create(
|
106 |
+
messages=[
|
107 |
+
{
|
108 |
+
"role": "system",
|
109 |
+
"content": "You are a helpful assistant.",
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"role": "user",
|
113 |
+
"content": prompt,
|
114 |
+
}
|
115 |
+
],
|
116 |
+
max_completion_tokens=100000, # adjust as needed
|
117 |
+
model=self.deployment
|
118 |
+
)
|
119 |
+
except Exception as e:
|
120 |
+
logger.error(f"Exception during LLM call: {e}", exc_info=True)
|
121 |
+
return {"indices_to_remove": [], "reasoning": {"confidence": "low"}}
|
122 |
+
|
123 |
+
try:
|
124 |
+
logger.error(f"Raw LLM response: {response.choices[0].message.content!r}")
|
125 |
+
result = json.loads(response.choices[0].message.content)
|
126 |
+
|
127 |
+
# Validate and limit the number of elements to remove
|
128 |
+
indices_to_remove = result.get("indices_to_remove", [])
|
129 |
+
|
130 |
+
# Be conservative - limit to maximum 10 elements to prevent over-removal
|
131 |
+
if len(indices_to_remove) > 10:
|
132 |
+
logger.warning(f"LLM suggested removing {len(indices_to_remove)} elements, limiting to 10 most likely formal medication lists")
|
133 |
+
# Keep only the first 10 (assuming they're ordered by importance)
|
134 |
+
indices_to_remove = indices_to_remove[:10]
|
135 |
+
result["indices_to_remove"] = indices_to_remove
|
136 |
+
result["reasoning"]["justification"] += " [LIMITED: Only top 10 elements selected to prevent over-removal]"
|
137 |
+
|
138 |
+
# Log the reasoning for transparency
|
139 |
+
reasoning = result.get("reasoning", {})
|
140 |
+
logger.info(f"LLM reasoning: {reasoning}")
|
141 |
+
|
142 |
+
return result
|
143 |
+
except Exception as e:
|
144 |
+
logger.error(f"Failed to parse LLM response: {e}")
|
145 |
+
return {"indices_to_remove": [], "reasoning": {"confidence": "low"}}
|
src/processing/sections.py
ADDED
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import logging
|
3 |
+
from dataclasses import dataclass
|
4 |
+
from typing import List, Optional, Dict, Any
|
5 |
+
from .llm_extractor import AzureO1MedicationExtractor
|
6 |
+
|
7 |
+
logger = logging.getLogger(__name__)
|
8 |
+
|
9 |
+
class ReasoningSectionExtractor:
|
10 |
+
def __init__(self, endpoint, api_key, api_version, deployment):
|
11 |
+
self.llm_extractor = AzureO1MedicationExtractor(
|
12 |
+
endpoint=endpoint,
|
13 |
+
api_key=api_key,
|
14 |
+
api_version=api_version,
|
15 |
+
deployment=deployment,
|
16 |
+
)
|
17 |
+
|
18 |
+
def remove_sections_from_json(self, doc_json: Dict[str, Any]) -> Dict[str, Any]:
|
19 |
+
extraction_result = self.llm_extractor.extract_medication_sections(doc_json)
|
20 |
+
indices_to_remove = extraction_result["indices_to_remove"]
|
21 |
+
reasoning = extraction_result.get("reasoning", {})
|
22 |
+
|
23 |
+
# Log detailed reasoning for transparency
|
24 |
+
logger.info(f"O1-mini reasoning: {reasoning}")
|
25 |
+
|
26 |
+
# Provide specific feedback about what was removed
|
27 |
+
if indices_to_remove:
|
28 |
+
logger.info(f"Removing {len(indices_to_remove)} text elements: {indices_to_remove}")
|
29 |
+
|
30 |
+
# Show what specific content is being removed
|
31 |
+
texts = doc_json.get("texts", [])
|
32 |
+
for idx in indices_to_remove:
|
33 |
+
if idx < len(texts):
|
34 |
+
text_content = texts[idx].get("text", "")[:100]
|
35 |
+
logger.info(f" Removing text {idx}: '{text_content}...'")
|
36 |
+
else:
|
37 |
+
logger.info("No formal medication lists identified for removal")
|
38 |
+
|
39 |
+
# Remove the identified text elements
|
40 |
+
import copy
|
41 |
+
redacted_json = copy.deepcopy(doc_json)
|
42 |
+
texts = redacted_json.get("texts", [])
|
43 |
+
redacted_texts = [t for i, t in enumerate(texts) if i not in indices_to_remove]
|
44 |
+
redacted_json["texts"] = redacted_texts
|
45 |
+
|
46 |
+
# Log the result
|
47 |
+
removed_count = len(texts) - len(redacted_texts)
|
48 |
+
logger.info(f"Successfully removed {removed_count} text elements from document structure")
|
49 |
+
|
50 |
+
return redacted_json
|
51 |
+
|
52 |
+
def remove_sections(self, text: str) -> str:
|
53 |
+
"""
|
54 |
+
Remove sections from markdown text. This is a fallback method for compatibility.
|
55 |
+
Since ReasoningSectionExtractor works with JSON structure, this method
|
56 |
+
returns the original text (no redaction) as the JSON-based approach is preferred.
|
57 |
+
"""
|
58 |
+
logger.warning("ReasoningSectionExtractor.remove_sections() called - this method is not implemented for text-based redaction. Use remove_sections_from_json() instead.")
|
59 |
+
return text
|
60 |
+
|
61 |
+
@dataclass
|
62 |
+
class SectionDefinition:
|
63 |
+
"""Defines a section to extract/remove by specifying its start (and optional end) regex."""
|
64 |
+
name: str
|
65 |
+
start_pattern: str # Regex pattern to identify the section start (use multiline anchors as needed)
|
66 |
+
end_pattern: Optional[str] = None # Regex for section end, or None if it goes until next section or EOF
|
67 |
+
|
68 |
+
class SectionExtractor:
|
69 |
+
"""Finds and removes specified sections from document content."""
|
70 |
+
def __init__(self, sections: List[SectionDefinition]):
|
71 |
+
# Compile regex patterns for performance
|
72 |
+
self.sections = [
|
73 |
+
SectionDefinition(sec.name, re.compile(sec.start_pattern), re.compile(sec.end_pattern) if sec.end_pattern else None)
|
74 |
+
for sec in sections
|
75 |
+
]
|
76 |
+
|
77 |
+
def remove_sections(self, text: str) -> str:
|
78 |
+
"""
|
79 |
+
Remove all defined sections from the given text. Returns the redacted text.
|
80 |
+
The text is expected to be the full document content (in Markdown or plain text form).
|
81 |
+
"""
|
82 |
+
logger.info("Removing sections from text...")
|
83 |
+
if not self.sections:
|
84 |
+
return text # nothing to remove
|
85 |
+
|
86 |
+
to_remove_ranges = [] # will hold (start_index, end_index) for removal
|
87 |
+
|
88 |
+
# Find all section start positions
|
89 |
+
for sec in self.sections:
|
90 |
+
match = sec.start_pattern.search(text)
|
91 |
+
if match:
|
92 |
+
start_idx = match.start()
|
93 |
+
# Determine end of section
|
94 |
+
if sec.end_pattern:
|
95 |
+
end_match = sec.end_pattern.search(text, start_idx)
|
96 |
+
if end_match:
|
97 |
+
# End pattern found; end index is start of end_match
|
98 |
+
end_idx = end_match.start()
|
99 |
+
else:
|
100 |
+
end_idx = len(text) # if no end pattern found, remove till end
|
101 |
+
else:
|
102 |
+
end_idx = len(text) # default end is end-of-text (will adjust later if there's another section)
|
103 |
+
to_remove_ranges.append((start_idx, end_idx, sec.name))
|
104 |
+
logger.info(f"Marked section '{sec.name}' for removal (positions {start_idx}-{end_idx})")
|
105 |
+
else:
|
106 |
+
logger.info(f"Section '{sec.name}' not found in text (pattern: {sec.start_pattern.pattern})")
|
107 |
+
|
108 |
+
if not to_remove_ranges:
|
109 |
+
logger.info("No sections to remove.")
|
110 |
+
return text
|
111 |
+
|
112 |
+
# Sort ranges by start index
|
113 |
+
to_remove_ranges.sort(key=lambda x: x[0])
|
114 |
+
# If sections overlap or touch, adjust ranges to avoid double-counting
|
115 |
+
redacted_text = ""
|
116 |
+
current_idx = 0
|
117 |
+
for start_idx, end_idx, sec_name in to_remove_ranges:
|
118 |
+
# Append text from current_idx up to start_idx (keeping content before section)
|
119 |
+
if current_idx < start_idx:
|
120 |
+
redacted_text += text[current_idx:start_idx]
|
121 |
+
else:
|
122 |
+
# Overlapping section (or consecutive) β already handled by previous removal
|
123 |
+
logger.warning(f"Section '{sec_name}' overlaps with a previous section removal region.")
|
124 |
+
current_idx = max(current_idx, end_idx)
|
125 |
+
# Append any remaining text after last removed section
|
126 |
+
if current_idx < len(text):
|
127 |
+
redacted_text += text[current_idx:]
|
128 |
+
|
129 |
+
return redacted_text
|
130 |
+
|
131 |
+
def remove_sections_from_json(self, doc_json: Dict[str, Any]) -> Dict[str, Any]:
|
132 |
+
"""
|
133 |
+
Remove specified sections from the structured JSON document.
|
134 |
+
This method works with the Docling JSON structure to identify and remove
|
135 |
+
sections based on their semantic content rather than just text patterns.
|
136 |
+
"""
|
137 |
+
logger.info("Removing sections from structured JSON...")
|
138 |
+
if not self.sections:
|
139 |
+
return doc_json # nothing to remove
|
140 |
+
|
141 |
+
# Create a deep copy to avoid modifying the original
|
142 |
+
import copy
|
143 |
+
redacted_json = copy.deepcopy(doc_json)
|
144 |
+
|
145 |
+
# Get all text elements from the document
|
146 |
+
texts = redacted_json.get("texts", [])
|
147 |
+
if not texts:
|
148 |
+
logger.warning("No texts found in document JSON")
|
149 |
+
return redacted_json
|
150 |
+
|
151 |
+
# Find text elements that match our section patterns
|
152 |
+
text_indices_to_remove = set()
|
153 |
+
|
154 |
+
for sec in self.sections:
|
155 |
+
logger.info(f"Looking for section '{sec.name}' with pattern: {sec.start_pattern.pattern}")
|
156 |
+
|
157 |
+
# Find text elements that match the section start pattern
|
158 |
+
for i, text_elem in enumerate(texts):
|
159 |
+
text_content = text_elem.get("text", "")
|
160 |
+
if sec.start_pattern.search(text_content):
|
161 |
+
logger.info(f"Found section '{sec.name}' in text element {i}: '{text_content[:50]}...'")
|
162 |
+
text_indices_to_remove.add(i)
|
163 |
+
|
164 |
+
# If we have an end pattern, also remove subsequent text elements until we find the end
|
165 |
+
if sec.end_pattern:
|
166 |
+
for j in range(i + 1, len(texts)):
|
167 |
+
next_text_content = texts[j].get("text", "")
|
168 |
+
if sec.end_pattern.search(next_text_content):
|
169 |
+
logger.info(f"Found end of section '{sec.name}' in text element {j}")
|
170 |
+
break
|
171 |
+
text_indices_to_remove.add(j)
|
172 |
+
else:
|
173 |
+
# No end pattern - remove this text element only
|
174 |
+
# For medication lists, we might want to remove the next few elements too
|
175 |
+
# This is a heuristic that could be made more sophisticated
|
176 |
+
if "medication" in sec.name.lower():
|
177 |
+
# Remove up to 3 subsequent text elements for medication lists
|
178 |
+
for j in range(i + 1, min(i + 4, len(texts))):
|
179 |
+
text_indices_to_remove.add(j)
|
180 |
+
|
181 |
+
# Remove the identified text elements
|
182 |
+
if text_indices_to_remove:
|
183 |
+
logger.info(f"Removing {len(text_indices_to_remove)} text elements: {sorted(text_indices_to_remove)}")
|
184 |
+
|
185 |
+
# Remove from texts array
|
186 |
+
redacted_texts = [texts[i] for i in range(len(texts)) if i not in text_indices_to_remove]
|
187 |
+
redacted_json["texts"] = redacted_texts
|
188 |
+
|
189 |
+
# Update body children to remove references to deleted texts
|
190 |
+
body = redacted_json.get("body", {})
|
191 |
+
if "children" in body:
|
192 |
+
# Filter out references to removed text elements
|
193 |
+
original_children = body["children"]
|
194 |
+
redacted_children = []
|
195 |
+
|
196 |
+
for child_ref in original_children:
|
197 |
+
if "$ref" in child_ref:
|
198 |
+
ref_path = child_ref["$ref"]
|
199 |
+
# Check if this reference points to a text element we're keeping
|
200 |
+
if ref_path.startswith("#/texts/"):
|
201 |
+
try:
|
202 |
+
text_index = int(ref_path.split("/")[-1])
|
203 |
+
if text_index not in text_indices_to_remove:
|
204 |
+
# Adjust the reference index since we removed some texts
|
205 |
+
new_index = text_index - sum(1 for x in text_indices_to_remove if x < text_index)
|
206 |
+
child_ref["$ref"] = f"#/texts/{new_index}"
|
207 |
+
redacted_children.append(child_ref)
|
208 |
+
except (ValueError, IndexError):
|
209 |
+
# Keep the reference if we can't parse it
|
210 |
+
redacted_children.append(child_ref)
|
211 |
+
else:
|
212 |
+
# Keep non-text references
|
213 |
+
redacted_children.append(child_ref)
|
214 |
+
else:
|
215 |
+
# Keep non-reference children
|
216 |
+
redacted_children.append(child_ref)
|
217 |
+
|
218 |
+
body["children"] = redacted_children
|
219 |
+
else:
|
220 |
+
logger.info("No sections found to remove")
|
221 |
+
|
222 |
+
return redacted_json
|
src/streamlit_app.py
CHANGED
@@ -1,40 +1,549 @@
|
|
1 |
-
import altair as alt
|
2 |
-
import numpy as np
|
3 |
-
import pandas as pd
|
4 |
import streamlit as st
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
+
import logging
|
3 |
+
import os
|
4 |
+
import tempfile
|
5 |
+
import shutil
|
6 |
+
from processing.document_processor import DocumentProcessor
|
7 |
+
from processing.sections import ReasoningSectionExtractor
|
8 |
+
from utils.logging_utils import get_log_handler
|
9 |
+
from dotenv import load_dotenv
|
10 |
+
import sys
|
11 |
+
import html
|
12 |
+
import difflib
|
13 |
+
import re
|
14 |
|
15 |
+
# Load environment variables from .env
|
16 |
+
load_dotenv()
|
17 |
+
|
18 |
+
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
|
19 |
+
AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY")
|
20 |
+
AZURE_OPENAI_VERSION = os.getenv("AZURE_OPENAI_VERSION")
|
21 |
+
AZURE_OPENAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT")
|
22 |
+
|
23 |
+
# Create temp directory if it doesn't exist
|
24 |
+
TEMP_DIR = "temp_files"
|
25 |
+
os.makedirs(TEMP_DIR, exist_ok=True)
|
26 |
+
|
27 |
+
def cleanup_temp_files():
|
28 |
+
"""Clean up temporary files in the temp directory."""
|
29 |
+
try:
|
30 |
+
if os.path.exists(TEMP_DIR):
|
31 |
+
for filename in os.listdir(TEMP_DIR):
|
32 |
+
file_path = os.path.join(TEMP_DIR, filename)
|
33 |
+
if os.path.isfile(file_path):
|
34 |
+
os.remove(file_path)
|
35 |
+
logging.info(f"Cleaned up temporary files in {TEMP_DIR}")
|
36 |
+
except Exception as e:
|
37 |
+
logging.warning(f"Error cleaning up temp files: {e}")
|
38 |
+
|
39 |
+
def save_uploaded_file(uploaded_file, filename):
|
40 |
+
"""Save uploaded file to temp directory and return the path."""
|
41 |
+
temp_path = os.path.join(TEMP_DIR, f"temp_{filename}")
|
42 |
+
try:
|
43 |
+
uploaded_file.seek(0) # Reset file pointer to beginning
|
44 |
+
file_bytes = uploaded_file.read()
|
45 |
+
with open(temp_path, "wb") as f:
|
46 |
+
f.write(file_bytes)
|
47 |
+
logging.info(f"Saved uploaded file to {temp_path}")
|
48 |
+
return temp_path
|
49 |
+
except Exception as e:
|
50 |
+
logging.error(f"Error saving uploaded file: {e}")
|
51 |
+
raise
|
52 |
+
|
53 |
+
logging.basicConfig(
|
54 |
+
level=logging.INFO, # or DEBUG for more verbosity
|
55 |
+
format="%(asctime)s %(levelname)s %(name)s: %(message)s",
|
56 |
+
stream=sys.stdout,
|
57 |
+
force=True
|
58 |
+
)
|
59 |
+
|
60 |
+
# Configure page layout to use wide mode
|
61 |
+
st.set_page_config(
|
62 |
+
page_title="Medical Document Parser & Redactor",
|
63 |
+
page_icon="π",
|
64 |
+
layout="wide",
|
65 |
+
initial_sidebar_state="collapsed"
|
66 |
+
)
|
67 |
+
|
68 |
+
# Add custom CSS for better styling
|
69 |
+
st.markdown("""
|
70 |
+
<style>
|
71 |
+
/* Custom styling for text areas */
|
72 |
+
.stTextArea textarea {
|
73 |
+
font-family: 'Courier New', monospace !important;
|
74 |
+
font-size: 12px !important;
|
75 |
+
line-height: 1.4 !important;
|
76 |
+
border: 2px solid #e0e0e0 !important;
|
77 |
+
border-radius: 8px !important;
|
78 |
+
}
|
79 |
+
|
80 |
+
/* Hover effect for text areas */
|
81 |
+
.stTextArea textarea:hover {
|
82 |
+
border-color: #1f77b4 !important;
|
83 |
+
}
|
84 |
+
|
85 |
+
/* Custom styling for download buttons */
|
86 |
+
.stDownloadButton > button {
|
87 |
+
border-radius: 8px !important;
|
88 |
+
font-weight: 600 !important;
|
89 |
+
}
|
90 |
+
|
91 |
+
/* Custom styling for the comparison section */
|
92 |
+
.comparison-container {
|
93 |
+
background-color: #f8f9fa;
|
94 |
+
padding: 20px;
|
95 |
+
border-radius: 10px;
|
96 |
+
border: 1px solid #e9ecef;
|
97 |
+
}
|
98 |
+
</style>
|
99 |
+
""", unsafe_allow_html=True)
|
100 |
+
|
101 |
+
# Configure root logger only once (avoid duplicate handlers on reruns)
|
102 |
+
if len(logging.getLogger().handlers) == 0:
|
103 |
+
logging.getLogger().setLevel(logging.INFO)
|
104 |
+
# (We will attach custom handlers during processing as needed)
|
105 |
+
|
106 |
+
# Title and description
|
107 |
+
st.title("Medical Document Parser & Redactor")
|
108 |
+
st.write("""
|
109 |
+
Upload PDF medical documents to parse their content using **Docling** (structure-aware parser)
|
110 |
+
and automatically **redact specific sections** (e.g., initial and final medication lists).
|
111 |
+
Use the buttons below to view the original structure or process with redaction.
|
112 |
+
""")
|
113 |
+
|
114 |
+
# File uploader (accept multiple PDF files)
|
115 |
+
uploaded_files = st.file_uploader("Upload PDF medical documents", type=["pdf"], accept_multiple_files=True)
|
116 |
+
|
117 |
+
# Show temp directory status
|
118 |
+
temp_file_count = len(os.listdir(TEMP_DIR)) if os.path.exists(TEMP_DIR) else 0
|
119 |
+
if temp_file_count > 0:
|
120 |
+
st.caption(f"π {temp_file_count} temporary file(s) in temp_files/ directory")
|
121 |
+
else:
|
122 |
+
st.caption("π No temporary files")
|
123 |
+
|
124 |
+
# Initialize session state storage for results and logs
|
125 |
+
if "processed_results" not in st.session_state:
|
126 |
+
st.session_state.processed_results = {} # {filename: {"structured_json": ..., "redacted_md": ..., "redacted_json": ...}}
|
127 |
+
if "logs" not in st.session_state:
|
128 |
+
st.session_state.logs = {} # {filename: log_text}
|
129 |
+
if "original_structures" not in st.session_state:
|
130 |
+
st.session_state.original_structures = {} # {filename: structured_json}
|
131 |
+
|
132 |
+
# Clean up temp files on app start (but keep the directory)
|
133 |
+
if "temp_cleaned" not in st.session_state:
|
134 |
+
cleanup_temp_files()
|
135 |
+
st.session_state.temp_cleaned = True
|
136 |
+
|
137 |
+
def create_diff_content(original_text: str, redacted_text: str, view_type: str) -> str:
|
138 |
+
"""Create HTML content for diff view with highlighting."""
|
139 |
+
import difflib
|
140 |
+
import re
|
141 |
+
|
142 |
+
# Normalize the text to reduce formatting differences
|
143 |
+
def normalize_text(text):
|
144 |
+
# Remove extra whitespace and normalize line endings
|
145 |
+
lines = text.split('\n')
|
146 |
+
normalized_lines = []
|
147 |
+
for line in lines:
|
148 |
+
# Strip whitespace but preserve content
|
149 |
+
stripped = line.strip()
|
150 |
+
if stripped:
|
151 |
+
# Normalize header formatting differences
|
152 |
+
# Convert ## to # for level 1 headers
|
153 |
+
if re.match(r'^##\s+', stripped):
|
154 |
+
stripped = re.sub(r'^##\s+', '# ', stripped)
|
155 |
+
# Normalize quote formatting
|
156 |
+
if stripped.startswith('> '):
|
157 |
+
stripped = stripped.replace('> ', '> ')
|
158 |
+
elif stripped.startswith('+ > '):
|
159 |
+
stripped = stripped.replace('+ > ', '> ')
|
160 |
+
|
161 |
+
normalized_lines.append(stripped)
|
162 |
+
return normalized_lines
|
163 |
+
|
164 |
+
original_lines = normalize_text(original_text)
|
165 |
+
redacted_lines = normalize_text(redacted_text)
|
166 |
+
|
167 |
+
# Use difflib to get a more sophisticated diff
|
168 |
+
differ = difflib.Differ()
|
169 |
+
diff = list(differ.compare(original_lines, redacted_lines))
|
170 |
+
|
171 |
+
html_lines = []
|
172 |
+
|
173 |
+
if view_type == 'original':
|
174 |
+
# Show original with removed content highlighted
|
175 |
+
for line in diff:
|
176 |
+
if line.startswith(' '): # Unchanged line
|
177 |
+
escaped_line = html.escape(line[2:])
|
178 |
+
html_lines.append(f'<div style="padding: 2px 4px; margin: 1px 0; color: #333;">{escaped_line}</div>')
|
179 |
+
elif line.startswith('- '): # Removed line
|
180 |
+
escaped_line = html.escape(line[2:])
|
181 |
+
html_lines.append(f'<div style="background-color: #ffebee; color: #c62828; padding: 2px 4px; margin: 1px 0; border-left: 3px solid #f44336; font-weight: bold;">- {escaped_line}</div>')
|
182 |
+
elif line.startswith('+ '): # Added line (show as empty space in original view)
|
183 |
+
html_lines.append(f'<div style="background-color: #e8f5e8; color: #2e7d32; padding: 2px 4px; margin: 1px 0; border-left: 3px solid #4caf50; font-style: italic; opacity: 0.7;">+ (added in redacted version)</div>')
|
184 |
+
elif line.startswith('? '): # Ignore difflib hints
|
185 |
+
continue
|
186 |
+
|
187 |
+
elif view_type == 'redacted':
|
188 |
+
# Show redacted content with added content highlighted
|
189 |
+
for line in diff:
|
190 |
+
if line.startswith(' '): # Unchanged line
|
191 |
+
escaped_line = html.escape(line[2:])
|
192 |
+
html_lines.append(f'<div style="padding: 2px 4px; margin: 1px 0; color: #333;">{escaped_line}</div>')
|
193 |
+
elif line.startswith('- '): # Removed line (show as empty space in redacted view)
|
194 |
+
html_lines.append(f'<div style="background-color: #ffebee; color: #c62828; padding: 2px 4px; margin: 1px 0; border-left: 3px solid #f44336; font-style: italic; opacity: 0.7;">- (removed from original)</div>')
|
195 |
+
elif line.startswith('+ '): # Added line
|
196 |
+
escaped_line = html.escape(line[2:])
|
197 |
+
html_lines.append(f'<div style="background-color: #e8f5e8; color: #2e7d32; padding: 2px 4px; margin: 1px 0; border-left: 3px solid #4caf50; font-weight: bold;">+ {escaped_line}</div>')
|
198 |
+
elif line.startswith('? '): # Ignore difflib hints
|
199 |
+
continue
|
200 |
+
|
201 |
+
return '\n'.join(html_lines)
|
202 |
+
|
203 |
+
if uploaded_files:
|
204 |
+
# UI to select which file to work with (if multiple files uploaded)
|
205 |
+
file_names = [f.name for f in uploaded_files]
|
206 |
+
selected_file = st.selectbox("Select a file to work with", options=file_names)
|
207 |
+
|
208 |
+
if selected_file:
|
209 |
+
# Find the selected uploaded file
|
210 |
+
uploaded_file = next(f for f in uploaded_files if f.name == selected_file)
|
211 |
+
|
212 |
+
# Create buttons for different actions
|
213 |
+
col1, col2, col3 = st.columns(3)
|
214 |
+
|
215 |
+
with col1:
|
216 |
+
if st.button("π Show Original", type="primary"):
|
217 |
+
# Process the document to get original structure (without redaction)
|
218 |
+
if selected_file not in st.session_state.original_structures:
|
219 |
+
# Save uploaded file to a temporary location
|
220 |
+
temp_path = save_uploaded_file(uploaded_file, selected_file)
|
221 |
+
|
222 |
+
# Create a DocumentProcessor without section extraction (for original structure)
|
223 |
+
processor = DocumentProcessor(section_extractor=None)
|
224 |
+
|
225 |
+
# Process the document to get original structure
|
226 |
+
result = processor.process(temp_path)
|
227 |
+
st.session_state.original_structures[selected_file] = result.structured_json
|
228 |
+
# Also store the original markdown for comparison
|
229 |
+
st.session_state.original_structures[f"{selected_file}_markdown"] = result.structured_markdown
|
230 |
+
|
231 |
+
# Display the original structure
|
232 |
+
st.session_state.show_original = True
|
233 |
+
st.session_state.show_processed = False
|
234 |
+
|
235 |
+
with col2:
|
236 |
+
if st.button("π Process with Redaction"):
|
237 |
+
# Process the document with redaction
|
238 |
+
if selected_file not in st.session_state.processed_results:
|
239 |
+
# Save uploaded file to a temporary location
|
240 |
+
temp_path = save_uploaded_file(uploaded_file, selected_file)
|
241 |
+
|
242 |
+
# Create a DocumentProcessor with a SectionExtractor for our target sections
|
243 |
+
section_extractor = ReasoningSectionExtractor(
|
244 |
+
endpoint=AZURE_OPENAI_ENDPOINT,
|
245 |
+
api_key=AZURE_OPENAI_KEY,
|
246 |
+
api_version=AZURE_OPENAI_VERSION,
|
247 |
+
deployment=AZURE_OPENAI_DEPLOYMENT,
|
248 |
+
)
|
249 |
+
processor = DocumentProcessor(section_extractor=section_extractor)
|
250 |
+
|
251 |
+
# Attach an in-memory log handler to capture logs for this file
|
252 |
+
log_handler, log_buffer = get_log_handler()
|
253 |
+
root_logger = logging.getLogger()
|
254 |
+
root_logger.addHandler(log_handler)
|
255 |
+
try:
|
256 |
+
# Process the document (Docling parse + section redaction)
|
257 |
+
result = processor.process(temp_path)
|
258 |
+
finally:
|
259 |
+
# Remove handler and stop capturing logs
|
260 |
+
root_logger.removeHandler(log_handler)
|
261 |
+
|
262 |
+
# Save results in session state
|
263 |
+
st.session_state.processed_results[selected_file] = {
|
264 |
+
"structured_json": result.structured_json,
|
265 |
+
"redacted_md": result.redacted_markdown,
|
266 |
+
"redacted_json": result.redacted_json
|
267 |
+
}
|
268 |
+
# Combine log records into a single text
|
269 |
+
log_text = "\n".join(log_buffer)
|
270 |
+
st.session_state.logs[selected_file] = log_text
|
271 |
+
|
272 |
+
st.session_state.show_original = False
|
273 |
+
st.session_state.show_processed = True
|
274 |
+
|
275 |
+
with col3:
|
276 |
+
if st.button("π Switch View"):
|
277 |
+
# Toggle between views
|
278 |
+
if st.session_state.get("show_original", False):
|
279 |
+
st.session_state.show_original = False
|
280 |
+
st.session_state.show_processed = True
|
281 |
+
else:
|
282 |
+
st.session_state.show_original = True
|
283 |
+
st.session_state.show_processed = False
|
284 |
+
|
285 |
+
# Show current view status
|
286 |
+
if st.session_state.get("show_original", False):
|
287 |
+
st.info("π Currently viewing: **Original Document Structure**")
|
288 |
+
elif st.session_state.get("show_processed", False):
|
289 |
+
st.success("π Currently viewing: **Processed Document with Redaction**")
|
290 |
+
else:
|
291 |
+
st.info("βΉοΈ Select an action above to view document content")
|
292 |
+
|
293 |
+
# Display results based on button clicked
|
294 |
+
if st.session_state.get("show_original", False):
|
295 |
+
st.markdown("---")
|
296 |
+
st.subheader(f"Original Document Structure - {selected_file}")
|
297 |
+
|
298 |
+
# Get the original structure
|
299 |
+
original_json = st.session_state.original_structures[selected_file]
|
300 |
+
original_markdown = st.session_state.original_structures.get(f"{selected_file}_markdown", "")
|
301 |
+
|
302 |
+
# Display PDF viewer and original markdown side by side
|
303 |
+
col1, col2 = st.columns([1, 1])
|
304 |
+
|
305 |
+
with col1:
|
306 |
+
st.subheader("π Original PDF")
|
307 |
+
# Reset file pointer to beginning
|
308 |
+
uploaded_file.seek(0)
|
309 |
+
# Display PDF using base64 encoding for inline display
|
310 |
+
import base64
|
311 |
+
pdf_bytes = uploaded_file.getvalue()
|
312 |
+
b64_pdf = base64.b64encode(pdf_bytes).decode()
|
313 |
+
pdf_display = f'<iframe src="data:application/pdf;base64,{b64_pdf}" width="100%" height="600" type="application/pdf"></iframe>'
|
314 |
+
st.markdown(pdf_display, unsafe_allow_html=True)
|
315 |
+
|
316 |
+
with col2:
|
317 |
+
st.subheader("π Original Document (Markdown)")
|
318 |
+
st.caption("Docling-generated markdown from the PDF")
|
319 |
+
# Use a text area for better readability and scrolling
|
320 |
+
st.text_area(
|
321 |
+
label="Original markdown content",
|
322 |
+
value=original_markdown,
|
323 |
+
height=600,
|
324 |
+
key="original_markdown_display",
|
325 |
+
label_visibility="collapsed"
|
326 |
+
)
|
327 |
+
|
328 |
+
# Add a download button for the original markdown
|
329 |
+
st.markdown("---")
|
330 |
+
col1, col2 = st.columns(2)
|
331 |
+
with col1:
|
332 |
+
st.download_button(
|
333 |
+
label="π₯ Download Original Markdown",
|
334 |
+
data=original_markdown,
|
335 |
+
file_name=f"{selected_file}_original.md",
|
336 |
+
mime="text/markdown"
|
337 |
+
)
|
338 |
+
with col2:
|
339 |
+
st.subheader("π JSON Structure")
|
340 |
+
st.json(original_json)
|
341 |
+
|
342 |
+
elif st.session_state.get("show_processed", False):
|
343 |
+
st.markdown("---")
|
344 |
+
st.subheader(f"Processed Document - {selected_file}")
|
345 |
+
|
346 |
+
# Retrieve stored results
|
347 |
+
data = st.session_state.processed_results[selected_file]
|
348 |
+
structured_json = data["structured_json"]
|
349 |
+
redacted_md = data["redacted_md"]
|
350 |
+
redacted_json = data["redacted_json"]
|
351 |
+
|
352 |
+
# Get the original markdown from the structured JSON
|
353 |
+
# We need to reconstruct the original markdown from the structured JSON
|
354 |
+
# For now, we'll use the structured_markdown from the DocumentResult
|
355 |
+
# But we need to store this in the session state
|
356 |
+
|
357 |
+
# Create a DocumentProcessor to get the original markdown
|
358 |
+
if "original_markdown" not in st.session_state.processed_results[selected_file]:
|
359 |
+
# Save uploaded file to a temporary location
|
360 |
+
temp_path = save_uploaded_file(uploaded_file, selected_file)
|
361 |
+
|
362 |
+
# Create a DocumentProcessor without section extraction to get original markdown
|
363 |
+
processor = DocumentProcessor(section_extractor=None)
|
364 |
+
result = processor.process(temp_path)
|
365 |
+
|
366 |
+
# Store the original markdown
|
367 |
+
st.session_state.processed_results[selected_file]["original_markdown"] = result.structured_markdown
|
368 |
+
|
369 |
+
original_md = st.session_state.processed_results[selected_file]["original_markdown"]
|
370 |
+
|
371 |
+
# Show processing summary
|
372 |
+
original_texts = structured_json.get("texts", [])
|
373 |
+
redacted_texts = redacted_json.get("texts", [])
|
374 |
+
removed_count = len(original_texts) - len(redacted_texts)
|
375 |
+
|
376 |
+
if removed_count > 0:
|
377 |
+
st.success(f"β
Successfully removed {removed_count} text elements containing medication information")
|
378 |
+
else:
|
379 |
+
st.info("βΉοΈ No medication sections were identified for removal")
|
380 |
+
|
381 |
+
# Create tabs for different views
|
382 |
+
tab1, tab2, tab3 = st.tabs(["π Side-by-Side Comparison", "π JSON Structure", "π Processing Details"])
|
383 |
+
|
384 |
+
with tab1:
|
385 |
+
st.subheader("Original vs Redacted Content")
|
386 |
+
st.caption("Compare the original document content with the redacted version")
|
387 |
+
|
388 |
+
# Create a diff-like interface with synchronized scrolling and highlighting
|
389 |
+
diff_html = f"""
|
390 |
+
<div style="display: flex; gap: 20px; height: 600px; font-family: 'Courier New', monospace; font-size: 12px;">
|
391 |
+
<div style="flex: 1; border: 1px solid #ddd; border-radius: 5px; overflow: hidden;">
|
392 |
+
<div style="background-color: #f8f9fa; padding: 10px; border-bottom: 1px solid #ddd; font-weight: bold; color: #1f77b4;">
|
393 |
+
π Original Document
|
394 |
+
</div>
|
395 |
+
<div id="original-content" style="height: 540px; overflow-y: auto; padding: 10px; background-color: #fff;">
|
396 |
+
{create_diff_content(original_md, redacted_md, 'original')}
|
397 |
+
</div>
|
398 |
+
</div>
|
399 |
+
<div style="flex: 1; border: 1px solid #ddd; border-radius: 5px; overflow: hidden;">
|
400 |
+
<div style="background-color: #f8f9fa; padding: 10px; border-bottom: 1px solid #ddd; font-weight: bold; color: #ff7f0e;">
|
401 |
+
π Redacted Document
|
402 |
+
</div>
|
403 |
+
<div id="redacted-content" style="height: 540px; overflow-y: auto; padding: 10px; background-color: #fff;">
|
404 |
+
{create_diff_content(original_md, redacted_md, 'redacted')}
|
405 |
+
</div>
|
406 |
+
</div>
|
407 |
+
</div>
|
408 |
+
|
409 |
+
<script>
|
410 |
+
// Synchronized scrolling
|
411 |
+
function syncScroll(sourceId, targetId) {{
|
412 |
+
const source = document.getElementById(sourceId);
|
413 |
+
const target = document.getElementById(targetId);
|
414 |
+
if (source && target) {{
|
415 |
+
target.scrollTop = source.scrollTop;
|
416 |
+
}}
|
417 |
+
}}
|
418 |
+
|
419 |
+
// Add scroll event listeners
|
420 |
+
document.addEventListener('DOMContentLoaded', function() {{
|
421 |
+
const original = document.getElementById('original-content');
|
422 |
+
const redacted = document.getElementById('redacted-content');
|
423 |
+
|
424 |
+
if (original && redacted) {{
|
425 |
+
original.addEventListener('scroll', function() {{
|
426 |
+
syncScroll('original-content', 'redacted-content');
|
427 |
+
}});
|
428 |
+
|
429 |
+
redacted.addEventListener('scroll', function() {{
|
430 |
+
syncScroll('redacted-content', 'original-content');
|
431 |
+
}});
|
432 |
+
}}
|
433 |
+
}});
|
434 |
+
</script>
|
435 |
+
"""
|
436 |
+
|
437 |
+
st.markdown(diff_html, unsafe_allow_html=True)
|
438 |
+
|
439 |
+
# Add legend for the diff highlighting
|
440 |
+
st.markdown("---")
|
441 |
+
col1, col2, col3 = st.columns(3)
|
442 |
+
with col1:
|
443 |
+
st.markdown("**π¨ Diff Legend:**")
|
444 |
+
st.markdown("π΄ **Red background** = Removed content")
|
445 |
+
st.markdown("π’ **Green background** = Added content")
|
446 |
+
st.markdown("βͺ **White background** = Unchanged content")
|
447 |
+
|
448 |
+
with col2:
|
449 |
+
st.markdown("**π Scrolling:**")
|
450 |
+
st.markdown("Scroll either panel to sync both views")
|
451 |
+
st.markdown("Content is aligned for easy comparison")
|
452 |
+
|
453 |
+
with col3:
|
454 |
+
st.markdown("**π‘ Tips:**")
|
455 |
+
st.markdown("Look for red-highlighted sections")
|
456 |
+
st.markdown("These show what was redacted")
|
457 |
+
st.markdown("Use scroll to navigate long documents")
|
458 |
+
|
459 |
+
# Add download buttons and other options
|
460 |
+
st.markdown("---")
|
461 |
+
st.subheader("π₯ Download Options")
|
462 |
+
|
463 |
+
col1, col2, col3 = st.columns(3)
|
464 |
+
with col1:
|
465 |
+
st.download_button(
|
466 |
+
label="π₯ Download Original Markdown",
|
467 |
+
data=original_md,
|
468 |
+
file_name=f"{selected_file}_original.md",
|
469 |
+
mime="text/markdown",
|
470 |
+
use_container_width=True
|
471 |
+
)
|
472 |
+
with col2:
|
473 |
+
st.download_button(
|
474 |
+
label="π₯ Download Redacted Markdown",
|
475 |
+
data=redacted_md,
|
476 |
+
file_name=f"{selected_file}_redacted.md",
|
477 |
+
mime="text/markdown",
|
478 |
+
use_container_width=True
|
479 |
+
)
|
480 |
+
with col3:
|
481 |
+
# Create a detailed diff view
|
482 |
+
if st.button("π Show Detailed Differences", use_container_width=True):
|
483 |
+
st.session_state.show_diff = True
|
484 |
+
|
485 |
+
# Show detailed diff if requested
|
486 |
+
if st.session_state.get("show_diff", False):
|
487 |
+
st.markdown("---")
|
488 |
+
st.subheader("π Detailed Content Differences")
|
489 |
+
|
490 |
+
# Simple diff visualization
|
491 |
+
original_lines = original_md.split('\n')
|
492 |
+
redacted_lines = redacted_md.split('\n')
|
493 |
+
|
494 |
+
# Find removed lines
|
495 |
+
removed_lines = []
|
496 |
+
for line in original_lines:
|
497 |
+
if line.strip() and line not in redacted_lines:
|
498 |
+
removed_lines.append(line)
|
499 |
+
|
500 |
+
if removed_lines:
|
501 |
+
st.warning(f"**Removed {len(removed_lines)} lines containing medication information:**")
|
502 |
+
for i, line in enumerate(removed_lines[:10]): # Show first 10 removed lines
|
503 |
+
st.text(f"β {line[:100]}{'...' if len(line) > 100 else ''}")
|
504 |
+
if len(removed_lines) > 10:
|
505 |
+
st.text(f"... and {len(removed_lines) - 10} more lines")
|
506 |
+
else:
|
507 |
+
st.info("No significant differences detected in the text content")
|
508 |
+
|
509 |
+
with tab2:
|
510 |
+
st.subheader("Document Structure Analysis")
|
511 |
+
|
512 |
+
# Show JSON structure comparison
|
513 |
+
col1, col2 = st.columns(2)
|
514 |
+
|
515 |
+
with col1:
|
516 |
+
st.markdown("**π Original Structure (JSON)**")
|
517 |
+
st.json(structured_json)
|
518 |
+
|
519 |
+
with col2:
|
520 |
+
st.markdown("**π Redacted Structure (JSON)**")
|
521 |
+
st.json(redacted_json)
|
522 |
+
|
523 |
+
with tab3:
|
524 |
+
st.subheader("Processing Details")
|
525 |
+
|
526 |
+
# Show what was removed
|
527 |
+
if removed_count > 0:
|
528 |
+
st.info(f"**Removed {removed_count} text elements from the document structure.**")
|
529 |
+
|
530 |
+
# Show the removed text elements
|
531 |
+
st.subheader("Removed Text Elements:")
|
532 |
+
removed_texts = []
|
533 |
+
for i, text_elem in enumerate(original_texts):
|
534 |
+
if i >= len(redacted_texts) or text_elem.get("text", "") != redacted_texts[i].get("text", ""):
|
535 |
+
removed_texts.append((i, text_elem.get("text", "")[:100] + "..." if len(text_elem.get("text", "")) > 100 else text_elem.get("text", "")))
|
536 |
+
|
537 |
+
for idx, text in removed_texts:
|
538 |
+
st.text(f"Text {idx}: {text}")
|
539 |
+
else:
|
540 |
+
st.info("No text elements were removed during processing.")
|
541 |
+
|
542 |
+
# Show processing logs
|
543 |
+
st.subheader("Processing Logs")
|
544 |
+
st.text_area(
|
545 |
+
label="Processing logs",
|
546 |
+
value=st.session_state.logs.get(selected_file, ""),
|
547 |
+
height=300,
|
548 |
+
label_visibility="collapsed"
|
549 |
+
)
|
src/utils/__init__.py
ADDED
File without changes
|
src/utils/logging_utils.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
|
3 |
+
class ListHandler(logging.Handler):
|
4 |
+
"""Logging handler that appends log messages to an internal list (for in-memory capture)."""
|
5 |
+
def __init__(self, log_list):
|
6 |
+
super().__init__()
|
7 |
+
self.log_list = log_list
|
8 |
+
# Use a simple formatter with time, level, and message
|
9 |
+
formatter = logging.Formatter(fmt="%(asctime)s - %(levelname)s - %(message)s",
|
10 |
+
datefmt="%H:%M:%S")
|
11 |
+
self.setFormatter(formatter)
|
12 |
+
|
13 |
+
def emit(self, record):
|
14 |
+
log_entry = self.format(record)
|
15 |
+
self.log_list.append(log_entry)
|
16 |
+
|
17 |
+
def get_log_handler():
|
18 |
+
"""
|
19 |
+
Create a ListHandler and return (handler, log_list).
|
20 |
+
This allows capturing logs in a list for a specific operation.
|
21 |
+
"""
|
22 |
+
log_list = []
|
23 |
+
handler = ListHandler(log_list)
|
24 |
+
handler.setLevel(logging.DEBUG)
|
25 |
+
return handler, log_list
|
uv.lock
ADDED
The diff for this file is too large to render.
See raw diff
|
|