levalencia commited on
Commit
5d3ebd9
·
1 Parent(s): 442515d

Add reportlab dependency for PDF generation and enhance document processing

Browse files

- Added reportlab to pyproject.toml and requirements.txt for PDF generation capabilities.
- Updated document processing logic to include a new function for generating redacted PDFs from the processed document structure.
- Enhanced logging and error handling during PDF generation to improve user feedback and debugging.
- Refactored the document processor to return detailed processing results, including removed indices and cost metrics.

JUPYTER_USAGE.md ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Jupyter Notebook Usage
2
+
3
+ This document shows how to use the document processing function in Jupyter notebooks for integration into larger processing pipelines.
4
+
5
+ ## Simple Usage
6
+
7
+ ```python
8
+ from processing.document_processor import process_document_with_redaction
9
+
10
+ # Process a single document
11
+ result = process_document_with_redaction(
12
+ file_path="path/to/your/document.pdf",
13
+ endpoint="your-azure-openai-endpoint",
14
+ api_key="your-azure-openai-key",
15
+ api_version="2024-02-15-preview",
16
+ deployment="o3-mini" # or "o4-mini", "o3", "o4"
17
+ )
18
+
19
+ # Access the results
20
+ original_md = result.original_document_md
21
+ redacted_md = result.redacted_document_md
22
+ input_tokens = result.input_tokens
23
+ output_tokens = result.output_tokens
24
+ cost = result.cost
25
+
26
+ print(f"Processing complete!")
27
+ print(f"Input tokens: {input_tokens:,}")
28
+ print(f"Output tokens: {output_tokens:,}")
29
+ print(f"Total cost: ${cost:.4f}")
30
+ ```
31
+
32
+ ## Batch Processing
33
+
34
+ ```python
35
+ import os
36
+ from processing.document_processor import process_document_with_redaction
37
+
38
+ # Configuration
39
+ AZURE_OPENAI_ENDPOINT = "your-azure-openai-endpoint"
40
+ AZURE_OPENAI_KEY = "your-azure-openai-key"
41
+ AZURE_OPENAI_VERSION = "2024-02-15-preview"
42
+ AZURE_OPENAI_DEPLOYMENT = "o3-mini"
43
+
44
+ # Process multiple documents
45
+ pdf_directory = "path/to/pdf/files"
46
+ results = []
47
+
48
+ for filename in os.listdir(pdf_directory):
49
+ if filename.endswith('.pdf'):
50
+ file_path = os.path.join(pdf_directory, filename)
51
+
52
+ print(f"Processing {filename}...")
53
+
54
+ try:
55
+ result = process_document_with_redaction(
56
+ file_path=file_path,
57
+ endpoint=AZURE_OPENAI_ENDPOINT,
58
+ api_key=AZURE_OPENAI_KEY,
59
+ api_version=AZURE_OPENAI_VERSION,
60
+ deployment=AZURE_OPENAI_DEPLOYMENT
61
+ )
62
+
63
+ results.append({
64
+ 'filename': filename,
65
+ 'original_md': result.original_document_md,
66
+ 'redacted_md': result.redacted_document_md,
67
+ 'input_tokens': result.input_tokens,
68
+ 'output_tokens': result.output_tokens,
69
+ 'cost': result.cost
70
+ })
71
+
72
+ print(f" ✓ Completed - Cost: ${result.cost:.4f}")
73
+
74
+ except Exception as e:
75
+ print(f" ✗ Error processing {filename}: {e}")
76
+
77
+ # Summary
78
+ total_cost = sum(r['cost'] for r in results)
79
+ total_input_tokens = sum(r['input_tokens'] for r in results)
80
+ total_output_tokens = sum(r['output_tokens'] for r in results)
81
+
82
+ print(f"\nBatch processing complete!")
83
+ print(f"Documents processed: {len(results)}")
84
+ print(f"Total input tokens: {total_input_tokens:,}")
85
+ print(f"Total output tokens: {total_output_tokens:,}")
86
+ print(f"Total cost: ${total_cost:.4f}")
87
+ ```
88
+
89
+ ## Environment Variables
90
+
91
+ You can also use environment variables for configuration:
92
+
93
+ ```python
94
+ import os
95
+ from dotenv import load_dotenv
96
+ from processing.document_processor import process_document_with_redaction
97
+
98
+ # Load environment variables
99
+ load_dotenv()
100
+
101
+ # Get configuration from environment
102
+ AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
103
+ AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY")
104
+ AZURE_OPENAI_VERSION = os.getenv("AZURE_OPENAI_VERSION")
105
+ AZURE_OPENAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT")
106
+
107
+ # Process document
108
+ result = process_document_with_redaction(
109
+ file_path="document.pdf",
110
+ endpoint=AZURE_OPENAI_ENDPOINT,
111
+ api_key=AZURE_OPENAI_KEY,
112
+ api_version=AZURE_OPENAI_VERSION,
113
+ deployment=AZURE_OPENAI_DEPLOYMENT
114
+ )
115
+ ```
116
+
117
+ ## Return Value
118
+
119
+ The function returns a `ProcessingResult` object with the following attributes:
120
+
121
+ - `original_document_md`: Markdown version of the original document
122
+ - `redacted_document_md`: Markdown version with medication sections removed
123
+ - `input_tokens`: Number of input tokens used
124
+ - `output_tokens`: Number of output tokens generated
125
+ - `cost`: Total cost in USD
126
+
127
+ ## Supported Models
128
+
129
+ The function supports the following Azure OpenAI deployment names:
130
+ - `o3-mini` (GPT-4o Mini) - Cheapest option
131
+ - `o4-mini` (GPT-4o Mini) - Same as o3-mini
132
+ - `o3` (GPT-3.5 Turbo) - Medium cost
133
+ - `o4` (GPT-4o) - Most expensive but most capable
134
+
135
+ ## Error Handling
136
+
137
+ The function will raise exceptions for:
138
+ - File not found
139
+ - Invalid Azure OpenAI credentials
140
+ - API rate limits
141
+ - Network errors
142
+
143
+ Make sure to handle these appropriately in your pipeline.
pyproject.toml CHANGED
@@ -10,4 +10,5 @@ dependencies = [
10
  "pyyaml>=6.0",
11
  "python-dotenv>=1.1.1",
12
  "openai>=1.91.0",
 
13
  ]
 
10
  "pyyaml>=6.0",
11
  "python-dotenv>=1.1.1",
12
  "openai>=1.91.0",
13
+ "reportlab>=4.4.2",
14
  ]
requirements.txt CHANGED
@@ -4,4 +4,5 @@ docling
4
  streamlit
5
  pyyaml
6
  python-dotenv
7
- openai
 
 
4
  streamlit
5
  pyyaml
6
  python-dotenv
7
+ openai
8
+ reportlab
src/processing/document_processor.py CHANGED
@@ -3,11 +3,12 @@ import time
3
  import logging
4
  import json
5
  from dataclasses import dataclass
6
- from typing import Optional
7
 
8
  # Don't import DocumentConverter at module level to prevent early initialization
9
  # from docling.document_converter import DocumentConverter
10
  from processing.sections import SectionExtractor
 
11
 
12
  # Remove global converter initialization - will be done lazily
13
  # _docling_converter = DocumentConverter()
@@ -23,6 +24,92 @@ class DocumentResult:
23
  redacted_markdown: str
24
  redacted_json: dict
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  class DocumentProcessor:
27
  """Handles parsing of documents with Docling and redacting specified sections."""
28
  def __init__(self, section_extractor: Optional[SectionExtractor] = None):
@@ -80,12 +167,12 @@ class DocumentProcessor:
80
 
81
  # Persist outputs to files (JSON and redacted text) for auditing
82
  base_name = os.path.splitext(os.path.basename(file_path))[0]
83
- # Use temp directory for output files - try to use the same temp dir as the main app
84
- temp_dir = "temp_files"
85
  try:
86
  os.makedirs(temp_dir, exist_ok=True)
87
  except PermissionError:
88
- # Fallback to system temp directory if we can't create in current directory
89
  import tempfile
90
  temp_dir = os.path.join(tempfile.gettempdir(), "docling_temp_files")
91
  os.makedirs(temp_dir, exist_ok=True)
@@ -139,10 +226,177 @@ class DocumentProcessor:
139
  logger.warning(f"Could not create cache directory {cache_dir}: {e}")
140
 
141
  def _export_redacted_markdown(self, document, redacted_json):
142
- """Export redacted markdown using the redacted JSON structure."""
143
- # Simply convert the redacted JSON back to markdown
144
- return self._json_to_markdown(redacted_json)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  def _json_to_markdown(self, json_data: dict) -> str:
147
  """Convert JSON document structure back to markdown format using Docling's structure."""
148
  markdown_lines = []
 
3
  import logging
4
  import json
5
  from dataclasses import dataclass
6
+ from typing import Optional, Tuple
7
 
8
  # Don't import DocumentConverter at module level to prevent early initialization
9
  # from docling.document_converter import DocumentConverter
10
  from processing.sections import SectionExtractor
11
+ from utils.cost_tracker import cost_tracker
12
 
13
  # Remove global converter initialization - will be done lazily
14
  # _docling_converter = DocumentConverter()
 
24
  redacted_markdown: str
25
  redacted_json: dict
26
 
27
+ @dataclass
28
+ class ProcessingResult:
29
+ """Simple result for Jupyter notebook usage."""
30
+ original_document_md: str
31
+ redacted_document_md: str
32
+ original_document_json: dict
33
+ redacted_document_json: dict
34
+ removed_indices: list # Add the actual indices that were removed
35
+ input_tokens: int
36
+ output_tokens: int
37
+ cost: float
38
+
39
+ def process_document_with_redaction(
40
+ file_path: str,
41
+ endpoint: str,
42
+ api_key: str,
43
+ api_version: str,
44
+ deployment: str,
45
+ section_extractor: Optional[SectionExtractor] = None
46
+ ) -> ProcessingResult:
47
+ """
48
+ Process a document and return a simple tuple with results.
49
+
50
+ Args:
51
+ file_path: Path to the PDF file to process
52
+ endpoint: Azure OpenAI endpoint
53
+ api_key: Azure OpenAI API key
54
+ api_version: Azure OpenAI API version
55
+ deployment: Azure OpenAI deployment name
56
+ section_extractor: Optional custom section extractor
57
+
58
+ Returns:
59
+ ProcessingResult with (original_document_md, redacted_document_md, input_tokens, output_tokens, cost)
60
+ """
61
+ logger.info(f"Processing document: {file_path}")
62
+
63
+ # Reset cost tracker for this processing session
64
+ cost_tracker.reset_session()
65
+
66
+ # Create section extractor if not provided
67
+ if section_extractor is None:
68
+ from processing.sections import ReasoningSectionExtractor
69
+ section_extractor = ReasoningSectionExtractor(
70
+ endpoint=endpoint,
71
+ api_key=api_key,
72
+ api_version=api_version,
73
+ deployment=deployment,
74
+ )
75
+
76
+ # Process the document
77
+ processor = DocumentProcessor(section_extractor=section_extractor)
78
+ result = processor.process(file_path)
79
+
80
+ # Get the actual removed indices from the section extractor
81
+ removed_indices = []
82
+ if section_extractor:
83
+ # Extract the removed indices from the LLM response
84
+ extraction_result = section_extractor.llm_extractor.extract_medication_sections(result.structured_json)
85
+ removed_indices = extraction_result.get("indices_to_remove", [])
86
+
87
+ # Get cost summary
88
+ cost_summary = cost_tracker.get_session_summary()
89
+ total_input_tokens = cost_summary.get("total_tokens", 0)
90
+ total_output_tokens = 0 # We'll calculate this from the breakdown
91
+ total_cost = cost_summary.get("total_cost", 0.0)
92
+
93
+ # Calculate output tokens from model breakdown
94
+ for model_stats in cost_summary.get("model_breakdown", {}).values():
95
+ total_output_tokens += model_stats.get("output_tokens", 0)
96
+
97
+ # Calculate input tokens (total - output)
98
+ total_input_tokens = total_input_tokens - total_output_tokens
99
+
100
+ logger.info(f"Processing complete - Input: {total_input_tokens}, Output: {total_output_tokens}, Cost: ${total_cost:.4f}")
101
+
102
+ return ProcessingResult(
103
+ original_document_md=result.structured_markdown,
104
+ redacted_document_md=result.redacted_markdown,
105
+ original_document_json=result.structured_json,
106
+ redacted_document_json=result.redacted_json,
107
+ removed_indices=removed_indices,
108
+ input_tokens=total_input_tokens,
109
+ output_tokens=total_output_tokens,
110
+ cost=total_cost
111
+ )
112
+
113
  class DocumentProcessor:
114
  """Handles parsing of documents with Docling and redacting specified sections."""
115
  def __init__(self, section_extractor: Optional[SectionExtractor] = None):
 
167
 
168
  # Persist outputs to files (JSON and redacted text) for auditing
169
  base_name = os.path.splitext(os.path.basename(file_path))[0]
170
+ # Use the same temp directory as the main application
171
+ temp_dir = os.environ.get('TEMP_DIR', '/tmp/docling_temp')
172
  try:
173
  os.makedirs(temp_dir, exist_ok=True)
174
  except PermissionError:
175
+ # Fallback to system temp directory if we can't create in the main temp dir
176
  import tempfile
177
  temp_dir = os.path.join(tempfile.gettempdir(), "docling_temp_files")
178
  os.makedirs(temp_dir, exist_ok=True)
 
226
  logger.warning(f"Could not create cache directory {cache_dir}: {e}")
227
 
228
  def _export_redacted_markdown(self, document, redacted_json):
229
+ """Export redacted markdown using Docling's Document class for proper formatting."""
230
+ try:
231
+ # Try different possible import paths for Docling Document class
232
+ try:
233
+ from docling.document import Document
234
+ except ImportError:
235
+ try:
236
+ from docling import Document
237
+ except ImportError:
238
+ try:
239
+ from docling.core import Document
240
+ except ImportError:
241
+ # If all imports fail, use the fallback method
242
+ logger.warning("Could not import Docling Document class from any known location")
243
+ raise ImportError("Docling Document class not found")
244
+
245
+ # Create a new Document from the redacted JSON
246
+ redacted_document = Document.from_dict(redacted_json)
247
+
248
+ # Use Docling's export method for proper markdown formatting
249
+ redacted_md = redacted_document.export_to_markdown()
250
+ logger.info("Successfully generated redacted markdown using Docling Document class")
251
+ return redacted_md
252
+
253
+ except Exception as e:
254
+ logger.warning(f"Failed to create Docling Document from redacted JSON: {e}")
255
+ logger.info("Falling back to manual JSON-to-markdown conversion")
256
+ # Fallback to the old method if Docling Document creation fails
257
+ return self._json_to_markdown(redacted_json)
258
 
259
+ def generate_redacted_pdf(self, redacted_json: dict, output_path: str) -> bool:
260
+ """
261
+ Generate a redacted PDF from the redacted JSON structure.
262
+
263
+ Args:
264
+ redacted_json: The redacted document JSON structure
265
+ output_path: Path where the PDF should be saved
266
+
267
+ Returns:
268
+ bool: True if PDF generation was successful, False otherwise
269
+ """
270
+ try:
271
+ # Import required libraries
272
+ from reportlab.lib.pagesizes import letter, A4
273
+ from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle
274
+ from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
275
+ from reportlab.lib.units import inch
276
+ from reportlab.lib import colors
277
+ import io
278
+
279
+ logger.info(f"Generating redacted PDF: {output_path}")
280
+
281
+ # Create PDF document
282
+ doc = SimpleDocTemplate(output_path, pagesize=A4)
283
+ story = []
284
+
285
+ # Get styles
286
+ styles = getSampleStyleSheet()
287
+ normal_style = styles['Normal']
288
+ heading_style = styles['Heading1']
289
+
290
+ # Create custom styles for better formatting
291
+ table_style = ParagraphStyle(
292
+ 'TableStyle',
293
+ parent=normal_style,
294
+ fontName='Courier',
295
+ fontSize=9,
296
+ spaceAfter=6
297
+ )
298
+
299
+ # Process text elements from JSON
300
+ texts = redacted_json.get("texts", [])
301
+
302
+ # Group consecutive table-like elements together
303
+ i = 0
304
+ while i < len(texts):
305
+ text_elem = texts[i]
306
+ text_content = text_elem.get("text", "").strip()
307
+ label = text_elem.get("label", "")
308
+ level = text_elem.get("level", 0)
309
+
310
+ if not text_content:
311
+ i += 1
312
+ continue
313
+
314
+ # Handle different content types
315
+ if label == "section_header":
316
+ # Create header with appropriate level
317
+ if level == 1:
318
+ story.append(Paragraph(text_content, heading_style))
319
+ else:
320
+ # Create sub-heading style
321
+ sub_heading_style = ParagraphStyle(
322
+ f'Heading{min(level, 3)}',
323
+ parent=normal_style,
324
+ fontSize=14 - level,
325
+ spaceAfter=12,
326
+ spaceBefore=12,
327
+ textColor=colors.darkblue
328
+ )
329
+ story.append(Paragraph(text_content, sub_heading_style))
330
+
331
+ elif label == "list_item":
332
+ # Handle list items
333
+ marker = text_elem.get("marker", "•")
334
+ list_text = f"{marker} {text_content}"
335
+ story.append(Paragraph(list_text, normal_style))
336
+
337
+ elif '|' in text_content and text_content.count('|') > 1:
338
+ # Handle table-like content - collect consecutive table rows
339
+ table_rows = []
340
+
341
+ # Add the current row
342
+ cells = [cell.strip() for cell in text_content.split('|') if cell.strip()]
343
+ if cells:
344
+ table_rows.append(cells)
345
+
346
+ # Look ahead for consecutive table rows
347
+ j = i + 1
348
+ while j < len(texts):
349
+ next_text = texts[j].get("text", "").strip()
350
+ if '|' in next_text and next_text.count('|') > 1:
351
+ next_cells = [cell.strip() for cell in next_text.split('|') if cell.strip()]
352
+ if next_cells:
353
+ table_rows.append(next_cells)
354
+ j += 1
355
+ else:
356
+ break
357
+
358
+ # Create table if we have rows
359
+ if table_rows:
360
+ table = Table(table_rows)
361
+ table.setStyle(TableStyle([
362
+ ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
363
+ ('FONTNAME', (0, 0), (-1, -1), 'Courier'),
364
+ ('FONTSIZE', (0, 0), (-1, -1), 9),
365
+ ('BOTTOMPADDING', (0, 0), (-1, -1), 3),
366
+ ('TOPPADDING', (0, 0), (-1, -1), 3),
367
+ ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
368
+ ('BACKGROUND', (0, 0), (-1, 0), colors.lightgrey), # Header row
369
+ ]))
370
+ story.append(table)
371
+ story.append(Spacer(1, 6))
372
+
373
+ # Skip the rows we've already processed
374
+ i = j - 1
375
+ else:
376
+ # Single row or no valid cells
377
+ story.append(Paragraph(text_content, table_style))
378
+
379
+ else:
380
+ # Regular text content
381
+ story.append(Paragraph(text_content, normal_style))
382
+
383
+ # Add small spacing between elements
384
+ story.append(Spacer(1, 3))
385
+ i += 1
386
+
387
+ # Build PDF
388
+ doc.build(story)
389
+ logger.info(f"Successfully generated redacted PDF: {output_path}")
390
+ return True
391
+
392
+ except ImportError as e:
393
+ logger.error(f"Required PDF generation libraries not available: {e}")
394
+ logger.info("Install reportlab with: pip install reportlab")
395
+ return False
396
+ except Exception as e:
397
+ logger.error(f"Error generating redacted PDF: {e}")
398
+ return False
399
+
400
  def _json_to_markdown(self, json_data: dict) -> str:
401
  """Convert JSON document structure back to markdown format using Docling's structure."""
402
  markdown_lines = []
src/processing/llm_extractor.py CHANGED
@@ -6,6 +6,7 @@ import logging
6
  from typing import Dict, Any
7
 
8
  from openai import AzureOpenAI
 
9
 
10
  logger = logging.getLogger(__name__)
11
 
@@ -39,107 +40,137 @@ class AzureO1MedicationExtractor:
39
  })
40
 
41
  prompt = f"""
42
- You are a medical document analysis expert specializing in discharge letters. Your task is to identify ONLY the two formal medication lists that should be redacted, while preserving all medication mentions in clinical discussion.
43
-
44
- **CRITICAL: You should ONLY remove the two formal medication lists:**
45
- 1. **Current medication list** (usually at the beginning of the document)
46
- 2. **Discharge medication list** (usually at the end of the document, often under headers like "Als verdere behandeling stellen wij voor" or "Thuismedicatie")
47
-
48
- **Typical discharge letter structure:**
49
- - Patient information and admission details
50
- - Clinical discussion and treatment narrative (KEEP medication mentions here)
51
- - Current medication list (REMOVE this formal list)
52
- - Discharge instructions and follow-up
53
- - Discharge medication list (REMOVE this formal list)
54
-
55
- **DO NOT remove:**
56
- - Medication mentions in clinical discussion (e.g., "patient was treated with Eliquis")
57
- - Medication adjustments mentioned in the narrative
58
- - Dosage information in clinical context
59
- - Any medication information that appears in the main clinical text
60
- - Treatment decisions and clinical reasoning
61
-
62
- **ONLY remove:**
63
- - Complete medication lists with multiple drugs
64
- - Formal medication sections with headers
65
- - Standalone medication lists that are clearly separated from clinical text
66
- - Lists that appear to be formal medication documentation
 
 
 
 
 
 
 
 
 
 
 
67
 
68
  Document structure:
69
  {text_analysis}
70
 
71
  **Analysis Instructions:**
72
- 1. Look for formal medication sections with clear headers (e.g., "Thuismedicatie", "Als verdere behandeling stellen wij voor")
73
- 2. Identify complete medication lists that contain multiple drugs with dosages
74
- 3. **IGNORE** any medication mentions that appear within clinical discussion or narrative text
75
- 4. Focus on structural elements that represent formal medication documentation
76
- 5. Be conservative - if in doubt, do NOT remove
77
- 6. Consider the position in the document (beginning/end vs. middle)
78
-
79
- **Examples of what to REMOVE:**
80
- - Complete lists under "Thuismedicatie" header
81
- - Formal medication lists under "Als verdere behandeling stellen wij voor"
82
- - Standalone medication sections with multiple drugs
83
- - Lists that appear at the beginning or end of the document
84
-
85
- **Examples of what to KEEP:**
86
- - "Patient was treated with Eliquis 2x 2.5mg" (clinical discussion)
87
- - "Stop Clopidogrel bij opname" (clinical instruction)
88
- - "Jardiance 10mg & Burinex 5mg" (if mentioned in clinical context)
89
- - Any medication mentioned in the context of treatment discussion
90
-
91
- Return your analysis as JSON:
92
  {{
93
  "indices_to_remove": [list of integer indices - ONLY formal medication lists],
94
  "reasoning": {{
95
- "formal_medication_lists": [list of identified formal medication list indices with explanations],
96
- "clinical_medication_mentions": [list of clinical mentions that were correctly preserved],
97
- "justification": "explanation of why only formal lists were selected for removal",
98
  "confidence": "high/medium/low"
99
  }}
100
  }}
101
  """
 
102
  logger.info(f"Prompt length: {len(prompt)}")
103
  logger.info(f"Number of text elements: {len(text_analysis)}")
 
104
  try:
105
  response = self.client.chat.completions.create(
106
- messages=[
107
- {
108
- "role": "system",
109
- "content": "You are a helpful assistant.",
110
- },
111
- {
112
- "role": "user",
113
- "content": prompt,
114
- }
115
- ],
116
- max_completion_tokens=100000, # adjust as needed
117
- model=self.deployment
 
118
  )
 
 
 
 
 
 
 
 
 
 
 
 
119
  except Exception as e:
120
  logger.error(f"Exception during LLM call: {e}", exc_info=True)
121
  return {"indices_to_remove": [], "reasoning": {"confidence": "low"}}
122
 
123
  try:
124
- logger.error(f"Raw LLM response: {response.choices[0].message.content!r}")
 
 
125
  result = json.loads(response.choices[0].message.content)
126
 
127
- # Validate and limit the number of elements to remove
128
  indices_to_remove = result.get("indices_to_remove", [])
129
 
130
- # Be conservative - limit to maximum 10 elements to prevent over-removal
131
- if len(indices_to_remove) > 10:
132
- logger.warning(f"LLM suggested removing {len(indices_to_remove)} elements, limiting to 10 most likely formal medication lists")
133
- # Keep only the first 10 (assuming they're ordered by importance)
134
- indices_to_remove = indices_to_remove[:10]
135
- result["indices_to_remove"] = indices_to_remove
136
- result["reasoning"]["justification"] += " [LIMITED: Only top 10 elements selected to prevent over-removal]"
137
 
138
- # Log the reasoning for transparency
139
- reasoning = result.get("reasoning", {})
140
- logger.info(f"LLM reasoning: {reasoning}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
  return result
 
143
  except Exception as e:
144
  logger.error(f"Failed to parse LLM response: {e}")
145
  return {"indices_to_remove": [], "reasoning": {"confidence": "low"}}
 
6
  from typing import Dict, Any
7
 
8
  from openai import AzureOpenAI
9
+ from utils.cost_tracker import cost_tracker
10
 
11
  logger = logging.getLogger(__name__)
12
 
 
40
  })
41
 
42
  prompt = f"""
43
+ You are a medical document analysis expert specializing in discharge letters. Your task is to identify ONLY the formal medication lists that should be redacted, while preserving ALL other content including medical history tables.
44
+
45
+ **CRITICAL: You should ONLY remove formal medication lists with explicit medication names, dosages, and frequencies.**
46
+
47
+ **What to REMOVE (medication lists only):**
48
+ 1. **Current medication list** - sections with headers like "Huidige thuismedicatie", "Current medications", "Medicatie"
49
+ 2. **Discharge medication list** - sections with headers like "Als verdere behandeling stellen wij voor", "Thuismedicatie", "Discharge medications"
50
+
51
+ **What medication lists look like:**
52
+ - Header: "Huidige thuismedicatie" or similar
53
+ - Followed by multiple lines with medication names, dosages, frequencies
54
+ - Example: "Pantomed 20mg Tablet Oral - 1 tablet - 2 maal daags"
55
+ - Example: "Forlax 10g Zakje Oral - 2 zakje - 1 maal daags (zo nodig)"
56
+
57
+ **What to ABSOLUTELY NEVER REMOVE:**
58
+ 1. **Medical history tables** - Tables with "Datum" and "Bespreking" columns containing dates and medical events
59
+ 2. **Treatment history** - Narrative descriptions of medical procedures, treatments, or events
60
+ 3. **Clinical discussions** - Any text discussing medical conditions, procedures, or clinical decisions
61
+ 4. **Tables with dates and procedures** - Any table format showing timeline of medical events
62
+ 5. **Individual medication mentions in clinical text** - References to medications within clinical narratives
63
+
64
+ **EXAMPLES OF CONTENT TO NEVER REMOVE:**
65
+ - Tables like: "| Datum | Bespreking |" followed by medical events
66
+ - "| 07/07/2017 | Niertransplantatie met donornier..."
67
+ - "| 15/8/2017 | Uitgestelde transplantfunctie..."
68
+ - "| 26/03/2018 | plaatsing peritoneaal dialysekatheter..."
69
+ - Any text describing medical procedures, surgeries, or treatments
70
+ - Clinical narratives mentioning medications in context (e.g., "behandeling met Sotrovimab")
71
+
72
+ **KEY DISTINGUISHING FEATURES:**
73
+ - **Medication lists**: Standalone sections with drug names + dosages + frequencies
74
+ - **Medical history**: Tables or narratives describing medical events, procedures, surgeries
75
+ - **Clinical text**: Discussions of treatment decisions, medical events, or conditions
76
+
77
+ **If you see a table with dates and medical procedures, it is MEDICAL HISTORY, not a medication list.**
78
+ **If you see clinical text discussing treatments or procedures, it is CLINICAL DISCUSSION, not a medication list.**
79
 
80
  Document structure:
81
  {text_analysis}
82
 
83
  **Analysis Instructions:**
84
+ 1. Look ONLY for formal medication sections with clear headers (e.g., "Thuismedicatie", "Huidige thuismedicatie")
85
+ 2. Identify sections that contain LISTS of medications with dosages and frequencies
86
+ 3. **NEVER identify medical history tables as medication lists**
87
+ 4. **NEVER identify clinical discussions as medication lists**
88
+ 5. Be extremely conservative - if in doubt, do NOT remove
89
+ 6. Focus ONLY on standalone medication documentation sections
90
+
91
+ Return your analysis as a JSON object with this exact structure:
 
 
 
 
 
 
 
 
 
 
 
 
92
  {{
93
  "indices_to_remove": [list of integer indices - ONLY formal medication lists],
94
  "reasoning": {{
95
+ "justification": "explanation of why only formal medication lists were selected for removal",
 
 
96
  "confidence": "high/medium/low"
97
  }}
98
  }}
99
  """
100
+
101
  logger.info(f"Prompt length: {len(prompt)}")
102
  logger.info(f"Number of text elements: {len(text_analysis)}")
103
+
104
  try:
105
  response = self.client.chat.completions.create(
106
+ messages=[
107
+ {
108
+ "role": "system",
109
+ "content": "You are a helpful assistant that analyzes medical documents and identifies formal medication lists for redaction.",
110
+ },
111
+ {
112
+ "role": "user",
113
+ "content": prompt,
114
+ }
115
+ ],
116
+ max_completion_tokens=100000,
117
+ model=self.deployment,
118
+ response_format={"type": "json_object"}
119
  )
120
+
121
+ # Record token usage and cost
122
+ if hasattr(response, 'usage') and response.usage:
123
+ cost_tracker.record_usage(
124
+ prompt_tokens=response.usage.prompt_tokens,
125
+ completion_tokens=response.usage.completion_tokens,
126
+ model=self.model_name
127
+ )
128
+ logger.info(f"API call completed - Input: {response.usage.prompt_tokens}, "
129
+ f"Output: {response.usage.completion_tokens}, "
130
+ f"Total: {response.usage.total_tokens} tokens")
131
+
132
  except Exception as e:
133
  logger.error(f"Exception during LLM call: {e}", exc_info=True)
134
  return {"indices_to_remove": [], "reasoning": {"confidence": "low"}}
135
 
136
  try:
137
+ logger.info(f"Raw LLM response: {response.choices[0].message.content!r}")
138
+
139
+ # Parse the structured JSON response
140
  result = json.loads(response.choices[0].message.content)
141
 
142
+ # Get the indices to remove
143
  indices_to_remove = result.get("indices_to_remove", [])
144
 
145
+ # Log what the LLM suggested
146
+ logger.info(f"LLM suggested removing {len(indices_to_remove)} elements: {indices_to_remove}")
 
 
 
 
 
147
 
148
+ # Log detailed information about what's being removed
149
+ if indices_to_remove:
150
+ logger.info("DETAILED ANALYSIS OF LLM SUGGESTIONS:")
151
+ logger.info("=" * 60)
152
+
153
+ for idx in indices_to_remove:
154
+ if idx < len(text_analysis):
155
+ text_content = text_analysis[idx].get("text", "")
156
+ text_label = text_analysis[idx].get("label", "")
157
+ logger.info(f"Index {idx} ({text_label}): '{text_content}'")
158
+ else:
159
+ logger.error(f"Index {idx} is out of bounds (max: {len(text_analysis)-1})")
160
+
161
+ logger.info("=" * 60)
162
+
163
+ # Log the reasoning if provided
164
+ reasoning = result.get("reasoning", {})
165
+ if reasoning:
166
+ logger.info(f"LLM reasoning: {reasoning}")
167
+
168
+ logger.info(f"Final removal list: {len(indices_to_remove)} elements will be removed")
169
+ else:
170
+ logger.info("No elements will be removed")
171
 
172
  return result
173
+
174
  except Exception as e:
175
  logger.error(f"Failed to parse LLM response: {e}")
176
  return {"indices_to_remove": [], "reasoning": {"confidence": "low"}}
src/processing/sections.py CHANGED
@@ -21,18 +21,57 @@ class ReasoningSectionExtractor:
21
  reasoning = extraction_result.get("reasoning", {})
22
 
23
  # Log detailed reasoning for transparency
24
- logger.info(f"O1-mini reasoning: {reasoning}")
 
 
 
25
 
26
  # Provide specific feedback about what was removed
27
  if indices_to_remove:
28
  logger.info(f"Removing {len(indices_to_remove)} text elements: {indices_to_remove}")
29
 
30
- # Show what specific content is being removed
31
- texts = doc_json.get("texts", [])
 
 
 
32
  for idx in indices_to_remove:
33
  if idx < len(texts):
34
- text_content = texts[idx].get("text", "")[:100]
35
- logger.info(f" Removing text {idx}: '{text_content}...'")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  else:
37
  logger.info("No formal medication lists identified for removal")
38
 
@@ -46,6 +85,7 @@ class ReasoningSectionExtractor:
46
  # Log the result
47
  removed_count = len(texts) - len(redacted_texts)
48
  logger.info(f"Successfully removed {removed_count} text elements from document structure")
 
49
 
50
  return redacted_json
51
 
 
21
  reasoning = extraction_result.get("reasoning", {})
22
 
23
  # Log detailed reasoning for transparency
24
+ logger.info(f"LLM reasoning summary: {reasoning}")
25
+
26
+ # Get the texts for detailed logging
27
+ texts = doc_json.get("texts", [])
28
 
29
  # Provide specific feedback about what was removed
30
  if indices_to_remove:
31
  logger.info(f"Removing {len(indices_to_remove)} text elements: {indices_to_remove}")
32
 
33
+ # Categorize and show what specific content is being removed
34
+ medication_headers = []
35
+ medication_items = []
36
+ other_content = []
37
+
38
  for idx in indices_to_remove:
39
  if idx < len(texts):
40
+ text_content = texts[idx].get("text", "")
41
+ text_label = texts[idx].get("label", "")
42
+
43
+ # Categorize the content
44
+ if any(keyword in text_content.lower() for keyword in ['medicatie', 'thuismedicatie', 'medication', 'drugs']):
45
+ medication_headers.append((idx, text_content))
46
+ elif any(keyword in text_content.lower() for keyword in ['tablet', 'capsule', 'mg', 'ml', 'zakje', 'oral', 'maal daags']):
47
+ medication_items.append((idx, text_content))
48
+ else:
49
+ other_content.append((idx, text_content))
50
+
51
+ # Log with more detail
52
+ logger.info(f" → Removing index {idx} ({text_label}): '{text_content[:150]}{'...' if len(text_content) > 150 else ''}'")
53
+ else:
54
+ logger.warning(f" → Invalid index {idx}: exceeds document length ({len(texts)})")
55
+
56
+ # Summary of what was categorized
57
+ if medication_headers:
58
+ logger.info(f"Medication headers removed: {len(medication_headers)} items")
59
+ for idx, content in medication_headers:
60
+ logger.info(f" Header {idx}: {content}")
61
+
62
+ if medication_items:
63
+ logger.info(f"Medication items removed: {len(medication_items)} items")
64
+ for idx, content in medication_items[:5]: # Show first 5 to avoid spam
65
+ logger.info(f" Item {idx}: {content[:100]}...")
66
+ if len(medication_items) > 5:
67
+ logger.info(f" ... and {len(medication_items) - 5} more medication items")
68
+
69
+ if other_content:
70
+ logger.warning(f"⚠️ NON-MEDICATION content removed: {len(other_content)} items")
71
+ for idx, content in other_content:
72
+ logger.warning(f" ⚠️ Index {idx}: {content[:200]}...")
73
+ logger.warning("⚠️ Please review: non-medication content was removed - this may indicate an issue with the LLM detection")
74
+
75
  else:
76
  logger.info("No formal medication lists identified for removal")
77
 
 
85
  # Log the result
86
  removed_count = len(texts) - len(redacted_texts)
87
  logger.info(f"Successfully removed {removed_count} text elements from document structure")
88
+ logger.info(f"Document structure: {len(texts)} → {len(redacted_texts)} text elements")
89
 
90
  return redacted_json
91
 
src/streamlit_app.py CHANGED
@@ -3,6 +3,8 @@
3
 
4
  import os
5
  import tempfile
 
 
6
 
7
  # Get a writable temp directory first
8
  try:
@@ -91,26 +93,12 @@ directories_to_create = [
91
  os.environ['ACCELERATE_CACHE'],
92
  ]
93
 
94
- # Monkey patch os.makedirs to prevent root directory access
95
- original_makedirs = os.makedirs
96
-
97
- def safe_makedirs(name, mode=0o777, exist_ok=False):
98
- """Safe version of makedirs that prevents root directory access."""
99
- # Check if trying to create directory in root filesystem
100
- if name.startswith('/') and not name.startswith('/tmp') and not name.startswith('/app'):
101
- # Redirect to temp directory
102
- basename = os.path.basename(name)
103
- safe_name = os.path.join(TEMP_DIR, basename)
104
- print(f"Redirecting root directory creation from {name} to {safe_name}")
105
- return original_makedirs(safe_name, mode, exist_ok)
106
- return original_makedirs(name, mode, exist_ok)
107
-
108
- # Apply the monkey patch
109
- os.makedirs = safe_makedirs
110
-
111
  for directory in directories_to_create:
112
  try:
113
- os.makedirs(directory, exist_ok=True)
 
 
 
114
  except Exception as e:
115
  print(f"Warning: Could not create directory {directory}: {e}")
116
 
@@ -121,11 +109,10 @@ import shutil
121
  from processing.document_processor import DocumentProcessor
122
  from processing.sections import ReasoningSectionExtractor
123
  from utils.logging_utils import get_log_handler
 
124
  from dotenv import load_dotenv
125
  import sys
126
- import html
127
  import difflib
128
- import re
129
  import time
130
 
131
  # Configure logging early to avoid issues
@@ -212,16 +199,40 @@ def get_temp_files_info():
212
 
213
  files = os.listdir(TEMP_DIR)
214
  total_size = 0
 
215
 
216
  for filename in files:
217
  try:
218
  file_path = os.path.join(TEMP_DIR, filename)
219
  if os.path.isfile(file_path):
220
- total_size += os.path.getsize(file_path)
 
 
 
 
 
 
 
 
 
 
 
 
221
  except (PermissionError, OSError) as e:
222
  logging.warning(f"Error accessing file {filename}: {e}")
 
 
 
 
 
223
  continue
224
 
 
 
 
 
 
 
225
  return len(files), total_size
226
  except PermissionError as e:
227
  logging.warning(f"Permission error accessing temp directory {TEMP_DIR}: {e}")
@@ -298,215 +309,7 @@ st.markdown("""
298
  border-radius: 10px;
299
  border: 1px solid #e9ecef;
300
  }
301
-
302
- /* Synchronized scrolling styles */
303
- .sync-scroll-container {
304
- display: flex;
305
- gap: 20px;
306
- height: 600px;
307
- font-family: 'Courier New', monospace;
308
- font-size: 12px;
309
- }
310
-
311
- .sync-scroll-panel {
312
- flex: 1;
313
- border: 1px solid #ddd;
314
- border-radius: 5px;
315
- overflow: hidden;
316
- display: flex;
317
- flex-direction: column;
318
- }
319
-
320
- .sync-scroll-header {
321
- background-color: #f8f9fa;
322
- padding: 10px;
323
- border-bottom: 1px solid #ddd;
324
- font-weight: bold;
325
- }
326
-
327
- .sync-scroll-content {
328
- flex: 1;
329
- overflow-y: auto;
330
- padding: 10px;
331
- background-color: #fff;
332
- scroll-behavior: smooth;
333
- transition: scroll-top 0.1s ease-out;
334
- }
335
-
336
- /* Prevent scroll chaining */
337
- .sync-scroll-content::-webkit-scrollbar {
338
- width: 8px;
339
- }
340
-
341
- .sync-scroll-content::-webkit-scrollbar-track {
342
- background: #f1f1f1;
343
- }
344
-
345
- .sync-scroll-content::-webkit-scrollbar-thumb {
346
- background: #888;
347
- border-radius: 4px;
348
- }
349
-
350
- .sync-scroll-content::-webkit-scrollbar-thumb:hover {
351
- background: #555;
352
- }
353
  </style>
354
-
355
- <script>
356
- // Improved synchronized scrolling implementation with better debugging
357
- console.log('Starting sync scroll setup...');
358
-
359
- function setupSyncScroll() {
360
- console.log('setupSyncScroll called');
361
-
362
- // Wait for elements to be available
363
- setTimeout(function() {
364
- console.log('Looking for scroll elements...');
365
- const originalContent = document.getElementById('original-content');
366
- const redactedContent = document.getElementById('redacted-content');
367
-
368
- console.log('Original content element:', originalContent);
369
- console.log('Redacted content element:', redactedContent);
370
-
371
- if (originalContent && redactedContent) {
372
- console.log('Both elements found, setting up sync...');
373
-
374
- let isScrolling = false;
375
- let scrollTimeout;
376
-
377
- function syncScroll(source, target) {
378
- if (!isScrolling) {
379
- isScrolling = true;
380
- console.log('Syncing scroll from', source.id, 'to', target.id, 'scrollTop:', source.scrollTop);
381
- target.scrollTop = source.scrollTop;
382
-
383
- // Clear existing timeout
384
- if (scrollTimeout) {
385
- clearTimeout(scrollTimeout);
386
- }
387
-
388
- // Reset flag after a short delay
389
- scrollTimeout = setTimeout(() => {
390
- isScrolling = false;
391
- console.log('Scroll sync completed');
392
- }, 100);
393
- }
394
- }
395
-
396
- // Remove existing listeners to prevent duplicates
397
- if (originalContent._syncScrollHandler) {
398
- originalContent.removeEventListener('scroll', originalContent._syncScrollHandler);
399
- }
400
- if (redactedContent._syncScrollHandler) {
401
- redactedContent.removeEventListener('scroll', redactedContent._syncScrollHandler);
402
- }
403
-
404
- // Create new handlers
405
- originalContent._syncScrollHandler = function(e) {
406
- console.log('Original content scrolled:', e.target.scrollTop);
407
- syncScroll(originalContent, redactedContent);
408
- };
409
-
410
- redactedContent._syncScrollHandler = function(e) {
411
- console.log('Redacted content scrolled:', e.target.scrollTop);
412
- syncScroll(redactedContent, originalContent);
413
- };
414
-
415
- // Add event listeners
416
- originalContent.addEventListener('scroll', originalContent._syncScrollHandler, { passive: true });
417
- redactedContent.addEventListener('scroll', redactedContent._syncScrollHandler, { passive: true });
418
-
419
- console.log('Event listeners added successfully');
420
-
421
- // Show status indicator
422
- const statusElement = document.getElementById('sync-status');
423
- if (statusElement) {
424
- statusElement.style.display = 'block';
425
- console.log('Status indicator shown');
426
- }
427
-
428
- // Test the synchronization
429
- setTimeout(() => {
430
- console.log('Testing scroll sync...');
431
- console.log('Original scrollTop:', originalContent.scrollTop);
432
- console.log('Redacted scrollTop:', redactedContent.scrollTop);
433
-
434
- // Try a small scroll to test
435
- originalContent.scrollTop = 10;
436
- setTimeout(() => {
437
- console.log('After test scroll - Original:', originalContent.scrollTop, 'Redacted:', redactedContent.scrollTop);
438
- }, 50);
439
- }, 200);
440
-
441
- } else {
442
- console.log('Elements not found, will retry...');
443
- // Retry with exponential backoff
444
- setTimeout(setupSyncScroll, 300);
445
- }
446
- }, 200);
447
- }
448
-
449
- // Multiple initialization strategies
450
- function initializeSyncScroll() {
451
- console.log('Initializing sync scroll...');
452
-
453
- // Strategy 1: Immediate setup
454
- setupSyncScroll();
455
-
456
- // Strategy 2: Setup after DOM ready
457
- if (document.readyState === 'loading') {
458
- document.addEventListener('DOMContentLoaded', function() {
459
- console.log('DOM loaded, setting up sync scroll...');
460
- setupSyncScroll();
461
- });
462
- }
463
-
464
- // Strategy 3: Setup after window load
465
- window.addEventListener('load', function() {
466
- console.log('Window loaded, setting up sync scroll...');
467
- setupSyncScroll();
468
- });
469
-
470
- // Strategy 4: Periodic retry for first 10 seconds
471
- let attempts = 0;
472
- const maxAttempts = 20;
473
- const retryInterval = setInterval(function() {
474
- attempts++;
475
- console.log('Retry attempt', attempts);
476
-
477
- const originalContent = document.getElementById('original-content');
478
- const redactedContent = document.getElementById('redacted-content');
479
-
480
- if (originalContent && redactedContent) {
481
- console.log('Elements found on retry, setting up...');
482
- setupSyncScroll();
483
- clearInterval(retryInterval);
484
- } else if (attempts >= maxAttempts) {
485
- console.log('Max retry attempts reached, giving up');
486
- clearInterval(retryInterval);
487
- }
488
- }, 500);
489
- }
490
-
491
- // Start initialization
492
- initializeSyncScroll();
493
-
494
- // Listen for Streamlit-specific events
495
- if (window.parent && window.parent.postMessage) {
496
- console.log('Streamlit environment detected');
497
-
498
- // Listen for any messages that might indicate a rerun
499
- window.addEventListener('message', function(event) {
500
- console.log('Received message:', event.data);
501
- if (event.data && (event.data.type === 'streamlit:rerun' || event.data.type === 'streamlit:setComponentValue')) {
502
- console.log('Streamlit rerun detected, reinitializing sync scroll...');
503
- setTimeout(setupSyncScroll, 1000);
504
- }
505
- });
506
- }
507
-
508
- console.log('Sync scroll script loaded');
509
- </script>
510
  """, unsafe_allow_html=True)
511
 
512
  # Configure root logger only once (avoid duplicate handlers on reruns)
@@ -528,6 +331,7 @@ Use the buttons below to view the original structure or process with redaction.
528
  if st.button("🧹 Clear All Data", type="secondary", help="Remove all temporary files and reset the application"):
529
  if clear_all_data():
530
  st.success("✅ All data cleared successfully! The application has been reset.")
 
531
  st.rerun()
532
  else:
533
  st.error("❌ Error clearing data. Please try again.")
@@ -578,6 +382,32 @@ with col1:
578
  # Show warning if total size is large
579
  if total_size > 50 * 1024 * 1024: # 50MB
580
  st.warning("⚠️ Large temporary files detected. Consider clearing data to free up space.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
581
  else:
582
  st.caption("📁 No temporary files")
583
 
@@ -593,72 +423,6 @@ with col2:
593
  else:
594
  st.caption("No files to delete")
595
 
596
- def create_diff_content(original_text: str, redacted_text: str, view_type: str) -> str:
597
- """Create HTML content for diff view with highlighting."""
598
- import difflib
599
- import re
600
-
601
- # Normalize the text to reduce formatting differences
602
- def normalize_text(text):
603
- # Remove extra whitespace and normalize line endings
604
- lines = text.split('\n')
605
- normalized_lines = []
606
- for line in lines:
607
- # Strip whitespace but preserve content
608
- stripped = line.strip()
609
- if stripped:
610
- # Normalize header formatting differences
611
- # Convert ## to # for level 1 headers
612
- if re.match(r'^##\s+', stripped):
613
- stripped = re.sub(r'^##\s+', '# ', stripped)
614
- # Normalize quote formatting
615
- if stripped.startswith('&gt; '):
616
- stripped = stripped.replace('&gt; ', '> ')
617
- elif stripped.startswith('+ > '):
618
- stripped = stripped.replace('+ > ', '> ')
619
-
620
- normalized_lines.append(stripped)
621
- return normalized_lines
622
-
623
- original_lines = normalize_text(original_text)
624
- redacted_lines = normalize_text(redacted_text)
625
-
626
- # Use difflib to get a more sophisticated diff
627
- differ = difflib.Differ()
628
- diff = list(differ.compare(original_lines, redacted_lines))
629
-
630
- html_lines = []
631
-
632
- if view_type == 'original':
633
- # Show original with removed content highlighted
634
- for line in diff:
635
- if line.startswith(' '): # Unchanged line
636
- escaped_line = html.escape(line[2:])
637
- html_lines.append(f'<div style="padding: 2px 4px; margin: 1px 0; color: #333;">{escaped_line}</div>')
638
- elif line.startswith('- '): # Removed line
639
- escaped_line = html.escape(line[2:])
640
- html_lines.append(f'<div style="background-color: #ffebee; color: #c62828; padding: 2px 4px; margin: 1px 0; border-left: 3px solid #f44336; font-weight: bold;">- {escaped_line}</div>')
641
- elif line.startswith('+ '): # Added line (show as empty space in original view)
642
- html_lines.append(f'<div style="background-color: #e8f5e8; color: #2e7d32; padding: 2px 4px; margin: 1px 0; border-left: 3px solid #4caf50; font-style: italic; opacity: 0.7;">+ (added in redacted version)</div>')
643
- elif line.startswith('? '): # Ignore difflib hints
644
- continue
645
-
646
- elif view_type == 'redacted':
647
- # Show redacted content with added content highlighted
648
- for line in diff:
649
- if line.startswith(' '): # Unchanged line
650
- escaped_line = html.escape(line[2:])
651
- html_lines.append(f'<div style="padding: 2px 4px; margin: 1px 0; color: #333;">{escaped_line}</div>')
652
- elif line.startswith('- '): # Removed line (show as empty space in redacted view)
653
- html_lines.append(f'<div style="background-color: #ffebee; color: #c62828; padding: 2px 4px; margin: 1px 0; border-left: 3px solid #f44336; font-style: italic; opacity: 0.7;">- (removed from original)</div>')
654
- elif line.startswith('+ '): # Added line
655
- escaped_line = html.escape(line[2:])
656
- html_lines.append(f'<div style="background-color: #e8f5e8; color: #2e7d32; padding: 2px 4px; margin: 1px 0; border-left: 3px solid #4caf50; font-weight: bold;">+ {escaped_line}</div>')
657
- elif line.startswith('? '): # Ignore difflib hints
658
- continue
659
-
660
- return '\n'.join(html_lines)
661
-
662
  if uploaded_files:
663
  # UI to select which file to work with (if multiple files uploaded)
664
  file_names = [f.name for f in uploaded_files]
@@ -698,32 +462,42 @@ if uploaded_files:
698
  # Save uploaded file to a temporary location
699
  temp_path = save_uploaded_file(uploaded_file, selected_file)
700
 
701
- # Create a DocumentProcessor with a SectionExtractor for our target sections
702
- section_extractor = ReasoningSectionExtractor(
703
- endpoint=AZURE_OPENAI_ENDPOINT,
704
- api_key=AZURE_OPENAI_KEY,
705
- api_version=AZURE_OPENAI_VERSION,
706
- deployment=AZURE_OPENAI_DEPLOYMENT,
707
- )
708
- processor = DocumentProcessor(section_extractor=section_extractor)
709
 
710
  # Attach an in-memory log handler to capture logs for this file
711
  log_handler, log_buffer = get_log_handler()
712
  root_logger = logging.getLogger()
713
  root_logger.addHandler(log_handler)
 
714
  try:
715
- # Process the document (Docling parse + section redaction)
716
- result = processor.process(temp_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
717
  finally:
718
  # Remove handler and stop capturing logs
719
  root_logger.removeHandler(log_handler)
720
 
721
- # Save results in session state
722
- st.session_state.processed_results[selected_file] = {
723
- "structured_json": result.structured_json,
724
- "redacted_md": result.redacted_markdown,
725
- "redacted_json": result.redacted_json
726
- }
727
  # Combine log records into a single text
728
  log_text = "\n".join(log_buffer)
729
  st.session_state.logs[selected_file] = log_text
@@ -807,25 +581,7 @@ if uploaded_files:
807
  structured_json = data["structured_json"]
808
  redacted_md = data["redacted_md"]
809
  redacted_json = data["redacted_json"]
810
-
811
- # Get the original markdown from the structured JSON
812
- # We need to reconstruct the original markdown from the structured JSON
813
- # For now, we'll use the structured_markdown from the DocumentResult
814
- # But we need to store this in the session state
815
-
816
- # Create a DocumentProcessor to get the original markdown
817
- if "original_markdown" not in st.session_state.processed_results[selected_file]:
818
- # Save uploaded file to a temporary location
819
- temp_path = save_uploaded_file(uploaded_file, selected_file)
820
-
821
- # Create a DocumentProcessor without section extraction to get original markdown
822
- processor = DocumentProcessor(section_extractor=None)
823
- result = processor.process(temp_path)
824
-
825
- # Store the original markdown
826
- st.session_state.processed_results[selected_file]["original_markdown"] = result.structured_markdown
827
-
828
- original_md = st.session_state.processed_results[selected_file]["original_markdown"]
829
 
830
  # Show processing summary
831
  original_texts = structured_json.get("texts", [])
@@ -844,67 +600,230 @@ if uploaded_files:
844
  st.subheader("Original vs Redacted Content")
845
  st.caption("Compare the original document content with the redacted version")
846
 
847
- # Add status indicator
848
- st.markdown("""
849
- <div id="sync-status" style="padding: 8px; background-color: #e8f5e8; border: 1px solid #4caf50; border-radius: 4px; margin-bottom: 10px; display: none;">
850
- <strong>Synchronized scrolling is active</strong> - Scroll either panel to sync both views
851
- </div>
852
- """, unsafe_allow_html=True)
853
 
854
- # Create a diff-like interface with synchronized scrolling and highlighting
855
- diff_html = f"""
856
- <div class="sync-scroll-container">
857
- <div class="sync-scroll-panel">
858
- <div class="sync-scroll-header">
859
- 📋 Original Document
860
- </div>
861
- <div id="original-content" class="sync-scroll-content">
862
- {create_diff_content(original_md, redacted_md, 'original')}
863
- </div>
864
- </div>
865
- <div class="sync-scroll-panel">
866
- <div class="sync-scroll-header">
867
- 🔒 Redacted Document
868
- </div>
869
- <div id="redacted-content" class="sync-scroll-content">
870
- {create_diff_content(original_md, redacted_md, 'redacted')}
871
- </div>
872
- </div>
873
- </div>
874
- """
875
 
876
- st.markdown(diff_html, unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
877
 
878
- # Add a hidden component to trigger JavaScript setup after Streamlit reruns
879
- st.markdown("""
880
- <script>
881
- // Trigger setup after Streamlit rerun
882
- if (window.parent && window.parent.postMessage) {
883
- // Wait for Streamlit to finish rendering
884
- setTimeout(function() {
885
- setupSyncScroll();
886
- }, 500);
887
- }
888
- </script>
889
- """, unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
890
 
891
-
892
- # Add legend for the diff highlighting
893
  st.markdown("---")
894
  col1, col2 = st.columns(2)
895
  with col1:
896
- st.markdown("**🎨 Diff Legend:**")
897
  st.markdown("🔴 **Red background** = Removed content")
898
- st.markdown("🟢 **Green background** = Added content")
899
- st.markdown(" **White background** = Unchanged content")
900
 
901
  with col2:
902
- st.markdown("**💡 Tips:**")
903
- st.markdown("Look for red-highlighted sections")
904
- st.markdown("These show what was redacted")
905
- st.markdown("Use scroll to navigate long documents")
906
 
907
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
908
 
909
  with tab2:
910
  st.subheader("Document Structure Analysis")
@@ -922,19 +841,139 @@ if uploaded_files:
922
  with tab3:
923
  st.subheader("Processing Details")
924
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
925
  # Show what was removed
926
  if removed_count > 0:
927
  st.info(f"**Removed {removed_count} text elements from the document structure.**")
928
 
929
- # Show the removed text elements
930
  st.subheader("Removed Text Elements:")
931
- removed_texts = []
932
- for i, text_elem in enumerate(original_texts):
933
- if i >= len(redacted_texts) or text_elem.get("text", "") != redacted_texts[i].get("text", ""):
934
- removed_texts.append((i, text_elem.get("text", "")[:100] + "..." if len(text_elem.get("text", "")) > 100 else text_elem.get("text", "")))
935
 
936
- for idx, text in removed_texts:
937
- st.text(f"Text {idx}: {text}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
938
  else:
939
  st.info("No text elements were removed during processing.")
940
 
 
3
 
4
  import os
5
  import tempfile
6
+ import json
7
+ from datetime import datetime
8
 
9
  # Get a writable temp directory first
10
  try:
 
93
  os.environ['ACCELERATE_CACHE'],
94
  ]
95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  for directory in directories_to_create:
97
  try:
98
+ # Create directory and all parent directories
99
+ os.makedirs(directory, mode=0o777, exist_ok=True)
100
+ # Ensure the directory has write permissions
101
+ os.chmod(directory, 0o777)
102
  except Exception as e:
103
  print(f"Warning: Could not create directory {directory}: {e}")
104
 
 
109
  from processing.document_processor import DocumentProcessor
110
  from processing.sections import ReasoningSectionExtractor
111
  from utils.logging_utils import get_log_handler
112
+ from utils.cost_tracker import cost_tracker
113
  from dotenv import load_dotenv
114
  import sys
 
115
  import difflib
 
116
  import time
117
 
118
  # Configure logging early to avoid issues
 
199
 
200
  files = os.listdir(TEMP_DIR)
201
  total_size = 0
202
+ file_details = []
203
 
204
  for filename in files:
205
  try:
206
  file_path = os.path.join(TEMP_DIR, filename)
207
  if os.path.isfile(file_path):
208
+ file_size = os.path.getsize(file_path)
209
+ total_size += file_size
210
+ file_details.append({
211
+ 'name': filename,
212
+ 'size': file_size,
213
+ 'type': 'file'
214
+ })
215
+ elif os.path.isdir(file_path):
216
+ file_details.append({
217
+ 'name': filename,
218
+ 'size': 0,
219
+ 'type': 'directory'
220
+ })
221
  except (PermissionError, OSError) as e:
222
  logging.warning(f"Error accessing file {filename}: {e}")
223
+ file_details.append({
224
+ 'name': filename,
225
+ 'size': 0,
226
+ 'type': 'error'
227
+ })
228
  continue
229
 
230
+ # Log detailed information for debugging
231
+ if file_details:
232
+ logging.info(f"Temp directory contents ({TEMP_DIR}):")
233
+ for detail in file_details:
234
+ logging.info(f" - {detail['name']} ({detail['type']}): {detail['size']} bytes")
235
+
236
  return len(files), total_size
237
  except PermissionError as e:
238
  logging.warning(f"Permission error accessing temp directory {TEMP_DIR}: {e}")
 
309
  border-radius: 10px;
310
  border: 1px solid #e9ecef;
311
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
312
  </style>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
313
  """, unsafe_allow_html=True)
314
 
315
  # Configure root logger only once (avoid duplicate handlers on reruns)
 
331
  if st.button("🧹 Clear All Data", type="secondary", help="Remove all temporary files and reset the application"):
332
  if clear_all_data():
333
  st.success("✅ All data cleared successfully! The application has been reset.")
334
+ cost_tracker.reset_session() # Reset cost tracking when clearing data
335
  st.rerun()
336
  else:
337
  st.error("❌ Error clearing data. Please try again.")
 
382
  # Show warning if total size is large
383
  if total_size > 50 * 1024 * 1024: # 50MB
384
  st.warning("⚠️ Large temporary files detected. Consider clearing data to free up space.")
385
+
386
+ # Debug: Show temp files (expandable)
387
+ with st.expander("🔍 Debug: View temporary files"):
388
+ try:
389
+ if os.path.exists(TEMP_DIR):
390
+ files = os.listdir(TEMP_DIR)
391
+ if files:
392
+ st.write("**Temporary files in directory:**")
393
+ for filename in files:
394
+ file_path = os.path.join(TEMP_DIR, filename)
395
+ try:
396
+ if os.path.isfile(file_path):
397
+ size = os.path.getsize(file_path)
398
+ st.write(f"📄 {filename} ({format_file_size(size)})")
399
+ elif os.path.isdir(file_path):
400
+ st.write(f"📁 {filename} (directory)")
401
+ else:
402
+ st.write(f"❓ {filename} (unknown)")
403
+ except Exception as e:
404
+ st.write(f"❌ {filename} (error: {e})")
405
+ else:
406
+ st.write("No files found in temp directory")
407
+ else:
408
+ st.write("Temp directory does not exist")
409
+ except Exception as e:
410
+ st.write(f"Error accessing temp directory: {e}")
411
  else:
412
  st.caption("📁 No temporary files")
413
 
 
423
  else:
424
  st.caption("No files to delete")
425
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
426
  if uploaded_files:
427
  # UI to select which file to work with (if multiple files uploaded)
428
  file_names = [f.name for f in uploaded_files]
 
462
  # Save uploaded file to a temporary location
463
  temp_path = save_uploaded_file(uploaded_file, selected_file)
464
 
465
+ # Ensure the deployment name is in the cost tracker
466
+ if AZURE_OPENAI_DEPLOYMENT and AZURE_OPENAI_DEPLOYMENT not in cost_tracker.get_available_models():
467
+ model_type = cost_tracker.guess_model_type(AZURE_OPENAI_DEPLOYMENT)
468
+ cost_tracker.add_deployment_pricing(AZURE_OPENAI_DEPLOYMENT, model_type)
469
+
470
+ # Use the new processing function
471
+ from processing.document_processor import process_document_with_redaction
 
472
 
473
  # Attach an in-memory log handler to capture logs for this file
474
  log_handler, log_buffer = get_log_handler()
475
  root_logger = logging.getLogger()
476
  root_logger.addHandler(log_handler)
477
+
478
  try:
479
+ # Process the document using the new function
480
+ processing_result = process_document_with_redaction(
481
+ file_path=temp_path,
482
+ endpoint=AZURE_OPENAI_ENDPOINT,
483
+ api_key=AZURE_OPENAI_KEY,
484
+ api_version=AZURE_OPENAI_VERSION,
485
+ deployment=AZURE_OPENAI_DEPLOYMENT,
486
+ )
487
+
488
+ # Save results in session state (maintaining compatibility with existing UI)
489
+ st.session_state.processed_results[selected_file] = {
490
+ "structured_json": processing_result.original_document_json,
491
+ "redacted_md": processing_result.redacted_document_md,
492
+ "redacted_json": processing_result.redacted_document_json, # Now this is actually redacted!
493
+ "original_markdown": processing_result.original_document_md,
494
+ "processing_result": processing_result # Store the new result
495
+ }
496
+
497
  finally:
498
  # Remove handler and stop capturing logs
499
  root_logger.removeHandler(log_handler)
500
 
 
 
 
 
 
 
501
  # Combine log records into a single text
502
  log_text = "\n".join(log_buffer)
503
  st.session_state.logs[selected_file] = log_text
 
581
  structured_json = data["structured_json"]
582
  redacted_md = data["redacted_md"]
583
  redacted_json = data["redacted_json"]
584
+ original_md = data["original_markdown"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
585
 
586
  # Show processing summary
587
  original_texts = structured_json.get("texts", [])
 
600
  st.subheader("Original vs Redacted Content")
601
  st.caption("Compare the original document content with the redacted version")
602
 
603
+ # Get the actual removed indices from the processing result
604
+ actual_removed_indices = []
605
+ if "processing_result" in st.session_state.processed_results[selected_file]:
606
+ processing_result = st.session_state.processed_results[selected_file]["processing_result"]
607
+ actual_removed_indices = processing_result.removed_indices
 
608
 
609
+ # Create a more intelligent side-by-side comparison based on JSON structure
610
+ col1, col2 = st.columns(2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
611
 
612
+ with col1:
613
+ st.markdown("**📋 Original Document**")
614
+
615
+ # Display original content with removed sections highlighted
616
+ for i, text_elem in enumerate(original_texts):
617
+ text_content = text_elem.get("text", "")
618
+ label = text_elem.get("label", "")
619
+
620
+ # Check if this element was removed
621
+ is_removed = i in actual_removed_indices
622
+
623
+ if is_removed:
624
+ # Highlight removed content in red
625
+ st.markdown(f"""
626
+ <div style="background-color: #ffebee; color: #c62828; padding: 8px; margin: 4px 0; border-left: 4px solid #f44336; border-radius: 4px;">
627
+ <strong>Text {i} ({label}) - REMOVED:</strong><br>
628
+ {text_content}
629
+ </div>
630
+ """, unsafe_allow_html=True)
631
+ else:
632
+ # Show normal content
633
+ content_preview = text_content[:150] + "..." if len(text_content) > 150 else text_content
634
+ st.markdown(f"""
635
+ <div style="padding: 4px; margin: 2px 0; border-radius: 4px;">
636
+ <strong>Text {i} ({label}) - {len(text_content)} chars:</strong><br>
637
+ <code style="background-color: #f5f5f5; padding: 2px; border-radius: 2px;">{content_preview}</code>
638
+ </div>
639
+ """, unsafe_allow_html=True)
640
 
641
+ with col2:
642
+ st.markdown("**🔒 Redacted Document**")
643
+
644
+ # Display redacted content (only non-removed elements)
645
+ redacted_index = 0
646
+ for i, text_elem in enumerate(original_texts):
647
+ text_content = text_elem.get("text", "")
648
+ label = text_elem.get("label", "")
649
+
650
+ # Check if this element was removed
651
+ is_removed = i in actual_removed_indices
652
+
653
+ if is_removed:
654
+ # Show placeholder for removed content
655
+ st.markdown(f"""
656
+ <div style="background-color: #ffebee; color: #c62828; padding: 8px; margin: 4px 0; border-left: 4px solid #f44336; border-radius: 4px; font-style: italic; opacity: 0.7;">
657
+ <strong>Text {i} ({label}) - REMOVED</strong><br>
658
+ [Content removed by redaction]
659
+ </div>
660
+ """, unsafe_allow_html=True)
661
+ else:
662
+ # Show the actual content from redacted texts
663
+ if redacted_index < len(redacted_texts):
664
+ redacted_content = redacted_texts[redacted_index].get("text", "")
665
+ content_preview = redacted_content[:150] + "..." if len(redacted_content) > 150 else redacted_content
666
+ st.markdown(f"""
667
+ <div style="padding: 4px; margin: 2px 0; border-radius: 4px;">
668
+ <strong>Text {i} ({label}) - {len(redacted_content)} chars:</strong><br>
669
+ <code style="background-color: #f5f5f5; padding: 2px; border-radius: 2px;">{content_preview}</code>
670
+ </div>
671
+ """, unsafe_allow_html=True)
672
+ redacted_index += 1
673
+ else:
674
+ st.markdown(f"""
675
+ <div style="padding: 4px; margin: 2px 0; border-radius: 4px; background-color: #f5f5f5;">
676
+ <strong>Text {i} ({label}):</strong><br>
677
+ [Content preserved]
678
+ </div>
679
+ """, unsafe_allow_html=True)
680
 
681
+ # Add legend
 
682
  st.markdown("---")
683
  col1, col2 = st.columns(2)
684
  with col1:
685
+ st.markdown("**🎨 Comparison Legend:**")
686
  st.markdown("🔴 **Red background** = Removed content")
687
+ st.markdown(" **White background** = Preserved content")
688
+ st.markdown("📝 **Italic text** = Placeholder for removed content")
689
 
690
  with col2:
691
+ st.markdown("**💡 How to read:**")
692
+ st.markdown("Left panel shows original with removed sections highlighted")
693
+ st.markdown("Right panel shows redacted version with placeholders")
694
+ st.markdown("Compare corresponding text indices to see changes")
695
 
696
+ # Add debug information to help identify missing content
697
+ with st.expander("🔍 Debug: Content Analysis"):
698
+ st.write("**Searching for table content...**")
699
+
700
+ # Search for table-related content in original texts
701
+ table_elements = []
702
+ for i, text_elem in enumerate(original_texts):
703
+ text_content = text_elem.get("text", "")
704
+ label = text_elem.get("label", "")
705
+
706
+ if "Bespreking" in text_content or "|" in text_content or "table" in label.lower():
707
+ table_elements.append({
708
+ "index": i,
709
+ "label": label,
710
+ "content": text_content[:200] + "..." if len(text_content) > 200 else text_content,
711
+ "is_removed": i in actual_removed_indices
712
+ })
713
+
714
+ if table_elements:
715
+ st.write(f"**Found {len(table_elements)} table-related elements:**")
716
+ for elem in table_elements:
717
+ status = "🔴 REMOVED" if elem["is_removed"] else "✅ PRESERVED"
718
+ st.write(f"**Text {elem['index']} ({elem['label']}) - {status}:**")
719
+ st.write(f"`{elem['content']}`")
720
+ st.write("---")
721
+ else:
722
+ st.write("**No table-related content found in original texts**")
723
+
724
+ # Also check redacted texts
725
+ st.write("**Table content in redacted texts:**")
726
+ table_elements_redacted = []
727
+ for i, text_elem in enumerate(redacted_texts):
728
+ text_content = text_elem.get("text", "")
729
+ label = text_elem.get("label", "")
730
+
731
+ if "Bespreking" in text_content or "|" in text_content or "table" in label.lower():
732
+ table_elements_redacted.append({
733
+ "index": i,
734
+ "label": label,
735
+ "content": text_content[:200] + "..." if len(text_content) > 200 else text_content
736
+ })
737
+
738
+ if table_elements_redacted:
739
+ st.write(f"**Found {len(table_elements_redacted)} table-related elements in redacted content:**")
740
+ for elem in table_elements_redacted:
741
+ st.write(f"**Text {elem['index']} ({elem['label']}):**")
742
+ st.write(f"`{elem['content']}`")
743
+ st.write("---")
744
+ else:
745
+ st.write("**No table-related content found in redacted texts**")
746
+
747
+ # Add download buttons for redacted content
748
+ st.markdown("---")
749
+ st.subheader("📥 Download Redacted Content")
750
+
751
+ col1, col2, col3 = st.columns(3)
752
+
753
+ with col1:
754
+ # Download redacted markdown
755
+ st.download_button(
756
+ label="📄 Download Redacted Markdown",
757
+ data=redacted_md,
758
+ file_name=f"{selected_file}_redacted.md",
759
+ mime="text/markdown",
760
+ help="Download the redacted document as Markdown format"
761
+ )
762
+
763
+ with col2:
764
+ # Generate and download redacted PDF
765
+ pdf_generated = False
766
+ pdf_bytes = None
767
+
768
+ if st.button("📋 Generate Redacted PDF", help="Generate a PDF version of the redacted document"):
769
+ with st.spinner("Generating redacted PDF..."):
770
+ try:
771
+ # Create a DocumentProcessor to access PDF generation
772
+ temp_path = save_uploaded_file(uploaded_file, selected_file)
773
+ processor = DocumentProcessor(section_extractor=None)
774
+
775
+ # Generate PDF path
776
+ base_name = os.path.splitext(selected_file)[0]
777
+ pdf_path = os.path.join(TEMP_DIR, f"{base_name}_redacted.pdf")
778
+
779
+ # Generate the PDF
780
+ success = processor.generate_redacted_pdf(redacted_json, pdf_path)
781
+
782
+ if success:
783
+ # Read the generated PDF and store for download
784
+ with open(pdf_path, "rb") as pdf_file:
785
+ pdf_bytes = pdf_file.read()
786
+ pdf_generated = True
787
+ st.success("✅ PDF generated successfully!")
788
+ else:
789
+ st.error("❌ Failed to generate PDF. Check logs for details.")
790
+
791
+ except Exception as e:
792
+ st.error(f"❌ Error generating PDF: {e}")
793
+ st.info("💡 Make sure reportlab is installed: `pip install reportlab`")
794
+
795
+ # Show download button if PDF was generated
796
+ if pdf_generated and pdf_bytes:
797
+ st.download_button(
798
+ label="📥 Download Redacted PDF",
799
+ data=pdf_bytes,
800
+ file_name=f"{os.path.splitext(selected_file)[0]}_redacted.pdf",
801
+ mime="application/pdf",
802
+ help="Download the redacted document as PDF"
803
+ )
804
+
805
+ # Show debug information about what's in the PDF
806
+ with st.expander("🔍 Debug: PDF Content Analysis"):
807
+ st.write("**Content that will be included in the PDF:**")
808
+ texts_in_pdf = redacted_json.get("texts", [])
809
+ st.write(f"Total text elements: {len(texts_in_pdf)}")
810
+
811
+ for i, text_elem in enumerate(texts_in_pdf):
812
+ text_content = text_elem.get("text", "")[:100] + "..." if len(text_elem.get("text", "")) > 100 else text_elem.get("text", "")
813
+ label = text_elem.get("label", "")
814
+ st.write(f"**Text {i} ({label}):** {text_content}")
815
+ elif not pdf_generated:
816
+ st.info("💡 Click 'Generate Redacted PDF' to create a PDF version")
817
+
818
+ with col3:
819
+ # Download redacted JSON structure
820
+ st.download_button(
821
+ label="🔧 Download Redacted JSON",
822
+ data=json.dumps(redacted_json, indent=2, ensure_ascii=False),
823
+ file_name=f"{selected_file}_redacted.json",
824
+ mime="application/json",
825
+ help="Download the redacted document structure as JSON"
826
+ )
827
 
828
  with tab2:
829
  st.subheader("Document Structure Analysis")
 
841
  with tab3:
842
  st.subheader("Processing Details")
843
 
844
+ # Show cost analysis for this processing session
845
+ st.subheader("💰 Cost Analysis")
846
+
847
+ # Get cost data from the processing result
848
+ if "processing_result" in st.session_state.processed_results[selected_file]:
849
+ processing_result = st.session_state.processed_results[selected_file]["processing_result"]
850
+
851
+ col1, col2, col3 = st.columns(3)
852
+ with col1:
853
+ st.metric("Total Cost", f"${processing_result.cost:.4f}")
854
+ with col2:
855
+ st.metric("Input Tokens", f"{processing_result.input_tokens:,}")
856
+ with col3:
857
+ st.metric("Output Tokens", f"{processing_result.output_tokens:,}")
858
+
859
+ # Add download button for cost report
860
+ cost_report = {
861
+ "timestamp": datetime.now().isoformat(),
862
+ "total_cost": processing_result.cost,
863
+ "input_tokens": processing_result.input_tokens,
864
+ "output_tokens": processing_result.output_tokens,
865
+ "total_tokens": processing_result.input_tokens + processing_result.output_tokens,
866
+ "document_processed": selected_file,
867
+ "model_used": AZURE_OPENAI_DEPLOYMENT
868
+ }
869
+
870
+ st.download_button(
871
+ label="📥 Download Cost Report (JSON)",
872
+ data=json.dumps(cost_report, indent=2),
873
+ file_name=f"cost_report_{selected_file}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
874
+ mime="application/json"
875
+ )
876
+
877
+ # Show model information
878
+ model_info = cost_tracker.get_model_info(AZURE_OPENAI_DEPLOYMENT)
879
+ if model_info:
880
+ st.subheader("Model Information")
881
+ st.write(f"**Model:** {model_info.description}")
882
+ st.write(f"**Input cost:** ${model_info.input_cost_per_1k_tokens:.4f}/1K tokens")
883
+ st.write(f"**Output cost:** ${model_info.output_cost_per_1k_tokens:.4f}/1K tokens")
884
+
885
+ # Calculate cost breakdown
886
+ input_cost = (processing_result.input_tokens / 1000) * model_info.input_cost_per_1k_tokens
887
+ output_cost = (processing_result.output_tokens / 1000) * model_info.output_cost_per_1k_tokens
888
+ st.write(f"**Cost breakdown:** Input: ${input_cost:.4f}, Output: ${output_cost:.4f}")
889
+ else:
890
+ # Fallback to old cost summary method
891
+ cost_summary = cost_tracker.get_session_summary()
892
+
893
+ if cost_summary["usage_count"] > 0:
894
+ col1, col2, col3 = st.columns(3)
895
+ with col1:
896
+ st.metric("Total Cost", f"${cost_summary['total_cost']:.4f}")
897
+ with col2:
898
+ st.metric("Total Tokens", f"{cost_summary['total_tokens']:,}")
899
+ with col3:
900
+ st.metric("API Calls", cost_summary["usage_count"])
901
+
902
+ # Add download button for cost report
903
+ cost_report = {
904
+ "timestamp": datetime.now().isoformat(),
905
+ "total_cost": cost_summary["total_cost"],
906
+ "total_tokens": cost_summary["total_tokens"],
907
+ "api_calls": cost_summary["usage_count"],
908
+ "model_breakdown": cost_summary["model_breakdown"],
909
+ "document_processed": selected_file
910
+ }
911
+
912
+ st.download_button(
913
+ label="📥 Download Cost Report (JSON)",
914
+ data=json.dumps(cost_report, indent=2),
915
+ file_name=f"cost_report_{selected_file}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
916
+ mime="application/json"
917
+ )
918
+
919
+ # Show detailed model breakdown
920
+ if cost_summary["model_breakdown"]:
921
+ st.subheader("Model Usage Breakdown")
922
+ for model, stats in cost_summary["model_breakdown"].items():
923
+ model_info = cost_tracker.get_model_info(model)
924
+ model_display_name = model_info.description if model_info else model
925
+
926
+ with st.expander(f"{model_display_name} - ${stats['cost']:.4f}"):
927
+ col1, col2 = st.columns(2)
928
+ with col1:
929
+ st.write(f"**Input tokens:** {stats['input_tokens']:,}")
930
+ st.write(f"**Output tokens:** {stats['output_tokens']:,}")
931
+ with col2:
932
+ st.write(f"**Total tokens:** {stats['total_tokens']:,}")
933
+ st.write(f"**API calls:** {stats['usage_count']}")
934
+
935
+ # Show cost breakdown
936
+ if model_info:
937
+ input_cost = (stats['input_tokens'] / 1000) * model_info.input_cost_per_1k_tokens
938
+ output_cost = (stats['output_tokens'] / 1000) * model_info.output_cost_per_1k_tokens
939
+ st.write(f"**Cost breakdown:** Input: ${input_cost:.4f}, Output: ${output_cost:.4f}")
940
+ else:
941
+ st.info("No API calls recorded for this session")
942
+
943
  # Show what was removed
944
  if removed_count > 0:
945
  st.info(f"**Removed {removed_count} text elements from the document structure.**")
946
 
947
+ # Show the removed text elements - use the actual indices from the processing result
948
  st.subheader("Removed Text Elements:")
 
 
 
 
949
 
950
+ # Get the actual indices that were removed from the processing result
951
+ if "processing_result" in st.session_state.processed_results[selected_file]:
952
+ # Get the actual removed indices from the LLM response
953
+ processing_result = st.session_state.processed_results[selected_file]["processing_result"]
954
+ actual_removed_indices = processing_result.removed_indices
955
+
956
+ if actual_removed_indices:
957
+ st.info(f"**Elements removed by LLM analysis ({len(actual_removed_indices)} elements):**")
958
+
959
+ for idx in actual_removed_indices:
960
+ if idx < len(original_texts):
961
+ text_content = original_texts[idx].get("text", "")
962
+ st.text(f"Text {idx}: {text_content[:100]}{'...' if len(text_content) > 100 else ''}")
963
+ else:
964
+ st.text(f"Text {idx}: [Index out of bounds]")
965
+ else:
966
+ st.info("**No elements were identified for removal by the LLM.**")
967
+ else:
968
+ # Fallback to the old method if processing result not available
969
+ st.warning("**Note: Using fallback calculation method**")
970
+ removed_texts = []
971
+ for i, text_elem in enumerate(original_texts):
972
+ if i >= len(redacted_texts) or text_elem.get("text", "") != redacted_texts[i].get("text", ""):
973
+ removed_texts.append((i, text_elem.get("text", "")[:100] + "..." if len(text_elem.get("text", "")) > 100 else text_elem.get("text", "")))
974
+
975
+ for idx, text in removed_texts:
976
+ st.text(f"Text {idx}: {text}")
977
  else:
978
  st.info("No text elements were removed during processing.")
979
 
src/utils/cost_tracker.py ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from dataclasses import dataclass
3
+ from typing import Dict, Optional
4
+ from datetime import datetime
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+ @dataclass
9
+ class ModelPricing:
10
+ """Pricing information for Azure OpenAI models."""
11
+ model_name: str
12
+ input_cost_per_1k_tokens: float # Cost per 1000 input tokens
13
+ output_cost_per_1k_tokens: float # Cost per 1000 output tokens
14
+ description: str
15
+
16
+ @dataclass
17
+ class TokenUsage:
18
+ """Token usage statistics for a single API call."""
19
+ prompt_tokens: int
20
+ completion_tokens: int
21
+ total_tokens: int
22
+ model: str
23
+ timestamp: datetime
24
+
25
+ @dataclass
26
+ class CostAnalysis:
27
+ """Cost analysis for document processing."""
28
+ total_input_tokens: int
29
+ total_output_tokens: int
30
+ total_cost: float
31
+ model_breakdown: Dict[str, Dict[str, float]] # {model: {"input_cost": x, "output_cost": y, "total_cost": z}}
32
+ processing_time: float
33
+ timestamp: datetime
34
+
35
+ class CostTracker:
36
+ """Tracks token usage and calculates costs for Azure OpenAI API calls."""
37
+
38
+ # Hardcoded pricing for Azure OpenAI models (current as of 2024)
39
+ # Source: https://azure.microsoft.com/en-us/pricing/details/cognitive-services/openai-service/
40
+ MODEL_PRICING = {
41
+ # Standard model names
42
+ "gpt-4o-mini": ModelPricing(
43
+ model_name="gpt-4o-mini",
44
+ input_cost_per_1k_tokens=0.00015, # $0.00015 per 1K input tokens
45
+ output_cost_per_1k_tokens=0.0006, # $0.0006 per 1K output tokens
46
+ description="GPT-4o Mini (O3 Mini)"
47
+ ),
48
+ "gpt-4o": ModelPricing(
49
+ model_name="gpt-4o",
50
+ input_cost_per_1k_tokens=0.0025, # $0.0025 per 1K input tokens
51
+ output_cost_per_1k_tokens=0.01, # $0.01 per 1K output tokens
52
+ description="GPT-4o (O4)"
53
+ ),
54
+ "gpt-35-turbo": ModelPricing(
55
+ model_name="gpt-35-turbo",
56
+ input_cost_per_1k_tokens=0.0005, # $0.0005 per 1K input tokens
57
+ output_cost_per_1k_tokens=0.0015, # $0.0015 per 1K output tokens
58
+ description="GPT-3.5 Turbo (O3)"
59
+ ),
60
+ # Azure deployment names (custom names set in Azure)
61
+ "o3-mini": ModelPricing(
62
+ model_name="o3-mini",
63
+ input_cost_per_1k_tokens=0.00015, # $0.00015 per 1K input tokens
64
+ output_cost_per_1k_tokens=0.0006, # $0.0006 per 1K output tokens
65
+ description="O3 Mini (GPT-4o Mini)"
66
+ ),
67
+ "o4-mini": ModelPricing(
68
+ model_name="o4-mini",
69
+ input_cost_per_1k_tokens=0.00015, # $0.00015 per 1K input tokens
70
+ output_cost_per_1k_tokens=0.0006, # $0.0006 per 1K output tokens
71
+ description="O4 Mini (GPT-4o Mini)"
72
+ ),
73
+ "o3": ModelPricing(
74
+ model_name="o3",
75
+ input_cost_per_1k_tokens=0.0005, # $0.0005 per 1K input tokens
76
+ output_cost_per_1k_tokens=0.0015, # $0.0015 per 1K output tokens
77
+ description="O3 (GPT-3.5 Turbo)"
78
+ ),
79
+ "o4": ModelPricing(
80
+ model_name="o4",
81
+ input_cost_per_1k_tokens=0.0025, # $0.0025 per 1K input tokens
82
+ output_cost_per_1k_tokens=0.01, # $0.01 per 1K output tokens
83
+ description="O4 (GPT-4o)"
84
+ ),
85
+ # Alternative model names that might be used in Azure deployments
86
+ "gpt-4o-mini-2024-07-18": ModelPricing(
87
+ model_name="gpt-4o-mini-2024-07-18",
88
+ input_cost_per_1k_tokens=0.00015, # $0.00015 per 1K input tokens
89
+ output_cost_per_1k_tokens=0.0006, # $0.0006 per 1K output tokens
90
+ description="GPT-4o Mini (O3 Mini) - Latest"
91
+ ),
92
+ "gpt-4o-2024-05-13": ModelPricing(
93
+ model_name="gpt-4o-2024-05-13",
94
+ input_cost_per_1k_tokens=0.0025, # $0.0025 per 1K input tokens
95
+ output_cost_per_1k_tokens=0.01, # $0.01 per 1K output tokens
96
+ description="GPT-4o (O4) - Latest"
97
+ ),
98
+ "gpt-35-turbo-0125": ModelPricing(
99
+ model_name="gpt-35-turbo-0125",
100
+ input_cost_per_1k_tokens=0.0005, # $0.0005 per 1K input tokens
101
+ output_cost_per_1k_tokens=0.0015, # $0.0015 per 1K output tokens
102
+ description="GPT-3.5 Turbo (O3) - Latest"
103
+ ),
104
+ }
105
+
106
+ def __init__(self):
107
+ self.usage_history: list[TokenUsage] = []
108
+ self.current_session_tokens = 0
109
+ self.current_session_cost = 0.0
110
+
111
+ def record_usage(self, prompt_tokens: int, completion_tokens: int, model: str) -> TokenUsage:
112
+ """Record token usage from an API call."""
113
+ total_tokens = prompt_tokens + completion_tokens
114
+ usage = TokenUsage(
115
+ prompt_tokens=prompt_tokens,
116
+ completion_tokens=completion_tokens,
117
+ total_tokens=total_tokens,
118
+ model=model,
119
+ timestamp=datetime.now()
120
+ )
121
+
122
+ self.usage_history.append(usage)
123
+ self.current_session_tokens += total_tokens
124
+
125
+ # Calculate cost for this usage
126
+ cost = self._calculate_cost(prompt_tokens, completion_tokens, model)
127
+ self.current_session_cost += cost
128
+
129
+ logger.info(f"Recorded usage: {prompt_tokens} input + {completion_tokens} output = {total_tokens} total tokens "
130
+ f"for model {model}, cost: ${cost:.6f}")
131
+
132
+ return usage
133
+
134
+ def _calculate_cost(self, input_tokens: int, output_tokens: int, model: str) -> float:
135
+ """Calculate cost for given token usage and model."""
136
+ if model not in self.MODEL_PRICING:
137
+ logger.warning(f"Unknown model pricing for {model}, using default pricing")
138
+ # Try to guess the model type based on the name
139
+ if "mini" in model.lower():
140
+ # Assume it's a mini model (cheapest)
141
+ model = "o3-mini"
142
+ elif "o4" in model.lower():
143
+ # Assume it's O4 (most expensive)
144
+ model = "o4"
145
+ elif "o3" in model.lower():
146
+ # Assume it's O3 (medium)
147
+ model = "o3"
148
+ else:
149
+ # Default to cheapest option
150
+ model = "o3-mini"
151
+
152
+ pricing = self.MODEL_PRICING[model]
153
+
154
+ input_cost = (input_tokens / 1000) * pricing.input_cost_per_1k_tokens
155
+ output_cost = (output_tokens / 1000) * pricing.output_cost_per_1k_tokens
156
+
157
+ return input_cost + output_cost
158
+
159
+ def get_session_summary(self) -> Dict[str, any]:
160
+ """Get summary of current session usage."""
161
+ if not self.usage_history:
162
+ return {
163
+ "total_tokens": 0,
164
+ "total_cost": 0.0,
165
+ "model_breakdown": {},
166
+ "usage_count": 0
167
+ }
168
+
169
+ model_breakdown = {}
170
+ for usage in self.usage_history:
171
+ if usage.model not in model_breakdown:
172
+ model_breakdown[usage.model] = {
173
+ "input_tokens": 0,
174
+ "output_tokens": 0,
175
+ "total_tokens": 0,
176
+ "cost": 0.0,
177
+ "usage_count": 0
178
+ }
179
+
180
+ model_breakdown[usage.model]["input_tokens"] += usage.prompt_tokens
181
+ model_breakdown[usage.model]["output_tokens"] += usage.completion_tokens
182
+ model_breakdown[usage.model]["total_tokens"] += usage.total_tokens
183
+ model_breakdown[usage.model]["usage_count"] += 1
184
+ model_breakdown[usage.model]["cost"] += self._calculate_cost(
185
+ usage.prompt_tokens, usage.completion_tokens, usage.model
186
+ )
187
+
188
+ return {
189
+ "total_tokens": self.current_session_tokens,
190
+ "total_cost": self.current_session_cost,
191
+ "model_breakdown": model_breakdown,
192
+ "usage_count": len(self.usage_history)
193
+ }
194
+
195
+ def reset_session(self):
196
+ """Reset current session statistics."""
197
+ self.usage_history = []
198
+ self.current_session_tokens = 0
199
+ self.current_session_cost = 0.0
200
+ logger.info("Cost tracker session reset")
201
+
202
+ def get_available_models(self) -> list[str]:
203
+ """Get list of available models with pricing."""
204
+ return list(self.MODEL_PRICING.keys())
205
+
206
+ def get_model_info(self, model: str) -> Optional[ModelPricing]:
207
+ """Get pricing information for a specific model."""
208
+ return self.MODEL_PRICING.get(model)
209
+
210
+ def add_deployment_pricing(self, deployment_name: str, model_type: str = "o3-mini"):
211
+ """Add pricing for a custom deployment name by mapping it to an existing model type."""
212
+ if deployment_name in self.MODEL_PRICING:
213
+ return # Already exists
214
+
215
+ # Map deployment name to existing model pricing
216
+ if model_type in self.MODEL_PRICING:
217
+ base_pricing = self.MODEL_PRICING[model_type]
218
+ self.MODEL_PRICING[deployment_name] = ModelPricing(
219
+ model_name=deployment_name,
220
+ input_cost_per_1k_tokens=base_pricing.input_cost_per_1k_tokens,
221
+ output_cost_per_1k_tokens=base_pricing.output_cost_per_1k_tokens,
222
+ description=f"{deployment_name} ({base_pricing.description})"
223
+ )
224
+ logger.info(f"Added pricing for deployment {deployment_name} based on {model_type}")
225
+ else:
226
+ logger.warning(f"Unknown model type {model_type} for deployment {deployment_name}")
227
+
228
+ def guess_model_type(self, deployment_name: str) -> str:
229
+ """Guess the model type based on deployment name."""
230
+ deployment_lower = deployment_name.lower()
231
+ if "mini" in deployment_lower:
232
+ return "o3-mini"
233
+ elif "o4" in deployment_lower:
234
+ return "o4"
235
+ elif "o3" in deployment_lower:
236
+ return "o3"
237
+ else:
238
+ return "o3-mini" # Default to cheapest
239
+
240
+ # Global cost tracker instance
241
+ cost_tracker = CostTracker()
uv.lock CHANGED
@@ -200,6 +200,7 @@ dependencies = [
200
  { name = "openai" },
201
  { name = "python-dotenv" },
202
  { name = "pyyaml" },
 
203
  { name = "streamlit" },
204
  ]
205
 
@@ -209,6 +210,7 @@ requires-dist = [
209
  { name = "openai", specifier = ">=1.91.0" },
210
  { name = "python-dotenv", specifier = ">=1.1.1" },
211
  { name = "pyyaml", specifier = ">=6.0" },
 
212
  { name = "streamlit", specifier = ">=1.46.0" },
213
  ]
214
 
@@ -1333,6 +1335,19 @@ wheels = [
1333
  { url = "https://files.pythonhosted.org/packages/45/94/bc295babb3062a731f52621cdc992d123111282e291abaf23faa413443ea/regex-2024.11.6-cp313-cp313-win_amd64.whl", hash = "sha256:2b3361af3198667e99927da8b84c1b010752fa4b1115ee30beaa332cabc3ef1a", size = 273545, upload-time = "2024-11-06T20:11:15Z" },
1334
  ]
1335
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1336
  [[package]]
1337
  name = "requests"
1338
  version = "2.32.4"
 
200
  { name = "openai" },
201
  { name = "python-dotenv" },
202
  { name = "pyyaml" },
203
+ { name = "reportlab" },
204
  { name = "streamlit" },
205
  ]
206
 
 
210
  { name = "openai", specifier = ">=1.91.0" },
211
  { name = "python-dotenv", specifier = ">=1.1.1" },
212
  { name = "pyyaml", specifier = ">=6.0" },
213
+ { name = "reportlab", specifier = ">=4.4.2" },
214
  { name = "streamlit", specifier = ">=1.46.0" },
215
  ]
216
 
 
1335
  { url = "https://files.pythonhosted.org/packages/45/94/bc295babb3062a731f52621cdc992d123111282e291abaf23faa413443ea/regex-2024.11.6-cp313-cp313-win_amd64.whl", hash = "sha256:2b3361af3198667e99927da8b84c1b010752fa4b1115ee30beaa332cabc3ef1a", size = 273545, upload-time = "2024-11-06T20:11:15Z" },
1336
  ]
1337
 
1338
+ [[package]]
1339
+ name = "reportlab"
1340
+ version = "4.4.2"
1341
+ source = { registry = "https://pypi.org/simple" }
1342
+ dependencies = [
1343
+ { name = "charset-normalizer" },
1344
+ { name = "pillow" },
1345
+ ]
1346
+ sdist = { url = "https://files.pythonhosted.org/packages/ec/9b/3483c7e4ad33d15f22d528872439e5bc92485814d7e7d10dbc3130368a83/reportlab-4.4.2.tar.gz", hash = "sha256:fc6283048ddd0781a9db1d671715990e6aa059c8d40ec9baf34294c4bd583a36", size = 3509063, upload-time = "2025-06-18T12:20:19.526Z" }
1347
+ wheels = [
1348
+ { url = "https://files.pythonhosted.org/packages/9f/74/ed990bc9586605d4e46f6b0e0b978a5b8e757aa599e39664bee26d6dc666/reportlab-4.4.2-py3-none-any.whl", hash = "sha256:58e11be387457928707c12153b7e41e52533a5da3f587b15ba8f8fd0805c6ee2", size = 1953624, upload-time = "2025-06-18T12:20:16.152Z" },
1349
+ ]
1350
+
1351
  [[package]]
1352
  name = "requests"
1353
  version = "2.32.4"