Spaces:
Sleeping
Sleeping
File size: 6,211 Bytes
bb68eb6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
# src/processing/llm_extractor.py
import json
import logging
from typing import Dict, Any
from openai import AzureOpenAI
logger = logging.getLogger(__name__)
class AzureO1MedicationExtractor:
def __init__(
self,
endpoint: str,
api_key: str,
api_version: str,
deployment: str,
model_name: str = None,
):
self.client = AzureOpenAI(
api_version=api_version,
azure_endpoint=endpoint,
api_key=api_key,
)
self.deployment = deployment
self.model_name = model_name or deployment
def extract_medication_sections(self, doc_json: Dict[str, Any]) -> Dict[str, Any]:
texts = doc_json.get("texts", [])
text_analysis = []
for i, text_elem in enumerate(texts):
text_analysis.append({
"index": i,
"text": text_elem.get("text", ""),
"label": text_elem.get("label", ""),
"level": text_elem.get("level", 0),
"parent": text_elem.get("parent", {}),
})
prompt = f"""
You are a medical document analysis expert specializing in discharge letters. Your task is to identify ONLY the two formal medication lists that should be redacted, while preserving all medication mentions in clinical discussion.
**CRITICAL: You should ONLY remove the two formal medication lists:**
1. **Current medication list** (usually at the beginning of the document)
2. **Discharge medication list** (usually at the end of the document, often under headers like "Als verdere behandeling stellen wij voor" or "Thuismedicatie")
**Typical discharge letter structure:**
- Patient information and admission details
- Clinical discussion and treatment narrative (KEEP medication mentions here)
- Current medication list (REMOVE this formal list)
- Discharge instructions and follow-up
- Discharge medication list (REMOVE this formal list)
**DO NOT remove:**
- Medication mentions in clinical discussion (e.g., "patient was treated with Eliquis")
- Medication adjustments mentioned in the narrative
- Dosage information in clinical context
- Any medication information that appears in the main clinical text
- Treatment decisions and clinical reasoning
**ONLY remove:**
- Complete medication lists with multiple drugs
- Formal medication sections with headers
- Standalone medication lists that are clearly separated from clinical text
- Lists that appear to be formal medication documentation
Document structure:
{text_analysis}
**Analysis Instructions:**
1. Look for formal medication sections with clear headers (e.g., "Thuismedicatie", "Als verdere behandeling stellen wij voor")
2. Identify complete medication lists that contain multiple drugs with dosages
3. **IGNORE** any medication mentions that appear within clinical discussion or narrative text
4. Focus on structural elements that represent formal medication documentation
5. Be conservative - if in doubt, do NOT remove
6. Consider the position in the document (beginning/end vs. middle)
**Examples of what to REMOVE:**
- Complete lists under "Thuismedicatie" header
- Formal medication lists under "Als verdere behandeling stellen wij voor"
- Standalone medication sections with multiple drugs
- Lists that appear at the beginning or end of the document
**Examples of what to KEEP:**
- "Patient was treated with Eliquis 2x 2.5mg" (clinical discussion)
- "Stop Clopidogrel bij opname" (clinical instruction)
- "Jardiance 10mg & Burinex 5mg" (if mentioned in clinical context)
- Any medication mentioned in the context of treatment discussion
Return your analysis as JSON:
{{
"indices_to_remove": [list of integer indices - ONLY formal medication lists],
"reasoning": {{
"formal_medication_lists": [list of identified formal medication list indices with explanations],
"clinical_medication_mentions": [list of clinical mentions that were correctly preserved],
"justification": "explanation of why only formal lists were selected for removal",
"confidence": "high/medium/low"
}}
}}
"""
logger.info(f"Prompt length: {len(prompt)}")
logger.info(f"Number of text elements: {len(text_analysis)}")
try:
response = self.client.chat.completions.create(
messages=[
{
"role": "system",
"content": "You are a helpful assistant.",
},
{
"role": "user",
"content": prompt,
}
],
max_completion_tokens=100000, # adjust as needed
model=self.deployment
)
except Exception as e:
logger.error(f"Exception during LLM call: {e}", exc_info=True)
return {"indices_to_remove": [], "reasoning": {"confidence": "low"}}
try:
logger.error(f"Raw LLM response: {response.choices[0].message.content!r}")
result = json.loads(response.choices[0].message.content)
# Validate and limit the number of elements to remove
indices_to_remove = result.get("indices_to_remove", [])
# Be conservative - limit to maximum 10 elements to prevent over-removal
if len(indices_to_remove) > 10:
logger.warning(f"LLM suggested removing {len(indices_to_remove)} elements, limiting to 10 most likely formal medication lists")
# Keep only the first 10 (assuming they're ordered by importance)
indices_to_remove = indices_to_remove[:10]
result["indices_to_remove"] = indices_to_remove
result["reasoning"]["justification"] += " [LIMITED: Only top 10 elements selected to prevent over-removal]"
# Log the reasoning for transparency
reasoning = result.get("reasoning", {})
logger.info(f"LLM reasoning: {reasoning}")
return result
except Exception as e:
logger.error(f"Failed to parse LLM response: {e}")
return {"indices_to_remove": [], "reasoning": {"confidence": "low"}} |