# src/processing/llm_extractor.py import json import logging from typing import Dict, Any from openai import AzureOpenAI logger = logging.getLogger(__name__) class AzureO1MedicationExtractor: def __init__( self, endpoint: str, api_key: str, api_version: str, deployment: str, model_name: str = None, ): self.client = AzureOpenAI( api_version=api_version, azure_endpoint=endpoint, api_key=api_key, ) self.deployment = deployment self.model_name = model_name or deployment def extract_medication_sections(self, doc_json: Dict[str, Any]) -> Dict[str, Any]: texts = doc_json.get("texts", []) text_analysis = [] for i, text_elem in enumerate(texts): text_analysis.append({ "index": i, "text": text_elem.get("text", ""), "label": text_elem.get("label", ""), "level": text_elem.get("level", 0), "parent": text_elem.get("parent", {}), }) prompt = f""" You are a medical document analysis expert specializing in discharge letters. Your task is to identify ONLY the two formal medication lists that should be redacted, while preserving all medication mentions in clinical discussion. **CRITICAL: You should ONLY remove the two formal medication lists:** 1. **Current medication list** (usually at the beginning of the document) 2. **Discharge medication list** (usually at the end of the document, often under headers like "Als verdere behandeling stellen wij voor" or "Thuismedicatie") **Typical discharge letter structure:** - Patient information and admission details - Clinical discussion and treatment narrative (KEEP medication mentions here) - Current medication list (REMOVE this formal list) - Discharge instructions and follow-up - Discharge medication list (REMOVE this formal list) **DO NOT remove:** - Medication mentions in clinical discussion (e.g., "patient was treated with Eliquis") - Medication adjustments mentioned in the narrative - Dosage information in clinical context - Any medication information that appears in the main clinical text - Treatment decisions and clinical reasoning **ONLY remove:** - Complete medication lists with multiple drugs - Formal medication sections with headers - Standalone medication lists that are clearly separated from clinical text - Lists that appear to be formal medication documentation Document structure: {text_analysis} **Analysis Instructions:** 1. Look for formal medication sections with clear headers (e.g., "Thuismedicatie", "Als verdere behandeling stellen wij voor") 2. Identify complete medication lists that contain multiple drugs with dosages 3. **IGNORE** any medication mentions that appear within clinical discussion or narrative text 4. Focus on structural elements that represent formal medication documentation 5. Be conservative - if in doubt, do NOT remove 6. Consider the position in the document (beginning/end vs. middle) **Examples of what to REMOVE:** - Complete lists under "Thuismedicatie" header - Formal medication lists under "Als verdere behandeling stellen wij voor" - Standalone medication sections with multiple drugs - Lists that appear at the beginning or end of the document **Examples of what to KEEP:** - "Patient was treated with Eliquis 2x 2.5mg" (clinical discussion) - "Stop Clopidogrel bij opname" (clinical instruction) - "Jardiance 10mg & Burinex 5mg" (if mentioned in clinical context) - Any medication mentioned in the context of treatment discussion Return your analysis as JSON: {{ "indices_to_remove": [list of integer indices - ONLY formal medication lists], "reasoning": {{ "formal_medication_lists": [list of identified formal medication list indices with explanations], "clinical_medication_mentions": [list of clinical mentions that were correctly preserved], "justification": "explanation of why only formal lists were selected for removal", "confidence": "high/medium/low" }} }} """ logger.info(f"Prompt length: {len(prompt)}") logger.info(f"Number of text elements: {len(text_analysis)}") try: response = self.client.chat.completions.create( messages=[ { "role": "system", "content": "You are a helpful assistant.", }, { "role": "user", "content": prompt, } ], max_completion_tokens=100000, # adjust as needed model=self.deployment ) except Exception as e: logger.error(f"Exception during LLM call: {e}", exc_info=True) return {"indices_to_remove": [], "reasoning": {"confidence": "low"}} try: logger.error(f"Raw LLM response: {response.choices[0].message.content!r}") result = json.loads(response.choices[0].message.content) # Validate and limit the number of elements to remove indices_to_remove = result.get("indices_to_remove", []) # Be conservative - limit to maximum 10 elements to prevent over-removal if len(indices_to_remove) > 10: logger.warning(f"LLM suggested removing {len(indices_to_remove)} elements, limiting to 10 most likely formal medication lists") # Keep only the first 10 (assuming they're ordered by importance) indices_to_remove = indices_to_remove[:10] result["indices_to_remove"] = indices_to_remove result["reasoning"]["justification"] += " [LIMITED: Only top 10 elements selected to prevent over-removal]" # Log the reasoning for transparency reasoning = result.get("reasoning", {}) logger.info(f"LLM reasoning: {reasoning}") return result except Exception as e: logger.error(f"Failed to parse LLM response: {e}") return {"indices_to_remove": [], "reasoning": {"confidence": "low"}}