import spaces import gradio as gr import torch from transformers import AutoModel, AutoTokenizer from PIL import Image import base64 import io import os import json from huggingface_hub import login from pdf2image import convert_from_bytes from datetime import datetime # Set your HF token HF_TOKEN = os.getenv("HUGGING_FACE_HUB_TOKEN") if HF_TOKEN: login(token=HF_TOKEN) # Global variables for model caching _model = None _tokenizer = None def load_model(): """Load MiniCPM model""" global _model, _tokenizer if _model is not None and _tokenizer is not None: return _model, _tokenizer try: _tokenizer = AutoTokenizer.from_pretrained( "openbmb/MiniCPM-V-2_6", trust_remote_code=True, use_fast=True ) _model = AutoModel.from_pretrained( "openbmb/MiniCPM-V-2_6", trust_remote_code=True, torch_dtype=torch.float16, device_map="auto" ) return _model, _tokenizer except Exception as e: print(f"Error loading gated model: {e}") _tokenizer = AutoTokenizer.from_pretrained( "openbmb/MiniCPM-V-2", trust_remote_code=True, use_fast=True ) _model = AutoModel.from_pretrained( "openbmb/MiniCPM-V-2", trust_remote_code=True, torch_dtype=torch.float16, device_map="auto" ) return _model, _tokenizer def pdf_to_images(pdf_file): """Convert PDF file to list of PIL images""" try: if hasattr(pdf_file, 'read'): pdf_bytes = pdf_file.read() else: with open(pdf_file, 'rb') as f: pdf_bytes = f.read() images = convert_from_bytes(pdf_bytes, dpi=300) return images except Exception as e: print(f"Error converting PDF to images: {e}") return [] def clean_empty_fields(data): """Recursively remove empty fields from dictionary""" if not isinstance(data, dict): return data cleaned = {} for key, value in data.items(): if isinstance(value, dict): cleaned_value = clean_empty_fields(value) if cleaned_value: # Only add if not empty cleaned[key] = cleaned_value elif isinstance(value, list): if value: # Only add if list is not empty cleaned_list = [] for item in value: if isinstance(item, dict): cleaned_item = clean_empty_fields(item) if cleaned_item: cleaned_list.append(cleaned_item) elif item: # Not empty cleaned_list.append(item) if cleaned_list: cleaned[key] = cleaned_list elif value not in [None, "", [], {}]: # Not empty cleaned[key] = value return cleaned def get_comprehensive_medical_extraction_prompt(): """Complete medical data extraction prompt with all fields""" return """You are a deterministic medical data extraction engine. You will receive a single page from a medical document. Your task is to extract ALL visible information from this page and return it in the exact JSON format below. Your response MUST follow this exact JSON format: { "page_analysis": { "page_contains_text": true, "page_type": "cover_page|patient_demographics|insurance|medical_history|referral_info|other", "overall_page_confidence": 0.0, "all_visible_text": "Complete text transcription of everything visible on this page" }, "extracted_data": { "date_of_receipt": "", "patient_first_name": "", "patient_last_name": "", "patient_dob": "", "patient_gender": "", "patient_primary_phone_number": "", "patient_secondary_phone_number": "", "patient_email": "", "patient_address": "", "patient_zip_code": "", "referral_source": "", "referral_source_phone_no": "", "referral_source_fax_no": "", "referral_source_email": "", "primary_insurance": { "payer_name": "", "member_id": "", "group_id": "" }, "secondary_insurance": { "payer_name": "", "member_id": "", "group_id": "" }, "tertiary_insurance": { "payer_name": "", "member_id": "", "group_id": "" }, "priority": "", "reason_for_referral": "", "diagnosis_informations": [ { "code": "", "description": "" } ], "refine_reason": "", "additional_medical_info": "", "provider_names": [], "appointment_dates": [], "medication_info": [], "other_important_details": "" }, "confidence_scores": { "date_of_receipt": 0.0, "patient_first_name": 0.0, "patient_last_name": 0.0, "patient_dob": 0.0, "patient_gender": 0.0, "patient_primary_phone_number": 0.0, "patient_secondary_phone_number": 0.0, "patient_email": 0.0, "patient_address": 0.0, "patient_zip_code": 0.0, "referral_source": 0.0, "referral_source_phone_no": 0.0, "referral_source_fax_no": 0.0, "referral_source_email": 0.0, "primary_insurance": { "payer_name": 0.0, "member_id": 0.0, "group_id": 0.0 }, "secondary_insurance": { "payer_name": 0.0, "member_id": 0.0, "group_id": 0.0 }, "tertiary_insurance": { "payer_name": 0.0, "member_id": 0.0, "group_id": 0.0 }, "priority": 0.0, "reason_for_referral": 0.0, "diagnosis_informations": 0.0, "refine_reason": 0.0 }, "fields_found_on_this_page": [], "metadata": { "extraction_timestamp": "", "model_used": "MiniCPM-V-2_6-GPU", "page_processing_notes": "" } } -------------------------------- STRICT FIELD FORMATTING RULES: -------------------------------- • Dates: Format as MM/DD/YYYY only • Phone numbers: Use digits and hyphens only (e.g., 406-596-1901), no extensions or parentheses • Gender: "Male", "Female", or "Other" only • Email: Must contain @ and valid domain, otherwise leave empty • Zip code: Only extract as last 5 digits of address -------------------------------- REFERRAL SOURCE RULES: -------------------------------- • Extract clinic/hospital/facility name ONLY – never the provider's name • Use facility's phone/fax/email, not individual provider's contact • Prefer header/fax banner for referral source over body text • Do not extract receiver clinic names (e.g., Frontier Psychiatry) as referral source -------------------------------- INSURANCE EXTRACTION FORMAT: -------------------------------- Each tier must follow this structure: "primary_insurance": { "payer_name": "string", "member_id": "string", "group_id": "string" }, "secondary_insurance": { ... }, "tertiary_insurance": { ... } • Use "member_id" for any ID (Policy, Insurance ID, Subscriber ID, etc.) • Use "group_id" ONLY if explicitly labeled as "Group ID", "Group Number", etc. • Leave all fields empty if "Self Pay" is indicated -------------------------------- DIAGNOSIS EXTRACTION RULES: -------------------------------- • Extract diagnosis codes AND their descriptions • If only code is present, set description to "" and confidence ≤ 0.6 • DO NOT infer description from ICD code -------------------------------- CONFIDENCE SCORING: -------------------------------- Assign realistic confidence (0.0–1.0) per field, e.g.: • 0.95–1.0 → Clearly labeled, unambiguous data • 0.7–0.94 → Some uncertainty (low quality, odd format) • 0.0–0.6 → Missing, ambiguous, or noisy data • Use float precision (e.g., 0.87, not just 1.0) Always populate the `confidence_scores` dictionary with the same structure as `extracted_data`. -------------------------------- CRITICAL INSTRUCTIONS: -------------------------------- 1. READ EVERYTHING: Transcribe all visible text in "all_visible_text" 2. EXTRACT PRECISELY: Only extract what's actually visible on THIS page 3. NO ASSUMPTIONS: Don't guess or infer information not present 4. FIELD CLASSIFICATION: List which fields were actually found in "fields_found_on_this_page" 5. CONFIDENCE: Be realistic - 0.0 if not found, up to 1.0 if completely certain 6. FORMAT EXACTLY: Follow date/phone/address formatting rules strictly 7. JSON ONLY: Return only valid JSON, no other text This is ONE PAGE of a multi-page document. Extract only what's visible on this specific page.""" def extract_single_page(image, extraction_prompt, model, tokenizer): """Extract data from a single page with comprehensive medical fields""" try: if hasattr(image, 'convert'): image = image.convert('RGB') response = model.chat( image=image, msgs=[{ "role": "user", "content": extraction_prompt }], tokenizer=tokenizer, sampling=False, temperature=0.1, max_new_tokens=4000 ) # Try to parse JSON try: parsed_data = json.loads(response) # Clean empty fields cleaned_data = clean_empty_fields(parsed_data) return cleaned_data if cleaned_data else None except json.JSONDecodeError: return None except Exception as e: print(f"Error extracting from page: {e}") return None @spaces.GPU(duration=180) # 3 minutes def extract_pages_clean_json(pdf_file, custom_prompt=None): """Extract each page individually - RETURN ONLY NON-EMPTY JSON DATA""" try: if pdf_file is None: return {"error": "No PDF provided"} # Convert PDF to images print("Converting PDF to images...") images = pdf_to_images(pdf_file) if not images: return {"error": "Could not convert PDF"} print(f"Processing {len(images)} pages individually...") # Load model once model, tokenizer = load_model() extraction_prompt = custom_prompt or get_comprehensive_medical_extraction_prompt() # Process each page and collect only non-empty JSON page_results = {} for i, image in enumerate(images): print(f"Extracting page {i+1}/{len(images)}...") page_json = extract_single_page(image, extraction_prompt, model, tokenizer) # Only add to results if page contains data if page_json: page_results[f"page_{i+1}"] = page_json return page_results # Return only pages with data except Exception as e: return {"error": str(e)} def create_gradio_interface(): with gr.Blocks(title="Clean Medical eFax Extractor", theme=gr.themes.Soft()) as demo: gr.Markdown("# 🏥 Clean Medical eFax Data Extractor") gr.Markdown("📋 **Returns Only Non-Empty Data** - Clean page-by-page extraction without empty fields") with gr.Tab("📄 Clean JSON Extraction"): with gr.Row(): with gr.Column(): pdf_input = gr.File( file_types=[".pdf"], label="Upload Medical eFax PDF", file_count="single" ) with gr.Accordion("🔧 Custom Prompt", open=False): prompt_input = gr.Textbox( value="", label="Custom Extraction Prompt (optional)", lines=4, placeholder="Leave empty for comprehensive medical extraction..." ) extract_btn = gr.Button("📋 Extract Clean JSON", variant="primary", size="lg") gr.Markdown(""" ### ✅ Clean Output Features - **No Empty Fields**: Only fields with actual data - **No Empty Pages**: Only pages containing information - **Easier Combination**: Clean structure for AI merging - **Optimized Size**: Reduced JSON payload """) with gr.Column(): status_output = gr.Textbox(label="📊 Processing Status", interactive=False) output = gr.JSON(label="📋 Clean JSON Results", show_label=True) with gr.Tab("🔌 API Usage Instructions"): gr.Markdown(""" ## Updated API Instructions ### Method 1: Python Client (Recommended) ``` pip install gradio_client ``` ``` from gradio_client import Client, handle_file import json # Connect to your deployed Space client = Client("crimsons-uv/miniCPM") # Extract medical data from eFax PDF def extract_efax_clean(pdf_path, custom_prompt=""): result = client.predict( pdf_file=handle_file(pdf_path), custom_prompt=custom_prompt, api_name="/process_with_status" ) # result is tuple: [status_message, clean_json_data] status, clean_data = result print(f"Status: {status}") # Process only pages with data for page_key, page_data in clean_data.items(): if page_key.startswith('page_'): print(f"\\n{page_key.upper()}:") if 'extracted_data' in page_ data = page_data['extracted_data'] if 'patient_first_name' in print(f" Patient: {data['patient_first_name']} {data.get('patient_last_name', '')}") if 'primary_insurance' in print(f" Insurance: {data['primary_insurance'].get('payer_name', '')}") if 'reason_for_referral' in print(f" Reason: {data['reason_for_referral']}") return clean_data # Usage results = extract_efax_clean("path/to/your/efax.pdf") ``` ### Method 2: cURL Commands ``` # Step 1: Make POST request curl -X POST https://crimsons-uv-minicpm.hf.space/gradio_api/call/process_with_status \\ -H "Content-Type: application/json" \\ -d '{ "data": [ {"path": "your_efax.pdf", "meta": {"_type": "gradio.FileData"}}, "" ] }' \\ | awk -F'"' '{ print $4}' \\ | read EVENT_ID; curl -N https://crimsons-uv-minicpm.hf.space/gradio_api/call/process_with_status/$EVENT_ID ``` ### Method 3: Direct HTTP API ``` import requests import base64 import json def call_clean_extraction_api(pdf_path, custom_prompt=""): # Read and encode PDF with open(pdf_path, 'rb') as f: pdf_b64 = base64.b64encode(f.read()).decode() # API payload payload = { "data": [ {"name": "efax.pdf", "data": f"application/pdf;base64,{pdf_b64}"}, custom_prompt ] } # Make request response = requests.post( "https://crimsons-uv-minicpm.hf.space/gradio_api/call/process_with_status", json=payload, headers={"Content-Type": "application/json"} ) return response.json() # Usage clean_results = call_clean_extraction_api("your_efax.pdf") ``` """) with gr.Tab("📋 Response Format"): gr.Markdown(""" ## Clean Response Structure ### Input: 5-page PDF with mixed content ### Output: Only pages with data ``` { "page_2": { "page_analysis": { "page_type": "patient_demographics", "overall_page_confidence": 0.95, "all_visible_text": "Patient: John Doe..." }, "extracted_data": { "patient_first_name": "John", "patient_last_name": "Doe", "patient_dob": "01/15/1980", "patient_gender": "Male", "patient_primary_phone_number": "555-123-4567", "patient_address": "123 Main St, City, State 12345", "patient_zip_code": "12345" }, "confidence_scores": { "patient_first_name": 1.0, "patient_last_name": 1.0, "patient_dob": 0.95, "patient_gender": 1.0 }, "fields_found_on_this_page": ["patient_first_name", "patient_last_name", "patient_dob"] }, "page_3": { "extracted_data": { "primary_insurance": { "payer_name": "Blue Cross Blue Shield", "member_id": "ABC123456789", "group_id": "GRP001" }, "reason_for_referral": "Cardiology consultation" }, "confidence_scores": { "primary_insurance": { "payer_name": 1.0, "member_id": 0.98, "group_id": 0.95 }, "reason_for_referral": 1.0 } } } ``` ### Benefits for AI Combination: - ✅ **No empty pages**: Pages 1, 4, 5 had no data, so not included - ✅ **No empty fields**: Only fields with actual values - ✅ **Smaller payload**: Reduced data size for faster processing - ✅ **Easy merging**: Clear structure for combining with ChatGPT/Claude """) def process_with_status(pdf_file, custom_prompt): if pdf_file is None: return "❌ No PDF uploaded", {"error": "Upload a PDF file"} yield "📄 Converting PDF to images...", {} try: result = extract_pages_clean_json(pdf_file, custom_prompt if custom_prompt.strip() else None) if "error" not in result: page_count = len([k for k in result.keys() if k.startswith("page_")]) yield f"✅ Extracted clean data from {page_count} pages with content", result else: yield f"❌ Error: {result['error']}", result except Exception as e: yield f"❌ Failed: {str(e)}", {"error": str(e)} extract_btn.click( fn=process_with_status, inputs=[pdf_input, prompt_input], outputs=[status_output, output], queue=True ) return demo if __name__ == "__main__": demo = create_gradio_interface() demo.queue( default_concurrency_limit=1, max_size=10 ).launch( server_name="0.0.0.0", server_port=7860, show_error=True )