Spaces:

crimson-suv
/

miniCPM

Paused

File size: 20,124 Bytes

import spaces
import gradio as gr
import torch
from transformers import AutoModel, AutoTokenizer
from PIL import Image
import base64
import io
import os
import json
from huggingface_hub import login
from pdf2image import convert_from_bytes
from datetime import datetime

# Set your HF token
HF_TOKEN = os.getenv("HUGGING_FACE_HUB_TOKEN")
if HF_TOKEN:
    login(token=HF_TOKEN)

# Global variables for model caching
_model = None
_tokenizer = None

def load_model():
    """Load MiniCPM model"""
    global _model, _tokenizer
    
    if _model is not None and _tokenizer is not None:
        return _model, _tokenizer
    
    try:
        _tokenizer = AutoTokenizer.from_pretrained(
            "openbmb/MiniCPM-V-2_6", 
            trust_remote_code=True,
            use_fast=True
        )
        _model = AutoModel.from_pretrained(
            "openbmb/MiniCPM-V-2_6", 
            trust_remote_code=True,
            torch_dtype=torch.float16,
            device_map="auto"
        )
        return _model, _tokenizer
    except Exception as e:
        print(f"Error loading gated model: {e}")
        _tokenizer = AutoTokenizer.from_pretrained(
            "openbmb/MiniCPM-V-2", 
            trust_remote_code=True,
            use_fast=True
        )
        _model = AutoModel.from_pretrained(
            "openbmb/MiniCPM-V-2", 
            trust_remote_code=True,
            torch_dtype=torch.float16,
            device_map="auto"
        )
        return _model, _tokenizer

def pdf_to_images(pdf_file):
    """Convert PDF file to list of PIL images"""
    try:
        if hasattr(pdf_file, 'read'):
            pdf_bytes = pdf_file.read()
        else:
            with open(pdf_file, 'rb') as f:
                pdf_bytes = f.read()
        
        images = convert_from_bytes(pdf_bytes, dpi=300)
        return images
    except Exception as e:
        print(f"Error converting PDF to images: {e}")
        return []

def clean_empty_fields(data):
    """Recursively remove empty fields from dictionary"""
    if not isinstance(data, dict):
        return data
    
    cleaned = {}
    for key, value in data.items():
        if isinstance(value, dict):
            cleaned_value = clean_empty_fields(value)
            if cleaned_value:  # Only add if not empty
                cleaned[key] = cleaned_value
        elif isinstance(value, list):
            if value:  # Only add if list is not empty
                cleaned_list = []
                for item in value:
                    if isinstance(item, dict):
                        cleaned_item = clean_empty_fields(item)
                        if cleaned_item:
                            cleaned_list.append(cleaned_item)
                    elif item:  # Not empty
                        cleaned_list.append(item)
                if cleaned_list:
                    cleaned[key] = cleaned_list
        elif value not in [None, "", [], {}]:  # Not empty
            cleaned[key] = value
    
    return cleaned

def get_comprehensive_medical_extraction_prompt():
    """Complete medical data extraction prompt with all fields"""
    return """You are a deterministic medical data extraction engine. You will receive a single page from a medical document. Your task is to extract ALL visible information from this page and return it in the exact JSON format below.

Your response MUST follow this exact JSON format:

{
  "page_analysis": {
    "page_contains_text": true,
    "page_type": "cover_page|patient_demographics|insurance|medical_history|referral_info|other",
    "overall_page_confidence": 0.0,
    "all_visible_text": "Complete text transcription of everything visible on this page"
  },
  "extracted_data": {
    "date_of_receipt": "",
    "patient_first_name": "",
    "patient_last_name": "",
    "patient_dob": "",
    "patient_gender": "",
    "patient_primary_phone_number": "",
    "patient_secondary_phone_number": "",
    "patient_email": "",
    "patient_address": "",
    "patient_zip_code": "",
    "referral_source": "",
    "referral_source_phone_no": "",
    "referral_source_fax_no": "",
    "referral_source_email": "",
    "primary_insurance": {
      "payer_name": "",
      "member_id": "",
      "group_id": ""
    },
    "secondary_insurance": {
      "payer_name": "",
      "member_id": "",
      "group_id": ""
    },
    "tertiary_insurance": {
      "payer_name": "",
      "member_id": "",
      "group_id": ""
    },
    "priority": "",
    "reason_for_referral": "",
    "diagnosis_informations": [
      {
        "code": "",
        "description": ""
      }
    ],
    "refine_reason": "",
    "additional_medical_info": "",
    "provider_names": [],
    "appointment_dates": [],
    "medication_info": [],
    "other_important_details": ""
  },
  "confidence_scores": {
    "date_of_receipt": 0.0,
    "patient_first_name": 0.0,
    "patient_last_name": 0.0,
    "patient_dob": 0.0,
    "patient_gender": 0.0,
    "patient_primary_phone_number": 0.0,
    "patient_secondary_phone_number": 0.0,
    "patient_email": 0.0,
    "patient_address": 0.0,
    "patient_zip_code": 0.0,
    "referral_source": 0.0,
    "referral_source_phone_no": 0.0,
    "referral_source_fax_no": 0.0,
    "referral_source_email": 0.0,
    "primary_insurance": {
      "payer_name": 0.0,
      "member_id": 0.0,
      "group_id": 0.0
    },
    "secondary_insurance": {
      "payer_name": 0.0,
      "member_id": 0.0,
      "group_id": 0.0
    },
    "tertiary_insurance": {
      "payer_name": 0.0,
      "member_id": 0.0,
      "group_id": 0.0
    },
    "priority": 0.0,
    "reason_for_referral": 0.0,
    "diagnosis_informations": 0.0,
    "refine_reason": 0.0
  },
  "fields_found_on_this_page": [],
  "metadata": {
    "extraction_timestamp": "",
    "model_used": "MiniCPM-V-2_6-GPU",
    "page_processing_notes": ""
  }
}

--------------------------------
STRICT FIELD FORMATTING RULES:
--------------------------------

• Dates: Format as MM/DD/YYYY only
• Phone numbers: Use digits and hyphens only (e.g., 406-596-1901), no extensions or parentheses
• Gender: "Male", "Female", or "Other" only
• Email: Must contain @ and valid domain, otherwise leave empty
• Zip code: Only extract as last 5 digits of address

--------------------------------
REFERRAL SOURCE RULES:
--------------------------------

• Extract clinic/hospital/facility name ONLY – never the provider's name
• Use facility's phone/fax/email, not individual provider's contact
• Prefer header/fax banner for referral source over body text
• Do not extract receiver clinic names (e.g., Frontier Psychiatry) as referral source

--------------------------------
INSURANCE EXTRACTION FORMAT:
--------------------------------

Each tier must follow this structure:
"primary_insurance": {
  "payer_name": "string",
  "member_id": "string", 
  "group_id": "string"
},
"secondary_insurance": { ... },
"tertiary_insurance": { ... }

• Use "member_id" for any ID (Policy, Insurance ID, Subscriber ID, etc.)
• Use "group_id" ONLY if explicitly labeled as "Group ID", "Group Number", etc.
• Leave all fields empty if "Self Pay" is indicated

--------------------------------
DIAGNOSIS EXTRACTION RULES:
--------------------------------

• Extract diagnosis codes AND their descriptions
• If only code is present, set description to "" and confidence ≤ 0.6
• DO NOT infer description from ICD code

--------------------------------
CONFIDENCE SCORING:
--------------------------------

Assign realistic confidence (0.0–1.0) per field, e.g.:

• 0.95–1.0 → Clearly labeled, unambiguous data
• 0.7–0.94 → Some uncertainty (low quality, odd format)
• 0.0–0.6 → Missing, ambiguous, or noisy data
• Use float precision (e.g., 0.87, not just 1.0)

Always populate the `confidence_scores` dictionary with the same structure as `extracted_data`.

--------------------------------
CRITICAL INSTRUCTIONS:
--------------------------------

1. READ EVERYTHING: Transcribe all visible text in "all_visible_text"
2. EXTRACT PRECISELY: Only extract what's actually visible on THIS page
3. NO ASSUMPTIONS: Don't guess or infer information not present
4. FIELD CLASSIFICATION: List which fields were actually found in "fields_found_on_this_page"
5. CONFIDENCE: Be realistic - 0.0 if not found, up to 1.0 if completely certain
6. FORMAT EXACTLY: Follow date/phone/address formatting rules strictly
7. JSON ONLY: Return only valid JSON, no other text

This is ONE PAGE of a multi-page document. Extract only what's visible on this specific page."""

def extract_single_page(image, extraction_prompt, model, tokenizer):
    """Extract data from a single page with comprehensive medical fields"""
    try:
        if hasattr(image, 'convert'):
            image = image.convert('RGB')
        
        response = model.chat(
            image=image,
            msgs=[{
                "role": "user",
                "content": extraction_prompt
            }],
            tokenizer=tokenizer,
            sampling=False,
            temperature=0.1,
            max_new_tokens=4000
        )
        
        # Try to parse JSON
        try:
            parsed_data = json.loads(response)
            # Clean empty fields
            cleaned_data = clean_empty_fields(parsed_data)
            return cleaned_data if cleaned_data else None
        except json.JSONDecodeError:
            return None
    except Exception as e:
        print(f"Error extracting from page: {e}")
        return None

@spaces.GPU(duration=180)  # 3 minutes
def extract_pages_clean_json(pdf_file, custom_prompt=None):
    """Extract each page individually - RETURN ONLY NON-EMPTY JSON DATA"""
    try:
        if pdf_file is None:
            return {"error": "No PDF provided"}
        
        # Convert PDF to images
        print("Converting PDF to images...")
        images = pdf_to_images(pdf_file)
        
        if not images:
            return {"error": "Could not convert PDF"}
        
        print(f"Processing {len(images)} pages individually...")
        
        # Load model once
        model, tokenizer = load_model()
        extraction_prompt = custom_prompt or get_comprehensive_medical_extraction_prompt()
        
        # Process each page and collect only non-empty JSON
        page_results = {}
        
        for i, image in enumerate(images):
            print(f"Extracting page {i+1}/{len(images)}...")
            
            page_json = extract_single_page(image, extraction_prompt, model, tokenizer)
            
            # Only add to results if page contains data
            if page_json:
                page_results[f"page_{i+1}"] = page_json
        
        return page_results  # Return only pages with data
        
    except Exception as e:
        return {"error": str(e)}

def create_gradio_interface():
    with gr.Blocks(title="Clean Medical eFax Extractor", theme=gr.themes.Soft()) as demo:
        gr.Markdown("# 🏥 Clean Medical eFax Data Extractor")
        gr.Markdown("📋 **Returns Only Non-Empty Data** - Clean page-by-page extraction without empty fields")
        
        with gr.Tab("📄 Clean JSON Extraction"):
            with gr.Row():
                with gr.Column():
                    pdf_input = gr.File(
                        file_types=[".pdf"], 
                        label="Upload Medical eFax PDF",
                        file_count="single"
                    )
                    
                    with gr.Accordion("🔧 Custom Prompt", open=False):
                        prompt_input = gr.Textbox(
                            value="",
                            label="Custom Extraction Prompt (optional)",
                            lines=4,
                            placeholder="Leave empty for comprehensive medical extraction..."
                        )
                    
                    extract_btn = gr.Button("📋 Extract Clean JSON", variant="primary", size="lg")
                    
                    gr.Markdown("""
                    ### ✅ Clean Output Features
                    - **No Empty Fields**: Only fields with actual data
                    - **No Empty Pages**: Only pages containing information
                    - **Easier Combination**: Clean structure for AI merging
                    - **Optimized Size**: Reduced JSON payload
                    """)
                
                with gr.Column():
                    status_output = gr.Textbox(label="📊 Processing Status", interactive=False)
                    output = gr.JSON(label="📋 Clean JSON Results", show_label=True)
        
        with gr.Tab("🔌 API Usage Instructions"):
            gr.Markdown("""
            ## Updated API Instructions
            
            ### Method 1: Python Client (Recommended)
            ```
            pip install gradio_client
            ```
            
            ```
            from gradio_client import Client, handle_file
            import json
            
            # Connect to your deployed Space
            client = Client("crimsons-uv/miniCPM")
            
            # Extract medical data from eFax PDF
            def extract_efax_clean(pdf_path, custom_prompt=""):
                result = client.predict(
                    pdf_file=handle_file(pdf_path),
                    custom_prompt=custom_prompt,
                    api_name="/process_with_status"
                )
                
                # result is tuple: [status_message, clean_json_data]
                status, clean_data = result
                
                print(f"Status: {status}")
                
                # Process only pages with data
                for page_key, page_data in clean_data.items():
                    if page_key.startswith('page_'):
                        print(f"\\n{page_key.upper()}:")
                        
                        if 'extracted_data' in page_
                            data = page_data['extracted_data']
                            if 'patient_first_name' in 
                                print(f"  Patient: {data['patient_first_name']} {data.get('patient_last_name', '')}")
                            if 'primary_insurance' in 
                                print(f"  Insurance: {data['primary_insurance'].get('payer_name', '')}")
                            if 'reason_for_referral' in 
                                print(f"  Reason: {data['reason_for_referral']}")
                
                return clean_data
            
            # Usage
            results = extract_efax_clean("path/to/your/efax.pdf")
            ```
            
            ### Method 2: cURL Commands
            ```
            # Step 1: Make POST request
            curl -X POST https://crimsons-uv-minicpm.hf.space/gradio_api/call/process_with_status \\
              -H "Content-Type: application/json" \\
              -d '{
                "data": [
                  {"path": "your_efax.pdf", "meta": {"_type": "gradio.FileData"}},
                  ""
                ]
              }' \\
              | awk -F'"' '{ print $4}' \\
              | read EVENT_ID; curl -N https://crimsons-uv-minicpm.hf.space/gradio_api/call/process_with_status/$EVENT_ID
            ```
            
            ### Method 3: Direct HTTP API
            ```
            import requests
            import base64
            import json
            
            def call_clean_extraction_api(pdf_path, custom_prompt=""):
                # Read and encode PDF
                with open(pdf_path, 'rb') as f:
                    pdf_b64 = base64.b64encode(f.read()).decode()
                
                # API payload
                payload = {
                    "data": [
                        {"name": "efax.pdf", "data": f"application/pdf;base64,{pdf_b64}"},
                        custom_prompt
                    ]
                }
                
                # Make request
                response = requests.post(
                    "https://crimsons-uv-minicpm.hf.space/gradio_api/call/process_with_status",
                    json=payload,
                    headers={"Content-Type": "application/json"}
                )
                
                return response.json()
            
            # Usage
            clean_results = call_clean_extraction_api("your_efax.pdf")
            ```
            """)
        
        with gr.Tab("📋 Response Format"):
            gr.Markdown("""
            ## Clean Response Structure
            
            ### Input: 5-page PDF with mixed content
            ### Output: Only pages with data
            
            ```
            {
              "page_2": {
                "page_analysis": {
                  "page_type": "patient_demographics",
                  "overall_page_confidence": 0.95,
                  "all_visible_text": "Patient: John Doe..."
                },
                "extracted_data": {
                  "patient_first_name": "John",
                  "patient_last_name": "Doe",
                  "patient_dob": "01/15/1980",
                  "patient_gender": "Male",
                  "patient_primary_phone_number": "555-123-4567",
                  "patient_address": "123 Main St, City, State 12345",
                  "patient_zip_code": "12345"
                },
                "confidence_scores": {
                  "patient_first_name": 1.0,
                  "patient_last_name": 1.0,
                  "patient_dob": 0.95,
                  "patient_gender": 1.0
                },
                "fields_found_on_this_page": ["patient_first_name", "patient_last_name", "patient_dob"]
              },
              "page_3": {
                "extracted_data": {
                  "primary_insurance": {
                    "payer_name": "Blue Cross Blue Shield",
                    "member_id": "ABC123456789",
                    "group_id": "GRP001"
                  },
                  "reason_for_referral": "Cardiology consultation"
                },
                "confidence_scores": {
                  "primary_insurance": {
                    "payer_name": 1.0,
                    "member_id": 0.98,
                    "group_id": 0.95
                  },
                  "reason_for_referral": 1.0
                }
              }
            }
            ```
            
            ### Benefits for AI Combination:
            - ✅ **No empty pages**: Pages 1, 4, 5 had no data, so not included
            - ✅ **No empty fields**: Only fields with actual values
            - ✅ **Smaller payload**: Reduced data size for faster processing
            - ✅ **Easy merging**: Clear structure for combining with ChatGPT/Claude
            """)
        
        def process_with_status(pdf_file, custom_prompt):
            if pdf_file is None:
                return "❌ No PDF uploaded", {"error": "Upload a PDF file"}
            
            yield "📄 Converting PDF to images...", {}
            
            try:
                result = extract_pages_clean_json(pdf_file, custom_prompt if custom_prompt.strip() else None)
                
                if "error" not in result:
                    page_count = len([k for k in result.keys() if k.startswith("page_")])
                    yield f"✅ Extracted clean data from {page_count} pages with content", result
                else:
                    yield f"❌ Error: {result['error']}", result
                    
            except Exception as e:
                yield f"❌ Failed: {str(e)}", {"error": str(e)}
        
        extract_btn.click(
            fn=process_with_status,
            inputs=[pdf_input, prompt_input],
            outputs=[status_output, output],
            queue=True
        )
    
    return demo

if __name__ == "__main__":
    demo = create_gradio_interface()
    demo.queue(
        default_concurrency_limit=1,
        max_size=10
    ).launch(
        server_name="0.0.0.0", 
        server_port=7860,
        show_error=True
    )