Spaces:
Paused
Paused
import spaces | |
import gradio as gr | |
import torch | |
from transformers import AutoModel, AutoTokenizer | |
from PIL import Image | |
import base64 | |
import io | |
import os | |
import json | |
from huggingface_hub import login | |
from pdf2image import convert_from_bytes | |
from datetime import datetime | |
# Set your HF token | |
HF_TOKEN = os.getenv("HUGGING_FACE_HUB_TOKEN") | |
if HF_TOKEN: | |
login(token=HF_TOKEN) | |
# Global variables for model caching | |
_model = None | |
_tokenizer = None | |
def load_model(): | |
"""Load MiniCPM model""" | |
global _model, _tokenizer | |
if _model is not None and _tokenizer is not None: | |
return _model, _tokenizer | |
try: | |
_tokenizer = AutoTokenizer.from_pretrained( | |
"openbmb/MiniCPM-V-2_6", | |
trust_remote_code=True, | |
use_fast=True | |
) | |
_model = AutoModel.from_pretrained( | |
"openbmb/MiniCPM-V-2_6", | |
trust_remote_code=True, | |
torch_dtype=torch.float16, | |
device_map="auto" | |
) | |
return _model, _tokenizer | |
except Exception as e: | |
print(f"Error loading gated model: {e}") | |
_tokenizer = AutoTokenizer.from_pretrained( | |
"openbmb/MiniCPM-V-2", | |
trust_remote_code=True, | |
use_fast=True | |
) | |
_model = AutoModel.from_pretrained( | |
"openbmb/MiniCPM-V-2", | |
trust_remote_code=True, | |
torch_dtype=torch.float16, | |
device_map="auto" | |
) | |
return _model, _tokenizer | |
def pdf_to_images(pdf_file): | |
"""Convert PDF file to list of PIL images""" | |
try: | |
if hasattr(pdf_file, 'read'): | |
pdf_bytes = pdf_file.read() | |
else: | |
with open(pdf_file, 'rb') as f: | |
pdf_bytes = f.read() | |
images = convert_from_bytes(pdf_bytes, dpi=300) | |
return images | |
except Exception as e: | |
print(f"Error converting PDF to images: {e}") | |
return [] | |
def clean_empty_fields(data): | |
"""Recursively remove empty fields from dictionary""" | |
if not isinstance(data, dict): | |
return data | |
cleaned = {} | |
for key, value in data.items(): | |
if isinstance(value, dict): | |
cleaned_value = clean_empty_fields(value) | |
if cleaned_value: # Only add if not empty | |
cleaned[key] = cleaned_value | |
elif isinstance(value, list): | |
if value: # Only add if list is not empty | |
cleaned_list = [] | |
for item in value: | |
if isinstance(item, dict): | |
cleaned_item = clean_empty_fields(item) | |
if cleaned_item: | |
cleaned_list.append(cleaned_item) | |
elif item: # Not empty | |
cleaned_list.append(item) | |
if cleaned_list: | |
cleaned[key] = cleaned_list | |
elif value not in [None, "", [], {}]: # Not empty | |
cleaned[key] = value | |
return cleaned | |
def get_comprehensive_medical_extraction_prompt(): | |
"""Complete medical data extraction prompt with all fields""" | |
return """You are a deterministic medical data extraction engine. You will receive a single page from a medical document. Your task is to extract ALL visible information from this page and return it in the exact JSON format below. | |
Your response MUST follow this exact JSON format: | |
{ | |
"page_analysis": { | |
"page_contains_text": true, | |
"page_type": "cover_page|patient_demographics|insurance|medical_history|referral_info|other", | |
"overall_page_confidence": 0.0, | |
"all_visible_text": "Complete text transcription of everything visible on this page" | |
}, | |
"extracted_data": { | |
"date_of_receipt": "", | |
"patient_first_name": "", | |
"patient_last_name": "", | |
"patient_dob": "", | |
"patient_gender": "", | |
"patient_primary_phone_number": "", | |
"patient_secondary_phone_number": "", | |
"patient_email": "", | |
"patient_address": "", | |
"patient_zip_code": "", | |
"referral_source": "", | |
"referral_source_phone_no": "", | |
"referral_source_fax_no": "", | |
"referral_source_email": "", | |
"primary_insurance": { | |
"payer_name": "", | |
"member_id": "", | |
"group_id": "" | |
}, | |
"secondary_insurance": { | |
"payer_name": "", | |
"member_id": "", | |
"group_id": "" | |
}, | |
"tertiary_insurance": { | |
"payer_name": "", | |
"member_id": "", | |
"group_id": "" | |
}, | |
"priority": "", | |
"reason_for_referral": "", | |
"diagnosis_informations": [ | |
{ | |
"code": "", | |
"description": "" | |
} | |
], | |
"refine_reason": "", | |
"additional_medical_info": "", | |
"provider_names": [], | |
"appointment_dates": [], | |
"medication_info": [], | |
"other_important_details": "" | |
}, | |
"confidence_scores": { | |
"date_of_receipt": 0.0, | |
"patient_first_name": 0.0, | |
"patient_last_name": 0.0, | |
"patient_dob": 0.0, | |
"patient_gender": 0.0, | |
"patient_primary_phone_number": 0.0, | |
"patient_secondary_phone_number": 0.0, | |
"patient_email": 0.0, | |
"patient_address": 0.0, | |
"patient_zip_code": 0.0, | |
"referral_source": 0.0, | |
"referral_source_phone_no": 0.0, | |
"referral_source_fax_no": 0.0, | |
"referral_source_email": 0.0, | |
"primary_insurance": { | |
"payer_name": 0.0, | |
"member_id": 0.0, | |
"group_id": 0.0 | |
}, | |
"secondary_insurance": { | |
"payer_name": 0.0, | |
"member_id": 0.0, | |
"group_id": 0.0 | |
}, | |
"tertiary_insurance": { | |
"payer_name": 0.0, | |
"member_id": 0.0, | |
"group_id": 0.0 | |
}, | |
"priority": 0.0, | |
"reason_for_referral": 0.0, | |
"diagnosis_informations": 0.0, | |
"refine_reason": 0.0 | |
}, | |
"fields_found_on_this_page": [], | |
"metadata": { | |
"extraction_timestamp": "", | |
"model_used": "MiniCPM-V-2_6-GPU", | |
"page_processing_notes": "" | |
} | |
} | |
-------------------------------- | |
STRICT FIELD FORMATTING RULES: | |
-------------------------------- | |
β’ Dates: Format as MM/DD/YYYY only | |
β’ Phone numbers: Use digits and hyphens only (e.g., 406-596-1901), no extensions or parentheses | |
β’ Gender: "Male", "Female", or "Other" only | |
β’ Email: Must contain @ and valid domain, otherwise leave empty | |
β’ Zip code: Only extract as last 5 digits of address | |
-------------------------------- | |
REFERRAL SOURCE RULES: | |
-------------------------------- | |
β’ Extract clinic/hospital/facility name ONLY β never the provider's name | |
β’ Use facility's phone/fax/email, not individual provider's contact | |
β’ Prefer header/fax banner for referral source over body text | |
β’ Do not extract receiver clinic names (e.g., Frontier Psychiatry) as referral source | |
-------------------------------- | |
INSURANCE EXTRACTION FORMAT: | |
-------------------------------- | |
Each tier must follow this structure: | |
"primary_insurance": { | |
"payer_name": "string", | |
"member_id": "string", | |
"group_id": "string" | |
}, | |
"secondary_insurance": { ... }, | |
"tertiary_insurance": { ... } | |
β’ Use "member_id" for any ID (Policy, Insurance ID, Subscriber ID, etc.) | |
β’ Use "group_id" ONLY if explicitly labeled as "Group ID", "Group Number", etc. | |
β’ Leave all fields empty if "Self Pay" is indicated | |
-------------------------------- | |
DIAGNOSIS EXTRACTION RULES: | |
-------------------------------- | |
β’ Extract diagnosis codes AND their descriptions | |
β’ If only code is present, set description to "" and confidence β€ 0.6 | |
β’ DO NOT infer description from ICD code | |
-------------------------------- | |
CONFIDENCE SCORING: | |
-------------------------------- | |
Assign realistic confidence (0.0β1.0) per field, e.g.: | |
β’ 0.95β1.0 β Clearly labeled, unambiguous data | |
β’ 0.7β0.94 β Some uncertainty (low quality, odd format) | |
β’ 0.0β0.6 β Missing, ambiguous, or noisy data | |
β’ Use float precision (e.g., 0.87, not just 1.0) | |
Always populate the `confidence_scores` dictionary with the same structure as `extracted_data`. | |
-------------------------------- | |
CRITICAL INSTRUCTIONS: | |
-------------------------------- | |
1. READ EVERYTHING: Transcribe all visible text in "all_visible_text" | |
2. EXTRACT PRECISELY: Only extract what's actually visible on THIS page | |
3. NO ASSUMPTIONS: Don't guess or infer information not present | |
4. FIELD CLASSIFICATION: List which fields were actually found in "fields_found_on_this_page" | |
5. CONFIDENCE: Be realistic - 0.0 if not found, up to 1.0 if completely certain | |
6. FORMAT EXACTLY: Follow date/phone/address formatting rules strictly | |
7. JSON ONLY: Return only valid JSON, no other text | |
This is ONE PAGE of a multi-page document. Extract only what's visible on this specific page.""" | |
def extract_single_page(image, extraction_prompt, model, tokenizer): | |
"""Extract data from a single page with comprehensive medical fields""" | |
try: | |
if hasattr(image, 'convert'): | |
image = image.convert('RGB') | |
response = model.chat( | |
image=image, | |
msgs=[{ | |
"role": "user", | |
"content": extraction_prompt | |
}], | |
tokenizer=tokenizer, | |
sampling=False, | |
temperature=0.1, | |
max_new_tokens=4000 | |
) | |
# Try to parse JSON | |
try: | |
parsed_data = json.loads(response) | |
# Clean empty fields | |
cleaned_data = clean_empty_fields(parsed_data) | |
return cleaned_data if cleaned_data else None | |
except json.JSONDecodeError: | |
return None | |
except Exception as e: | |
print(f"Error extracting from page: {e}") | |
return None | |
# 3 minutes | |
def extract_pages_clean_json(pdf_file, custom_prompt=None): | |
"""Extract each page individually - RETURN ONLY NON-EMPTY JSON DATA""" | |
try: | |
if pdf_file is None: | |
return {"error": "No PDF provided"} | |
# Convert PDF to images | |
print("Converting PDF to images...") | |
images = pdf_to_images(pdf_file) | |
if not images: | |
return {"error": "Could not convert PDF"} | |
print(f"Processing {len(images)} pages individually...") | |
# Load model once | |
model, tokenizer = load_model() | |
extraction_prompt = custom_prompt or get_comprehensive_medical_extraction_prompt() | |
# Process each page and collect only non-empty JSON | |
page_results = {} | |
for i, image in enumerate(images): | |
print(f"Extracting page {i+1}/{len(images)}...") | |
page_json = extract_single_page(image, extraction_prompt, model, tokenizer) | |
# Only add to results if page contains data | |
if page_json: | |
page_results[f"page_{i+1}"] = page_json | |
return page_results # Return only pages with data | |
except Exception as e: | |
return {"error": str(e)} | |
def create_gradio_interface(): | |
with gr.Blocks(title="Clean Medical eFax Extractor", theme=gr.themes.Soft()) as demo: | |
gr.Markdown("# π₯ Clean Medical eFax Data Extractor") | |
gr.Markdown("π **Returns Only Non-Empty Data** - Clean page-by-page extraction without empty fields") | |
with gr.Tab("π Clean JSON Extraction"): | |
with gr.Row(): | |
with gr.Column(): | |
pdf_input = gr.File( | |
file_types=[".pdf"], | |
label="Upload Medical eFax PDF", | |
file_count="single" | |
) | |
with gr.Accordion("π§ Custom Prompt", open=False): | |
prompt_input = gr.Textbox( | |
value="", | |
label="Custom Extraction Prompt (optional)", | |
lines=4, | |
placeholder="Leave empty for comprehensive medical extraction..." | |
) | |
extract_btn = gr.Button("π Extract Clean JSON", variant="primary", size="lg") | |
gr.Markdown(""" | |
### β Clean Output Features | |
- **No Empty Fields**: Only fields with actual data | |
- **No Empty Pages**: Only pages containing information | |
- **Easier Combination**: Clean structure for AI merging | |
- **Optimized Size**: Reduced JSON payload | |
""") | |
with gr.Column(): | |
status_output = gr.Textbox(label="π Processing Status", interactive=False) | |
output = gr.JSON(label="π Clean JSON Results", show_label=True) | |
with gr.Tab("π API Usage Instructions"): | |
gr.Markdown(""" | |
## Updated API Instructions | |
### Method 1: Python Client (Recommended) | |
``` | |
pip install gradio_client | |
``` | |
``` | |
from gradio_client import Client, handle_file | |
import json | |
# Connect to your deployed Space | |
client = Client("crimsons-uv/miniCPM") | |
# Extract medical data from eFax PDF | |
def extract_efax_clean(pdf_path, custom_prompt=""): | |
result = client.predict( | |
pdf_file=handle_file(pdf_path), | |
custom_prompt=custom_prompt, | |
api_name="/process_with_status" | |
) | |
# result is tuple: [status_message, clean_json_data] | |
status, clean_data = result | |
print(f"Status: {status}") | |
# Process only pages with data | |
for page_key, page_data in clean_data.items(): | |
if page_key.startswith('page_'): | |
print(f"\\n{page_key.upper()}:") | |
if 'extracted_data' in page_ | |
data = page_data['extracted_data'] | |
if 'patient_first_name' in | |
print(f" Patient: {data['patient_first_name']} {data.get('patient_last_name', '')}") | |
if 'primary_insurance' in | |
print(f" Insurance: {data['primary_insurance'].get('payer_name', '')}") | |
if 'reason_for_referral' in | |
print(f" Reason: {data['reason_for_referral']}") | |
return clean_data | |
# Usage | |
results = extract_efax_clean("path/to/your/efax.pdf") | |
``` | |
### Method 2: cURL Commands | |
``` | |
# Step 1: Make POST request | |
curl -X POST https://crimsons-uv-minicpm.hf.space/gradio_api/call/process_with_status \\ | |
-H "Content-Type: application/json" \\ | |
-d '{ | |
"data": [ | |
{"path": "your_efax.pdf", "meta": {"_type": "gradio.FileData"}}, | |
"" | |
] | |
}' \\ | |
| awk -F'"' '{ print $4}' \\ | |
| read EVENT_ID; curl -N https://crimsons-uv-minicpm.hf.space/gradio_api/call/process_with_status/$EVENT_ID | |
``` | |
### Method 3: Direct HTTP API | |
``` | |
import requests | |
import base64 | |
import json | |
def call_clean_extraction_api(pdf_path, custom_prompt=""): | |
# Read and encode PDF | |
with open(pdf_path, 'rb') as f: | |
pdf_b64 = base64.b64encode(f.read()).decode() | |
# API payload | |
payload = { | |
"data": [ | |
{"name": "efax.pdf", "data": f"application/pdf;base64,{pdf_b64}"}, | |
custom_prompt | |
] | |
} | |
# Make request | |
response = requests.post( | |
"https://crimsons-uv-minicpm.hf.space/gradio_api/call/process_with_status", | |
json=payload, | |
headers={"Content-Type": "application/json"} | |
) | |
return response.json() | |
# Usage | |
clean_results = call_clean_extraction_api("your_efax.pdf") | |
``` | |
""") | |
with gr.Tab("π Response Format"): | |
gr.Markdown(""" | |
## Clean Response Structure | |
### Input: 5-page PDF with mixed content | |
### Output: Only pages with data | |
``` | |
{ | |
"page_2": { | |
"page_analysis": { | |
"page_type": "patient_demographics", | |
"overall_page_confidence": 0.95, | |
"all_visible_text": "Patient: John Doe..." | |
}, | |
"extracted_data": { | |
"patient_first_name": "John", | |
"patient_last_name": "Doe", | |
"patient_dob": "01/15/1980", | |
"patient_gender": "Male", | |
"patient_primary_phone_number": "555-123-4567", | |
"patient_address": "123 Main St, City, State 12345", | |
"patient_zip_code": "12345" | |
}, | |
"confidence_scores": { | |
"patient_first_name": 1.0, | |
"patient_last_name": 1.0, | |
"patient_dob": 0.95, | |
"patient_gender": 1.0 | |
}, | |
"fields_found_on_this_page": ["patient_first_name", "patient_last_name", "patient_dob"] | |
}, | |
"page_3": { | |
"extracted_data": { | |
"primary_insurance": { | |
"payer_name": "Blue Cross Blue Shield", | |
"member_id": "ABC123456789", | |
"group_id": "GRP001" | |
}, | |
"reason_for_referral": "Cardiology consultation" | |
}, | |
"confidence_scores": { | |
"primary_insurance": { | |
"payer_name": 1.0, | |
"member_id": 0.98, | |
"group_id": 0.95 | |
}, | |
"reason_for_referral": 1.0 | |
} | |
} | |
} | |
``` | |
### Benefits for AI Combination: | |
- β **No empty pages**: Pages 1, 4, 5 had no data, so not included | |
- β **No empty fields**: Only fields with actual values | |
- β **Smaller payload**: Reduced data size for faster processing | |
- β **Easy merging**: Clear structure for combining with ChatGPT/Claude | |
""") | |
def process_with_status(pdf_file, custom_prompt): | |
if pdf_file is None: | |
return "β No PDF uploaded", {"error": "Upload a PDF file"} | |
yield "π Converting PDF to images...", {} | |
try: | |
result = extract_pages_clean_json(pdf_file, custom_prompt if custom_prompt.strip() else None) | |
if "error" not in result: | |
page_count = len([k for k in result.keys() if k.startswith("page_")]) | |
yield f"β Extracted clean data from {page_count} pages with content", result | |
else: | |
yield f"β Error: {result['error']}", result | |
except Exception as e: | |
yield f"β Failed: {str(e)}", {"error": str(e)} | |
extract_btn.click( | |
fn=process_with_status, | |
inputs=[pdf_input, prompt_input], | |
outputs=[status_output, output], | |
queue=True | |
) | |
return demo | |
if __name__ == "__main__": | |
demo = create_gradio_interface() | |
demo.queue( | |
default_concurrency_limit=1, | |
max_size=10 | |
).launch( | |
server_name="0.0.0.0", | |
server_port=7860, | |
show_error=True | |
) | |