Spaces:

crimson-suv
/

miniCPM

Paused

App Files Files Community

Suvadeep Das commited on 27 days ago

Commit

b21a788

verified ·

1 Parent(s): 998302b

Update app.py

Browse files

Files changed (1) hide show

app.py +201 -197

app.py CHANGED Viewed

@@ -70,6 +70,34 @@ def pdf_to_images(pdf_file):
         print(f"Error converting PDF to images: {e}")
         return []
 def get_comprehensive_medical_extraction_prompt():
     """Complete medical data extraction prompt with all fields"""
     return """You are a deterministic medical data extraction engine. You will receive a single page from a medical document. Your task is to extract ALL visible information from this page and return it in the exact JSON format below.
@@ -257,120 +285,64 @@ def extract_single_page(image, extraction_prompt, model, tokenizer):
             tokenizer=tokenizer,
             sampling=False,
             temperature=0.1,
-            max_new_tokens=4000  # More tokens for comprehensive extraction
         )
         # Try to parse JSON
         try:
             parsed_data = json.loads(response)
-            return {
-                "status": "success",
-                "data": parsed_data,
-                "raw_response": response,
-                "model": "MiniCPM-V-2_6-GPU"
-            }
         except json.JSONDecodeError:
-            # Return structured error with raw text
-            return {
-                "status": "json_parse_error",
-                "data": {
-                    "page_analysis": {
-                        "page_contains_text": True,
-                        "page_type": "unknown",
-                        "overall_page_confidence": 0.5,
-                        "all_visible_text": response
-                    },
-                    "extracted_data": {},
-                    "confidence_scores": {},
-                    "fields_found_on_this_page": [],
-                    "parsing_error": "Could not parse JSON response"
-                },
-                "raw_response": response,
-                "model": "MiniCPM-V-2_6-GPU",
-                "error": "JSON parsing failed - returned raw text"
-            }
     except Exception as e:
-        return {
-            "status": "extraction_error",
-            "error": str(e),
-            "data": None,
-            "raw_response": ""
-        }
 @spaces.GPU(duration=600)  # 10 minutes
-def extract_pages_individually(pdf_file, custom_prompt=None):
-    """Extract each page individually with comprehensive medical data"""
     try:
         if pdf_file is None:
-            return {"status": "error", "error": "No PDF provided"}
         # Convert PDF to images
         print("Converting PDF to images...")
         images = pdf_to_images(pdf_file)
         if not images:
-            return {"status": "error", "error": "Could not convert PDF"}
-        print(f"Processing {len(images)} pages individually with comprehensive extraction...")
         # Load model once
         model, tokenizer = load_model()
         extraction_prompt = custom_prompt or get_comprehensive_medical_extraction_prompt()
-        # Process each page independently
-        results = []
-        successful_extractions = 0
         for i, image in enumerate(images):
-            print(f"Extracting page {i+1}/{len(images)} with full medical fields...")
-            page_result = extract_single_page(image, extraction_prompt, model, tokenizer)
-            if page_result["status"] == "success":
-                successful_extractions += 1
-            results.append({
-                "page_number": i + 1,
-                "extraction_result": page_result,
-                "timestamp": datetime.now().isoformat()
-            })
-        return {
-            "status": "success",
-            "total_pages": len(images),
-            "successful_extractions": successful_extractions,
-            "individual_pages": results,
-            "processing_info": {
-                "model_used": "MiniCPM-V-2_6-GPU",
-                "extraction_timestamp": datetime.now().isoformat(),
-                "processing_method": "comprehensive_individual_page_extraction",
-                "extraction_prompt_used": "comprehensive_medical_fields",
-                "note": "Each page processed with full medical field extraction - combine results with separate AI"
-            },
-            "next_step_instructions": {
-                "combination_method": "Use ChatGPT/Claude to combine all pages into final medical record",
-                "fields_to_aggregate": [
-                    "date_of_receipt", "patient_demographics", "insurance_info",
-                    "referral_source", "diagnosis_codes", "reason_for_referral"
-                ],
-                "confidence_handling": "Take highest confidence values across pages for each field"
-            }
-        }
     except Exception as e:
-        return {
-            "status": "error",
-            "error": str(e),
-            "total_pages": 0,
-            "individual_pages": []
-        }
 def create_gradio_interface():
-    with gr.Blocks(title="Comprehensive Medical Page Extractor", theme=gr.themes.Soft()) as demo:
-        gr.Markdown("# 🏥 Comprehensive Medical Data Extractor")
-        gr.Markdown("📋 **Complete Field Extraction** - All medical fields extracted per page, ready for AI combination")
-        with gr.Tab("📄 Comprehensive Page Extraction"):
             with gr.Row():
                 with gr.Column():
                     pdf_input = gr.File(
@@ -384,147 +356,178 @@ def create_gradio_interface():
                             value="",
                             label="Custom Extraction Prompt (optional)",
                             lines=4,
-                            placeholder="Leave empty for comprehensive medical extraction with all fields..."
                         )
-                    extract_btn = gr.Button("🏥 Extract All Medical Fields Per Page", variant="primary", size="lg")
                     gr.Markdown("""
-                    ### 📋 Comprehensive Fields Extracted:
-                    - ✅ **Patient Demographics** (name, DOB, gender, address, phone, email)
-                    - ✅ **Insurance Information** (primary/secondary/tertiary with IDs)
-                    - ✅ **Referral Source** (clinic, phone, fax, email)
-                    - ✅ **Medical Codes** (diagnosis codes with descriptions)
-                    - ✅ **Clinical Info** (priority, reason for referral, medical history)
-                    - ✅ **Confidence Scores** (0.0-1.0 for each field)
-                    - ✅ **Full Text Transcription** (everything visible on each page)
                     """)
                 with gr.Column():
                     status_output = gr.Textbox(label="📊 Processing Status", interactive=False)
-                    output = gr.JSON(label="📋 Comprehensive Page Results", show_label=True)
-        with gr.Tab("🔌 API Usage"):
             gr.Markdown("""
-            ## Comprehensive Medical Extraction API
-            ### Python Usage
             ```
-            import requests
-            import base64
-            with open("medical_efax.pdf", "rb") as f:
-                pdf_b64 = base64.b64encode(f.read()).decode()
-            response = requests.post(
-                "https://your-username-extracting-efax.hf.space/api/predict",
-                json={
-                    "data": [
-                        {"name": "efax.pdf", "data": f"application/pdf;base64,{pdf_b64}"},
-                        ""  # Custom prompt (optional)
-                    ]
-                }
-            )
-            result = response.json()
-            # Access comprehensive page results
-            for page in result["data"]["individual_pages"]:
-                page_num = page["page_number"]
-                extraction = page["extraction_result"]
-                if extraction["status"] == "success":
-                    data = extraction["data"]
-                    # Page analysis
-                    print(f"Page {page_num} Type: {data['page_analysis']['page_type']}")
-                    print(f"Confidence: {data['page_analysis']['overall_page_confidence']}")
-                    # Extracted medical fields
-                    extracted = data['extracted_data']
-                    print(f"Patient: {extracted['patient_first_name']} {extracted['patient_last_name']}")
-                    print(f"Insurance: {extracted['primary_insurance']['payer_name']}")
-                    print(f"Diagnosis: {extracted['diagnosis_informations']}")
-                    # Fields found on this page
-                    print(f"Fields found: {data['fields_found_on_this_page']}")
-            ```
-            ### Use ChatGPT/Claude for Final Combination
             ```
-            # Prepare all page data for combination
-            all_pages_data = []
-            for page in result["data"]["individual_pages"]:
-                if page["extraction_result"]["status"] == "success":
-                    all_pages_data.append({
-                        "page": page["page_number"],
-                        "extracted_data": page["extraction_result"]["data"]["extracted_data"],
-                        "confidence_scores": page["extraction_result"]["data"]["confidence_scores"],
-                        "fields_found": page["extraction_result"]["data"]["fields_found_on_this_page"]
-                    })
-            # Send to ChatGPT for combination
-            combination_prompt = f'''
-            Combine these {len(all_pages_data)} medical document pages into a single comprehensive patient record.
-            For each field, choose the value with highest confidence across all pages.
-            If multiple pages have the same field, verify consistency.
-            Page Data:
-            {json.dumps(all_pages_data, indent=2)}
-            Return the final medical record in the same structure with:
-            - Combined data from all pages
-            - Highest confidence scores per field
-            - List of pages where each field was found
-            - Fields needing human review (confidence < 0.9)
-            '''
             ```
             """)
-        with gr.Tab("📊 Field Mapping"):
             gr.Markdown("""
-            ## Complete Medical Fields Extracted Per Page
-            ### Patient Demographics
-            - `date_of_receipt` - Document receipt date (MM/DD/YYYY)
-            - `patient_first_name` - Patient's first name
-            - `patient_last_name` - Patient's last name
-            - `patient_dob` - Date of birth (MM/DD/YYYY)
-            - `patient_gender` - Male/Female/Other only
-            - `patient_primary_phone_number` - Main phone (###-###-####)
-            - `patient_secondary_phone_number` - Secondary phone
-            - `patient_email` - Email address (must have @ and domain)
-            - `patient_address` - Full address
-            - `patient_zip_code` - Last 5 digits only
-            ### Referral Information
-            - `referral_source` - Clinic/hospital name (NOT provider name)
-            - `referral_source_phone_no` - Facility phone
-            - `referral_source_fax_no` - Facility fax
-            - `referral_source_email` - Facility email
-            ### Insurance (Primary/Secondary/Tertiary)
-            - `payer_name` - Insurance company name
-            - `member_id` - Any ID (policy, subscriber, member, etc.)
-            - `group_id` - Only if explicitly labeled as "Group"
-            ### Medical Information
-            - `priority` - "Routine" or "Urgent" only
-            - `reason_for_referral` - Why patient was referred
-            - `diagnosis_informations` - Array of {code, description}
-            - `refine_reason` - Additional refinement details
-            ### Page Analysis
-            - `page_type` - Classification of page content
-            - `all_visible_text` - Complete text transcription
-            - `overall_page_confidence` - Page extraction confidence
-            - `fields_found_on_this_page` - List of fields with data
-            ### Confidence Scoring (0.0 - 1.0)
-            - `0.95-1.0` → Clearly visible, unambiguous
-            - `0.7-0.94` → Some uncertainty, formatting issues
-            - `0.0-0.6` → Missing, unclear, or poor quality
             """)
         def process_with_status(pdf_file, custom_prompt):
@@ -534,12 +537,13 @@ def create_gradio_interface():
             yield "📄 Converting PDF to images...", {}
             try:
-                result = extract_pages_individually(pdf_file, custom_prompt if custom_prompt.strip() else None)
-                if result["status"] == "success":
-                    yield f"✅ Extracted comprehensive medical data from {result['successful_extractions']}/{result['total_pages']} pages", result
                 else:
-                    yield f"❌ Error: {result.get('error')}", result
             except Exception as e:
                 yield f"❌ Failed: {str(e)}", {"error": str(e)}

         print(f"Error converting PDF to images: {e}")
         return []
+def clean_empty_fields(data):
+    """Recursively remove empty fields from dictionary"""
+    if not isinstance(data, dict):
+        return data
+    cleaned = {}
+    for key, value in data.items():
+        if isinstance(value, dict):
+            cleaned_value = clean_empty_fields(value)
+            if cleaned_value:  # Only add if not empty
+                cleaned[key] = cleaned_value
+        elif isinstance(value, list):
+            if value:  # Only add if list is not empty
+                cleaned_list = []
+                for item in value:
+                    if isinstance(item, dict):
+                        cleaned_item = clean_empty_fields(item)
+                        if cleaned_item:
+                            cleaned_list.append(cleaned_item)
+                    elif item:  # Not empty
+                        cleaned_list.append(item)
+                if cleaned_list:
+                    cleaned[key] = cleaned_list
+        elif value not in [None, "", [], {}]:  # Not empty
+            cleaned[key] = value
+    return cleaned
 def get_comprehensive_medical_extraction_prompt():
     """Complete medical data extraction prompt with all fields"""
     return """You are a deterministic medical data extraction engine. You will receive a single page from a medical document. Your task is to extract ALL visible information from this page and return it in the exact JSON format below.
             tokenizer=tokenizer,
             sampling=False,
             temperature=0.1,
+            max_new_tokens=4000
         )
         # Try to parse JSON
         try:
             parsed_data = json.loads(response)
+            # Clean empty fields
+            cleaned_data = clean_empty_fields(parsed_data)
+            return cleaned_data if cleaned_data else None
         except json.JSONDecodeError:
+            return None
     except Exception as e:
+        print(f"Error extracting from page: {e}")
+        return None
 @spaces.GPU(duration=600)  # 10 minutes
+def extract_pages_clean_json(pdf_file, custom_prompt=None):
+    """Extract each page individually - RETURN ONLY NON-EMPTY JSON DATA"""
     try:
         if pdf_file is None:
+            return {"error": "No PDF provided"}
         # Convert PDF to images
         print("Converting PDF to images...")
         images = pdf_to_images(pdf_file)
         if not images:
+            return {"error": "Could not convert PDF"}
+        print(f"Processing {len(images)} pages individually...")
         # Load model once
         model, tokenizer = load_model()
         extraction_prompt = custom_prompt or get_comprehensive_medical_extraction_prompt()
+        # Process each page and collect only non-empty JSON
+        page_results = {}
         for i, image in enumerate(images):
+            print(f"Extracting page {i+1}/{len(images)}...")
+            page_json = extract_single_page(image, extraction_prompt, model, tokenizer)
+            # Only add to results if page contains data
+            if page_json:
+                page_results[f"page_{i+1}"] = page_json
+        return page_results  # Return only pages with data
     except Exception as e:
+        return {"error": str(e)}
 def create_gradio_interface():
+    with gr.Blocks(title="Clean Medical eFax Extractor", theme=gr.themes.Soft()) as demo:
+        gr.Markdown("# 🏥 Clean Medical eFax Data Extractor")
+        gr.Markdown("📋 **Returns Only Non-Empty Data** - Clean page-by-page extraction without empty fields")
+        with gr.Tab("📄 Clean JSON Extraction"):
             with gr.Row():
                 with gr.Column():
                     pdf_input = gr.File(
                             value="",
                             label="Custom Extraction Prompt (optional)",
                             lines=4,
+                            placeholder="Leave empty for comprehensive medical extraction..."
                         )
+                    extract_btn = gr.Button("📋 Extract Clean JSON", variant="primary", size="lg")
                     gr.Markdown("""
+                    ### ✅ Clean Output Features
+                    - **No Empty Fields**: Only fields with actual data
+                    - **No Empty Pages**: Only pages containing information
+                    - **Easier Combination**: Clean structure for AI merging
+                    - **Optimized Size**: Reduced JSON payload
                     """)
                 with gr.Column():
                     status_output = gr.Textbox(label="📊 Processing Status", interactive=False)
+                    output = gr.JSON(label="📋 Clean JSON Results", show_label=True)
+        with gr.Tab("🔌 API Usage Instructions"):
             gr.Markdown("""
+            ## Updated API Instructions
+            ### Method 1: Python Client (Recommended)
+            ```
+            pip install gradio_client
             ```
+            ```
+            from gradio_client import Client, handle_file
+            import json
+            # Connect to your deployed Space
+            client = Client("crimsons-uv/miniCPM")
+            # Extract medical data from eFax PDF
+            def extract_efax_clean(pdf_path, custom_prompt=""):
+                result = client.predict(
+                    pdf_file=handle_file(pdf_path),
+                    custom_prompt=custom_prompt,
+                    api_name="/process_with_status"
+                )
+                # result is tuple: [status_message, clean_json_data]
+                status, clean_data = result
+                print(f"Status: {status}")
+                # Process only pages with data
+                for page_key, page_data in clean_data.items():
+                    if page_key.startswith('page_'):
+                        print(f"\\n{page_key.upper()}:")
+                        if 'extracted_data' in page_
+                            data = page_data['extracted_data']
+                            if 'patient_first_name' in
+                                print(f"  Patient: {data['patient_first_name']} {data.get('patient_last_name', '')}")
+                            if 'primary_insurance' in
+                                print(f"  Insurance: {data['primary_insurance'].get('payer_name', '')}")
+                            if 'reason_for_referral' in
+                                print(f"  Reason: {data['reason_for_referral']}")
+                return clean_data
+            # Usage
+            results = extract_efax_clean("path/to/your/efax.pdf")
             ```
+            ### Method 2: cURL Commands
+            ```
+            # Step 1: Make POST request
+            curl -X POST https://crimsons-uv-minicpm.hf.space/gradio_api/call/process_with_status \\
+              -H "Content-Type: application/json" \\
+              -d '{
+                "data": [
+                  {"path": "your_efax.pdf", "meta": {"_type": "gradio.FileData"}},
+                  ""
+                ]
+              }' \\
+              | awk -F'"' '{ print $4}' \\
+              | read EVENT_ID; curl -N https://crimsons-uv-minicpm.hf.space/gradio_api/call/process_with_status/$EVENT_ID
+            ```
+            ### Method 3: Direct HTTP API
+            ```
+            import requests
+            import base64
+            import json
+            def call_clean_extraction_api(pdf_path, custom_prompt=""):
+                # Read and encode PDF
+                with open(pdf_path, 'rb') as f:
+                    pdf_b64 = base64.b64encode(f.read()).decode()
+                # API payload
+                payload = {
+                    "data": [
+                        {"name": "efax.pdf", "data": f"application/pdf;base64,{pdf_b64}"},
+                        custom_prompt
+                    ]
+                }
+                # Make request
+                response = requests.post(
+                    "https://crimsons-uv-minicpm.hf.space/gradio_api/call/process_with_status",
+                    json=payload,
+                    headers={"Content-Type": "application/json"}
+                )
+                return response.json()
+            # Usage
+            clean_results = call_clean_extraction_api("your_efax.pdf")
             ```
             """)
+        with gr.Tab("📋 Response Format"):
             gr.Markdown("""
+            ## Clean Response Structure
+            ### Input: 5-page PDF with mixed content
+            ### Output: Only pages with data
+            ```
+            {
+              "page_2": {
+                "page_analysis": {
+                  "page_type": "patient_demographics",
+                  "overall_page_confidence": 0.95,
+                  "all_visible_text": "Patient: John Doe..."
+                },
+                "extracted_data": {
+                  "patient_first_name": "John",
+                  "patient_last_name": "Doe",
+                  "patient_dob": "01/15/1980",
+                  "patient_gender": "Male",
+                  "patient_primary_phone_number": "555-123-4567",
+                  "patient_address": "123 Main St, City, State 12345",
+                  "patient_zip_code": "12345"
+                },
+                "confidence_scores": {
+                  "patient_first_name": 1.0,
+                  "patient_last_name": 1.0,
+                  "patient_dob": 0.95,
+                  "patient_gender": 1.0
+                },
+                "fields_found_on_this_page": ["patient_first_name", "patient_last_name", "patient_dob"]
+              },
+              "page_3": {
+                "extracted_data": {
+                  "primary_insurance": {
+                    "payer_name": "Blue Cross Blue Shield",
+                    "member_id": "ABC123456789",
+                    "group_id": "GRP001"
+                  },
+                  "reason_for_referral": "Cardiology consultation"
+                },
+                "confidence_scores": {
+                  "primary_insurance": {
+                    "payer_name": 1.0,
+                    "member_id": 0.98,
+                    "group_id": 0.95
+                  },
+                  "reason_for_referral": 1.0
+                }
+              }
+            }
+            ```
+            ### Benefits for AI Combination:
+            - ✅ **No empty pages**: Pages 1, 4, 5 had no data, so not included
+            - ✅ **No empty fields**: Only fields with actual values
+            - ✅ **Smaller payload**: Reduced data size for faster processing
+            - ✅ **Easy merging**: Clear structure for combining with ChatGPT/Claude
             """)
         def process_with_status(pdf_file, custom_prompt):
             yield "📄 Converting PDF to images...", {}
             try:
+                result = extract_pages_clean_json(pdf_file, custom_prompt if custom_prompt.strip() else None)
+                if "error" not in result:
+                    page_count = len([k for k in result.keys() if k.startswith("page_")])
+                    yield f"✅ Extracted clean data from {page_count} pages with content", result
                 else:
+                    yield f"❌ Error: {result['error']}", result
             except Exception as e:
                 yield f"❌ Failed: {str(e)}", {"error": str(e)}