Suvadeep Das commited on
Commit
b21a788
Β·
verified Β·
1 Parent(s): 998302b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +201 -197
app.py CHANGED
@@ -70,6 +70,34 @@ def pdf_to_images(pdf_file):
70
  print(f"Error converting PDF to images: {e}")
71
  return []
72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  def get_comprehensive_medical_extraction_prompt():
74
  """Complete medical data extraction prompt with all fields"""
75
  return """You are a deterministic medical data extraction engine. You will receive a single page from a medical document. Your task is to extract ALL visible information from this page and return it in the exact JSON format below.
@@ -257,120 +285,64 @@ def extract_single_page(image, extraction_prompt, model, tokenizer):
257
  tokenizer=tokenizer,
258
  sampling=False,
259
  temperature=0.1,
260
- max_new_tokens=4000 # More tokens for comprehensive extraction
261
  )
262
 
263
  # Try to parse JSON
264
  try:
265
  parsed_data = json.loads(response)
266
- return {
267
- "status": "success",
268
- "data": parsed_data,
269
- "raw_response": response,
270
- "model": "MiniCPM-V-2_6-GPU"
271
- }
272
  except json.JSONDecodeError:
273
- # Return structured error with raw text
274
- return {
275
- "status": "json_parse_error",
276
- "data": {
277
- "page_analysis": {
278
- "page_contains_text": True,
279
- "page_type": "unknown",
280
- "overall_page_confidence": 0.5,
281
- "all_visible_text": response
282
- },
283
- "extracted_data": {},
284
- "confidence_scores": {},
285
- "fields_found_on_this_page": [],
286
- "parsing_error": "Could not parse JSON response"
287
- },
288
- "raw_response": response,
289
- "model": "MiniCPM-V-2_6-GPU",
290
- "error": "JSON parsing failed - returned raw text"
291
- }
292
  except Exception as e:
293
- return {
294
- "status": "extraction_error",
295
- "error": str(e),
296
- "data": None,
297
- "raw_response": ""
298
- }
299
 
300
  @spaces.GPU(duration=600) # 10 minutes
301
- def extract_pages_individually(pdf_file, custom_prompt=None):
302
- """Extract each page individually with comprehensive medical data"""
303
  try:
304
  if pdf_file is None:
305
- return {"status": "error", "error": "No PDF provided"}
306
 
307
  # Convert PDF to images
308
  print("Converting PDF to images...")
309
  images = pdf_to_images(pdf_file)
310
 
311
  if not images:
312
- return {"status": "error", "error": "Could not convert PDF"}
313
 
314
- print(f"Processing {len(images)} pages individually with comprehensive extraction...")
315
 
316
  # Load model once
317
  model, tokenizer = load_model()
318
  extraction_prompt = custom_prompt or get_comprehensive_medical_extraction_prompt()
319
 
320
- # Process each page independently
321
- results = []
322
- successful_extractions = 0
323
 
324
  for i, image in enumerate(images):
325
- print(f"Extracting page {i+1}/{len(images)} with full medical fields...")
326
 
327
- page_result = extract_single_page(image, extraction_prompt, model, tokenizer)
328
 
329
- if page_result["status"] == "success":
330
- successful_extractions += 1
331
-
332
- results.append({
333
- "page_number": i + 1,
334
- "extraction_result": page_result,
335
- "timestamp": datetime.now().isoformat()
336
- })
337
 
338
- return {
339
- "status": "success",
340
- "total_pages": len(images),
341
- "successful_extractions": successful_extractions,
342
- "individual_pages": results,
343
- "processing_info": {
344
- "model_used": "MiniCPM-V-2_6-GPU",
345
- "extraction_timestamp": datetime.now().isoformat(),
346
- "processing_method": "comprehensive_individual_page_extraction",
347
- "extraction_prompt_used": "comprehensive_medical_fields",
348
- "note": "Each page processed with full medical field extraction - combine results with separate AI"
349
- },
350
- "next_step_instructions": {
351
- "combination_method": "Use ChatGPT/Claude to combine all pages into final medical record",
352
- "fields_to_aggregate": [
353
- "date_of_receipt", "patient_demographics", "insurance_info",
354
- "referral_source", "diagnosis_codes", "reason_for_referral"
355
- ],
356
- "confidence_handling": "Take highest confidence values across pages for each field"
357
- }
358
- }
359
 
360
  except Exception as e:
361
- return {
362
- "status": "error",
363
- "error": str(e),
364
- "total_pages": 0,
365
- "individual_pages": []
366
- }
367
 
368
  def create_gradio_interface():
369
- with gr.Blocks(title="Comprehensive Medical Page Extractor", theme=gr.themes.Soft()) as demo:
370
- gr.Markdown("# πŸ₯ Comprehensive Medical Data Extractor")
371
- gr.Markdown("πŸ“‹ **Complete Field Extraction** - All medical fields extracted per page, ready for AI combination")
372
 
373
- with gr.Tab("πŸ“„ Comprehensive Page Extraction"):
374
  with gr.Row():
375
  with gr.Column():
376
  pdf_input = gr.File(
@@ -384,147 +356,178 @@ def create_gradio_interface():
384
  value="",
385
  label="Custom Extraction Prompt (optional)",
386
  lines=4,
387
- placeholder="Leave empty for comprehensive medical extraction with all fields..."
388
  )
389
 
390
- extract_btn = gr.Button("πŸ₯ Extract All Medical Fields Per Page", variant="primary", size="lg")
391
 
392
  gr.Markdown("""
393
- ### πŸ“‹ Comprehensive Fields Extracted:
394
- - βœ… **Patient Demographics** (name, DOB, gender, address, phone, email)
395
- - βœ… **Insurance Information** (primary/secondary/tertiary with IDs)
396
- - βœ… **Referral Source** (clinic, phone, fax, email)
397
- - βœ… **Medical Codes** (diagnosis codes with descriptions)
398
- - βœ… **Clinical Info** (priority, reason for referral, medical history)
399
- - βœ… **Confidence Scores** (0.0-1.0 for each field)
400
- - βœ… **Full Text Transcription** (everything visible on each page)
401
  """)
402
 
403
  with gr.Column():
404
  status_output = gr.Textbox(label="πŸ“Š Processing Status", interactive=False)
405
- output = gr.JSON(label="πŸ“‹ Comprehensive Page Results", show_label=True)
406
 
407
- with gr.Tab("πŸ”Œ API Usage"):
408
  gr.Markdown("""
409
- ## Comprehensive Medical Extraction API
410
 
411
- ### Python Usage
 
 
412
  ```
413
- import requests
414
- import base64
415
-
416
- with open("medical_efax.pdf", "rb") as f:
417
- pdf_b64 = base64.b64encode(f.read()).decode()
418
 
419
- response = requests.post(
420
- "https://your-username-extracting-efax.hf.space/api/predict",
421
- json={
422
- "data": [
423
- {"name": "efax.pdf", "data": f"application/pdf;base64,{pdf_b64}"},
424
- "" # Custom prompt (optional)
425
- ]
426
- }
427
- )
428
 
429
- result = response.json()
 
430
 
431
- # Access comprehensive page results
432
- for page in result["data"]["individual_pages"]:
433
- page_num = page["page_number"]
434
- extraction = page["extraction_result"]
 
 
 
435
 
436
- if extraction["status"] == "success":
437
- data = extraction["data"]
438
-
439
- # Page analysis
440
- print(f"Page {page_num} Type: {data['page_analysis']['page_type']}")
441
- print(f"Confidence: {data['page_analysis']['overall_page_confidence']}")
442
-
443
- # Extracted medical fields
444
- extracted = data['extracted_data']
445
- print(f"Patient: {extracted['patient_first_name']} {extracted['patient_last_name']}")
446
- print(f"Insurance: {extracted['primary_insurance']['payer_name']}")
447
- print(f"Diagnosis: {extracted['diagnosis_informations']}")
448
-
449
- # Fields found on this page
450
- print(f"Fields found: {data['fields_found_on_this_page']}")
451
- ```
 
 
 
 
452
 
453
- ### Use ChatGPT/Claude for Final Combination
 
454
  ```
455
- # Prepare all page data for combination
456
- all_pages_data = []
457
- for page in result["data"]["individual_pages"]:
458
- if page["extraction_result"]["status"] == "success":
459
- all_pages_data.append({
460
- "page": page["page_number"],
461
- "extracted_data": page["extraction_result"]["data"]["extracted_data"],
462
- "confidence_scores": page["extraction_result"]["data"]["confidence_scores"],
463
- "fields_found": page["extraction_result"]["data"]["fields_found_on_this_page"]
464
- })
465
 
466
- # Send to ChatGPT for combination
467
- combination_prompt = f'''
468
- Combine these {len(all_pages_data)} medical document pages into a single comprehensive patient record.
 
 
 
 
 
 
 
 
 
 
 
469
 
470
- For each field, choose the value with highest confidence across all pages.
471
- If multiple pages have the same field, verify consistency.
 
 
 
472
 
473
- Page Data:
474
- {json.dumps(all_pages_data, indent=2)}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
475
 
476
- Return the final medical record in the same structure with:
477
- - Combined data from all pages
478
- - Highest confidence scores per field
479
- - List of pages where each field was found
480
- - Fields needing human review (confidence < 0.9)
481
- '''
482
  ```
483
  """)
484
 
485
- with gr.Tab("πŸ“Š Field Mapping"):
486
  gr.Markdown("""
487
- ## Complete Medical Fields Extracted Per Page
488
 
489
- ### Patient Demographics
490
- - `date_of_receipt` - Document receipt date (MM/DD/YYYY)
491
- - `patient_first_name` - Patient's first name
492
- - `patient_last_name` - Patient's last name
493
- - `patient_dob` - Date of birth (MM/DD/YYYY)
494
- - `patient_gender` - Male/Female/Other only
495
- - `patient_primary_phone_number` - Main phone (###-###-####)
496
- - `patient_secondary_phone_number` - Secondary phone
497
- - `patient_email` - Email address (must have @ and domain)
498
- - `patient_address` - Full address
499
- - `patient_zip_code` - Last 5 digits only
500
 
501
- ### Referral Information
502
- - `referral_source` - Clinic/hospital name (NOT provider name)
503
- - `referral_source_phone_no` - Facility phone
504
- - `referral_source_fax_no` - Facility fax
505
- - `referral_source_email` - Facility email
506
-
507
- ### Insurance (Primary/Secondary/Tertiary)
508
- - `payer_name` - Insurance company name
509
- - `member_id` - Any ID (policy, subscriber, member, etc.)
510
- - `group_id` - Only if explicitly labeled as "Group"
511
-
512
- ### Medical Information
513
- - `priority` - "Routine" or "Urgent" only
514
- - `reason_for_referral` - Why patient was referred
515
- - `diagnosis_informations` - Array of {code, description}
516
- - `refine_reason` - Additional refinement details
517
-
518
- ### Page Analysis
519
- - `page_type` - Classification of page content
520
- - `all_visible_text` - Complete text transcription
521
- - `overall_page_confidence` - Page extraction confidence
522
- - `fields_found_on_this_page` - List of fields with data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
523
 
524
- ### Confidence Scoring (0.0 - 1.0)
525
- - `0.95-1.0` β†’ Clearly visible, unambiguous
526
- - `0.7-0.94` β†’ Some uncertainty, formatting issues
527
- - `0.0-0.6` β†’ Missing, unclear, or poor quality
 
528
  """)
529
 
530
  def process_with_status(pdf_file, custom_prompt):
@@ -534,12 +537,13 @@ def create_gradio_interface():
534
  yield "πŸ“„ Converting PDF to images...", {}
535
 
536
  try:
537
- result = extract_pages_individually(pdf_file, custom_prompt if custom_prompt.strip() else None)
538
 
539
- if result["status"] == "success":
540
- yield f"βœ… Extracted comprehensive medical data from {result['successful_extractions']}/{result['total_pages']} pages", result
 
541
  else:
542
- yield f"❌ Error: {result.get('error')}", result
543
 
544
  except Exception as e:
545
  yield f"❌ Failed: {str(e)}", {"error": str(e)}
 
70
  print(f"Error converting PDF to images: {e}")
71
  return []
72
 
73
+ def clean_empty_fields(data):
74
+ """Recursively remove empty fields from dictionary"""
75
+ if not isinstance(data, dict):
76
+ return data
77
+
78
+ cleaned = {}
79
+ for key, value in data.items():
80
+ if isinstance(value, dict):
81
+ cleaned_value = clean_empty_fields(value)
82
+ if cleaned_value: # Only add if not empty
83
+ cleaned[key] = cleaned_value
84
+ elif isinstance(value, list):
85
+ if value: # Only add if list is not empty
86
+ cleaned_list = []
87
+ for item in value:
88
+ if isinstance(item, dict):
89
+ cleaned_item = clean_empty_fields(item)
90
+ if cleaned_item:
91
+ cleaned_list.append(cleaned_item)
92
+ elif item: # Not empty
93
+ cleaned_list.append(item)
94
+ if cleaned_list:
95
+ cleaned[key] = cleaned_list
96
+ elif value not in [None, "", [], {}]: # Not empty
97
+ cleaned[key] = value
98
+
99
+ return cleaned
100
+
101
  def get_comprehensive_medical_extraction_prompt():
102
  """Complete medical data extraction prompt with all fields"""
103
  return """You are a deterministic medical data extraction engine. You will receive a single page from a medical document. Your task is to extract ALL visible information from this page and return it in the exact JSON format below.
 
285
  tokenizer=tokenizer,
286
  sampling=False,
287
  temperature=0.1,
288
+ max_new_tokens=4000
289
  )
290
 
291
  # Try to parse JSON
292
  try:
293
  parsed_data = json.loads(response)
294
+ # Clean empty fields
295
+ cleaned_data = clean_empty_fields(parsed_data)
296
+ return cleaned_data if cleaned_data else None
 
 
 
297
  except json.JSONDecodeError:
298
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299
  except Exception as e:
300
+ print(f"Error extracting from page: {e}")
301
+ return None
 
 
 
 
302
 
303
  @spaces.GPU(duration=600) # 10 minutes
304
+ def extract_pages_clean_json(pdf_file, custom_prompt=None):
305
+ """Extract each page individually - RETURN ONLY NON-EMPTY JSON DATA"""
306
  try:
307
  if pdf_file is None:
308
+ return {"error": "No PDF provided"}
309
 
310
  # Convert PDF to images
311
  print("Converting PDF to images...")
312
  images = pdf_to_images(pdf_file)
313
 
314
  if not images:
315
+ return {"error": "Could not convert PDF"}
316
 
317
+ print(f"Processing {len(images)} pages individually...")
318
 
319
  # Load model once
320
  model, tokenizer = load_model()
321
  extraction_prompt = custom_prompt or get_comprehensive_medical_extraction_prompt()
322
 
323
+ # Process each page and collect only non-empty JSON
324
+ page_results = {}
 
325
 
326
  for i, image in enumerate(images):
327
+ print(f"Extracting page {i+1}/{len(images)}...")
328
 
329
+ page_json = extract_single_page(image, extraction_prompt, model, tokenizer)
330
 
331
+ # Only add to results if page contains data
332
+ if page_json:
333
+ page_results[f"page_{i+1}"] = page_json
 
 
 
 
 
334
 
335
+ return page_results # Return only pages with data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
336
 
337
  except Exception as e:
338
+ return {"error": str(e)}
 
 
 
 
 
339
 
340
  def create_gradio_interface():
341
+ with gr.Blocks(title="Clean Medical eFax Extractor", theme=gr.themes.Soft()) as demo:
342
+ gr.Markdown("# πŸ₯ Clean Medical eFax Data Extractor")
343
+ gr.Markdown("πŸ“‹ **Returns Only Non-Empty Data** - Clean page-by-page extraction without empty fields")
344
 
345
+ with gr.Tab("πŸ“„ Clean JSON Extraction"):
346
  with gr.Row():
347
  with gr.Column():
348
  pdf_input = gr.File(
 
356
  value="",
357
  label="Custom Extraction Prompt (optional)",
358
  lines=4,
359
+ placeholder="Leave empty for comprehensive medical extraction..."
360
  )
361
 
362
+ extract_btn = gr.Button("πŸ“‹ Extract Clean JSON", variant="primary", size="lg")
363
 
364
  gr.Markdown("""
365
+ ### βœ… Clean Output Features
366
+ - **No Empty Fields**: Only fields with actual data
367
+ - **No Empty Pages**: Only pages containing information
368
+ - **Easier Combination**: Clean structure for AI merging
369
+ - **Optimized Size**: Reduced JSON payload
 
 
 
370
  """)
371
 
372
  with gr.Column():
373
  status_output = gr.Textbox(label="πŸ“Š Processing Status", interactive=False)
374
+ output = gr.JSON(label="πŸ“‹ Clean JSON Results", show_label=True)
375
 
376
+ with gr.Tab("πŸ”Œ API Usage Instructions"):
377
  gr.Markdown("""
378
+ ## Updated API Instructions
379
 
380
+ ### Method 1: Python Client (Recommended)
381
+ ```
382
+ pip install gradio_client
383
  ```
 
 
 
 
 
384
 
385
+ ```
386
+ from gradio_client import Client, handle_file
387
+ import json
 
 
 
 
 
 
388
 
389
+ # Connect to your deployed Space
390
+ client = Client("crimsons-uv/miniCPM")
391
 
392
+ # Extract medical data from eFax PDF
393
+ def extract_efax_clean(pdf_path, custom_prompt=""):
394
+ result = client.predict(
395
+ pdf_file=handle_file(pdf_path),
396
+ custom_prompt=custom_prompt,
397
+ api_name="/process_with_status"
398
+ )
399
 
400
+ # result is tuple: [status_message, clean_json_data]
401
+ status, clean_data = result
402
+
403
+ print(f"Status: {status}")
404
+
405
+ # Process only pages with data
406
+ for page_key, page_data in clean_data.items():
407
+ if page_key.startswith('page_'):
408
+ print(f"\\n{page_key.upper()}:")
409
+
410
+ if 'extracted_data' in page_
411
+ data = page_data['extracted_data']
412
+ if 'patient_first_name' in
413
+ print(f" Patient: {data['patient_first_name']} {data.get('patient_last_name', '')}")
414
+ if 'primary_insurance' in
415
+ print(f" Insurance: {data['primary_insurance'].get('payer_name', '')}")
416
+ if 'reason_for_referral' in
417
+ print(f" Reason: {data['reason_for_referral']}")
418
+
419
+ return clean_data
420
 
421
+ # Usage
422
+ results = extract_efax_clean("path/to/your/efax.pdf")
423
  ```
 
 
 
 
 
 
 
 
 
 
424
 
425
+ ### Method 2: cURL Commands
426
+ ```
427
+ # Step 1: Make POST request
428
+ curl -X POST https://crimsons-uv-minicpm.hf.space/gradio_api/call/process_with_status \\
429
+ -H "Content-Type: application/json" \\
430
+ -d '{
431
+ "data": [
432
+ {"path": "your_efax.pdf", "meta": {"_type": "gradio.FileData"}},
433
+ ""
434
+ ]
435
+ }' \\
436
+ | awk -F'"' '{ print $4}' \\
437
+ | read EVENT_ID; curl -N https://crimsons-uv-minicpm.hf.space/gradio_api/call/process_with_status/$EVENT_ID
438
+ ```
439
 
440
+ ### Method 3: Direct HTTP API
441
+ ```
442
+ import requests
443
+ import base64
444
+ import json
445
 
446
+ def call_clean_extraction_api(pdf_path, custom_prompt=""):
447
+ # Read and encode PDF
448
+ with open(pdf_path, 'rb') as f:
449
+ pdf_b64 = base64.b64encode(f.read()).decode()
450
+
451
+ # API payload
452
+ payload = {
453
+ "data": [
454
+ {"name": "efax.pdf", "data": f"application/pdf;base64,{pdf_b64}"},
455
+ custom_prompt
456
+ ]
457
+ }
458
+
459
+ # Make request
460
+ response = requests.post(
461
+ "https://crimsons-uv-minicpm.hf.space/gradio_api/call/process_with_status",
462
+ json=payload,
463
+ headers={"Content-Type": "application/json"}
464
+ )
465
+
466
+ return response.json()
467
 
468
+ # Usage
469
+ clean_results = call_clean_extraction_api("your_efax.pdf")
 
 
 
 
470
  ```
471
  """)
472
 
473
+ with gr.Tab("πŸ“‹ Response Format"):
474
  gr.Markdown("""
475
+ ## Clean Response Structure
476
 
477
+ ### Input: 5-page PDF with mixed content
478
+ ### Output: Only pages with data
 
 
 
 
 
 
 
 
 
479
 
480
+ ```
481
+ {
482
+ "page_2": {
483
+ "page_analysis": {
484
+ "page_type": "patient_demographics",
485
+ "overall_page_confidence": 0.95,
486
+ "all_visible_text": "Patient: John Doe..."
487
+ },
488
+ "extracted_data": {
489
+ "patient_first_name": "John",
490
+ "patient_last_name": "Doe",
491
+ "patient_dob": "01/15/1980",
492
+ "patient_gender": "Male",
493
+ "patient_primary_phone_number": "555-123-4567",
494
+ "patient_address": "123 Main St, City, State 12345",
495
+ "patient_zip_code": "12345"
496
+ },
497
+ "confidence_scores": {
498
+ "patient_first_name": 1.0,
499
+ "patient_last_name": 1.0,
500
+ "patient_dob": 0.95,
501
+ "patient_gender": 1.0
502
+ },
503
+ "fields_found_on_this_page": ["patient_first_name", "patient_last_name", "patient_dob"]
504
+ },
505
+ "page_3": {
506
+ "extracted_data": {
507
+ "primary_insurance": {
508
+ "payer_name": "Blue Cross Blue Shield",
509
+ "member_id": "ABC123456789",
510
+ "group_id": "GRP001"
511
+ },
512
+ "reason_for_referral": "Cardiology consultation"
513
+ },
514
+ "confidence_scores": {
515
+ "primary_insurance": {
516
+ "payer_name": 1.0,
517
+ "member_id": 0.98,
518
+ "group_id": 0.95
519
+ },
520
+ "reason_for_referral": 1.0
521
+ }
522
+ }
523
+ }
524
+ ```
525
 
526
+ ### Benefits for AI Combination:
527
+ - βœ… **No empty pages**: Pages 1, 4, 5 had no data, so not included
528
+ - βœ… **No empty fields**: Only fields with actual values
529
+ - βœ… **Smaller payload**: Reduced data size for faster processing
530
+ - βœ… **Easy merging**: Clear structure for combining with ChatGPT/Claude
531
  """)
532
 
533
  def process_with_status(pdf_file, custom_prompt):
 
537
  yield "πŸ“„ Converting PDF to images...", {}
538
 
539
  try:
540
+ result = extract_pages_clean_json(pdf_file, custom_prompt if custom_prompt.strip() else None)
541
 
542
+ if "error" not in result:
543
+ page_count = len([k for k in result.keys() if k.startswith("page_")])
544
+ yield f"βœ… Extracted clean data from {page_count} pages with content", result
545
  else:
546
+ yield f"❌ Error: {result['error']}", result
547
 
548
  except Exception as e:
549
  yield f"❌ Failed: {str(e)}", {"error": str(e)}