Suvadeep Das commited on
Commit
998302b
Β·
verified Β·
1 Parent(s): f1ee120

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +302 -221
app.py CHANGED
@@ -9,10 +9,9 @@ import os
9
  import json
10
  from huggingface_hub import login
11
  from pdf2image import convert_from_bytes
12
- import tempfile
13
  from datetime import datetime
14
 
15
- # Set your HF token (add this to your Space secrets)
16
  HF_TOKEN = os.getenv("HUGGING_FACE_HUB_TOKEN")
17
  if HF_TOKEN:
18
  login(token=HF_TOKEN)
@@ -22,7 +21,7 @@ _model = None
22
  _tokenizer = None
23
 
24
  def load_model():
25
- """Load MiniCPM model (CPU loading, GPU usage happens in main function)"""
26
  global _model, _tokenizer
27
 
28
  if _model is not None and _tokenizer is not None:
@@ -57,7 +56,7 @@ def load_model():
57
  return _model, _tokenizer
58
 
59
  def pdf_to_images(pdf_file):
60
- """Convert PDF file to list of PIL images (CPU operation)"""
61
  try:
62
  if hasattr(pdf_file, 'read'):
63
  pdf_bytes = pdf_file.read()
@@ -71,12 +70,20 @@ def pdf_to_images(pdf_file):
71
  print(f"Error converting PDF to images: {e}")
72
  return []
73
 
74
- def get_medical_extraction_prompt():
75
- """Get the medical data extraction prompt"""
76
- return """You are a medical document OCR and data extraction specialist. Analyze this medical document image and extract ALL visible information. Return the data in this exact JSON format:
 
 
77
 
78
  {
79
- "data": {
 
 
 
 
 
 
80
  "date_of_receipt": "",
81
  "patient_first_name": "",
82
  "patient_last_name": "",
@@ -114,7 +121,12 @@ def get_medical_extraction_prompt():
114
  "description": ""
115
  }
116
  ],
117
- "refine_reason": ""
 
 
 
 
 
118
  },
119
  "confidence_scores": {
120
  "date_of_receipt": 0.0,
@@ -136,28 +148,106 @@ def get_medical_extraction_prompt():
136
  "member_id": 0.0,
137
  "group_id": 0.0
138
  },
 
 
 
 
 
 
 
 
 
 
139
  "priority": 0.0,
140
- "reason_for_referral": 0.0
 
 
 
 
 
 
 
 
141
  }
142
  }
143
 
144
- INSTRUCTIONS:
145
- 1. Read ALL text visible in the document
146
- 2. Extract exact values as they appear (no modifications)
147
- 3. For dates, use MM/DD/YYYY format
148
- 4. For phone numbers, use format like 850-463-0143
149
- 5. Assign confidence scores 0.0-1.0 (1.0 = completely certain, 0.0 = not found)
150
- 6. If information is not visible, leave field empty but still include it
151
- 7. Return ONLY the JSON, no other text"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
 
153
- def extract_data_from_image(image, extraction_prompt, model, tokenizer):
154
- """Extract data from a single image using MiniCPM (runs within GPU session)"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  try:
156
- # Convert PIL image to proper format if needed
157
  if hasattr(image, 'convert'):
158
  image = image.convert('RGB')
159
 
160
- # Use the correct MiniCPM chat interface
161
  response = model.chat(
162
  image=image,
163
  msgs=[{
@@ -167,301 +257,293 @@ def extract_data_from_image(image, extraction_prompt, model, tokenizer):
167
  tokenizer=tokenizer,
168
  sampling=False,
169
  temperature=0.1,
170
- max_new_tokens=2048
171
  )
172
 
173
- # Try to parse JSON response
174
  try:
175
  parsed_data = json.loads(response)
176
  return {
177
  "status": "success",
178
- "extracted_data": parsed_data,
179
  "raw_response": response,
180
- "model_used": "MiniCPM-V-2_6-GPU"
181
  }
182
  except json.JSONDecodeError:
 
183
  return {
184
- "status": "partial_success",
185
- "extracted_data": response,
 
 
 
 
 
 
 
 
 
 
 
186
  "raw_response": response,
187
- "model_used": "MiniCPM-V-2_6-GPU",
188
- "note": "Response was not valid JSON"
189
  }
190
-
191
  except Exception as e:
192
  return {
193
- "status": "error",
194
  "error": str(e),
195
- "extracted_data": None
 
196
  }
197
 
198
- def safe_merge_field(combined_data, field, value, page_num, extracted_pages):
199
- """Safely merge field data with type checking"""
200
- try:
201
- if field in combined_data and value:
202
- # Handle nested dictionaries (like insurance)
203
- if isinstance(value, dict) and isinstance(combined_data[field], dict):
204
- for sub_field, sub_value in value.items():
205
- if sub_field in combined_data[field] and sub_value and not combined_data[field][sub_field]:
206
- combined_data[field][sub_field] = sub_value
207
- if page_num not in extracted_pages:
208
- extracted_pages.append(page_num)
209
- # Handle simple fields
210
- elif not isinstance(value, (dict, list)) and not combined_data[field]:
211
- combined_data[field] = value
212
- if page_num not in extracted_pages:
213
- extracted_pages.append(page_num)
214
- except Exception as e:
215
- print(f"Warning: Error merging field {field}: {e}")
216
-
217
- def safe_merge_confidence(combined_confidence, field, score):
218
- """Safely merge confidence scores with type checking"""
219
- try:
220
- # Handle nested confidence scores (like primary_insurance)
221
- if isinstance(score, dict):
222
- if field not in combined_confidence:
223
- combined_confidence[field] = {}
224
- for sub_field, sub_score in score.items():
225
- if (sub_field not in combined_confidence[field] and
226
- isinstance(sub_score, (int, float)) and sub_score > 0):
227
- combined_confidence[field][sub_field] = sub_score
228
- # Handle simple confidence scores
229
- elif isinstance(score, (int, float)) and score > 0:
230
- if field not in combined_confidence:
231
- combined_confidence[field] = score
232
- except Exception as e:
233
- print(f"Warning: Error merging confidence for {field}: {e}")
234
-
235
- def combine_page_data(pages_data):
236
- """Combine extracted data from multiple pages into final medical record - FIXED VERSION"""
237
- combined_data = {
238
- "date_of_receipt": "",
239
- "patient_first_name": "",
240
- "patient_last_name": "",
241
- "patient_dob": "",
242
- "patient_gender": "",
243
- "patient_primary_phone_number": "",
244
- "patient_secondary_phone_number": "",
245
- "patient_email": "",
246
- "patient_address": "",
247
- "patient_zip_code": "",
248
- "referral_source": "",
249
- "referral_source_phone_no": "",
250
- "referral_source_fax_no": "",
251
- "referral_source_email": "",
252
- "primary_insurance": {
253
- "payer_name": "",
254
- "member_id": "",
255
- "group_id": ""
256
- },
257
- "secondary_insurance": {
258
- "payer_name": None,
259
- "member_id": None,
260
- "group_id": None
261
- },
262
- "tertiary_insurance": {
263
- "payer_name": None,
264
- "member_id": None,
265
- "group_id": None
266
- },
267
- "priority": "",
268
- "reason_for_referral": "",
269
- "diagnosis_informations": [],
270
- "refine_reason": "",
271
- "extracted_page_numbers": []
272
- }
273
-
274
- combined_confidence = {}
275
-
276
- # Combine data from all pages
277
- for page_num, page_data in enumerate(pages_data, 1):
278
- try:
279
- if page_data.get("page_data", {}).get("status") == "success":
280
- extracted = page_data["page_data"].get("extracted_data", {})
281
-
282
- # If we got JSON data, merge it
283
- if isinstance(extracted, dict) and "data" in extracted:
284
- page_info = extracted["data"]
285
-
286
- # Safely merge each field
287
- for field, value in page_info.items():
288
- safe_merge_field(combined_data, field, value, page_num, combined_data["extracted_page_numbers"])
289
-
290
- # Safely merge confidence scores
291
- if "confidence_scores" in extracted:
292
- for field, score in extracted["confidence_scores"].items():
293
- safe_merge_confidence(combined_confidence, field, score)
294
- except Exception as e:
295
- print(f"Warning: Error processing page {page_num}: {e}")
296
- continue
297
-
298
- return {
299
- "data": combined_data,
300
- "confidence_scores": combined_confidence,
301
- "fields_needing_review": [],
302
- "metadata": {
303
- "extraction_timestamp": datetime.now().isoformat(),
304
- "model_used": "MiniCPM-V-2_6-GPU",
305
- "confidence_threshold": 0.9,
306
- "requires_human_review": False,
307
- "total_pages_processed": len(pages_data)
308
- }
309
- }
310
-
311
- @spaces.GPU(duration=600) # 10 minutes for large documents
312
- def extract_efax_from_pdf(pdf_file, custom_prompt=None):
313
- """Main function to process multi-page PDF eFax - ALL GPU processing happens here"""
314
  try:
315
  if pdf_file is None:
316
- return {
317
- "status": "error",
318
- "error": "No PDF file provided",
319
- "total_pages": 0,
320
- "pages_data": []
321
- }
322
 
323
- # Step 1: Convert PDF to images (CPU operation)
324
  print("Converting PDF to images...")
325
  images = pdf_to_images(pdf_file)
326
 
327
  if not images:
328
- return {
329
- "status": "error",
330
- "error": "Could not convert PDF to images",
331
- "total_pages": 0,
332
- "pages_data": []
333
- }
334
 
335
- print(f"Converted {len(images)} pages. Starting GPU processing...")
336
 
337
- # Step 2: Load model on GPU
338
  model, tokenizer = load_model()
 
339
 
340
- # Step 3: Use custom prompt or default
341
- extraction_prompt = custom_prompt if custom_prompt else get_medical_extraction_prompt()
 
342
 
343
- # Step 4: Process all pages within single GPU session
344
- pages_data = []
345
  for i, image in enumerate(images):
346
- print(f"Processing page {i+1}/{len(images)} on GPU...")
347
- page_result = extract_data_from_image(image, extraction_prompt, model, tokenizer)
348
- pages_data.append({
 
 
 
 
 
349
  "page_number": i + 1,
350
- "page_data": page_result
 
351
  })
352
 
353
- print("GPU processing complete. Combining results...")
354
-
355
- # Step 5: Combine data from all pages (with error handling)
356
- combined_result = combine_page_data(pages_data)
357
-
358
- # Final result
359
- result = {
360
- "status": "success",
361
  "total_pages": len(images),
362
- "pages_data": pages_data,
363
- "combined_extraction": combined_result,
364
- "model_used": "MiniCPM-V-2_6-ZeroGPU",
365
- "hardware": "ZeroGPU",
366
- "processing_time": "Within 10-minute GPU session"
 
 
 
 
 
 
 
 
 
 
 
 
367
  }
368
 
369
- return result
370
-
371
  except Exception as e:
372
- print(f"Error in extract_efax_from_pdf: {e}")
373
  return {
374
  "status": "error",
375
  "error": str(e),
376
  "total_pages": 0,
377
- "pages_data": []
378
  }
379
 
380
- # Create Gradio Interface
381
  def create_gradio_interface():
382
- with gr.Blocks(title="eFax PDF Data Extractor - Fixed", theme=gr.themes.Soft()) as demo:
383
- gr.Markdown("# πŸ₯ eFax Medical Data Extraction API")
384
- gr.Markdown("πŸš€ **Fixed Version** - Single 10-minute GPU session with proper error handling")
385
 
386
- with gr.Tab("πŸ“„ PDF Upload & Extraction"):
387
  with gr.Row():
388
  with gr.Column():
389
  pdf_input = gr.File(
390
  file_types=[".pdf"],
391
- label="Upload eFax PDF (up to 20 pages)",
392
  file_count="single"
393
  )
394
 
395
- with gr.Accordion("πŸ”§ Advanced Options", open=False):
396
  prompt_input = gr.Textbox(
397
  value="",
398
- label="Custom Extraction Prompt (leave empty for default medical extraction)",
399
- lines=5,
400
- placeholder="Leave empty to use optimized medical data extraction prompt..."
401
  )
402
 
403
- extract_btn = gr.Button("πŸš€ Extract Medical Data (Fixed)", variant="primary", size="lg")
404
 
405
  gr.Markdown("""
406
- ### βœ… Bug Fixes Applied
407
- - **Fixed**: Dict/int comparison error
408
- - **Added**: Safe type checking for all operations
409
- - **Improved**: Error handling and logging
410
- - **Single GPU Session**: No more timeouts
 
 
 
411
  """)
412
 
413
  with gr.Column():
414
  status_output = gr.Textbox(label="πŸ“Š Processing Status", interactive=False)
415
- output = gr.JSON(label="πŸ“‹ Extracted Medical Data", show_label=True)
416
 
417
  with gr.Tab("πŸ”Œ API Usage"):
418
  gr.Markdown("""
419
- ## Fixed API (No More Errors)
420
 
421
  ### Python Usage
422
  ```
423
  import requests
424
  import base64
425
 
426
- with open("large_medical_fax.pdf", "rb") as f:
427
  pdf_b64 = base64.b64encode(f.read()).decode()
428
 
429
  response = requests.post(
430
  "https://your-username-extracting-efax.hf.space/api/predict",
431
  json={
432
  "data": [
433
- {"name": "medical_fax.pdf", "data": f"application/pdf;base64,{pdf_b64}"},
434
- "" # Empty for default prompt
435
  ]
436
  }
437
  )
438
 
439
- # Should work without dict/int comparison errors
440
  result = response.json()
441
- if result["data"]["status"] == "success":
442
- medical_data = result["data"]["combined_extraction"]
443
- print("Patient:", medical_data["data"]["patient_first_name"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
444
  ```
445
  """)
446
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
447
  def process_with_status(pdf_file, custom_prompt):
448
  if pdf_file is None:
449
- return "❌ No PDF file uploaded", {"error": "Please upload a PDF file"}
450
 
451
  yield "πŸ“„ Converting PDF to images...", {}
452
 
453
  try:
454
- result = extract_efax_from_pdf(pdf_file, custom_prompt if custom_prompt.strip() else None)
455
 
456
  if result["status"] == "success":
457
- yield f"βœ… Successfully processed {result['total_pages']} pages", result
458
  else:
459
- yield f"❌ Error: {result.get('error', 'Unknown error')}", result
460
 
461
  except Exception as e:
462
- yield f"❌ Processing failed: {str(e)}", {"error": str(e)}
463
 
464
- # Connect the interface
465
  extract_btn.click(
466
  fn=process_with_status,
467
  inputs=[pdf_input, prompt_input],
@@ -471,7 +553,6 @@ def create_gradio_interface():
471
 
472
  return demo
473
 
474
- # Launch the app
475
  if __name__ == "__main__":
476
  demo = create_gradio_interface()
477
  demo.queue(
 
9
  import json
10
  from huggingface_hub import login
11
  from pdf2image import convert_from_bytes
 
12
  from datetime import datetime
13
 
14
+ # Set your HF token
15
  HF_TOKEN = os.getenv("HUGGING_FACE_HUB_TOKEN")
16
  if HF_TOKEN:
17
  login(token=HF_TOKEN)
 
21
  _tokenizer = None
22
 
23
  def load_model():
24
+ """Load MiniCPM model"""
25
  global _model, _tokenizer
26
 
27
  if _model is not None and _tokenizer is not None:
 
56
  return _model, _tokenizer
57
 
58
  def pdf_to_images(pdf_file):
59
+ """Convert PDF file to list of PIL images"""
60
  try:
61
  if hasattr(pdf_file, 'read'):
62
  pdf_bytes = pdf_file.read()
 
70
  print(f"Error converting PDF to images: {e}")
71
  return []
72
 
73
+ def get_comprehensive_medical_extraction_prompt():
74
+ """Complete medical data extraction prompt with all fields"""
75
+ return """You are a deterministic medical data extraction engine. You will receive a single page from a medical document. Your task is to extract ALL visible information from this page and return it in the exact JSON format below.
76
+
77
+ Your response MUST follow this exact JSON format:
78
 
79
  {
80
+ "page_analysis": {
81
+ "page_contains_text": true,
82
+ "page_type": "cover_page|patient_demographics|insurance|medical_history|referral_info|other",
83
+ "overall_page_confidence": 0.0,
84
+ "all_visible_text": "Complete text transcription of everything visible on this page"
85
+ },
86
+ "extracted_data": {
87
  "date_of_receipt": "",
88
  "patient_first_name": "",
89
  "patient_last_name": "",
 
121
  "description": ""
122
  }
123
  ],
124
+ "refine_reason": "",
125
+ "additional_medical_info": "",
126
+ "provider_names": [],
127
+ "appointment_dates": [],
128
+ "medication_info": [],
129
+ "other_important_details": ""
130
  },
131
  "confidence_scores": {
132
  "date_of_receipt": 0.0,
 
148
  "member_id": 0.0,
149
  "group_id": 0.0
150
  },
151
+ "secondary_insurance": {
152
+ "payer_name": 0.0,
153
+ "member_id": 0.0,
154
+ "group_id": 0.0
155
+ },
156
+ "tertiary_insurance": {
157
+ "payer_name": 0.0,
158
+ "member_id": 0.0,
159
+ "group_id": 0.0
160
+ },
161
  "priority": 0.0,
162
+ "reason_for_referral": 0.0,
163
+ "diagnosis_informations": 0.0,
164
+ "refine_reason": 0.0
165
+ },
166
+ "fields_found_on_this_page": [],
167
+ "metadata": {
168
+ "extraction_timestamp": "",
169
+ "model_used": "MiniCPM-V-2_6-GPU",
170
+ "page_processing_notes": ""
171
  }
172
  }
173
 
174
+ --------------------------------
175
+ STRICT FIELD FORMATTING RULES:
176
+ --------------------------------
177
+
178
+ β€’ Dates: Format as MM/DD/YYYY only
179
+ β€’ Phone numbers: Use digits and hyphens only (e.g., 406-596-1901), no extensions or parentheses
180
+ β€’ Gender: "Male", "Female", or "Other" only
181
+ β€’ Email: Must contain @ and valid domain, otherwise leave empty
182
+ β€’ Zip code: Only extract as last 5 digits of address
183
+
184
+ --------------------------------
185
+ REFERRAL SOURCE RULES:
186
+ --------------------------------
187
+
188
+ β€’ Extract clinic/hospital/facility name ONLY – never the provider's name
189
+ β€’ Use facility's phone/fax/email, not individual provider's contact
190
+ β€’ Prefer header/fax banner for referral source over body text
191
+ β€’ Do not extract receiver clinic names (e.g., Frontier Psychiatry) as referral source
192
+
193
+ --------------------------------
194
+ INSURANCE EXTRACTION FORMAT:
195
+ --------------------------------
196
+
197
+ Each tier must follow this structure:
198
+ "primary_insurance": {
199
+ "payer_name": "string",
200
+ "member_id": "string",
201
+ "group_id": "string"
202
+ },
203
+ "secondary_insurance": { ... },
204
+ "tertiary_insurance": { ... }
205
+
206
+ β€’ Use "member_id" for any ID (Policy, Insurance ID, Subscriber ID, etc.)
207
+ β€’ Use "group_id" ONLY if explicitly labeled as "Group ID", "Group Number", etc.
208
+ β€’ Leave all fields empty if "Self Pay" is indicated
209
+
210
+ --------------------------------
211
+ DIAGNOSIS EXTRACTION RULES:
212
+ --------------------------------
213
 
214
+ β€’ Extract diagnosis codes AND their descriptions
215
+ β€’ If only code is present, set description to "" and confidence ≀ 0.6
216
+ β€’ DO NOT infer description from ICD code
217
+
218
+ --------------------------------
219
+ CONFIDENCE SCORING:
220
+ --------------------------------
221
+
222
+ Assign realistic confidence (0.0–1.0) per field, e.g.:
223
+
224
+ β€’ 0.95–1.0 β†’ Clearly labeled, unambiguous data
225
+ β€’ 0.7–0.94 β†’ Some uncertainty (low quality, odd format)
226
+ β€’ 0.0–0.6 β†’ Missing, ambiguous, or noisy data
227
+ β€’ Use float precision (e.g., 0.87, not just 1.0)
228
+
229
+ Always populate the `confidence_scores` dictionary with the same structure as `extracted_data`.
230
+
231
+ --------------------------------
232
+ CRITICAL INSTRUCTIONS:
233
+ --------------------------------
234
+
235
+ 1. READ EVERYTHING: Transcribe all visible text in "all_visible_text"
236
+ 2. EXTRACT PRECISELY: Only extract what's actually visible on THIS page
237
+ 3. NO ASSUMPTIONS: Don't guess or infer information not present
238
+ 4. FIELD CLASSIFICATION: List which fields were actually found in "fields_found_on_this_page"
239
+ 5. CONFIDENCE: Be realistic - 0.0 if not found, up to 1.0 if completely certain
240
+ 6. FORMAT EXACTLY: Follow date/phone/address formatting rules strictly
241
+ 7. JSON ONLY: Return only valid JSON, no other text
242
+
243
+ This is ONE PAGE of a multi-page document. Extract only what's visible on this specific page."""
244
+
245
+ def extract_single_page(image, extraction_prompt, model, tokenizer):
246
+ """Extract data from a single page with comprehensive medical fields"""
247
  try:
 
248
  if hasattr(image, 'convert'):
249
  image = image.convert('RGB')
250
 
 
251
  response = model.chat(
252
  image=image,
253
  msgs=[{
 
257
  tokenizer=tokenizer,
258
  sampling=False,
259
  temperature=0.1,
260
+ max_new_tokens=4000 # More tokens for comprehensive extraction
261
  )
262
 
263
+ # Try to parse JSON
264
  try:
265
  parsed_data = json.loads(response)
266
  return {
267
  "status": "success",
268
+ "data": parsed_data,
269
  "raw_response": response,
270
+ "model": "MiniCPM-V-2_6-GPU"
271
  }
272
  except json.JSONDecodeError:
273
+ # Return structured error with raw text
274
  return {
275
+ "status": "json_parse_error",
276
+ "data": {
277
+ "page_analysis": {
278
+ "page_contains_text": True,
279
+ "page_type": "unknown",
280
+ "overall_page_confidence": 0.5,
281
+ "all_visible_text": response
282
+ },
283
+ "extracted_data": {},
284
+ "confidence_scores": {},
285
+ "fields_found_on_this_page": [],
286
+ "parsing_error": "Could not parse JSON response"
287
+ },
288
  "raw_response": response,
289
+ "model": "MiniCPM-V-2_6-GPU",
290
+ "error": "JSON parsing failed - returned raw text"
291
  }
 
292
  except Exception as e:
293
  return {
294
+ "status": "extraction_error",
295
  "error": str(e),
296
+ "data": None,
297
+ "raw_response": ""
298
  }
299
 
300
+ @spaces.GPU(duration=600) # 10 minutes
301
+ def extract_pages_individually(pdf_file, custom_prompt=None):
302
+ """Extract each page individually with comprehensive medical data"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
303
  try:
304
  if pdf_file is None:
305
+ return {"status": "error", "error": "No PDF provided"}
 
 
 
 
 
306
 
307
+ # Convert PDF to images
308
  print("Converting PDF to images...")
309
  images = pdf_to_images(pdf_file)
310
 
311
  if not images:
312
+ return {"status": "error", "error": "Could not convert PDF"}
 
 
 
 
 
313
 
314
+ print(f"Processing {len(images)} pages individually with comprehensive extraction...")
315
 
316
+ # Load model once
317
  model, tokenizer = load_model()
318
+ extraction_prompt = custom_prompt or get_comprehensive_medical_extraction_prompt()
319
 
320
+ # Process each page independently
321
+ results = []
322
+ successful_extractions = 0
323
 
 
 
324
  for i, image in enumerate(images):
325
+ print(f"Extracting page {i+1}/{len(images)} with full medical fields...")
326
+
327
+ page_result = extract_single_page(image, extraction_prompt, model, tokenizer)
328
+
329
+ if page_result["status"] == "success":
330
+ successful_extractions += 1
331
+
332
+ results.append({
333
  "page_number": i + 1,
334
+ "extraction_result": page_result,
335
+ "timestamp": datetime.now().isoformat()
336
  })
337
 
338
+ return {
339
+ "status": "success",
 
 
 
 
 
 
340
  "total_pages": len(images),
341
+ "successful_extractions": successful_extractions,
342
+ "individual_pages": results,
343
+ "processing_info": {
344
+ "model_used": "MiniCPM-V-2_6-GPU",
345
+ "extraction_timestamp": datetime.now().isoformat(),
346
+ "processing_method": "comprehensive_individual_page_extraction",
347
+ "extraction_prompt_used": "comprehensive_medical_fields",
348
+ "note": "Each page processed with full medical field extraction - combine results with separate AI"
349
+ },
350
+ "next_step_instructions": {
351
+ "combination_method": "Use ChatGPT/Claude to combine all pages into final medical record",
352
+ "fields_to_aggregate": [
353
+ "date_of_receipt", "patient_demographics", "insurance_info",
354
+ "referral_source", "diagnosis_codes", "reason_for_referral"
355
+ ],
356
+ "confidence_handling": "Take highest confidence values across pages for each field"
357
+ }
358
  }
359
 
 
 
360
  except Exception as e:
 
361
  return {
362
  "status": "error",
363
  "error": str(e),
364
  "total_pages": 0,
365
+ "individual_pages": []
366
  }
367
 
 
368
  def create_gradio_interface():
369
+ with gr.Blocks(title="Comprehensive Medical Page Extractor", theme=gr.themes.Soft()) as demo:
370
+ gr.Markdown("# πŸ₯ Comprehensive Medical Data Extractor")
371
+ gr.Markdown("πŸ“‹ **Complete Field Extraction** - All medical fields extracted per page, ready for AI combination")
372
 
373
+ with gr.Tab("πŸ“„ Comprehensive Page Extraction"):
374
  with gr.Row():
375
  with gr.Column():
376
  pdf_input = gr.File(
377
  file_types=[".pdf"],
378
+ label="Upload Medical eFax PDF",
379
  file_count="single"
380
  )
381
 
382
+ with gr.Accordion("πŸ”§ Custom Prompt", open=False):
383
  prompt_input = gr.Textbox(
384
  value="",
385
+ label="Custom Extraction Prompt (optional)",
386
+ lines=4,
387
+ placeholder="Leave empty for comprehensive medical extraction with all fields..."
388
  )
389
 
390
+ extract_btn = gr.Button("πŸ₯ Extract All Medical Fields Per Page", variant="primary", size="lg")
391
 
392
  gr.Markdown("""
393
+ ### πŸ“‹ Comprehensive Fields Extracted:
394
+ - βœ… **Patient Demographics** (name, DOB, gender, address, phone, email)
395
+ - βœ… **Insurance Information** (primary/secondary/tertiary with IDs)
396
+ - βœ… **Referral Source** (clinic, phone, fax, email)
397
+ - βœ… **Medical Codes** (diagnosis codes with descriptions)
398
+ - βœ… **Clinical Info** (priority, reason for referral, medical history)
399
+ - βœ… **Confidence Scores** (0.0-1.0 for each field)
400
+ - βœ… **Full Text Transcription** (everything visible on each page)
401
  """)
402
 
403
  with gr.Column():
404
  status_output = gr.Textbox(label="πŸ“Š Processing Status", interactive=False)
405
+ output = gr.JSON(label="πŸ“‹ Comprehensive Page Results", show_label=True)
406
 
407
  with gr.Tab("πŸ”Œ API Usage"):
408
  gr.Markdown("""
409
+ ## Comprehensive Medical Extraction API
410
 
411
  ### Python Usage
412
  ```
413
  import requests
414
  import base64
415
 
416
+ with open("medical_efax.pdf", "rb") as f:
417
  pdf_b64 = base64.b64encode(f.read()).decode()
418
 
419
  response = requests.post(
420
  "https://your-username-extracting-efax.hf.space/api/predict",
421
  json={
422
  "data": [
423
+ {"name": "efax.pdf", "data": f"application/pdf;base64,{pdf_b64}"},
424
+ "" # Custom prompt (optional)
425
  ]
426
  }
427
  )
428
 
 
429
  result = response.json()
430
+
431
+ # Access comprehensive page results
432
+ for page in result["data"]["individual_pages"]:
433
+ page_num = page["page_number"]
434
+ extraction = page["extraction_result"]
435
+
436
+ if extraction["status"] == "success":
437
+ data = extraction["data"]
438
+
439
+ # Page analysis
440
+ print(f"Page {page_num} Type: {data['page_analysis']['page_type']}")
441
+ print(f"Confidence: {data['page_analysis']['overall_page_confidence']}")
442
+
443
+ # Extracted medical fields
444
+ extracted = data['extracted_data']
445
+ print(f"Patient: {extracted['patient_first_name']} {extracted['patient_last_name']}")
446
+ print(f"Insurance: {extracted['primary_insurance']['payer_name']}")
447
+ print(f"Diagnosis: {extracted['diagnosis_informations']}")
448
+
449
+ # Fields found on this page
450
+ print(f"Fields found: {data['fields_found_on_this_page']}")
451
+ ```
452
+
453
+ ### Use ChatGPT/Claude for Final Combination
454
+ ```
455
+ # Prepare all page data for combination
456
+ all_pages_data = []
457
+ for page in result["data"]["individual_pages"]:
458
+ if page["extraction_result"]["status"] == "success":
459
+ all_pages_data.append({
460
+ "page": page["page_number"],
461
+ "extracted_data": page["extraction_result"]["data"]["extracted_data"],
462
+ "confidence_scores": page["extraction_result"]["data"]["confidence_scores"],
463
+ "fields_found": page["extraction_result"]["data"]["fields_found_on_this_page"]
464
+ })
465
+
466
+ # Send to ChatGPT for combination
467
+ combination_prompt = f'''
468
+ Combine these {len(all_pages_data)} medical document pages into a single comprehensive patient record.
469
+
470
+ For each field, choose the value with highest confidence across all pages.
471
+ If multiple pages have the same field, verify consistency.
472
+
473
+ Page Data:
474
+ {json.dumps(all_pages_data, indent=2)}
475
+
476
+ Return the final medical record in the same structure with:
477
+ - Combined data from all pages
478
+ - Highest confidence scores per field
479
+ - List of pages where each field was found
480
+ - Fields needing human review (confidence < 0.9)
481
+ '''
482
  ```
483
  """)
484
 
485
+ with gr.Tab("πŸ“Š Field Mapping"):
486
+ gr.Markdown("""
487
+ ## Complete Medical Fields Extracted Per Page
488
+
489
+ ### Patient Demographics
490
+ - `date_of_receipt` - Document receipt date (MM/DD/YYYY)
491
+ - `patient_first_name` - Patient's first name
492
+ - `patient_last_name` - Patient's last name
493
+ - `patient_dob` - Date of birth (MM/DD/YYYY)
494
+ - `patient_gender` - Male/Female/Other only
495
+ - `patient_primary_phone_number` - Main phone (###-###-####)
496
+ - `patient_secondary_phone_number` - Secondary phone
497
+ - `patient_email` - Email address (must have @ and domain)
498
+ - `patient_address` - Full address
499
+ - `patient_zip_code` - Last 5 digits only
500
+
501
+ ### Referral Information
502
+ - `referral_source` - Clinic/hospital name (NOT provider name)
503
+ - `referral_source_phone_no` - Facility phone
504
+ - `referral_source_fax_no` - Facility fax
505
+ - `referral_source_email` - Facility email
506
+
507
+ ### Insurance (Primary/Secondary/Tertiary)
508
+ - `payer_name` - Insurance company name
509
+ - `member_id` - Any ID (policy, subscriber, member, etc.)
510
+ - `group_id` - Only if explicitly labeled as "Group"
511
+
512
+ ### Medical Information
513
+ - `priority` - "Routine" or "Urgent" only
514
+ - `reason_for_referral` - Why patient was referred
515
+ - `diagnosis_informations` - Array of {code, description}
516
+ - `refine_reason` - Additional refinement details
517
+
518
+ ### Page Analysis
519
+ - `page_type` - Classification of page content
520
+ - `all_visible_text` - Complete text transcription
521
+ - `overall_page_confidence` - Page extraction confidence
522
+ - `fields_found_on_this_page` - List of fields with data
523
+
524
+ ### Confidence Scoring (0.0 - 1.0)
525
+ - `0.95-1.0` β†’ Clearly visible, unambiguous
526
+ - `0.7-0.94` β†’ Some uncertainty, formatting issues
527
+ - `0.0-0.6` β†’ Missing, unclear, or poor quality
528
+ """)
529
+
530
  def process_with_status(pdf_file, custom_prompt):
531
  if pdf_file is None:
532
+ return "❌ No PDF uploaded", {"error": "Upload a PDF file"}
533
 
534
  yield "πŸ“„ Converting PDF to images...", {}
535
 
536
  try:
537
+ result = extract_pages_individually(pdf_file, custom_prompt if custom_prompt.strip() else None)
538
 
539
  if result["status"] == "success":
540
+ yield f"βœ… Extracted comprehensive medical data from {result['successful_extractions']}/{result['total_pages']} pages", result
541
  else:
542
+ yield f"❌ Error: {result.get('error')}", result
543
 
544
  except Exception as e:
545
+ yield f"❌ Failed: {str(e)}", {"error": str(e)}
546
 
 
547
  extract_btn.click(
548
  fn=process_with_status,
549
  inputs=[pdf_input, prompt_input],
 
553
 
554
  return demo
555
 
 
556
  if __name__ == "__main__":
557
  demo = create_gradio_interface()
558
  demo.queue(