Spaces:
Paused
Paused
Suvadeep Das
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -38,7 +38,7 @@ def load_model():
|
|
38 |
"openbmb/MiniCPM-V-2_6",
|
39 |
trust_remote_code=True,
|
40 |
torch_dtype=torch.float16,
|
41 |
-
device_map="auto"
|
42 |
)
|
43 |
return _model, _tokenizer
|
44 |
except Exception as e:
|
@@ -195,8 +195,45 @@ def extract_data_from_image(image, extraction_prompt, model, tokenizer):
|
|
195 |
"extracted_data": None
|
196 |
}
|
197 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
198 |
def combine_page_data(pages_data):
|
199 |
-
"""Combine extracted data from multiple pages into final medical record"""
|
200 |
combined_data = {
|
201 |
"date_of_receipt": "",
|
202 |
"patient_first_name": "",
|
@@ -238,24 +275,25 @@ def combine_page_data(pages_data):
|
|
238 |
|
239 |
# Combine data from all pages
|
240 |
for page_num, page_data in enumerate(pages_data, 1):
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
# If we got JSON data, merge it
|
245 |
-
if isinstance(extracted, dict) and "data" in extracted:
|
246 |
-
page_info = extracted["data"]
|
247 |
-
|
248 |
-
# Merge non-empty fields (first non-empty value wins)
|
249 |
-
for field, value in page_info.items():
|
250 |
-
if field in combined_data and value and not combined_data[field]:
|
251 |
-
combined_data[field] = value
|
252 |
-
combined_data["extracted_page_numbers"].append(page_num)
|
253 |
|
254 |
-
#
|
255 |
-
if "
|
256 |
-
|
257 |
-
|
258 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
259 |
|
260 |
return {
|
261 |
"data": combined_data,
|
@@ -282,7 +320,7 @@ def extract_efax_from_pdf(pdf_file, custom_prompt=None):
|
|
282 |
"pages_data": []
|
283 |
}
|
284 |
|
285 |
-
# Step 1: Convert PDF to images (CPU operation
|
286 |
print("Converting PDF to images...")
|
287 |
images = pdf_to_images(pdf_file)
|
288 |
|
@@ -296,7 +334,7 @@ def extract_efax_from_pdf(pdf_file, custom_prompt=None):
|
|
296 |
|
297 |
print(f"Converted {len(images)} pages. Starting GPU processing...")
|
298 |
|
299 |
-
# Step 2: Load model on GPU
|
300 |
model, tokenizer = load_model()
|
301 |
|
302 |
# Step 3: Use custom prompt or default
|
@@ -314,7 +352,7 @@ def extract_efax_from_pdf(pdf_file, custom_prompt=None):
|
|
314 |
|
315 |
print("GPU processing complete. Combining results...")
|
316 |
|
317 |
-
# Step 5: Combine data from all pages
|
318 |
combined_result = combine_page_data(pages_data)
|
319 |
|
320 |
# Final result
|
@@ -331,6 +369,7 @@ def extract_efax_from_pdf(pdf_file, custom_prompt=None):
|
|
331 |
return result
|
332 |
|
333 |
except Exception as e:
|
|
|
334 |
return {
|
335 |
"status": "error",
|
336 |
"error": str(e),
|
@@ -340,9 +379,9 @@ def extract_efax_from_pdf(pdf_file, custom_prompt=None):
|
|
340 |
|
341 |
# Create Gradio Interface
|
342 |
def create_gradio_interface():
|
343 |
-
with gr.Blocks(title="eFax PDF Data Extractor -
|
344 |
gr.Markdown("# π₯ eFax Medical Data Extraction API")
|
345 |
-
gr.Markdown("π **
|
346 |
|
347 |
with gr.Tab("π PDF Upload & Extraction"):
|
348 |
with gr.Row():
|
@@ -361,13 +400,14 @@ def create_gradio_interface():
|
|
361 |
placeholder="Leave empty to use optimized medical data extraction prompt..."
|
362 |
)
|
363 |
|
364 |
-
extract_btn = gr.Button("π Extract Medical Data (
|
365 |
|
366 |
gr.Markdown("""
|
367 |
-
###
|
368 |
-
- **
|
369 |
-
- **
|
370 |
-
- **
|
|
|
371 |
""")
|
372 |
|
373 |
with gr.Column():
|
@@ -376,7 +416,7 @@ def create_gradio_interface():
|
|
376 |
|
377 |
with gr.Tab("π API Usage"):
|
378 |
gr.Markdown("""
|
379 |
-
##
|
380 |
|
381 |
### Python Usage
|
382 |
```
|
@@ -396,36 +436,14 @@ def create_gradio_interface():
|
|
396 |
}
|
397 |
)
|
398 |
|
399 |
-
#
|
400 |
result = response.json()
|
401 |
-
|
|
|
|
|
402 |
```
|
403 |
""")
|
404 |
|
405 |
-
with gr.Tab("β‘ Performance Info"):
|
406 |
-
gr.Markdown("""
|
407 |
-
## Optimized ZeroGPU Performance
|
408 |
-
|
409 |
-
### Before Optimization (β Had Timeout Issues)
|
410 |
-
- GPU session per page = 13 Γ 30 seconds = 6.5 minutes
|
411 |
-
- Model loading repeated = wasted time
|
412 |
-
- Timeout around page 11/13
|
413 |
-
|
414 |
-
### After Optimization (β
No Timeouts)
|
415 |
-
- **Single 10-minute GPU session** for entire document
|
416 |
-
- Model loads once, processes all pages
|
417 |
-
- Handles 15-20+ page documents easily
|
418 |
-
- PDF conversion on CPU (doesn't count toward GPU time)
|
419 |
-
|
420 |
-
### Processing Flow
|
421 |
-
1. **PDF β Images** (CPU, before GPU starts)
|
422 |
-
2. **π GPU Session Starts** (10 minutes allocated)
|
423 |
-
3. **Load Model** (once, on GPU)
|
424 |
-
4. **Process All Pages** (GPU, sequential)
|
425 |
-
5. **GPU Session Ends**
|
426 |
-
6. **Combine Results** (CPU, after GPU)
|
427 |
-
""")
|
428 |
-
|
429 |
def process_with_status(pdf_file, custom_prompt):
|
430 |
if pdf_file is None:
|
431 |
return "β No PDF file uploaded", {"error": "Please upload a PDF file"}
|
@@ -436,7 +454,7 @@ def create_gradio_interface():
|
|
436 |
result = extract_efax_from_pdf(pdf_file, custom_prompt if custom_prompt.strip() else None)
|
437 |
|
438 |
if result["status"] == "success":
|
439 |
-
yield f"β
Successfully processed {result['total_pages']} pages
|
440 |
else:
|
441 |
yield f"β Error: {result.get('error', 'Unknown error')}", result
|
442 |
|
@@ -463,4 +481,4 @@ if __name__ == "__main__":
|
|
463 |
server_name="0.0.0.0",
|
464 |
server_port=7860,
|
465 |
show_error=True
|
466 |
-
)
|
|
|
38 |
"openbmb/MiniCPM-V-2_6",
|
39 |
trust_remote_code=True,
|
40 |
torch_dtype=torch.float16,
|
41 |
+
device_map="auto"
|
42 |
)
|
43 |
return _model, _tokenizer
|
44 |
except Exception as e:
|
|
|
195 |
"extracted_data": None
|
196 |
}
|
197 |
|
198 |
+
def safe_merge_field(combined_data, field, value, page_num, extracted_pages):
|
199 |
+
"""Safely merge field data with type checking"""
|
200 |
+
try:
|
201 |
+
if field in combined_data and value:
|
202 |
+
# Handle nested dictionaries (like insurance)
|
203 |
+
if isinstance(value, dict) and isinstance(combined_data[field], dict):
|
204 |
+
for sub_field, sub_value in value.items():
|
205 |
+
if sub_field in combined_data[field] and sub_value and not combined_data[field][sub_field]:
|
206 |
+
combined_data[field][sub_field] = sub_value
|
207 |
+
if page_num not in extracted_pages:
|
208 |
+
extracted_pages.append(page_num)
|
209 |
+
# Handle simple fields
|
210 |
+
elif not isinstance(value, (dict, list)) and not combined_data[field]:
|
211 |
+
combined_data[field] = value
|
212 |
+
if page_num not in extracted_pages:
|
213 |
+
extracted_pages.append(page_num)
|
214 |
+
except Exception as e:
|
215 |
+
print(f"Warning: Error merging field {field}: {e}")
|
216 |
+
|
217 |
+
def safe_merge_confidence(combined_confidence, field, score):
|
218 |
+
"""Safely merge confidence scores with type checking"""
|
219 |
+
try:
|
220 |
+
# Handle nested confidence scores (like primary_insurance)
|
221 |
+
if isinstance(score, dict):
|
222 |
+
if field not in combined_confidence:
|
223 |
+
combined_confidence[field] = {}
|
224 |
+
for sub_field, sub_score in score.items():
|
225 |
+
if (sub_field not in combined_confidence[field] and
|
226 |
+
isinstance(sub_score, (int, float)) and sub_score > 0):
|
227 |
+
combined_confidence[field][sub_field] = sub_score
|
228 |
+
# Handle simple confidence scores
|
229 |
+
elif isinstance(score, (int, float)) and score > 0:
|
230 |
+
if field not in combined_confidence:
|
231 |
+
combined_confidence[field] = score
|
232 |
+
except Exception as e:
|
233 |
+
print(f"Warning: Error merging confidence for {field}: {e}")
|
234 |
+
|
235 |
def combine_page_data(pages_data):
|
236 |
+
"""Combine extracted data from multiple pages into final medical record - FIXED VERSION"""
|
237 |
combined_data = {
|
238 |
"date_of_receipt": "",
|
239 |
"patient_first_name": "",
|
|
|
275 |
|
276 |
# Combine data from all pages
|
277 |
for page_num, page_data in enumerate(pages_data, 1):
|
278 |
+
try:
|
279 |
+
if page_data.get("page_data", {}).get("status") == "success":
|
280 |
+
extracted = page_data["page_data"].get("extracted_data", {})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
281 |
|
282 |
+
# If we got JSON data, merge it
|
283 |
+
if isinstance(extracted, dict) and "data" in extracted:
|
284 |
+
page_info = extracted["data"]
|
285 |
+
|
286 |
+
# Safely merge each field
|
287 |
+
for field, value in page_info.items():
|
288 |
+
safe_merge_field(combined_data, field, value, page_num, combined_data["extracted_page_numbers"])
|
289 |
+
|
290 |
+
# Safely merge confidence scores
|
291 |
+
if "confidence_scores" in extracted:
|
292 |
+
for field, score in extracted["confidence_scores"].items():
|
293 |
+
safe_merge_confidence(combined_confidence, field, score)
|
294 |
+
except Exception as e:
|
295 |
+
print(f"Warning: Error processing page {page_num}: {e}")
|
296 |
+
continue
|
297 |
|
298 |
return {
|
299 |
"data": combined_data,
|
|
|
320 |
"pages_data": []
|
321 |
}
|
322 |
|
323 |
+
# Step 1: Convert PDF to images (CPU operation)
|
324 |
print("Converting PDF to images...")
|
325 |
images = pdf_to_images(pdf_file)
|
326 |
|
|
|
334 |
|
335 |
print(f"Converted {len(images)} pages. Starting GPU processing...")
|
336 |
|
337 |
+
# Step 2: Load model on GPU
|
338 |
model, tokenizer = load_model()
|
339 |
|
340 |
# Step 3: Use custom prompt or default
|
|
|
352 |
|
353 |
print("GPU processing complete. Combining results...")
|
354 |
|
355 |
+
# Step 5: Combine data from all pages (with error handling)
|
356 |
combined_result = combine_page_data(pages_data)
|
357 |
|
358 |
# Final result
|
|
|
369 |
return result
|
370 |
|
371 |
except Exception as e:
|
372 |
+
print(f"Error in extract_efax_from_pdf: {e}")
|
373 |
return {
|
374 |
"status": "error",
|
375 |
"error": str(e),
|
|
|
379 |
|
380 |
# Create Gradio Interface
|
381 |
def create_gradio_interface():
|
382 |
+
with gr.Blocks(title="eFax PDF Data Extractor - Fixed", theme=gr.themes.Soft()) as demo:
|
383 |
gr.Markdown("# π₯ eFax Medical Data Extraction API")
|
384 |
+
gr.Markdown("π **Fixed Version** - Single 10-minute GPU session with proper error handling")
|
385 |
|
386 |
with gr.Tab("π PDF Upload & Extraction"):
|
387 |
with gr.Row():
|
|
|
400 |
placeholder="Leave empty to use optimized medical data extraction prompt..."
|
401 |
)
|
402 |
|
403 |
+
extract_btn = gr.Button("π Extract Medical Data (Fixed)", variant="primary", size="lg")
|
404 |
|
405 |
gr.Markdown("""
|
406 |
+
### β
Bug Fixes Applied
|
407 |
+
- **Fixed**: Dict/int comparison error
|
408 |
+
- **Added**: Safe type checking for all operations
|
409 |
+
- **Improved**: Error handling and logging
|
410 |
+
- **Single GPU Session**: No more timeouts
|
411 |
""")
|
412 |
|
413 |
with gr.Column():
|
|
|
416 |
|
417 |
with gr.Tab("π API Usage"):
|
418 |
gr.Markdown("""
|
419 |
+
## Fixed API (No More Errors)
|
420 |
|
421 |
### Python Usage
|
422 |
```
|
|
|
436 |
}
|
437 |
)
|
438 |
|
439 |
+
# Should work without dict/int comparison errors
|
440 |
result = response.json()
|
441 |
+
if result["data"]["status"] == "success":
|
442 |
+
medical_data = result["data"]["combined_extraction"]
|
443 |
+
print("Patient:", medical_data["data"]["patient_first_name"])
|
444 |
```
|
445 |
""")
|
446 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
447 |
def process_with_status(pdf_file, custom_prompt):
|
448 |
if pdf_file is None:
|
449 |
return "β No PDF file uploaded", {"error": "Please upload a PDF file"}
|
|
|
454 |
result = extract_efax_from_pdf(pdf_file, custom_prompt if custom_prompt.strip() else None)
|
455 |
|
456 |
if result["status"] == "success":
|
457 |
+
yield f"β
Successfully processed {result['total_pages']} pages", result
|
458 |
else:
|
459 |
yield f"β Error: {result.get('error', 'Unknown error')}", result
|
460 |
|
|
|
481 |
server_name="0.0.0.0",
|
482 |
server_port=7860,
|
483 |
show_error=True
|
484 |
+
)
|