Suvadeep Das commited on
Commit
b3a4de8
·
verified ·
1 Parent(s): 02dd474

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +267 -0
app.py ADDED
@@ -0,0 +1,267 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import AutoModel, AutoTokenizer
4
+ from PIL import Image
5
+ import base64
6
+ import io
7
+ import os
8
+ from huggingface_hub import login
9
+ from pdf2image import convert_from_bytes
10
+ import tempfile
11
+
12
+ # Set your HF token (add this to your Space secrets)
13
+ HF_TOKEN = os.getenv("HUGGING_FACE_HUB_TOKEN")
14
+ if HF_TOKEN:
15
+ login(token=HF_TOKEN)
16
+
17
+ # Load MiniCPM model
18
+ @gr.cache
19
+ def load_model():
20
+ try:
21
+ tokenizer = AutoTokenizer.from_pretrained(
22
+ "openbmb/MiniCPM-V-2_6",
23
+ trust_remote_code=True
24
+ )
25
+ model = AutoModel.from_pretrained(
26
+ "openbmb/MiniCPM-V-2_6",
27
+ trust_remote_code=True,
28
+ torch_dtype=torch.float16,
29
+ device_map="cpu" # Use CPU for free tier
30
+ )
31
+ return model, tokenizer
32
+ except Exception as e:
33
+ # Fallback to non-gated version if access issues
34
+ print(f"Error loading gated model: {e}")
35
+ tokenizer = AutoTokenizer.from_pretrained(
36
+ "openbmb/MiniCPM-V-2",
37
+ trust_remote_code=True
38
+ )
39
+ model = AutoModel.from_pretrained(
40
+ "openbmb/MiniCPM-V-2",
41
+ trust_remote_code=True,
42
+ torch_dtype=torch.float16,
43
+ device_map="cpu"
44
+ )
45
+ return model, tokenizer
46
+
47
+ # Initialize model
48
+ model, tokenizer = load_model()
49
+
50
+ def pdf_to_images(pdf_file):
51
+ """Convert PDF file to list of PIL images"""
52
+ try:
53
+ # Read PDF bytes
54
+ if hasattr(pdf_file, 'read'):
55
+ pdf_bytes = pdf_file.read()
56
+ else:
57
+ with open(pdf_file, 'rb') as f:
58
+ pdf_bytes = f.read()
59
+
60
+ # Convert PDF to images (300 DPI for good quality)
61
+ images = convert_from_bytes(pdf_bytes, dpi=300)
62
+ return images
63
+ except Exception as e:
64
+ print(f"Error converting PDF to images: {e}")
65
+ return []
66
+
67
+ def extract_data_from_image(image, extraction_prompt):
68
+ """Extract data from a single image using MiniCPM"""
69
+ try:
70
+ # Prepare messages for MiniCPM
71
+ messages = [
72
+ {
73
+ 'role': 'user',
74
+ 'content': [
75
+ {'type': 'text', 'text': extraction_prompt},
76
+ {'type': 'image', 'image': image}
77
+ ]
78
+ }
79
+ ]
80
+
81
+ # Generate response
82
+ response = model.chat(
83
+ image=image,
84
+ msgs=messages,
85
+ tokenizer=tokenizer,
86
+ sampling=True,
87
+ temperature=0.1
88
+ )
89
+
90
+ return {
91
+ "status": "success",
92
+ "extracted_data": response,
93
+ "model_used": "MiniCPM-V-2_6"
94
+ }
95
+
96
+ except Exception as e:
97
+ return {
98
+ "status": "error",
99
+ "error": str(e),
100
+ "extracted_data": None
101
+ }
102
+
103
+ def extract_efax_from_pdf(pdf_file, extraction_prompt="You are a deterministic medical data extraction engine. You will receive medical documents in various layouts. Your task is to extract specific fields into a strictly structured JSON format, including realistic confidence scores, with no assumptions or corrections.\n\nYour response MUST follow this exact JSON format:\n\n{\n \"data\": { ... },\n \"confidence_scores\": { ... },\n \"fields_needing_review\": [ ... ],\n \"metadata\": {\n \"extraction_timestamp\": \"<ISO 8601 or UUID>\",\n \"model_used\": \"gpt-4o\",\n \"confidence_threshold\": 0.9,\n \"requires_human_review\": <true|false>\n }\n}\n\n— All extracted fields must appear exactly as found in the document.\n— Confidence scores MUST be realistic floats between 0.0 and 1.0.\n— NEVER default to 0.0 unless data is missing or unreadable.\n— Include all mandatory fields below, even if empty.\n— If any field has confidence < 0.9, add it to `fields_needing_review` and set `requires_human_review` to true.\n\n--------------------------------\nSTRICT FIELD FORMATTING RULES:\n--------------------------------\n\n• Dates: Format as MM/DD/YYYY only\n• Phone numbers: Use digits and hyphens only (e.g., 406-596-1901), no extensions or parentheses\n• Gender: \"Male\", \"Female\", or \"Other\" only\n• Email: Must contain @ and valid domain, otherwise leave empty\n• Zip code: Only extract as last 5 digits of address\n\n--------------------------------\nREFERRAL SOURCE RULES:\n--------------------------------\n\n• Extract clinic/hospital/facility name ONLY – never the provider's name\n• Use facility’s phone/fax/email, not individual provider’s contact\n• Prefer header/fax banner for referral source over body text\n• Do not extract receiver clinic names (e.g., Frontier Psychiatry) as referral source\n\n--------------------------------\nINSURANCE EXTRACTION FORMAT:\n--------------------------------\n\nEach tier must follow this structure:\n\"primary_insurance\": {\n \"payer_name\": \"string\",\n \"member_id\": \"string\",\n \"group_id\": \"string\"\n},\n\"secondary_insurance\": { ... },\n\"tertiary_insurance\": { ... }\n\n• Use \"member_id\" for any ID (Policy, Insurance ID, Subscriber ID, etc.)\n• Use \"group_id\" ONLY if explicitly labeled as \"Group ID\", \"Group Number\", etc.\n• Leave all fields empty if \"Self Pay\" is indicated\n\n--------------------------------\nDIAGNOSIS EXTRACTION RULES:\n--------------------------------\n\n• Extract diagnosis codes AND their descriptions\n• If only code is present, set description to \"\" and confidence ≤ 0.6\n• DO NOT infer description from ICD code\n\n--------------------------------\nMANDATORY FIELDS TO EXTRACT:\n--------------------------------\n\n• date_of_receipt\n• patient_first_name\n• patient_last_name\n• patient_dob\n• patient_gender\n• patient_primary_phone_number\n• patient_secondary_phone_number\n• patient_email\n• patient_address\n• patient_zip_code\n• referral_source\n• referral_source_phone_no\n• referral_source_fax_no\n• referral_source_email\n• primary_insurance\n• secondary_insurance\n• tertiary_insurance\n• priority (\"Routine\" or \"Urgent\" ONLY)\n• reason_for_referral\n• diagnosis_informations (list of { code, description })\n• refine_reason\n• extracted_page_numbers (list of page numbers where data was found)\n\n--------------------------------\nCONFIDENCE SCORING:\n--------------------------------\n\nAssign realistic confidence (0.0–1.0) per field, e.g.:\n\n• 0.95–1.0 → Clearly labeled, unambiguous data\n• 0.7–0.94 → Some uncertainty (low quality, odd format)\n• 0.0–0.6 → Missing, ambiguous, or noisy data\n• Use float precision (e.g., 0.87, not just 1.0)\n\nAlways populate the `confidence_scores` dictionary with the same structure as `data`.\n\nIf any score < 0.9, populate `fields_needing_review` and set `requires_human_review = true`.\n\n--------------------------------\nFINAL REMINDERS:\n--------------------------------\n\n• No assumptions or corrections – only extract what’s visible\n• Follow exact field formatting and nesting\n• Maintain reproducibility and determinism\n• Return full structure even if some fields are empty\n• NEVER skip the confidence_scores section\n\nRespond only with the valid JSON."):
104
+ """Main function to process multi-page PDF eFax"""
105
+ try:
106
+ if pdf_file is None:
107
+ return {
108
+ "status": "error",
109
+ "error": "No PDF file provided",
110
+ "total_pages": 0,
111
+ "pages_data": []
112
+ }
113
+
114
+ # Convert PDF to images
115
+ images = pdf_to_images(pdf_file)
116
+
117
+ if not images:
118
+ return {
119
+ "status": "error",
120
+ "error": "Could not convert PDF to images",
121
+ "total_pages": 0,
122
+ "pages_data": []
123
+ }
124
+
125
+ # Process each page
126
+ pages_data = []
127
+ for i, image in enumerate(images):
128
+ page_result = extract_data_from_image(image, extraction_prompt)
129
+ pages_data.append({
130
+ "page_number": i + 1,
131
+ "page_data": page_result
132
+ })
133
+
134
+ # Aggregate results
135
+ aggregated_result = {
136
+ "status": "success",
137
+ "total_pages": len(images),
138
+ "pages_data": pages_data,
139
+ "model_used": "MiniCPM-V-2_6",
140
+ "extraction_prompt": extraction_prompt
141
+ }
142
+
143
+ return aggregated_result
144
+
145
+ except Exception as e:
146
+ return {
147
+ "status": "error",
148
+ "error": str(e),
149
+ "total_pages": 0,
150
+ "pages_data": []
151
+ }
152
+
153
+ # Create Gradio Interface
154
+ def create_gradio_interface():
155
+ with gr.Blocks(title="eFax PDF Data Extractor") as demo:
156
+ gr.Markdown("# eFax PDF Data Extraction API using MiniCPM")
157
+ gr.Markdown("Upload a multi-page eFax PDF to extract structured data from all pages")
158
+
159
+ with gr.Tab("PDF Upload & Extraction"):
160
+ with gr.Row():
161
+ with gr.Column():
162
+ pdf_input = gr.File(
163
+ file_types=[".pdf"],
164
+ label="Upload eFax PDF",
165
+ file_count="single"
166
+ )
167
+ prompt_input = gr.Textbox(
168
+ value="Extract patient name, email, phone number, medical details, and all relevant information from this eFax page",
169
+ label="Extraction Prompt (applied to each page)",
170
+ lines=3
171
+ )
172
+ extract_btn = gr.Button("Extract Data from PDF", variant="primary")
173
+
174
+ with gr.Column():
175
+ output = gr.JSON(label="Extracted Data (All Pages)")
176
+
177
+ with gr.Tab("API Usage"):
178
+ gr.Markdown("""
179
+ ## API Endpoints
180
+
181
+ Once deployed, you can use this Space as an API for PDF processing:
182
+
183
+ ### Python API Usage
184
+ ```
185
+ import requests
186
+ import base64
187
+
188
+ # Convert PDF to base64
189
+ with open("efax.pdf", "rb") as f:
190
+ pdf_b64 = base64.b64encode(f.read()).decode()
191
+
192
+ response = requests.post(
193
+ "https://your-username-extracting-efax.hf.space/api/predict",
194
+ json={
195
+ "data": [
196
+ {"name": "efax.pdf", "data": f"application/pdf;base64,{pdf_b64}"},
197
+ "Extract all patient data from this eFax"
198
+ ]
199
+ }
200
+ )
201
+
202
+ result = response.json()
203
+ print("Total pages:", result["data"]["total_pages"])
204
+ for page in result["data"]["pages_data"]:
205
+ print(f"Page {page['page_number']}:", page["page_data"]["extracted_data"])
206
+ ```
207
+
208
+ ### cURL Example
209
+ ```
210
+ curl -X POST "https://your-username-extracting-efax.hf.space/api/predict" \\
211
+ -H "Content-Type: application/json" \\
212
+ -d '{
213
+ "data": [
214
+ {"name": "efax.pdf", "data": "application/pdf;base64,PDF_BASE64_HERE"},
215
+ "Extract patient information"
216
+ ]
217
+ }'
218
+ ```
219
+
220
+ ### Response Format
221
+ ```
222
+ {
223
+ "status": "success",
224
+ "total_pages": 7,
225
+ "pages_data": [
226
+ {
227
+ "page_number": 1,
228
+ "page_data": {
229
+ "status": "success",
230
+ "extracted_data": "Patient: John Doe\\nEmail: [email protected]...",
231
+ "model_used": "MiniCPM-V-2_6"
232
+ }
233
+ }
234
+ ]
235
+ }
236
+ ```
237
+ """)
238
+
239
+ with gr.Tab("Processing Info"):
240
+ gr.Markdown("""
241
+ ## Processing Details
242
+
243
+ - **Supported Format**: PDF files only
244
+ - **Page Limit**: Optimized for 6-7 page eFax documents
245
+ - **Processing**: Each PDF page is converted to high-quality image (300 DPI)
246
+ - **Model**: MiniCPM-V-2_6 for OCR and data extraction
247
+ - **Output**: Structured JSON with page-by-page results
248
+
249
+ ## Healthcare Compliance
250
+ - All processing is done in-memory
251
+ - No files are permanently stored
252
+ - Suitable for HIPAA-compliant workflows when used privately
253
+ """)
254
+
255
+ # Connect the interface
256
+ extract_btn.click(
257
+ fn=extract_efax_from_pdf,
258
+ inputs=[pdf_input, prompt_input],
259
+ outputs=output
260
+ )
261
+
262
+ return demo
263
+
264
+ # Launch the app
265
+ if __name__ == "__main__":
266
+ demo = create_gradio_interface()
267
+ demo.launch(server_name="0.0.0.0", server_port=7860)