Sandy2636 commited on
Commit
3d827ec
Β·
1 Parent(s): 0a8e31d

Update space

Browse files
Files changed (1) hide show
  1. app.py +329 -72
app.py CHANGED
@@ -1,91 +1,348 @@
1
  import gradio as gr
2
- import base64
3
  import requests
4
- from PIL import Image
5
- import io
6
-
7
- API_KEY = "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"
8
- IMAGE_MODEL = "OpenGVLab/InternVL3-14B"
9
-
10
- def extract_passport_info(images, document_type):
11
- results = []
12
-
13
- for image in images:
14
- # Convert image to base64
15
- buffered = io.BytesIO()
16
- image.save(buffered, format="JPEG")
17
- encoded_image = base64.b64encode(buffered.getvalue()).decode("utf-8")
18
- data_url = f"data:image/jpeg;base64,{encoded_image}"
19
-
20
- # Prompt to extract full passport data
21
- prompt = (
22
- f"Extract all passport information from the uploaded {document_type} image. "
23
- "Include MRZ (if present), full name, passport number, nationality, gender, "
24
- "date of birth, date of issue, expiry date, issuing country, and any other text or labels in other languages. "
25
- "Return the result in a JSON format."
26
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
- # OpenRouter Payload
29
  payload = {
30
  "model": IMAGE_MODEL,
31
  "messages": [
32
  {
33
  "role": "user",
34
  "content": [
35
- {"type": "text", "text": prompt},
36
- {"type": "image_url", "image_url": {"url": data_url}},
37
- ],
38
  }
39
  ],
 
 
 
 
40
  }
41
 
42
  headers = {
43
- "Authorization": f"Bearer {API_KEY}",
44
- "Content-Type": "application/json"
 
 
45
  }
46
 
47
- try:
48
- response = requests.post("https://openrouter.ai/api/v1/chat/completions", headers=headers, json=payload)
49
- result = response.json()
50
- print("πŸ“‘ Status:", response.status_code)
51
- print("πŸ“‘ Raw Result:", result)
52
-
53
- if "choices" in result:
54
- extracted = result["choices"][0]["message"]["content"]
55
- results.append({
56
- "document_type": document_type,
57
- "extracted_info": extracted
58
- })
59
- else:
60
- results.append({
61
- "document_type": document_type,
62
- "extracted_info": "❌ No data extracted"
63
- })
64
-
65
- except Exception as e:
66
- results.append({
67
- "document_type": document_type,
68
- "extracted_info": f"⚠️ Error: {str(e)}"
69
- })
70
-
71
- return results
72
-
73
-
74
- # Gradio UI
75
- demo = gr.Interface(
76
- fn=extract_passport_info,
77
- inputs=[
78
- gr.Image(type="pil", label="Upload Passport/Document Images", multiple=True),
79
- gr.Dropdown(
80
- choices=["passport_front", "passport_back", "photo", "hotel_reservation"],
81
- label="Document Type",
82
- value="passport_front",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  )
84
- ],
85
- outputs="json",
86
- title="Passport & Document Info Extractor",
87
- description="Upload one or more document images. Extracted information will include MRZ and all available text, structured in JSON format.",
88
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
  if __name__ == "__main__":
91
- demo.launch()
 
 
 
 
 
 
 
1
  import gradio as gr
 
2
  import requests
3
+ import base64
4
+ import os
5
+ import json
6
+ import mimetypes
7
+
8
+ # --- Configuration ---
9
+ # IMPORTANT: Set your OPENROUTER_API_KEY as an environment variable
10
+ # For example, in your terminal: export OPENROUTER_API_KEY='your_key_here'
11
+ OPENROUTER_API_KEY = "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"
12
+ IMAGE_MODEL = "opengvlab/internvl3-14b:free"
13
+ OPENROUTER_API_URL = "https://openrouter.ai/api/v1/chat/completions"
14
+
15
+ # --- Application State ---
16
+ # Global list to store documents in the current batch
17
+ # Each item: {"path": "image_file_path", "type": "document_type_string", "filename": "display_filename"}
18
+ current_batch = []
19
+
20
+ # --- Helper Functions ---
21
+
22
+ def generate_extraction_prompt(doc_type_provided_by_user):
23
+ """
24
+ Generates a detailed prompt for the LLM to extract information
25
+ and structure it as a JSON object.
26
+ """
27
+ prompt = f"""You are an advanced OCR and information extraction AI.
28
+ The user has provided an image and identified it as a '{doc_type_provided_by_user}'.
29
+ Your task is to meticulously analyze this image and extract all relevant information.
30
+
31
+ Output Format Instructions:
32
+ Provide your response as a SINGLE, VALID JSON OBJECT. Do not include any explanatory text before or after the JSON.
33
+ The JSON object should have the following top-level keys:
34
+ - "document_type_provided": (string) The type provided by the user: "{doc_type_provided_by_user}".
35
+ - "document_type_detected": (string) Your best guess of the specific document type (e.g., "Passport", "National ID Card", "Driver's License", "Visa Sticker", "Hotel Confirmation Voucher", "Boarding Pass", "Photograph of a person").
36
+ - "extracted_fields": (object) A key-value map of all extracted information. Be comprehensive. Examples:
37
+ - For passports/IDs: "Surname", "Given Names", "Document Number", "Nationality", "Date of Birth", "Sex", "Place of Birth", "Date of Issue", "Date of Expiry", "Issuing Authority", "Country Code".
38
+ - For hotel reservations: "Guest Name", "Hotel Name", "Booking Reference", "Check-in Date", "Check-out Date", "Room Type".
39
+ - For photos: "Description" (e.g., "Portrait of a person", "Image contains text: [text if any]").
40
+ - "mrz_data": (object or null) If a Machine Readable Zone (MRZ) is present:
41
+ - "raw_mrz_lines": (array of strings) Each line of the MRZ.
42
+ - "parsed_mrz": (object) Key-value pairs of parsed MRZ fields (e.g., "passport_type", "issuing_country", "surname", "given_names", "passport_number", "nationality", "dob", "sex", "expiry_date", "personal_number").
43
+ If no MRZ, this field should be null.
44
+ - "multilingual_info": (array of objects or null) For any text segments not in English:
45
+ - Each object: {{"language_detected": "ISO 639-1 code", "original_text": "...", "english_translation_or_transliteration": "..."}}
46
+ If no non-English text, this field can be null or an empty array.
47
+ - "full_text_ocr": (string) Concatenation of all text found on the document.
48
+
49
+ Extraction Guidelines:
50
+ 1. Prioritize accuracy. If unsure about a character or word, indicate uncertainty if possible, or extract the most likely interpretation.
51
+ 2. Extract all visible text, including small print, stamps, and handwritten annotations if legible.
52
+ 3. For dates, try to use ISO 8601 format (YYYY-MM-DD) if possible, but retain original format if conversion is ambiguous.
53
+ 4. If the image is a photo of a person without much text, the "extracted_fields" might contain a description, and "full_text_ocr" might be minimal.
54
+ 5. If the document is multi-page and only one page is provided, note this if apparent.
55
+
56
+ Ensure the entire output strictly adheres to the JSON format.
57
+ """
58
+ return prompt
59
+
60
+ def process_single_image_with_openrouter(image_path, doc_type):
61
+ """
62
+ Encodes an image, sends it to OpenRouter with a generated prompt,
63
+ and attempts to parse the JSON response from the LLM.
64
+ """
65
+ if not OPENROUTER_API_KEY:
66
+ return {"error": "OpenRouter API key not set.", "document_type_provided": doc_type}
67
+
68
+ try:
69
+ with open(image_path, "rb") as f:
70
+ encoded_image_bytes = f.read()
71
+ encoded_image_string = base64.b64encode(encoded_image_bytes).decode("utf-8")
72
+
73
+ mime_type, _ = mimetypes.guess_type(image_path)
74
+ if not mime_type:
75
+ # Fallback, try to infer from extension or default to common types
76
+ ext = os.path.splitext(image_path)[1].lower()
77
+ if ext == ".png": mime_type = "image/png"
78
+ elif ext == ".jpg" or ext == ".jpeg": mime_type = "image/jpeg"
79
+ elif ext == ".webp": mime_type = "image/webp"
80
+ else: mime_type = "image/jpeg" # A common default
81
+
82
+ data_url = f"data:{mime_type};base64,{encoded_image_string}"
83
+ prompt_text = generate_extraction_prompt(doc_type)
84
 
 
85
  payload = {
86
  "model": IMAGE_MODEL,
87
  "messages": [
88
  {
89
  "role": "user",
90
  "content": [
91
+ {"type": "text", "text": prompt_text},
92
+ {"type": "image_url", "image_url": {"url": data_url}}
93
+ ]
94
  }
95
  ],
96
+ "max_tokens": 3000, # Increased for potentially large JSONs
97
+ "temperature": 0.1, # Lower temperature for more deterministic output
98
+ # "response_format": {"type": "json_object"}, # Uncomment if OpenRouter & model fully support this
99
+ # for guaranteed JSON. Prompt is primary method now.
100
  }
101
 
102
  headers = {
103
+ "Authorization": f"Bearer {OPENROUTER_API_KEY}",
104
+ "Content-Type": "application/json",
105
+ "HTTP-Referer": "https://huggingface.co/spaces/YOUR_SPACE_NAME", # Optional: Replace with your app's URL
106
+ "X-Title": "Gradio Document Extractor" # Optional: Replace with your app's name
107
  }
108
 
109
+ print(f"Sending request to OpenRouter for image: {os.path.basename(image_path)}, type: {doc_type}")
110
+ response = requests.post(OPENROUTER_API_URL, headers=headers, json=payload, timeout=120) # 120s timeout
111
+ response.raise_for_status() # Raises HTTPError for bad responses (4XX or 5XX)
112
+ result = response.json()
113
+ print(f"Received response from OpenRouter. Status: {response.status_code}")
114
+
115
+ if "choices" in result and result["choices"]:
116
+ content_text = result["choices"][0]["message"]["content"]
117
+
118
+ # Try to clean up and parse JSON (models sometimes wrap in markdown)
119
+ clean_content = content_text.strip()
120
+ if clean_content.startswith("```json"):
121
+ clean_content = clean_content[7:]
122
+ if clean_content.endswith("```"):
123
+ clean_content = clean_content[:-3]
124
+ elif clean_content.startswith("`") and clean_content.endswith("`"): # Single backtick
125
+ clean_content = clean_content[1:-1]
126
+
127
+ try:
128
+ parsed_json = json.loads(clean_content)
129
+ # Ensure document_type_provided is in the root, even if LLM missed it
130
+ if "document_type_provided" not in parsed_json:
131
+ parsed_json["document_type_provided"] = doc_type
132
+ return parsed_json
133
+ except json.JSONDecodeError as e:
134
+ print(f"JSONDecodeError: {e}. Raw content was:\n{content_text}")
135
+ return {
136
+ "error": "Failed to parse LLM output as JSON.",
137
+ "raw_content_from_llm": content_text,
138
+ "document_type_provided": doc_type
139
+ }
140
+ else:
141
+ print(f"No 'choices' in API response: {result}")
142
+ return {"error": "No choices in API response.", "details": result, "document_type_provided": doc_type}
143
+
144
+ except requests.exceptions.Timeout:
145
+ print(f"API Request Timeout for {os.path.basename(image_path)}")
146
+ return {"error": "API request timed out.", "document_type_provided": doc_type}
147
+ except requests.exceptions.RequestException as e:
148
+ error_message = f"API Request Error: {str(e)}"
149
+ if e.response is not None:
150
+ error_message += f" Status: {e.response.status_code}, Response: {e.response.text}"
151
+ print(error_message)
152
+ return {"error": error_message, "document_type_provided": doc_type}
153
+ except Exception as e:
154
+ print(f"An unexpected error occurred during processing {os.path.basename(image_path)}: {str(e)}")
155
+ return {"error": f"An unexpected error: {str(e)}", "document_type_provided": doc_type}
156
+
157
+ # --- Gradio Interface Callbacks ---
158
+
159
+ def add_document_to_batch_ui(image_filepath, doc_type_selection):
160
+ """Adds an uploaded image and its type to the current batch state."""
161
+ global current_batch
162
+ if image_filepath and doc_type_selection:
163
+ filename = os.path.basename(image_filepath)
164
+ # Note: image_filepath is a temporary path from Gradio.
165
+ # It should be used relatively quickly. For long-lived state,
166
+ # you might copy the file or read its content.
167
+ current_batch.append({"path": image_filepath, "type": doc_type_selection, "filename": filename})
168
+
169
+ # Prepare display for Dataframe: list of lists
170
+ batch_display_data = [[item["filename"], item["type"]] for item in current_batch]
171
+ return batch_display_data, f"Added '{filename}' as '{doc_type_selection}'."
172
+
173
+ # Return current state if inputs are invalid
174
+ batch_display_data = [[item["filename"], item["type"]] for item in current_batch]
175
+ return batch_display_data, "Failed to add: Image or document type missing."
176
+
177
+
178
+ def process_batch_ui():
179
+ """Processes all documents in the current batch and returns combined JSON results."""
180
+ global current_batch
181
+ if not OPENROUTER_API_KEY:
182
+ return {"error": "OPENROUTER_API_KEY is not set. Please configure it."}, "API Key Missing."
183
+
184
+ if not current_batch:
185
+ return {"message": "Batch is empty. Add documents first."}, "Batch is empty."
186
+
187
+ all_results = []
188
+ status_updates = []
189
+
190
+ for i, item_to_process in enumerate(current_batch):
191
+ status_msg = f"Processing document {i+1}/{len(current_batch)}: {item_to_process['filename']} ({item_to_process['type']})..."
192
+ print(status_msg)
193
+ # yield None, status_msg # This would require process_batch_ui to be a generator for live updates
194
+
195
+ # Ensure the file path is valid; Gradio's temp files should be okay here
196
+ # if not os.path.exists(item_to_process["path"]):
197
+ # error_res = {"error": f"File not found: {item_to_process['filename']}. It might have been a temporary file that was removed.", "document_type_provided": item_to_process['type']}
198
+ # all_results.append(error_res)
199
+ # status_updates.append(f"Error: File {item_to_process['filename']} not found.")
200
+ # continue
201
+
202
+ extracted_data = process_single_image_with_openrouter(item_to_process["path"], item_to_process["type"])
203
+ all_results.append(extracted_data)
204
+ if "error" in extracted_data:
205
+ status_updates.append(f"Error processing {item_to_process['filename']}: {extracted_data['error']}")
206
+ else:
207
+ status_updates.append(f"Successfully processed {item_to_process['filename']}.")
208
+
209
+ # Attempt to group results by person (heuristic)
210
+ # This is a basic grouping; more sophisticated logic could be added.
211
+ grouped_by_person = {}
212
+ unidentified_docs = []
213
+
214
+ for result_item in all_results:
215
+ doc_id = None
216
+ if isinstance(result_item, dict) and "extracted_fields" in result_item and isinstance(result_item["extracted_fields"], dict):
217
+ fields = result_item["extracted_fields"]
218
+ # Try common identifiers
219
+ passport_no = fields.get("Document Number") or fields.get("Passport Number") or fields.get("passport_number")
220
+ name = fields.get("Given Names") or fields.get("Given Name") or fields.get("Name")
221
+ surname = fields.get("Surname") or fields.get("Family Name")
222
+ dob = fields.get("Date of Birth") or fields.get("DOB")
223
+
224
+ if passport_no:
225
+ doc_id = f"passport_{str(passport_no).replace(' ', '').lower()}"
226
+ elif name and surname and dob:
227
+ doc_id = f"{str(name).replace(' ', '').lower()}_{str(surname).replace(' ', '').lower()}_{str(dob).replace(' ', '')}"
228
+ elif name and surname:
229
+ doc_id = f"{str(name).replace(' ', '').lower()}_{str(surname).replace(' ', '').lower()}"
230
+
231
+
232
+ if doc_id:
233
+ if doc_id not in grouped_by_person:
234
+ grouped_by_person[doc_id] = {"person_identifier": doc_id, "documents": []}
235
+ grouped_by_person[doc_id]["documents"].append(result_item)
236
+ else:
237
+ unidentified_docs.append(result_item)
238
+
239
+ final_structured_output = {
240
+ "summary": f"Processed {len(current_batch)} documents.",
241
+ "grouped_by_person": list(grouped_by_person.values()) if grouped_by_person else [], # Convert dict to list for easier iteration in JSON
242
+ "unidentified_documents_or_errors": unidentified_docs
243
+ }
244
+
245
+ # Do not clear batch here, let user do it.
246
+ # current_batch = [] # Clears batch after processing
247
+ # batch_display_data = []
248
+
249
+ final_status = "Batch processing complete. " + " | ".join(status_updates)
250
+ print(final_status)
251
+ return final_structured_output, final_status # Output JSON and status message
252
+
253
+
254
+ def clear_batch_ui():
255
+ """Clears the current batch and updates the UI."""
256
+ global current_batch
257
+ current_batch = []
258
+ return [], "Batch cleared successfully." # Cleared dataframe and status message
259
+
260
+
261
+ # --- Gradio UI Layout ---
262
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
263
+ gr.Markdown("# πŸ“„ Document Information Extractor (OpenGVLab/InternVL3-14B via OpenRouter)")
264
+ gr.Markdown(
265
+ "**Instructions:**\n"
266
+ "1. Upload a document image (e.g., passport front/back, photo, hotel reservation).\n"
267
+ "2. Select the correct document type.\n"
268
+ "3. Click 'Add Document to Current Batch'. Repeat for all documents of a person or a related set.\n"
269
+ "4. Review the batch. Click 'Clear Entire Batch' to start over.\n"
270
+ "5. Click 'Process Batch and Extract Information' to send documents to the AI.\n"
271
+ "6. View the extracted information in JSON format below."
272
+ )
273
+
274
+ if not OPENROUTER_API_KEY:
275
+ gr.Markdown(
276
+ "<h3 style='color:red;'>⚠️ Warning: `OPENROUTER_API_KEY` environment variable is not detected. "
277
+ "API calls will fail. Please set it and restart this application.</h3>"
278
  )
279
+
280
+ with gr.Row():
281
+ with gr.Column(scale=1):
282
+ gr.Markdown("### Step 1: Add Document")
283
+ image_input = gr.Image(
284
+ label="Upload Document Image",
285
+ type="filepath", # 'filepath' gives a temporary path to the uploaded file
286
+ sources=["upload"],
287
+ height=300
288
+ )
289
+ doc_type_choices = [
290
+ 'passport_front', 'passport_back', 'national_id_front', 'national_id_back',
291
+ 'drivers_license_front', 'drivers_license_back', 'visa_sticker',
292
+ 'photo', 'hotel_reservation', 'boarding_pass', 'utility_bill', 'other_document'
293
+ ]
294
+ doc_type_input = gr.Dropdown(
295
+ label="Select Document Type",
296
+ choices=doc_type_choices,
297
+ value='passport_front',
298
+ filterable=True
299
+ )
300
+ add_button = gr.Button("βž• Add Document to Current Batch", variant="secondary")
301
+
302
+ with gr.Column(scale=2):
303
+ gr.Markdown("### Step 2: Review Current Batch")
304
+ batch_dataframe = gr.Dataframe(
305
+ headers=["Filename", "Document Type"],
306
+ datatype=["str", "str"],
307
+ row_count=(0, "dynamic"), # Start with 0 rows, dynamically adjusts
308
+ col_count=(2, "fixed"),
309
+ wrap=True,
310
+ height=380,
311
+ )
312
+ clear_batch_button = gr.Button("πŸ—‘οΈ Clear Entire Batch", variant="stop")
313
+
314
+ gr.Markdown("### Step 3: Process Batch")
315
+ process_button = gr.Button("πŸš€ Process Batch and Extract Information", variant="primary")
316
+
317
+ status_message_textbox = gr.Textbox(label="Processing Status", interactive=False, lines=2)
318
+
319
+ gr.Markdown("### Step 4: View Results")
320
+ output_json_display = gr.JSON(label="Extracted Information (JSON Format)")
321
+
322
+ # --- Connect UI elements to functions ---
323
+ add_button.click(
324
+ fn=add_document_to_batch_ui,
325
+ inputs=[image_input, doc_type_input],
326
+ outputs=[batch_dataframe, status_message_textbox]
327
+ ).then(lambda: None, outputs=image_input) # Clear image input after adding
328
+
329
+ clear_batch_button.click(
330
+ fn=clear_batch_ui,
331
+ inputs=[],
332
+ outputs=[batch_dataframe, status_message_textbox]
333
+ )
334
+
335
+ process_button.click(
336
+ fn=process_batch_ui,
337
+ inputs=[],
338
+ outputs=[output_json_display, status_message_textbox]
339
+ )
340
 
341
  if __name__ == "__main__":
342
+ if not OPENROUTER_API_KEY:
343
+ print("ERROR: The OPENROUTER_API_KEY environment variable is not set.")
344
+ print("Please set it before running the application, e.g.:")
345
+ print(" export OPENROUTER_API_KEY='your_openrouter_key_here'")
346
+ print("The application will launch, but API calls will fail.")
347
+
348
+ demo.launch()