Sandy2636 commited on
Commit
ba14e67
Β·
1 Parent(s): e08f157

Add application file

Browse files
Files changed (3) hide show
  1. .gradio/certificate.pem +31 -0
  2. app.py +390 -198
  3. old_app.py +267 -0
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
app.py CHANGED
@@ -1,70 +1,105 @@
1
  import gradio as gr
2
- import requests
3
  import base64
4
- import os
5
  import json
6
- import mimetypes
 
 
 
7
 
8
  # --- Configuration ---
9
- OPENROUTER_API_KEY = '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'
10
- IMAGE_MODEL = "opengvlab/internvl3-14b:free"
 
11
  OPENROUTER_API_URL = "https://openrouter.ai/api/v1/chat/completions"
12
 
13
- # --- Application State ---
14
- current_batch = []
 
 
 
15
 
16
  # --- Helper Functions ---
17
 
18
- def generate_extraction_prompt(doc_type_provided_by_user):
19
- prompt = f"""You are an advanced OCR and information extraction AI.
20
- The user has provided an image and identified it as a '{doc_type_provided_by_user}'.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  Your task is to meticulously analyze this image and extract all relevant information.
22
 
23
  Output Format Instructions:
24
  Provide your response as a SINGLE, VALID JSON OBJECT. Do not include any explanatory text before or after the JSON.
25
  The JSON object should have the following top-level keys:
26
- - "document_type_provided": (string) The type provided by the user: "{doc_type_provided_by_user}".
27
- - "document_type_detected": (string) Your best guess of the specific document type (e.g., "Passport", "National ID Card", "Driver's License", "Visa Sticker", "Hotel Confirmation Voucher", "Boarding Pass", "Photograph of a person").
28
  - "extracted_fields": (object) A key-value map of all extracted information. Be comprehensive. Examples:
29
- - For passports/IDs: "Surname", "Given Names", "Document Number", "Nationality", "Date of Birth", "Sex", "Place of Birth", "Date of Issue", "Date of Expiry", "Issuing Authority", "Country Code".
30
- - For hotel reservations: "Guest Name", "Hotel Name", "Booking Reference", "Check-in Date", "Check-out Date", "Room Type".
31
- - For photos: "Description" (e.g., "Portrait of a person", "Image contains text: [text if any]").
 
32
  - "mrz_data": (object or null) If a Machine Readable Zone (MRZ) is present:
33
  - "raw_mrz_lines": (array of strings) Each line of the MRZ.
34
- - "parsed_mrz": (object) Key-value pairs of parsed MRZ fields (e.g., "passport_type", "issuing_country", "surname", "given_names", "passport_number", "nationality", "dob", "sex", "expiry_date", "personal_number").
35
  If no MRZ, this field should be null.
36
- - "multilingual_info": (array of objects or null) For any text segments not in English:
37
- - Each object: {{"language_detected": "ISO 639-1 code", "original_text": "...", "english_translation_or_transliteration": "..."}}
38
- If no non-English text, this field can be null or an empty array.
39
  - "full_text_ocr": (string) Concatenation of all text found on the document.
40
 
41
  Extraction Guidelines:
42
- 1. Prioritize accuracy. If unsure about a character or word, indicate uncertainty if possible, or extract the most likely interpretation.
43
- 2. Extract all visible text, including small print, stamps, and handwritten annotations if legible.
44
  3. For dates, try to use ISO 8601 format (YYYY-MM-DD) if possible, but retain original format if conversion is ambiguous.
45
- 4. If the image is a photo of a person without much text, the "extracted_fields" might contain a description, and "full_text_ocr" might be minimal.
46
- 5. If the document is multi-page and only one page is provided, note this if apparent.
47
 
48
  Ensure the entire output strictly adheres to the JSON format.
49
  """
50
- return prompt
51
 
52
- def process_single_image_with_openrouter(image_path, doc_type):
53
  if not OPENROUTER_API_KEY:
54
- return {"error": "OpenRouter API key not set.", "document_type_provided": doc_type}
55
  try:
56
- with open(image_path, "rb") as f:
57
- encoded_image_bytes = f.read()
58
- encoded_image_string = base64.b64encode(encoded_image_bytes).decode("utf-8")
59
- mime_type, _ = mimetypes.guess_type(image_path)
60
- if not mime_type:
61
- ext = os.path.splitext(image_path)[1].lower()
62
- if ext == ".png": mime_type = "image/png"
63
- elif ext in [".jpg", ".jpeg"]: mime_type = "image/jpeg"
64
- elif ext == ".webp": mime_type = "image/webp"
65
- else: mime_type = "image/jpeg"
66
- data_url = f"data:{mime_type};base64,{encoded_image_string}"
67
- prompt_text = generate_extraction_prompt(doc_type)
 
68
  payload = {
69
  "model": IMAGE_MODEL,
70
  "messages": [
@@ -76,192 +111,349 @@ def process_single_image_with_openrouter(image_path, doc_type):
76
  ]
77
  }
78
  ],
79
- "max_tokens": 3000,
80
  "temperature": 0.1,
81
  }
82
  headers = {
83
  "Authorization": f"Bearer {OPENROUTER_API_KEY}",
84
  "Content-Type": "application/json",
85
- "HTTP-Referer": "https://huggingface.co/spaces/YOUR_SPACE_NAME",
86
- "X-Title": "Gradio Document Extractor"
87
  }
88
- print(f"Sending request to OpenRouter for image: {os.path.basename(image_path)}, type: {doc_type}")
89
- response = requests.post(OPENROUTER_API_URL, headers=headers, json=payload, timeout=120)
90
  response.raise_for_status()
91
  result = response.json()
92
- print(f"Received response from OpenRouter. Status: {response.status_code}")
93
  if "choices" in result and result["choices"]:
94
- content_text = result["choices"][0]["message"]["content"]
95
- clean_content = content_text.strip()
96
- if clean_content.startswith("```json"):
97
- clean_content = clean_content[7:]
98
- if clean_content.endswith("```"):
99
- clean_content = clean_content[:-3]
100
- elif clean_content.startswith("`") and clean_content.endswith("`"):
101
- clean_content = clean_content[1:-1]
102
- try:
103
- parsed_json = json.loads(clean_content)
104
- if "document_type_provided" not in parsed_json:
105
- parsed_json["document_type_provided"] = doc_type
106
- return parsed_json
107
- except json.JSONDecodeError as e:
108
- print(f"JSONDecodeError: {e}. Raw content was:\n{content_text}")
109
- return {
110
- "error": "Failed to parse LLM output as JSON.",
111
- "raw_content_from_llm": content_text,
112
- "document_type_provided": doc_type
113
- }
114
  else:
115
- print(f"No 'choices' in API response: {result}")
116
- return {"error": "No choices in API response.", "details": result, "document_type_provided": doc_type}
117
  except requests.exceptions.Timeout:
118
- print(f"API Request Timeout for {os.path.basename(image_path)}")
119
- return {"error": "API request timed out.", "document_type_provided": doc_type}
120
  except requests.exceptions.RequestException as e:
121
  error_message = f"API Request Error: {str(e)}"
122
- if e.response is not None:
123
  error_message += f" Status: {e.response.status_code}, Response: {e.response.text}"
124
- print(error_message)
125
- return {"error": error_message, "document_type_provided": doc_type}
126
  except Exception as e:
127
- print(f"An unexpected error occurred during processing {os.path.basename(image_path)}: {str(e)}")
128
- return {"error": f"An unexpected error: {str(e)}", "document_type_provided": doc_type}
129
-
130
- def add_document_to_batch_ui(image_filepath, doc_type_selection):
131
- global current_batch
132
- if image_filepath and doc_type_selection:
133
- filename = os.path.basename(image_filepath)
134
- current_batch.append({"path": image_filepath, "type": doc_type_selection, "filename": filename})
135
- batch_display_data = [[item["filename"], item["type"]] for item in current_batch]
136
- return batch_display_data, f"Added '{filename}' as '{doc_type_selection}'."
137
- batch_display_data = [[item["filename"], item["type"]] for item in current_batch]
138
- return batch_display_data, "Failed to add: Image or document type missing."
139
-
140
- def process_batch_ui():
141
- global current_batch
142
- if not OPENROUTER_API_KEY:
143
- return {"error": "OPENROUTER_API_KEY is not set. Please configure it."}, "API Key Missing."
144
- if not current_batch:
145
- return {"message": "Batch is empty. Add documents first."}, "Batch is empty."
146
- all_results = []
147
- status_updates = []
148
- for i, item_to_process in enumerate(current_batch):
149
- status_msg = f"Processing document {i+1}/{len(current_batch)}: {item_to_process['filename']} ({item_to_process['type']})..."
150
- print(status_msg)
151
- extracted_data = process_single_image_with_openrouter(item_to_process["path"], item_to_process["type"])
152
- all_results.append(extracted_data)
153
- if "error" in extracted_data:
154
- status_updates.append(f"Error processing {item_to_process['filename']}: {extracted_data['error']}")
155
- else:
156
- status_updates.append(f"Successfully processed {item_to_process['filename']}.")
157
- grouped_by_person = {}
158
- unidentified_docs = []
159
- for result_item in all_results:
160
- doc_id = None
161
- if isinstance(result_item, dict) and "extracted_fields" in result_item and isinstance(result_item["extracted_fields"], dict):
162
- fields = result_item["extracted_fields"]
163
- passport_no = fields.get("Document Number") or fields.get("Passport Number") or fields.get("passport_number")
164
- name = fields.get("Given Names") or fields.get("Given Name") or fields.get("Name")
165
- surname = fields.get("Surname") or fields.get("Family Name")
166
- dob = fields.get("Date of Birth") or fields.get("DOB")
167
- if passport_no:
168
- doc_id = f"passport_{str(passport_no).replace(' ', '').lower()}"
169
- elif name and surname and dob:
170
- doc_id = f"{str(name).replace(' ', '').lower()}_{str(surname).replace(' ', '').lower()}_{str(dob).replace(' ', '')}"
171
- elif name and surname:
172
- doc_id = f"{str(name).replace(' ', '').lower()}_{str(surname).replace(' ', '').lower()}"
173
- if doc_id:
174
- if doc_id not in grouped_by_person:
175
- grouped_by_person[doc_id] = {"person_identifier": doc_id, "documents": []}
176
- grouped_by_person[doc_id]["documents"].append(result_item)
177
- else:
178
- unidentified_docs.append(result_item)
179
- final_structured_output = {
180
- "summary": f"Processed {len(current_batch)} documents.",
181
- "grouped_by_person": list(grouped_by_person.values()) if grouped_by_person else [],
182
- "unidentified_documents_or_errors": unidentified_docs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  }
184
- final_status = "Batch processing complete. " + " | ".join(status_updates)
185
- print(final_status)
186
- return final_structured_output, final_status
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
 
188
- def clear_batch_ui():
189
- global current_batch
190
- current_batch = []
191
- return [], "Batch cleared successfully."
 
 
 
 
 
 
192
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
194
- gr.Markdown("# πŸ“„ Document Information Extractor (OpenGVLab/InternVL3-14B via OpenRouter)")
195
  gr.Markdown(
196
- "**Instructions:**\n"
197
- "1. Upload a document image (e.g., passport front/back, photo, hotel reservation).\n"
198
- "2. Select the correct document type.\n"
199
- "3. Click 'Add Document to Current Batch'. Repeat for all documents of a person or a related set.\n"
200
- "4. Review the batch. Click 'Clear Entire Batch' to start over.\n"
201
- "5. Click 'Process Batch and Extract Information' to send documents to the AI.\n"
202
- "6. View the extracted information in JSON format below."
203
  )
 
204
  if not OPENROUTER_API_KEY:
205
- gr.Markdown(
206
- "<h3 style='color:red;'>⚠️ Warning: `OPENROUTER_API_KEY` environment variable is not detected. "
207
- "API calls will fail. Please set it and restart this application.</h3>"
208
- )
209
  with gr.Row():
210
  with gr.Column(scale=1):
211
- gr.Markdown("### Step 1: Add Document")
212
- image_input = gr.Image(
213
- label="Upload Document Image",
214
- type="filepath",
215
- sources=["upload"],
216
- height=300
217
- )
218
- doc_type_choices = [
219
- 'passport_front', 'passport_back', 'national_id_front', 'national_id_back',
220
- 'drivers_license_front', 'drivers_license_back', 'visa_sticker',
221
- 'photo', 'hotel_reservation', 'boarding_pass', 'utility_bill', 'other_document'
222
- ]
223
- doc_type_input = gr.Dropdown(
224
- label="Select Document Type",
225
- choices=doc_type_choices,
226
- value='passport_front',
227
- filterable=True
228
- )
229
- add_button = gr.Button("βž• Add Document to Current Batch", variant="secondary")
230
- with gr.Column(scale=2):
231
- gr.Markdown("### Step 2: Review Current Batch")
232
- batch_dataframe = gr.Dataframe(
233
- headers=["Filename", "Document Type"],
234
- datatype=["str", "str"],
235
- row_count=1, # Changed: Start with 1 row, should grow dynamically
236
- col_count=2, # Changed: Simpler integer for fixed columns
237
- wrap=True
238
- )
239
- clear_batch_button = gr.Button("πŸ—‘οΈ Clear Entire Batch", variant="stop")
240
- gr.Markdown("### Step 3: Process Batch")
241
- process_button = gr.Button("πŸš€ Process Batch and Extract Information", variant="primary")
242
- status_message_textbox = gr.Textbox(label="Processing Status", interactive=False, lines=2)
243
- gr.Markdown("### Step 4: View Results")
244
- output_json_display = gr.JSON(label="Extracted Information (JSON Format)")
245
- add_button.click(
246
- fn=add_document_to_batch_ui,
247
- inputs=[image_input, doc_type_input],
248
- outputs=[batch_dataframe, status_message_textbox]
249
- ).then(lambda: None, outputs=image_input)
250
- clear_batch_button.click(
251
- fn=clear_batch_ui,
252
- inputs=[],
253
- outputs=[batch_dataframe, status_message_textbox]
254
  )
 
 
 
 
 
 
 
255
  process_button.click(
256
- fn=process_batch_ui,
257
- inputs=[],
258
- outputs=[output_json_display, status_message_textbox]
 
 
 
 
 
259
  )
260
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
  if __name__ == "__main__":
262
- if not OPENROUTER_API_KEY:
263
- print("ERROR: The OPENROUTER_API_KEY environment variable is not set.")
264
- print("Please set it before running the application, e.g.:")
265
- print(" export OPENROUTER_API_KEY='your_openrouter_key_here'")
266
- print("The application will launch, but API calls will fail.")
267
- demo.launch(share=True) # Added share=True
 
1
  import gradio as gr
 
2
  import base64
3
+ import requests
4
  import json
5
+ import re
6
+ import os
7
+ import uuid
8
+ from datetime import datetime
9
 
10
  # --- Configuration ---
11
+ # IMPORTANT: Set your OPENROUTER_API_KEY as a Hugging Face Space Secret
12
+ OPENROUTER_API_KEY = "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"
13
+ IMAGE_MODEL = "opengvlab/internvl3-14b:free" # Using the free tier model as specified
14
  OPENROUTER_API_URL = "https://openrouter.ai/api/v1/chat/completions"
15
 
16
+ # --- Global State (managed within Gradio's session if possible, or module-level for simplicity here) ---
17
+ # This will be reset each time the processing function is called.
18
+ # For a multi-user or more robust app, session state or a proper backend DB would be needed.
19
+ processed_files_data = [] # Stores dicts for each file's details and status
20
+ person_profiles = {} # Stores dicts for each identified person and their documents
21
 
22
  # --- Helper Functions ---
23
 
24
+ def extract_json_from_text(text):
25
+ """
26
+ Extracts a JSON object from a string, trying common markdown and direct JSON.
27
+ """
28
+ if not text:
29
+ return {"error": "Empty text provided for JSON extraction."}
30
+
31
+ # Try to match ```json ... ``` code block
32
+ match_block = re.search(r"```json\s*(\{.*?\})\s*```", text, re.DOTALL | re.IGNORECASE)
33
+ if match_block:
34
+ json_str = match_block.group(1)
35
+ else:
36
+ # If no block, assume the text itself might be JSON or wrapped in single backticks
37
+ text_stripped = text.strip()
38
+ if text_stripped.startswith("`") and text_stripped.endswith("`"):
39
+ json_str = text_stripped[1:-1]
40
+ else:
41
+ json_str = text_stripped # Assume it's direct JSON
42
+
43
+ try:
44
+ return json.loads(json_str)
45
+ except json.JSONDecodeError as e:
46
+ # Fallback: Try to find the first '{' and last '}' if initial parsing fails
47
+ try:
48
+ first_brace = json_str.find('{')
49
+ last_brace = json_str.rfind('}')
50
+ if first_brace != -1 and last_brace != -1 and last_brace > first_brace:
51
+ potential_json_str = json_str[first_brace : last_brace+1]
52
+ return json.loads(potential_json_str)
53
+ else:
54
+ return {"error": f"Invalid JSON structure: {str(e)}", "original_text": text}
55
+ except json.JSONDecodeError as e2:
56
+ return {"error": f"Invalid JSON structure after attempting substring: {str(e2)}", "original_text": text}
57
+
58
+
59
+ def get_ocr_prompt():
60
+ return f"""You are an advanced OCR and information extraction AI.
61
  Your task is to meticulously analyze this image and extract all relevant information.
62
 
63
  Output Format Instructions:
64
  Provide your response as a SINGLE, VALID JSON OBJECT. Do not include any explanatory text before or after the JSON.
65
  The JSON object should have the following top-level keys:
66
+ - "document_type_detected": (string) Your best guess of the specific document type (e.g., "Passport", "National ID Card", "Driver's License", "Visa Sticker", "Hotel Confirmation Voucher", "Bank Statement", "Photo of a person").
 
67
  - "extracted_fields": (object) A key-value map of all extracted information. Be comprehensive. Examples:
68
+ - For passports/IDs: "Surname", "Given Names", "Full Name", "Document Number", "Nationality", "Date of Birth", "Sex", "Place of Birth", "Date of Issue", "Date of Expiry", "Issuing Authority", "Country Code".
69
+ - For hotel reservations: "Guest Name", "Hotel Name", "Booking Reference", "Check-in Date", "Check-out Date".
70
+ - For bank statements: "Account Holder Name", "Account Number", "Bank Name", "Statement Period", "Ending Balance".
71
+ - For photos: "Description" (e.g., "Portrait of a person", "Group photo at a location"), "People Present" (array of strings if multiple).
72
  - "mrz_data": (object or null) If a Machine Readable Zone (MRZ) is present:
73
  - "raw_mrz_lines": (array of strings) Each line of the MRZ.
74
+ - "parsed_mrz": (object) Key-value pairs of parsed MRZ fields.
75
  If no MRZ, this field should be null.
 
 
 
76
  - "full_text_ocr": (string) Concatenation of all text found on the document.
77
 
78
  Extraction Guidelines:
79
+ 1. Prioritize accuracy.
80
+ 2. Extract all visible text. Include "Full Name" by combining given and surnames if possible.
81
  3. For dates, try to use ISO 8601 format (YYYY-MM-DD) if possible, but retain original format if conversion is ambiguous.
 
 
82
 
83
  Ensure the entire output strictly adheres to the JSON format.
84
  """
 
85
 
86
+ def call_openrouter_ocr(image_filepath):
87
  if not OPENROUTER_API_KEY:
88
+ return {"error": "OpenRouter API Key not configured."}
89
  try:
90
+ with open(image_filepath, "rb") as f:
91
+ encoded_image = base64.b64encode(f.read()).decode("utf-8")
92
+
93
+ # Basic MIME type guessing, default to jpeg
94
+ mime_type = "image/jpeg"
95
+ if image_filepath.lower().endswith(".png"):
96
+ mime_type = "image/png"
97
+ elif image_filepath.lower().endswith(".webp"):
98
+ mime_type = "image/webp"
99
+
100
+ data_url = f"data:{mime_type};base64,{encoded_image}"
101
+ prompt_text = get_ocr_prompt()
102
+
103
  payload = {
104
  "model": IMAGE_MODEL,
105
  "messages": [
 
111
  ]
112
  }
113
  ],
114
+ "max_tokens": 3500, # Increased for detailed JSON
115
  "temperature": 0.1,
116
  }
117
  headers = {
118
  "Authorization": f"Bearer {OPENROUTER_API_KEY}",
119
  "Content-Type": "application/json",
120
+ "HTTP-Referer": "https://huggingface.co/spaces/DoClassifier", # Optional: Update with your Space URL
121
+ "X-Title": "DoClassifier Processor" # Optional
122
  }
123
+
124
+ response = requests.post(OPENROUTER_API_URL, headers=headers, json=payload, timeout=180) # 3 min timeout
125
  response.raise_for_status()
126
  result = response.json()
127
+
128
  if "choices" in result and result["choices"]:
129
+ raw_content = result["choices"][0]["message"]["content"]
130
+ return extract_json_from_text(raw_content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  else:
132
+ return {"error": "No 'choices' in API response from OpenRouter.", "details": result}
133
+
134
  except requests.exceptions.Timeout:
135
+ return {"error": "API request timed out."}
 
136
  except requests.exceptions.RequestException as e:
137
  error_message = f"API Request Error: {str(e)}"
138
+ if hasattr(e, 'response') and e.response is not None:
139
  error_message += f" Status: {e.response.status_code}, Response: {e.response.text}"
140
+ return {"error": error_message}
 
141
  except Exception as e:
142
+ return {"error": f"An unexpected error occurred during OCR: {str(e)}"}
143
+
144
+ def extract_entities_from_ocr(ocr_json):
145
+ if not ocr_json or "extracted_fields" not in ocr_json or not isinstance(ocr_json["extracted_fields"], dict):
146
+ return {"name": None, "dob": None, "passport_no": None, "doc_type": ocr_json.get("document_type_detected", "Unknown")}
147
+
148
+ fields = ocr_json["extracted_fields"]
149
+ doc_type = ocr_json.get("document_type_detected", "Unknown")
150
+
151
+ # Normalize potential field names (case-insensitive search)
152
+ name_keys = ["full name", "name", "account holder name", "guest name"]
153
+ dob_keys = ["date of birth", "dob"]
154
+ passport_keys = ["document number", "passport number"]
155
+
156
+ extracted_name = None
157
+ for key in name_keys:
158
+ for field_key, value in fields.items():
159
+ if key == field_key.lower():
160
+ extracted_name = str(value) if value else None
161
+ break
162
+ if extracted_name:
163
+ break
164
+
165
+ extracted_dob = None
166
+ for key in dob_keys:
167
+ for field_key, value in fields.items():
168
+ if key == field_key.lower():
169
+ extracted_dob = str(value) if value else None
170
+ break
171
+ if extracted_dob:
172
+ break
173
+
174
+ extracted_passport_no = None
175
+ for key in passport_keys:
176
+ for field_key, value in fields.items():
177
+ if key == field_key.lower():
178
+ extracted_passport_no = str(value).replace(" ", "").upper() if value else None # Normalize
179
+ break
180
+ if extracted_passport_no:
181
+ break
182
+
183
+ return {
184
+ "name": extracted_name,
185
+ "dob": extracted_dob,
186
+ "passport_no": extracted_passport_no,
187
+ "doc_type": doc_type
188
+ }
189
+
190
+ def normalize_name(name):
191
+ if not name: return ""
192
+ return "".join(filter(str.isalnum, name)).lower()
193
+
194
+ def get_person_id_and_update_profiles(doc_id, entities, current_persons_data):
195
+ """
196
+ Tries to assign a document to an existing person or creates a new one.
197
+ Returns a person_key.
198
+ Updates current_persons_data in place.
199
+ """
200
+ passport_no = entities.get("passport_no")
201
+ name = entities.get("name")
202
+ dob = entities.get("dob")
203
+
204
+ # 1. Match by Passport Number (strongest identifier)
205
+ if passport_no:
206
+ for p_key, p_data in current_persons_data.items():
207
+ if passport_no in p_data.get("passport_numbers", set()):
208
+ p_data["doc_ids"].add(doc_id)
209
+ # Update person profile with potentially new name/dob if current is missing
210
+ if name and not p_data.get("canonical_name"): p_data["canonical_name"] = name
211
+ if dob and not p_data.get("canonical_dob"): p_data["canonical_dob"] = dob
212
+ return p_key
213
+ # New person based on passport number
214
+ new_person_key = f"person_{passport_no}" # Or more robust ID generation
215
+ current_persons_data[new_person_key] = {
216
+ "canonical_name": name,
217
+ "canonical_dob": dob,
218
+ "names": {normalize_name(name)} if name else set(),
219
+ "dobs": {dob} if dob else set(),
220
+ "passport_numbers": {passport_no},
221
+ "doc_ids": {doc_id},
222
+ "display_name": name or f"Person (ID: {passport_no})"
223
+ }
224
+ return new_person_key
225
+
226
+ # 2. Match by Normalized Name + DOB (if passport not found or not present)
227
+ if name and dob:
228
+ norm_name = normalize_name(name)
229
+ composite_key_nd = f"{norm_name}_{dob}"
230
+ for p_key, p_data in current_persons_data.items():
231
+ # Check if this name and dob combo has been seen for this person
232
+ if norm_name in p_data.get("names", set()) and dob in p_data.get("dobs", set()):
233
+ p_data["doc_ids"].add(doc_id)
234
+ return p_key
235
+ # New person based on name and DOB
236
+ new_person_key = f"person_{composite_key_nd}_{str(uuid.uuid4())[:4]}"
237
+ current_persons_data[new_person_key] = {
238
+ "canonical_name": name,
239
+ "canonical_dob": dob,
240
+ "names": {norm_name},
241
+ "dobs": {dob},
242
+ "passport_numbers": set(),
243
+ "doc_ids": {doc_id},
244
+ "display_name": name
245
+ }
246
+ return new_person_key
247
+
248
+ # 3. If only name, less reliable, create new person (could add fuzzy matching later)
249
+ if name:
250
+ norm_name = normalize_name(name)
251
+ # Check if a person with just this name exists and has no other strong identifiers yet
252
+ # This part can be made more robust, for now, it might create more splits
253
+ new_person_key = f"person_{norm_name}_{str(uuid.uuid4())[:4]}"
254
+ current_persons_data[new_person_key] = {
255
+ "canonical_name": name, "canonical_dob": None,
256
+ "names": {norm_name}, "dobs": set(), "passport_numbers": set(),
257
+ "doc_ids": {doc_id}, "display_name": name
258
+ }
259
+ return new_person_key
260
+
261
+ # 4. Unclassifiable for now, assign a generic unique person key
262
+ generic_person_key = f"unidentified_person_{str(uuid.uuid4())[:6]}"
263
+ current_persons_data[generic_person_key] = {
264
+ "canonical_name": "Unknown", "canonical_dob": None,
265
+ "names": set(), "dobs": set(), "passport_numbers": set(),
266
+ "doc_ids": {doc_id}, "display_name": f"Unknown Person ({doc_id[:6]})"
267
  }
268
+ return generic_person_key
269
+
270
+
271
+ def format_dataframe_data(current_files_data):
272
+ # Headers for the dataframe
273
+ # "ID", "Filename", "Status", "Detected Type", "Extracted Name", "Extracted DOB", "Main ID", "Person Key"
274
+ df_rows = []
275
+ for f_data in current_files_data:
276
+ entities = f_data.get("entities") or {}
277
+ df_rows.append([
278
+ f_data["doc_id"][:8], # Short ID
279
+ f_data["filename"],
280
+ f_data["status"],
281
+ entities.get("doc_type", "N/A"),
282
+ entities.get("name", "N/A"),
283
+ entities.get("dob", "N/A"),
284
+ entities.get("passport_no", "N/A"),
285
+ f_data.get("assigned_person_key", "N/A")
286
+ ])
287
+ return df_rows
288
+
289
+ def format_persons_markdown(current_persons_data, current_files_data):
290
+ if not current_persons_data:
291
+ return "No persons identified yet."
292
+
293
+ md_parts = ["## Classified Persons & Documents\n"]
294
+ for p_key, p_data in current_persons_data.items():
295
+ display_name = p_data.get('display_name', p_key)
296
+ md_parts.append(f"### Person: {display_name} (Profile Key: {p_key})")
297
+ if p_data.get("canonical_dob"): md_parts.append(f"* DOB: {p_data['canonical_dob']}")
298
+ if p_data.get("passport_numbers"): md_parts.append(f"* Passport(s): {', '.join(p_data['passport_numbers'])}")
299
+
300
+ md_parts.append("* Documents:")
301
+ doc_ids_for_person = p_data.get("doc_ids", set())
302
+ if doc_ids_for_person:
303
+ for doc_id in doc_ids_for_person:
304
+ # Find the filename and detected type from current_files_data
305
+ doc_detail = next((f for f in current_files_data if f["doc_id"] == doc_id), None)
306
+ if doc_detail:
307
+ filename = doc_detail["filename"]
308
+ doc_type = doc_detail.get("entities", {}).get("doc_type", "Unknown Type")
309
+ md_parts.append(f" - {filename} (`{doc_type}`)")
310
+ else:
311
+ md_parts.append(f" - Document ID: {doc_id[:8]} (details not found, unexpected)")
312
+ else:
313
+ md_parts.append(" - No documents currently assigned.")
314
+ md_parts.append("\n---\n")
315
+ return "\n".join(md_parts)
316
+
317
+ # --- Main Gradio Processing Function (Generator) ---
318
+ def process_uploaded_files(files_list, progress=gr.Progress(track_tqdm=True)):
319
+ global processed_files_data, person_profiles # Reset global state for each run
320
+ processed_files_data = []
321
+ person_profiles = {}
322
+
323
+ if not OPENROUTER_API_KEY:
324
+ yield (
325
+ [["N/A", "ERROR", "OpenRouter API Key not configured.", "N/A", "N/A", "N/A", "N/A", "N/A"]],
326
+ "Error: OpenRouter API Key not configured. Please set it in Space Secrets.",
327
+ "{}", "API Key Missing. Processing halted."
328
+ )
329
+ return
330
+
331
+ if not files_list:
332
+ yield ([], "No files uploaded.", "{}", "Upload files to begin.")
333
+ return
334
+
335
+ # Initialize processed_files_data
336
+ for i, file_obj in enumerate(files_list):
337
+ doc_uid = str(uuid.uuid4())
338
+ processed_files_data.append({
339
+ "doc_id": doc_uid,
340
+ "filename": os.path.basename(file_obj.name), # file_obj.name is the temp path
341
+ "filepath": file_obj.name,
342
+ "status": "Queued",
343
+ "ocr_json": None,
344
+ "entities": None,
345
+ "assigned_person_key": None
346
+ })
347
+
348
+ initial_df_data = format_dataframe_data(processed_files_data)
349
+ initial_persons_md = format_persons_markdown(person_profiles, processed_files_data)
350
+ yield (initial_df_data, initial_persons_md, "{}", f"Initialized. Found {len(files_list)} files.")
351
 
352
+ # Iterate and process each file
353
+ for i, file_data_item in enumerate(progress.tqdm(processed_files_data, desc="Processing Documents")):
354
+ current_doc_id = file_data_item["doc_id"]
355
+ current_filename = file_data_item["filename"]
356
+
357
+ # 1. OCR Processing
358
+ file_data_item["status"] = "OCR in Progress..."
359
+ df_data = format_dataframe_data(processed_files_data)
360
+ persons_md = format_persons_markdown(person_profiles, processed_files_data) # No change yet
361
+ yield (df_data, persons_md, "{}", f"({i+1}/{len(processed_files_data)}) OCR for: {current_filename}")
362
 
363
+ ocr_result = call_openrouter_ocr(file_data_item["filepath"])
364
+ file_data_item["ocr_json"] = ocr_result # Store full JSON
365
+
366
+ if "error" in ocr_result:
367
+ file_data_item["status"] = f"OCR Error: {ocr_result['error'][:50]}..." # Truncate long errors
368
+ df_data = format_dataframe_data(processed_files_data)
369
+ yield (df_data, persons_md, json.dumps(ocr_result, indent=2), f"({i+1}/{len(processed_files_data)}) OCR Error on {current_filename}")
370
+ continue # Move to next file
371
+
372
+ file_data_item["status"] = "OCR Done. Extracting Entities..."
373
+ df_data = format_dataframe_data(processed_files_data)
374
+ yield (df_data, persons_md, json.dumps(ocr_result, indent=2), f"({i+1}/{len(processed_files_data)}) OCR Done for {current_filename}")
375
+
376
+ # 2. Entity Extraction
377
+ entities = extract_entities_from_ocr(ocr_result)
378
+ file_data_item["entities"] = entities
379
+ file_data_item["status"] = "Entities Extracted. Classifying..."
380
+ df_data = format_dataframe_data(processed_files_data) # Now entities will show up
381
+ yield (df_data, persons_md, json.dumps(ocr_result, indent=2), f"({i+1}/{len(processed_files_data)}) Entities for {current_filename}")
382
+
383
+ # 3. Person Classification / Linking
384
+ person_key = get_person_id_and_update_profiles(current_doc_id, entities, person_profiles)
385
+ file_data_item["assigned_person_key"] = person_key
386
+ file_data_item["status"] = "Classified"
387
+
388
+ df_data = format_dataframe_data(processed_files_data)
389
+ persons_md = format_persons_markdown(person_profiles, processed_files_data) # Now persons_md updates
390
+ yield (df_data, persons_md, json.dumps(ocr_result, indent=2), f"({i+1}/{len(processed_files_data)}) Classified {current_filename} -> {person_key}")
391
+
392
+ final_df_data = format_dataframe_data(processed_files_data)
393
+ final_persons_md = format_persons_markdown(person_profiles, processed_files_data)
394
+ yield (final_df_data, final_persons_md, "{}", f"All {len(processed_files_data)} documents processed.")
395
+
396
+
397
+ # --- Gradio UI Layout ---
398
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
399
+ gr.Markdown("# πŸ“„ Intelligent Document Processor & Classifier")
400
  gr.Markdown(
401
+ "**Upload multiple documents (images of passports, bank statements, hotel reservations, photos, etc.). "
402
+ "The system will perform OCR, attempt to extract key entities, and classify documents by the person they belong to.**\n"
403
+ "Ensure `OPENROUTER_API_KEY` is set as a Secret in your Hugging Face Space."
 
 
 
 
404
  )
405
+
406
  if not OPENROUTER_API_KEY:
407
+ gr.Markdown("<h3 style='color:red;'>⚠️ ERROR: `OPENROUTER_API_KEY` is not set in Space Secrets! OCR will fail.</h3>")
408
+
 
 
409
  with gr.Row():
410
  with gr.Column(scale=1):
411
+ files_input = gr.Files(label="Upload Document Images (Bulk)", file_count="multiple", type="filepath")
412
+ process_button = gr.Button("Process Uploaded Documents", variant="primary")
413
+ overall_status_textbox = gr.Textbox(label="Overall Progress", interactive=False, lines=1)
414
+
415
+ gr.Markdown("---")
416
+ gr.Markdown("## Document Processing Details")
417
+ # "ID", "Filename", "Status", "Detected Type", "Extracted Name", "Extracted DOB", "Main ID", "Person Key"
418
+ dataframe_headers = ["Doc ID (short)", "Filename", "Status", "Detected Type", "Name", "DOB", "Passport No.", "Assigned Person Key"]
419
+ document_status_df = gr.Dataframe(
420
+ headers=dataframe_headers,
421
+ datatype=["str"] * len(dataframe_headers), # All as strings for display simplicity
422
+ label="Individual Document Status & Extracted Entities",
423
+ row_count=(0, "dynamic"), # Start empty, dynamically grows
424
+ col_count=(len(dataframe_headers), "fixed"),
425
+ wrap=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
426
  )
427
+
428
+ ocr_json_output = gr.Code(label="Selected Document OCR JSON", language="json", interactive=False)
429
+
430
+ gr.Markdown("---")
431
+ person_classification_output_md = gr.Markdown("## Classified Persons & Documents\nNo persons identified yet.")
432
+
433
+ # Event Handlers
434
  process_button.click(
435
+ fn=process_uploaded_files,
436
+ inputs=[files_input],
437
+ outputs=[
438
+ document_status_df,
439
+ person_classification_output_md,
440
+ ocr_json_output, # Temporarily show last OCR here, better if select event works
441
+ overall_status_textbox
442
+ ]
443
  )
444
 
445
+ @document_status_df.select(inputs=None, outputs=ocr_json_output, show_progress="hidden")
446
+ def display_selected_ocr(evt: gr.SelectData):
447
+ if evt.index is None or evt.index[0] is None: # evt.index is (row, col)
448
+ return "{}" # Nothing selected or invalid selection
449
+
450
+ selected_row_index = evt.index[0]
451
+ if selected_row_index < len(processed_files_data):
452
+ selected_doc_data = processed_files_data[selected_row_index]
453
+ if selected_doc_data and selected_doc_data["ocr_json"]:
454
+ return json.dumps(selected_doc_data["ocr_json"], indent=2)
455
+ return "{ \"message\": \"No OCR data found for selected row or selection out of bounds.\" }"
456
+
457
+
458
  if __name__ == "__main__":
459
+ demo.queue().launch(debug=True, share=True) # Use queue for longer processes, share=True for Spaces
 
 
 
 
 
old_app.py ADDED
@@ -0,0 +1,267 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ import base64
4
+ import os
5
+ import json
6
+ import mimetypes
7
+
8
+ # --- Configuration ---
9
+ OPENROUTER_API_KEY = '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'
10
+ IMAGE_MODEL = "opengvlab/internvl3-14b:free"
11
+ OPENROUTER_API_URL = "https://openrouter.ai/api/v1/chat/completions"
12
+
13
+ # --- Application State ---
14
+ current_batch = []
15
+
16
+ # --- Helper Functions ---
17
+
18
+ def generate_extraction_prompt(doc_type_provided_by_user):
19
+ prompt = f"""You are an advanced OCR and information extraction AI.
20
+ The user has provided an image and identified it as a '{doc_type_provided_by_user}'.
21
+ Your task is to meticulously analyze this image and extract all relevant information.
22
+
23
+ Output Format Instructions:
24
+ Provide your response as a SINGLE, VALID JSON OBJECT. Do not include any explanatory text before or after the JSON.
25
+ The JSON object should have the following top-level keys:
26
+ - "document_type_provided": (string) The type provided by the user: "{doc_type_provided_by_user}".
27
+ - "document_type_detected": (string) Your best guess of the specific document type (e.g., "Passport", "National ID Card", "Driver's License", "Visa Sticker", "Hotel Confirmation Voucher", "Boarding Pass", "Photograph of a person").
28
+ - "extracted_fields": (object) A key-value map of all extracted information. Be comprehensive. Examples:
29
+ - For passports/IDs: "Surname", "Given Names", "Document Number", "Nationality", "Date of Birth", "Sex", "Place of Birth", "Date of Issue", "Date of Expiry", "Issuing Authority", "Country Code".
30
+ - For hotel reservations: "Guest Name", "Hotel Name", "Booking Reference", "Check-in Date", "Check-out Date", "Room Type".
31
+ - For photos: "Description" (e.g., "Portrait of a person", "Image contains text: [text if any]").
32
+ - "mrz_data": (object or null) If a Machine Readable Zone (MRZ) is present:
33
+ - "raw_mrz_lines": (array of strings) Each line of the MRZ.
34
+ - "parsed_mrz": (object) Key-value pairs of parsed MRZ fields (e.g., "passport_type", "issuing_country", "surname", "given_names", "passport_number", "nationality", "dob", "sex", "expiry_date", "personal_number").
35
+ If no MRZ, this field should be null.
36
+ - "multilingual_info": (array of objects or null) For any text segments not in English:
37
+ - Each object: {{"language_detected": "ISO 639-1 code", "original_text": "...", "english_translation_or_transliteration": "..."}}
38
+ If no non-English text, this field can be null or an empty array.
39
+ - "full_text_ocr": (string) Concatenation of all text found on the document.
40
+
41
+ Extraction Guidelines:
42
+ 1. Prioritize accuracy. If unsure about a character or word, indicate uncertainty if possible, or extract the most likely interpretation.
43
+ 2. Extract all visible text, including small print, stamps, and handwritten annotations if legible.
44
+ 3. For dates, try to use ISO 8601 format (YYYY-MM-DD) if possible, but retain original format if conversion is ambiguous.
45
+ 4. If the image is a photo of a person without much text, the "extracted_fields" might contain a description, and "full_text_ocr" might be minimal.
46
+ 5. If the document is multi-page and only one page is provided, note this if apparent.
47
+
48
+ Ensure the entire output strictly adheres to the JSON format.
49
+ """
50
+ return prompt
51
+
52
+ def process_single_image_with_openrouter(image_path, doc_type):
53
+ if not OPENROUTER_API_KEY:
54
+ return {"error": "OpenRouter API key not set.", "document_type_provided": doc_type}
55
+ try:
56
+ with open(image_path, "rb") as f:
57
+ encoded_image_bytes = f.read()
58
+ encoded_image_string = base64.b64encode(encoded_image_bytes).decode("utf-8")
59
+ mime_type, _ = mimetypes.guess_type(image_path)
60
+ if not mime_type:
61
+ ext = os.path.splitext(image_path)[1].lower()
62
+ if ext == ".png": mime_type = "image/png"
63
+ elif ext in [".jpg", ".jpeg"]: mime_type = "image/jpeg"
64
+ elif ext == ".webp": mime_type = "image/webp"
65
+ else: mime_type = "image/jpeg"
66
+ data_url = f"data:{mime_type};base64,{encoded_image_string}"
67
+ prompt_text = generate_extraction_prompt(doc_type)
68
+ payload = {
69
+ "model": IMAGE_MODEL,
70
+ "messages": [
71
+ {
72
+ "role": "user",
73
+ "content": [
74
+ {"type": "text", "text": prompt_text},
75
+ {"type": "image_url", "image_url": {"url": data_url}}
76
+ ]
77
+ }
78
+ ],
79
+ "max_tokens": 3000,
80
+ "temperature": 0.1,
81
+ }
82
+ headers = {
83
+ "Authorization": f"Bearer {OPENROUTER_API_KEY}",
84
+ "Content-Type": "application/json",
85
+ "HTTP-Referer": "https://huggingface.co/spaces/Passport_Extractor",
86
+ "X-Title": "Document Classifier"
87
+ }
88
+ print(f"Sending request to OpenRouter for image: {os.path.basename(image_path)}, type: {doc_type}")
89
+ response = requests.post(OPENROUTER_API_URL, headers=headers, json=payload, timeout=120)
90
+ response.raise_for_status()
91
+ result = response.json()
92
+ print(f"Received response from OpenRouter. Status: {response.status_code}")
93
+ if "choices" in result and result["choices"]:
94
+ content_text = result["choices"][0]["message"]["content"]
95
+ clean_content = content_text.strip()
96
+ if clean_content.startswith("```json"):
97
+ clean_content = clean_content[7:]
98
+ if clean_content.endswith("```"):
99
+ clean_content = clean_content[:-3]
100
+ elif clean_content.startswith("`") and clean_content.endswith("`"):
101
+ clean_content = clean_content[1:-1]
102
+ try:
103
+ parsed_json = json.loads(clean_content)
104
+ if "document_type_provided" not in parsed_json:
105
+ parsed_json["document_type_provided"] = doc_type
106
+ return parsed_json
107
+ except json.JSONDecodeError as e:
108
+ print(f"JSONDecodeError: {e}. Raw content was:\n{content_text}")
109
+ return {
110
+ "error": "Failed to parse LLM output as JSON.",
111
+ "raw_content_from_llm": content_text,
112
+ "document_type_provided": doc_type
113
+ }
114
+ else:
115
+ print(f"No 'choices' in API response: {result}")
116
+ return {"error": "No choices in API response.", "details": result, "document_type_provided": doc_type}
117
+ except requests.exceptions.Timeout:
118
+ print(f"API Request Timeout for {os.path.basename(image_path)}")
119
+ return {"error": "API request timed out.", "document_type_provided": doc_type}
120
+ except requests.exceptions.RequestException as e:
121
+ error_message = f"API Request Error: {str(e)}"
122
+ if e.response is not None:
123
+ error_message += f" Status: {e.response.status_code}, Response: {e.response.text}"
124
+ print(error_message)
125
+ return {"error": error_message, "document_type_provided": doc_type}
126
+ except Exception as e:
127
+ print(f"An unexpected error occurred during processing {os.path.basename(image_path)}: {str(e)}")
128
+ return {"error": f"An unexpected error: {str(e)}", "document_type_provided": doc_type}
129
+
130
+ def add_document_to_batch_ui(image_filepath, doc_type_selection):
131
+ global current_batch
132
+ if image_filepath and doc_type_selection:
133
+ filename = os.path.basename(image_filepath)
134
+ current_batch.append({"path": image_filepath, "type": doc_type_selection, "filename": filename})
135
+ batch_display_data = [[item["filename"], item["type"]] for item in current_batch]
136
+ return batch_display_data, f"Added '{filename}' as '{doc_type_selection}'."
137
+ batch_display_data = [[item["filename"], item["type"]] for item in current_batch]
138
+ return batch_display_data, "Failed to add: Image or document type missing."
139
+
140
+ def process_batch_ui():
141
+ global current_batch
142
+ if not OPENROUTER_API_KEY:
143
+ return {"error": "OPENROUTER_API_KEY is not set. Please configure it."}, "API Key Missing."
144
+ if not current_batch:
145
+ return {"message": "Batch is empty. Add documents first."}, "Batch is empty."
146
+ all_results = []
147
+ status_updates = []
148
+ for i, item_to_process in enumerate(current_batch):
149
+ status_msg = f"Processing document {i+1}/{len(current_batch)}: {item_to_process['filename']} ({item_to_process['type']})..."
150
+ print(status_msg)
151
+ extracted_data = process_single_image_with_openrouter(item_to_process["path"], item_to_process["type"])
152
+ all_results.append(extracted_data)
153
+ if "error" in extracted_data:
154
+ status_updates.append(f"Error processing {item_to_process['filename']}: {extracted_data['error']}")
155
+ else:
156
+ status_updates.append(f"Successfully processed {item_to_process['filename']}.")
157
+ grouped_by_person = {}
158
+ unidentified_docs = []
159
+ for result_item in all_results:
160
+ doc_id = None
161
+ if isinstance(result_item, dict) and "extracted_fields" in result_item and isinstance(result_item["extracted_fields"], dict):
162
+ fields = result_item["extracted_fields"]
163
+ passport_no = fields.get("Document Number") or fields.get("Passport Number") or fields.get("passport_number")
164
+ name = fields.get("Given Names") or fields.get("Given Name") or fields.get("Name")
165
+ surname = fields.get("Surname") or fields.get("Family Name")
166
+ dob = fields.get("Date of Birth") or fields.get("DOB")
167
+ if passport_no:
168
+ doc_id = f"passport_{str(passport_no).replace(' ', '').lower()}"
169
+ elif name and surname and dob:
170
+ doc_id = f"{str(name).replace(' ', '').lower()}_{str(surname).replace(' ', '').lower()}_{str(dob).replace(' ', '')}"
171
+ elif name and surname:
172
+ doc_id = f"{str(name).replace(' ', '').lower()}_{str(surname).replace(' ', '').lower()}"
173
+ if doc_id:
174
+ if doc_id not in grouped_by_person:
175
+ grouped_by_person[doc_id] = {"person_identifier": doc_id, "documents": []}
176
+ grouped_by_person[doc_id]["documents"].append(result_item)
177
+ else:
178
+ unidentified_docs.append(result_item)
179
+ final_structured_output = {
180
+ "summary": f"Processed {len(current_batch)} documents.",
181
+ "grouped_by_person": list(grouped_by_person.values()) if grouped_by_person else [],
182
+ "unidentified_documents_or_errors": unidentified_docs
183
+ }
184
+ final_status = "Batch processing complete. " + " | ".join(status_updates)
185
+ print(final_status)
186
+ return final_structured_output, final_status
187
+
188
+ def clear_batch_ui():
189
+ global current_batch
190
+ current_batch = []
191
+ return [], "Batch cleared successfully."
192
+
193
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
194
+ gr.Markdown("# πŸ“„ Document Information Extractor (OpenGVLab/InternVL3-14B via OpenRouter)")
195
+ gr.Markdown(
196
+ "**Instructions:**\n"
197
+ "1. Upload a document image (e.g., passport front/back, photo, hotel reservation).\n"
198
+ "2. Select the correct document type.\n"
199
+ "3. Click 'Add Document to Current Batch'. Repeat for all documents of a person or a related set.\n"
200
+ "4. Review the batch. Click 'Clear Entire Batch' to start over.\n"
201
+ "5. Click 'Process Batch and Extract Information' to send documents to the AI.\n"
202
+ "6. View the extracted information in JSON format below."
203
+ )
204
+ if not OPENROUTER_API_KEY:
205
+ gr.Markdown(
206
+ "<h3 style='color:red;'>⚠️ Warning: `OPENROUTER_API_KEY` environment variable is not detected. "
207
+ "API calls will fail. Please set it and restart this application.</h3>"
208
+ )
209
+ with gr.Row():
210
+ with gr.Column(scale=1):
211
+ gr.Markdown("### Step 1: Add Document")
212
+ image_input = gr.Image(
213
+ label="Upload Document Image",
214
+ type="filepath",
215
+ sources=["upload"],
216
+ height=300
217
+ )
218
+ doc_type_choices = [
219
+ 'passport_front', 'passport_back', 'national_id_front', 'national_id_back',
220
+ 'drivers_license_front', 'drivers_license_back', 'visa_sticker',
221
+ 'photo', 'hotel_reservation', 'boarding_pass', 'utility_bill', 'other_document'
222
+ ]
223
+ doc_type_input = gr.Dropdown(
224
+ label="Select Document Type",
225
+ choices=doc_type_choices,
226
+ value='passport_front',
227
+ filterable=True
228
+ )
229
+ add_button = gr.Button("βž• Add Document to Current Batch", variant="secondary")
230
+ with gr.Column(scale=2):
231
+ gr.Markdown("### Step 2: Review Current Batch")
232
+ batch_dataframe = gr.Dataframe(
233
+ headers=["Filename", "Document Type"],
234
+ datatype=["str", "str"],
235
+ row_count=1, # Changed: Start with 1 row, should grow dynamically
236
+ col_count=2, # Changed: Simpler integer for fixed columns
237
+ wrap=True
238
+ )
239
+ clear_batch_button = gr.Button("πŸ—‘οΈ Clear Entire Batch", variant="stop")
240
+ gr.Markdown("### Step 3: Process Batch")
241
+ process_button = gr.Button("πŸš€ Process Batch and Extract Information", variant="primary")
242
+ status_message_textbox = gr.Textbox(label="Processing Status", interactive=False, lines=2)
243
+ gr.Markdown("### Step 4: View Results")
244
+ output_json_display = gr.JSON(label="Extracted Information (JSON Format)")
245
+ add_button.click(
246
+ fn=add_document_to_batch_ui,
247
+ inputs=[image_input, doc_type_input],
248
+ outputs=[batch_dataframe, status_message_textbox]
249
+ ).then(lambda: None, outputs=image_input)
250
+ clear_batch_button.click(
251
+ fn=clear_batch_ui,
252
+ inputs=[],
253
+ outputs=[batch_dataframe, status_message_textbox]
254
+ )
255
+ process_button.click(
256
+ fn=process_batch_ui,
257
+ inputs=[],
258
+ outputs=[output_json_display, status_message_textbox]
259
+ )
260
+
261
+ if __name__ == "__main__":
262
+ if not OPENROUTER_API_KEY:
263
+ print("ERROR: The OPENROUTER_API_KEY environment variable is not set.")
264
+ print("Please set it before running the application, e.g.:")
265
+ print(" export OPENROUTER_API_KEY='your_openrouter_key_here'")
266
+ print("The application will launch, but API calls will fail.")
267
+ demo.launch(share=True) # Added share=True