Sandy2636 commited on
Commit
77f26de
·
1 Parent(s): 2d6f97d

Add application file

Browse files
Files changed (3) hide show
  1. app.py +310 -166
  2. old_app.py +320 -196
  3. requirements.txt +8 -4
app.py CHANGED
@@ -6,20 +6,45 @@ import re
6
  import os
7
  import uuid
8
  from datetime import datetime
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  # --- Configuration ---
11
- # IMPORTANT: Set your OPENROUTER_API_KEY as a Hugging Face Space Secret
12
- OPENROUTER_API_KEY = "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"
13
- IMAGE_MODEL = "opengvlab/internvl3-14b:free" # Using the free tier model as specified
14
  OPENROUTER_API_URL = "https://openrouter.ai/api/v1/chat/completions"
15
 
16
- # --- Global State (managed within Gradio's session if possible, or module-level for simplicity here) ---
17
- # This will be reset each time the processing function is called.
18
- processed_files_data = [] # Stores dicts for each file's details and status
19
- person_profiles = {} # Stores dicts for each identified person and their documents
 
 
 
 
 
 
20
 
21
- # --- Helper Functions ---
 
 
22
 
 
23
  def extract_json_from_text(text):
24
  if not text:
25
  return {"error": "Empty text provided for JSON extraction."}
@@ -42,68 +67,56 @@ def extract_json_from_text(text):
42
  potential_json_str = json_str[first_brace : last_brace+1]
43
  return json.loads(potential_json_str)
44
  else:
45
- return {"error": f"Invalid JSON structure: {str(e)}", "original_text": text}
46
  except json.JSONDecodeError as e2:
47
  return {"error": f"Invalid JSON structure after attempting substring: {str(e2)}", "original_text": text}
48
 
49
  def get_ocr_prompt():
 
50
  return f"""You are an advanced OCR and information extraction AI.
51
  Your task is to meticulously analyze this image and extract all relevant information.
52
 
53
  Output Format Instructions:
54
  Provide your response as a SINGLE, VALID JSON OBJECT. Do not include any explanatory text before or after the JSON.
55
  The JSON object should have the following top-level keys:
56
- - "document_type_detected": (string) Your best guess of the specific document type (e.g., "Passport", "National ID Card", "Driver's License", "Visa Sticker", "Hotel Confirmation Voucher", "Bank Statement", "Photo of a person").
57
- - "extracted_fields": (object) A key-value map of all extracted information. Be comprehensive. Examples:
58
- - For passports/IDs: "Surname", "Given Names", "Full Name", "Document Number", "Nationality", "Date of Birth", "Sex", "Place of Birth", "Date of Issue", "Date of Expiry", "Issuing Authority", "Country Code".
59
- - For hotel reservations: "Guest Name", "Hotel Name", "Booking Reference", "Check-in Date", "Check-out Date".
60
- - For bank statements: "Account Holder Name", "Account Number", "Bank Name", "Statement Period", "Ending Balance".
61
- - For photos: "Description" (e.g., "Portrait of a person", "Group photo at a location"), "People Present" (array of strings if multiple).
62
- - "mrz_data": (object or null) If a Machine Readable Zone (MRZ) is present:
63
- - "raw_mrz_lines": (array of strings) Each line of the MRZ.
64
- - "parsed_mrz": (object) Key-value pairs of parsed MRZ fields.
65
- If no MRZ, this field should be null.
66
  - "full_text_ocr": (string) Concatenation of all text found on the document.
67
 
68
  Extraction Guidelines:
69
- 1. Prioritize accuracy.
70
- 2. Extract all visible text. Include "Full Name" by combining given and surnames if possible.
71
- 3. For dates, try to use ISO 8601 format (YYYY-MM-DD) if possible, but retain original format if conversion is ambiguous.
72
 
73
  Ensure the entire output strictly adheres to the JSON format.
74
  """
75
 
76
  def call_openrouter_ocr(image_filepath):
 
77
  if not OPENROUTER_API_KEY:
78
  return {"error": "OpenRouter API Key not configured."}
79
  try:
80
  with open(image_filepath, "rb") as f:
81
  encoded_image = base64.b64encode(f.read()).decode("utf-8")
82
  mime_type = "image/jpeg"
83
- if image_filepath.lower().endswith(".png"):
84
- mime_type = "image/png"
85
- elif image_filepath.lower().endswith(".webp"):
86
- mime_type = "image/webp"
87
  data_url = f"data:{mime_type};base64,{encoded_image}"
88
  prompt_text = get_ocr_prompt()
89
  payload = {
90
  "model": IMAGE_MODEL,
91
- "messages": [
92
- {
93
- "role": "user",
94
- "content": [
95
- {"type": "text", "text": prompt_text},
96
- {"type": "image_url", "image_url": {"url": data_url}}
97
- ]
98
- }
99
- ],
100
- "max_tokens": 3500,
101
- "temperature": 0.1,
102
  }
103
  headers = {
104
- "Authorization": f"Bearer {OPENROUTER_API_KEY}",
105
- "Content-Type": "application/json",
106
- "HTTP-Referer": "https://huggingface.co/spaces/YOUR_SPACE",
107
  "X-Title": "Gradio Document Processor"
108
  }
109
  response = requests.post(OPENROUTER_API_URL, headers=headers, json=payload, timeout=180)
@@ -114,139 +127,250 @@ def call_openrouter_ocr(image_filepath):
114
  return extract_json_from_text(raw_content)
115
  else:
116
  return {"error": "No 'choices' in API response from OpenRouter.", "details": result}
117
- except requests.exceptions.Timeout:
118
- return {"error": "API request timed out."}
119
  except requests.exceptions.RequestException as e:
120
  error_message = f"API Request Error: {str(e)}"
121
- if hasattr(e, 'response') and e.response is not None:
122
- error_message += f" Status: {e.response.status_code}, Response: {e.response.text}"
123
  return {"error": error_message}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  except Exception as e:
125
- return {"error": f"An unexpected error occurred during OCR: {str(e)}"}
 
 
 
 
 
126
 
127
  def extract_entities_from_ocr(ocr_json):
128
- if not ocr_json or "extracted_fields" not in ocr_json or not isinstance(ocr_json.get("extracted_fields"), dict):
129
  doc_type_from_ocr = "Unknown"
130
- if isinstance(ocr_json, dict): # ocr_json itself might be an error dict
131
  doc_type_from_ocr = ocr_json.get("document_type_detected", "Unknown (error in OCR)")
132
- return {"name": None, "dob": None, "passport_no": None, "doc_type": doc_type_from_ocr}
133
 
134
  fields = ocr_json["extracted_fields"]
135
  doc_type = ocr_json.get("document_type_detected", "Unknown")
136
- name_keys = ["full name", "name", "account holder name", "guest name"]
 
 
 
 
 
 
 
 
137
  dob_keys = ["date of birth", "dob"]
138
- passport_keys = ["document number", "passport number"]
 
 
139
  extracted_name = None
 
 
140
  for key in name_keys:
141
  for field_key, value in fields.items():
142
  if key == field_key.lower():
143
- extracted_name = str(value) if value else None
144
- break
145
- if extracted_name: break
 
 
 
 
 
 
 
 
 
 
146
  extracted_dob = None
147
  for key in dob_keys:
148
  for field_key, value in fields.items():
149
- if key == field_key.lower():
150
- extracted_dob = str(value) if value else None
151
  break
152
  if extracted_dob: break
153
- extracted_passport_no = None
154
- for key in passport_keys:
 
155
  for field_key, value in fields.items():
156
- if key == field_key.lower():
157
- extracted_passport_no = str(value).replace(" ", "").upper() if value else None
158
  break
159
- if extracted_passport_no: break
 
160
  return {
161
  "name": extracted_name,
162
  "dob": extracted_dob,
163
- "passport_no": extracted_passport_no,
164
- "doc_type": doc_type
 
165
  }
166
 
167
  def normalize_name(name):
168
  if not name: return ""
169
  return "".join(filter(str.isalnum, name)).lower()
170
 
171
- def get_person_id_and_update_profiles(doc_id, entities, current_persons_data):
172
- passport_no = entities.get("passport_no")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
  name = entities.get("name")
174
  dob = entities.get("dob")
175
- if passport_no:
 
 
176
  for p_key, p_data in current_persons_data.items():
177
- if passport_no in p_data.get("passport_numbers", set()):
178
  p_data["doc_ids"].add(doc_id)
179
- if name and not p_data.get("canonical_name"): p_data["canonical_name"] = name
180
- if dob and not p_data.get("canonical_dob"): p_data["canonical_dob"] = dob
 
 
181
  return p_key
182
- new_person_key = f"person_{passport_no}"
 
183
  current_persons_data[new_person_key] = {
184
- "canonical_name": name, "canonical_dob": dob,
185
  "names": {normalize_name(name)} if name else set(),
186
  "dobs": {dob} if dob else set(),
187
- "passport_numbers": {passport_no}, "doc_ids": {doc_id},
188
- "display_name": name or f"Person (ID: {passport_no})"
 
189
  }
 
190
  return new_person_key
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
  if name and dob:
192
  norm_name = normalize_name(name)
193
- composite_key_nd = f"{norm_name}_{dob}"
194
  for p_key, p_data in current_persons_data.items():
195
  if norm_name in p_data.get("names", set()) and dob in p_data.get("dobs", set()):
196
  p_data["doc_ids"].add(doc_id)
 
 
197
  return p_key
198
- new_person_key = f"person_{composite_key_nd}_{str(uuid.uuid4())[:4]}"
 
199
  current_persons_data[new_person_key] = {
200
- "canonical_name": name, "canonical_dob": dob,
201
- "names": {norm_name}, "dobs": {dob},
202
- "passport_numbers": set(), "doc_ids": {doc_id},
203
- "display_name": name
204
  }
 
205
  return new_person_key
 
 
206
  if name:
207
  norm_name = normalize_name(name)
208
- new_person_key = f"person_{norm_name}_{str(uuid.uuid4())[:4]}"
 
 
 
209
  current_persons_data[new_person_key] = {
210
- "canonical_name": name, "canonical_dob": None,
211
- "names": {norm_name}, "dobs": set(), "passport_numbers": set(),
212
- "doc_ids": {doc_id}, "display_name": name
213
  }
 
214
  return new_person_key
 
 
215
  generic_person_key = f"unidentified_person_{str(uuid.uuid4())[:6]}"
216
  current_persons_data[generic_person_key] = {
217
- "canonical_name": "Unknown", "canonical_dob": None,
218
- "names": set(), "dobs": set(), "passport_numbers": set(),
219
- "doc_ids": {doc_id}, "display_name": f"Unknown Person ({doc_id[:6]})"
220
  }
 
221
  return generic_person_key
222
 
 
223
  def format_dataframe_data(current_files_data):
224
  df_rows = []
225
  for f_data in current_files_data:
226
- entities = f_data.get("entities") or {} # CORRECTED LINE HERE
 
 
 
 
 
227
  df_rows.append([
228
  f_data.get("doc_id", "N/A")[:8],
229
  f_data.get("filename", "N/A"),
230
  f_data.get("status", "N/A"),
231
  entities.get("doc_type", "N/A"),
 
232
  entities.get("name", "N/A"),
233
  entities.get("dob", "N/A"),
234
- entities.get("passport_no", "N/A"),
235
- f_data.get("assigned_person_key", "N/A")
 
236
  ])
237
  return df_rows
238
 
239
  def format_persons_markdown(current_persons_data, current_files_data):
240
- if not current_persons_data:
241
- return "No persons identified yet."
242
  md_parts = ["## Classified Persons & Documents\n"]
243
- for p_key, p_data in current_persons_data.items():
244
  display_name = p_data.get('display_name', p_key)
245
  md_parts.append(f"### Person: {display_name} (Profile Key: {p_key})")
246
- if p_data.get("canonical_dob"): md_parts.append(f"* DOB: {p_data['canonical_dob']}")
247
- if p_data.get("passport_numbers"): md_parts.append(f"* Passport(s): {', '.join(p_data['passport_numbers'])}")
 
 
248
  md_parts.append("* Documents:")
249
- doc_ids_for_person = p_data.get("doc_ids", set())
250
  if doc_ids_for_person:
251
  for doc_id in doc_ids_for_person:
252
  doc_detail = next((f for f in current_files_data if f["doc_id"] == doc_id), None)
@@ -254,11 +378,10 @@ def format_persons_markdown(current_persons_data, current_files_data):
254
  filename = doc_detail.get("filename", "Unknown File")
255
  doc_entities = doc_detail.get("entities") or {}
256
  doc_type = doc_entities.get("doc_type", "Unknown Type")
257
- md_parts.append(f" - {filename} (`{doc_type}`)")
258
- else:
259
- md_parts.append(f" - Document ID: {doc_id[:8]} (details error)")
260
- else:
261
- md_parts.append(" - No documents currently assigned.")
262
  md_parts.append("\n---\n")
263
  return "\n".join(md_parts)
264
 
@@ -267,125 +390,146 @@ def process_uploaded_files(files_list, progress=gr.Progress(track_tqdm=True)):
267
  processed_files_data = []
268
  person_profiles = {}
269
  if not OPENROUTER_API_KEY:
270
- yield (
271
- [["N/A", "ERROR", "OpenRouter API Key not configured.", "N/A", "N/A", "N/A", "N/A", "N/A"]],
272
- "Error: OpenRouter API Key not configured. Please set it in Space Secrets.",
273
- "{}", "API Key Missing. Processing halted."
274
- )
275
  return
276
  if not files_list:
277
  yield ([], "No files uploaded.", "{}", "Upload files to begin.")
278
  return
279
- for i, file_obj in enumerate(files_list):
 
 
280
  doc_uid = str(uuid.uuid4())
281
  processed_files_data.append({
282
  "doc_id": doc_uid,
283
- "filename": os.path.basename(file_obj.name if hasattr(file_obj, 'name') else f"file_{i+1}.unknown"),
284
- "filepath": file_obj.name if hasattr(file_obj, 'name') else None, # file_obj itself is filepath if from gr.Files type="filepath"
285
- "status": "Queued",
286
- "ocr_json": None,
287
- "entities": None,
288
- "assigned_person_key": None
289
  })
290
- initial_df_data = format_dataframe_data(processed_files_data)
291
- initial_persons_md = format_persons_markdown(person_profiles, processed_files_data)
292
- yield (initial_df_data, initial_persons_md, "{}", f"Initialized. Found {len(files_list)} files.")
 
 
293
  for i, file_data_item in enumerate(progress.tqdm(processed_files_data, desc="Processing Documents")):
294
  current_doc_id = file_data_item["doc_id"]
295
  current_filename = file_data_item["filename"]
296
- if not file_data_item["filepath"]: # Check if filepath is valid
297
- file_data_item["status"] = "Error: Invalid file path"
 
 
 
 
298
  df_data = format_dataframe_data(processed_files_data)
299
  persons_md = format_persons_markdown(person_profiles, processed_files_data)
300
- yield(df_data, persons_md, "{}", f"({i+1}/{len(processed_files_data)}) Error with file {current_filename}")
301
  continue
302
 
303
- file_data_item["status"] = "OCR in Progress..."
304
- df_data = format_dataframe_data(processed_files_data)
305
- persons_md = format_persons_markdown(person_profiles, processed_files_data)
306
- yield (df_data, persons_md, "{}", f"({i+1}/{len(processed_files_data)}) OCR for: {current_filename}")
307
  ocr_result = call_openrouter_ocr(file_data_item["filepath"])
308
  file_data_item["ocr_json"] = ocr_result
309
  if "error" in ocr_result:
310
- file_data_item["status"] = f"OCR Error: {str(ocr_result['error'])[:50]}..."
311
- df_data = format_dataframe_data(processed_files_data)
312
- yield (df_data, persons_md, json.dumps(ocr_result, indent=2), f"({i+1}/{len(processed_files_data)}) OCR Error on {current_filename}")
 
313
  continue
314
- file_data_item["status"] = "OCR Done. Extracting Entities..."
315
- df_data = format_dataframe_data(processed_files_data)
316
- yield (df_data, persons_md, json.dumps(ocr_result, indent=2), f"({i+1}/{len(processed_files_data)}) OCR Done for {current_filename}")
 
317
  entities = extract_entities_from_ocr(ocr_result)
318
  file_data_item["entities"] = entities
319
- file_data_item["status"] = "Entities Extracted. Classifying..."
320
- df_data = format_dataframe_data(processed_files_data)
321
- yield (df_data, persons_md, json.dumps(ocr_result, indent=2), f"({i+1}/{len(processed_files_data)}) Entities for {current_filename}")
322
- person_key = get_person_id_and_update_profiles(current_doc_id, entities, person_profiles)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
323
  file_data_item["assigned_person_key"] = person_key
324
  file_data_item["status"] = "Classified"
 
 
325
  df_data = format_dataframe_data(processed_files_data)
326
  persons_md = format_persons_markdown(person_profiles, processed_files_data)
327
- yield (df_data, persons_md, json.dumps(ocr_result, indent=2), f"({i+1}/{len(processed_files_data)}) Classified {current_filename} -> {person_key}")
 
328
  final_df_data = format_dataframe_data(processed_files_data)
329
  final_persons_md = format_persons_markdown(person_profiles, processed_files_data)
330
  yield (final_df_data, final_persons_md, "{}", f"All {len(processed_files_data)} documents processed.")
331
 
 
332
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
333
- gr.Markdown("# 📄 Intelligent Document Processor & Classifier")
334
  gr.Markdown(
335
- "**Upload multiple documents (images of passports, bank statements, hotel reservations, photos, etc.). "
336
- "The system will perform OCR, attempt to extract key entities, and classify documents by the person they belong to.**\n"
337
- "Ensure `OPENROUTER_API_KEY` is set as a Secret in your Hugging Face Space."
338
  )
339
- if not OPENROUTER_API_KEY:
340
- gr.Markdown("<h3 style='color:red;'>⚠️ ERROR: `OPENROUTER_API_KEY` is not set in Space Secrets! OCR will fail.</h3>")
 
341
  with gr.Row():
342
  with gr.Column(scale=1):
343
- files_input = gr.Files(label="Upload Document Images (Bulk)", file_count="multiple", type="filepath") # Using filepath
344
  process_button = gr.Button("🚀 Process Uploaded Documents", variant="primary")
345
- overall_status_textbox = gr.Textbox(label="Overall Progress", interactive=False, lines=1)
 
 
346
  gr.Markdown("---")
347
  gr.Markdown("## Document Processing Details")
348
- dataframe_headers = ["Doc ID (short)", "Filename", "Status", "Detected Type", "Name", "DOB", "Passport No.", "Assigned Person Key"]
349
  document_status_df = gr.Dataframe(
350
- headers=dataframe_headers,
351
- datatype=["str"] * len(dataframe_headers),
352
  label="Individual Document Status & Extracted Entities",
353
- row_count=(1, "dynamic"), # Start with 1 row, dynamically grows
354
- col_count=(len(dataframe_headers), "fixed"),
355
- wrap=True
356
  )
357
- ocr_json_output = gr.Code(label="Selected Document OCR JSON", language="json", interactive=False)
 
 
 
358
  gr.Markdown("---")
359
  person_classification_output_md = gr.Markdown("## Classified Persons & Documents\nNo persons identified yet.")
 
360
  process_button.click(
361
- fn=process_uploaded_files,
362
- inputs=[files_input],
363
- outputs=[
364
- document_status_df,
365
- person_classification_output_md,
366
- ocr_json_output,
367
- overall_status_textbox
368
- ]
369
  )
 
370
  @document_status_df.select(inputs=None, outputs=ocr_json_output, show_progress="hidden")
371
  def display_selected_ocr(evt: gr.SelectData):
372
- if evt.index is None or evt.index[0] is None:
373
- return "{}"
374
  selected_row_index = evt.index[0]
375
- # Ensure processed_files_data is accessible here. If it's truly global, it should be.
376
- # For safety, one might pass it or make it part of a class if this were more complex.
377
  if 0 <= selected_row_index < len(processed_files_data):
378
  selected_doc_data = processed_files_data[selected_row_index]
379
  if selected_doc_data and selected_doc_data.get("ocr_json"):
380
- # Check if ocr_json is already a dict, if not, try to parse (though it should be)
381
  ocr_data_to_display = selected_doc_data["ocr_json"]
382
- if isinstance(ocr_data_to_display, str): # Should not happen if stored correctly
383
- try:
384
- ocr_data_to_display = json.loads(ocr_data_to_display)
385
- except json.JSONDecodeError:
386
- return json.dumps({"error": "Stored OCR data is not valid JSON string."}, indent=2)
387
  return json.dumps(ocr_data_to_display, indent=2, ensure_ascii=False)
388
- return json.dumps({ "message": "No OCR data found for selected row or selection out of bounds (check if processing is complete). Current rows: " + str(len(processed_files_data))}, indent=2)
389
 
390
  if __name__ == "__main__":
391
  demo.queue().launch(debug=True, share=os.environ.get("GRADIO_SHARE", "true").lower() == "true")
 
6
  import os
7
  import uuid
8
  from datetime import datetime
9
+ import time # For potential sleeps if needed, or timing
10
+
11
+ # Attempt to import deepface and handle import error gracefully
12
+ try:
13
+ from deepface import DeepFace
14
+ from deepface.commons import functions as deepface_functions
15
+ DEEPFACE_AVAILABLE = True
16
+ except ImportError:
17
+ DEEPFACE_AVAILABLE = False
18
+ print("Warning: deepface library not found. Facial recognition features will be disabled.")
19
+ # Mock DeepFace object if not available to prevent NameErrors, though functions won't work
20
+ class DeepFaceMock:
21
+ def represent(self, *args, **kwargs): return []
22
+ def verify(self, *args, **kwargs): return {'verified': False, 'distance': float('inf')}
23
+ def detectFace(self, *args, **kwargs): raise NotImplementedError("DeepFace not installed")
24
+ DeepFace = DeepFaceMock()
25
+
26
 
27
  # --- Configuration ---
28
+ OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY")
29
+ IMAGE_MODEL = "opengvlab/internvl3-14b:free"
 
30
  OPENROUTER_API_URL = "https://openrouter.ai/api/v1/chat/completions"
31
 
32
+ # Facial Recognition Configuration
33
+ FACE_DETECTOR_BACKEND = 'retinaface' # common and effective
34
+ FACE_RECOGNITION_MODEL_NAME = 'VGG-Face' # good balance
35
+ # Threshold for deepface.verify (model-specific, VGG-Face with cosine is often around 0.40 for verification)
36
+ # Lower threshold means stricter match for verify. For similarity search, we might use raw distance.
37
+ # DeepFace.verify uses model-specific thresholds internally. Let's rely on its 'verified' flag.
38
+ FACE_SIMILARITY_THRESHOLD = 0.60 # For cosine distance, lower is more similar. For similarity, higher is better.
39
+ # Deepface verify returns 'distance'. For cosine, lower distance = more similar.
40
+ # Let's use a distance threshold. For VGG-Face with cosine, this might be < 0.4 for a match.
41
+ # We will use deepface.verify which handles this internally.
42
 
43
+ # --- Global State ---
44
+ processed_files_data = []
45
+ person_profiles = {}
46
 
47
+ # --- Helper Functions ---
48
  def extract_json_from_text(text):
49
  if not text:
50
  return {"error": "Empty text provided for JSON extraction."}
 
67
  potential_json_str = json_str[first_brace : last_brace+1]
68
  return json.loads(potential_json_str)
69
  else:
70
+ return {"error": f"Invalid JSON structure (no outer braces found): {str(e)}", "original_text": text}
71
  except json.JSONDecodeError as e2:
72
  return {"error": f"Invalid JSON structure after attempting substring: {str(e2)}", "original_text": text}
73
 
74
  def get_ocr_prompt():
75
+ # Enhanced prompt
76
  return f"""You are an advanced OCR and information extraction AI.
77
  Your task is to meticulously analyze this image and extract all relevant information.
78
 
79
  Output Format Instructions:
80
  Provide your response as a SINGLE, VALID JSON OBJECT. Do not include any explanatory text before or after the JSON.
81
  The JSON object should have the following top-level keys:
82
+ - "document_type_detected": (string) Your best guess of the specific document type (e.g., "Passport Front", "Passport Back", "National ID Card", "Photo of a person", "Hotel Reservation", "Bank Statement").
83
+ - "extracted_fields": (object) A key-value map of all extracted information. Be comprehensive.
84
+ - For ALL document types, if a primary person is the subject, try to include: "Primary Person Name", "Full Name".
85
+ - List other names found under specific keys like "Guest Name", "Account Holder Name", "Mother's Name", "Spouse's Name".
86
+ - Extract critical identifiers like "Passport Number", "Document Number", "ID Number", "Account Number", "Reservation Number" FROM ANY PART OF THE DOCUMENT where they appear. Use consistent key names for these if possible.
87
+ - For passports/IDs: "Surname", "Given Names", "Nationality", "Date of Birth", "Sex", "Place of Birth", "Date of Issue", "Date of Expiry".
88
+ - For photos: "Description" (e.g., "Portrait of John Doe", "User's profile photo"), "People Present" (array of names if discernible).
89
+ - "mrz_data": (object or null) If a Machine Readable Zone (MRZ) is present.
 
 
90
  - "full_text_ocr": (string) Concatenation of all text found on the document.
91
 
92
  Extraction Guidelines:
93
+ 1. Extract "Passport Number" or "Document Number" even from back sides or less prominent areas.
94
+ 2. Identify and list all prominent names. If one person is clearly the main subject, label their name as "Primary Person Name" or "Full Name".
95
+ 3. For dates, aim for YYYY-MM-DD.
96
 
97
  Ensure the entire output strictly adheres to the JSON format.
98
  """
99
 
100
  def call_openrouter_ocr(image_filepath):
101
+ # (User's existing function - kept mostly as is, ensure YOUR_SPACE is updated if needed)
102
  if not OPENROUTER_API_KEY:
103
  return {"error": "OpenRouter API Key not configured."}
104
  try:
105
  with open(image_filepath, "rb") as f:
106
  encoded_image = base64.b64encode(f.read()).decode("utf-8")
107
  mime_type = "image/jpeg"
108
+ if image_filepath.lower().endswith(".png"): mime_type = "image/png"
109
+ elif image_filepath.lower().endswith(".webp"): mime_type = "image/webp"
 
 
110
  data_url = f"data:{mime_type};base64,{encoded_image}"
111
  prompt_text = get_ocr_prompt()
112
  payload = {
113
  "model": IMAGE_MODEL,
114
+ "messages": [{"role": "user", "content": [{"type": "text", "text": prompt_text}, {"type": "image_url", "image_url": {"url": data_url}}]}],
115
+ "max_tokens": 3500, "temperature": 0.1,
 
 
 
 
 
 
 
 
 
116
  }
117
  headers = {
118
+ "Authorization": f"Bearer {OPENROUTER_API_KEY}", "Content-Type": "application/json",
119
+ "HTTP-Referer": os.environ.get("GRADIO_ROOT_PATH", "http://localhost:7860"), # Better placeholder
 
120
  "X-Title": "Gradio Document Processor"
121
  }
122
  response = requests.post(OPENROUTER_API_URL, headers=headers, json=payload, timeout=180)
 
127
  return extract_json_from_text(raw_content)
128
  else:
129
  return {"error": "No 'choices' in API response from OpenRouter.", "details": result}
130
+ except requests.exceptions.Timeout: return {"error": "API request timed out."}
 
131
  except requests.exceptions.RequestException as e:
132
  error_message = f"API Request Error: {str(e)}"
133
+ if hasattr(e, 'response') and e.response is not None: error_message += f" Status: {e.response.status_code}, Response: {e.response.text}"
 
134
  return {"error": error_message}
135
+ except Exception as e: return {"error": f"An unexpected error occurred during OCR: {str(e)}"}
136
+
137
+ def get_facial_embeddings_with_deepface(image_filepath):
138
+ if not DEEPFACE_AVAILABLE:
139
+ return {"error": "DeepFace library not installed.", "embeddings": []}
140
+ try:
141
+ # Use represent to get embeddings. It can find multiple faces.
142
+ # Setting align=True, detector_backend for robustness.
143
+ # enforce_detection=False will return empty list if no face, rather than error.
144
+ embedding_objs = DeepFace.represent(
145
+ img_path=image_filepath,
146
+ model_name=FACE_RECOGNITION_MODEL_NAME,
147
+ detector_backend=FACE_DETECTOR_BACKEND,
148
+ enforce_detection=False, # Don't raise error if no face
149
+ align=True
150
+ )
151
+ # DeepFace.represent returns a list of dictionaries, each with an 'embedding' key
152
+ embeddings = [obj['embedding'] for obj in embedding_objs if 'embedding' in obj]
153
+ if not embeddings:
154
+ return {"message": "No face detected or embedding failed.", "embeddings": []}
155
+ return {"embeddings": embeddings, "count": len(embeddings)}
156
  except Exception as e:
157
+ # Catch errors from DeepFace if enforce_detection was True or other issues
158
+ # Like "Face detector ... could not find anyıs face"
159
+ if "could not find any face" in str(e).lower():
160
+ return {"message": "No face detected.", "embeddings": []}
161
+ return {"error": f"Facial embedding extraction failed: {str(e)}", "embeddings": []}
162
+
163
 
164
  def extract_entities_from_ocr(ocr_json):
165
+ if not ocr_json or not isinstance(ocr_json, dict) or "extracted_fields" not in ocr_json or not isinstance(ocr_json.get("extracted_fields"), dict):
166
  doc_type_from_ocr = "Unknown"
167
+ if isinstance(ocr_json, dict):
168
  doc_type_from_ocr = ocr_json.get("document_type_detected", "Unknown (error in OCR)")
169
+ return {"name": None, "dob": None, "main_id": None, "doc_type": doc_type_from_ocr, "all_names_roles": []}
170
 
171
  fields = ocr_json["extracted_fields"]
172
  doc_type = ocr_json.get("document_type_detected", "Unknown")
173
+
174
+ # Expanded and prioritized name keys
175
+ # Order matters: more specific or primary names first
176
+ name_keys = [
177
+ "primary person name", "full name", "name", "account holder name", "guest name",
178
+ "cardholder name", "policy holder name", "applicant name", "beneficiary name",
179
+ "student name", "employee name", "sender name", "receiver name",
180
+ "patient name", "traveler name", "customer name", "member name", "user name"
181
+ ]
182
  dob_keys = ["date of birth", "dob"]
183
+ # Expanded ID keys (passport, national ID, etc.)
184
+ id_keys = ["passport number", "document number", "id number", "personal no", "member id", "customer id", "account number", "reservation number"]
185
+
186
  extracted_name = None
187
+ all_names_roles = [] # To store all names found with their original JSON key
188
+
189
  for key in name_keys:
190
  for field_key, value in fields.items():
191
  if key == field_key.lower():
192
+ if value and isinstance(value, str) and value.strip():
193
+ if not extracted_name: # Take the first one found as primary for now
194
+ extracted_name = value.strip()
195
+ all_names_roles.append({"name_text": value.strip(), "source_key": field_key})
196
+ # If "People Present" exists (e.g., for photos), add them
197
+ if "people present" in (k.lower() for k in fields.keys()):
198
+ people = fields.get([k for k in fields if k.lower() == "people present"][0])
199
+ if isinstance(people, list):
200
+ for person_name in people:
201
+ if isinstance(person_name, str) and person_name.strip():
202
+ all_names_roles.append({"name_text": person_name.strip(), "source_key": "People Present"})
203
+ if not extracted_name: extracted_name = person_name.strip() # Prioritize if no other name found
204
+
205
  extracted_dob = None
206
  for key in dob_keys:
207
  for field_key, value in fields.items():
208
+ if key == field_key.lower() and value and isinstance(value, str):
209
+ extracted_dob = value.strip()
210
  break
211
  if extracted_dob: break
212
+
213
+ extracted_main_id = None
214
+ for key in id_keys:
215
  for field_key, value in fields.items():
216
+ if key == field_key.lower() and value and isinstance(value, str):
217
+ extracted_main_id = value.replace(" ", "").upper().strip() # Normalize
218
  break
219
+ if extracted_main_id: break
220
+
221
  return {
222
  "name": extracted_name,
223
  "dob": extracted_dob,
224
+ "main_id": extracted_main_id, # This will be used as the primary linking ID
225
+ "doc_type": doc_type,
226
+ "all_names_roles": list({tuple(d.items()): d for d in all_names_roles}.values()) # Deduplicate
227
  }
228
 
229
  def normalize_name(name):
230
  if not name: return ""
231
  return "".join(filter(str.isalnum, name)).lower()
232
 
233
+ def are_faces_similar(emb1_list, emb2_gallery_list):
234
+ if not DEEPFACE_AVAILABLE or not emb1_list or not emb2_gallery_list:
235
+ return False
236
+ # Compare each embedding from emb1_list against each in emb2_gallery_list
237
+ for emb1 in emb1_list:
238
+ for emb2 in emb2_gallery_list:
239
+ try:
240
+ # DeepFace.verify expects embeddings directly if not paths
241
+ # It uses built-in thresholds per model.
242
+ result = DeepFace.verify(
243
+ img1_path=emb1, # Pass embedding directly
244
+ img2_path=emb2, # Pass embedding directly
245
+ model_name=FACE_RECOGNITION_MODEL_NAME,
246
+ detector_backend=FACE_DETECTOR_BACKEND, # Though not used for verify with embeddings
247
+ distance_metric='cosine' # Or 'euclidean', 'euclidean_l2'
248
+ )
249
+ if result.get("verified", False):
250
+ # print(f"Face match found: distance {result.get('distance')}")
251
+ return True
252
+ except Exception as e:
253
+ print(f"DeepFace verify error: {e}") # e.g. if embeddings are not in expected format
254
+ return False
255
+
256
+ def get_person_id_and_update_profiles(doc_id, entities, facial_embeddings, current_persons_data, linking_method_log):
257
+ main_id = entities.get("main_id") # Passport No, Document No, Account No etc.
258
  name = entities.get("name")
259
  dob = entities.get("dob")
260
+
261
+ # Tier 1: Match by Main ID (Passport, National ID, etc.)
262
+ if main_id:
263
  for p_key, p_data in current_persons_data.items():
264
+ if main_id in p_data.get("ids", set()):
265
  p_data["doc_ids"].add(doc_id)
266
+ if name and normalize_name(name) not in p_data["names"]: p_data["names"].add(normalize_name(name))
267
+ if dob and dob not in p_data["dobs"]: p_data["dobs"].add(dob)
268
+ if facial_embeddings: p_data["face_gallery"].extend(facial_embeddings) # Add new faces
269
+ linking_method_log.append(f"Linked by Main ID ({main_id}) to {p_key}")
270
  return p_key
271
+ # New person based on this main_id
272
+ new_person_key = f"person_id_{main_id}"
273
  current_persons_data[new_person_key] = {
274
+ "display_name": name or f"Person (ID: {main_id})",
275
  "names": {normalize_name(name)} if name else set(),
276
  "dobs": {dob} if dob else set(),
277
+ "ids": {main_id},
278
+ "face_gallery": list(facial_embeddings or []), # Initialize gallery
279
+ "doc_ids": {doc_id}
280
  }
281
+ linking_method_log.append(f"New person by Main ID ({main_id}): {new_person_key}")
282
  return new_person_key
283
+
284
+ # Tier 2: Match by Facial Recognition
285
+ if facial_embeddings:
286
+ for p_key, p_data in current_persons_data.items():
287
+ if are_faces_similar(facial_embeddings, p_data.get("face_gallery", [])):
288
+ p_data["doc_ids"].add(doc_id)
289
+ if name and normalize_name(name) not in p_data["names"]: p_data["names"].add(normalize_name(name))
290
+ if dob and dob not in p_data["dobs"]: p_data["dobs"].add(dob)
291
+ p_data["face_gallery"].extend(facial_embeddings) # Freshen gallery
292
+ linking_method_log.append(f"Linked by Facial Match to {p_key}")
293
+ return p_key
294
+ # If no facial match to existing, but we have a face and name/dob, it will be used for new profile below
295
+
296
+ # Tier 3: Match by Normalized Name + DOB
297
  if name and dob:
298
  norm_name = normalize_name(name)
 
299
  for p_key, p_data in current_persons_data.items():
300
  if norm_name in p_data.get("names", set()) and dob in p_data.get("dobs", set()):
301
  p_data["doc_ids"].add(doc_id)
302
+ if facial_embeddings: p_data["face_gallery"].extend(facial_embeddings)
303
+ linking_method_log.append(f"Linked by Name+DOB to {p_key}")
304
  return p_key
305
+ # New person based on name and DOB
306
+ new_person_key = f"person_{norm_name}_{dob}_{str(uuid.uuid4())[:4]}"
307
  current_persons_data[new_person_key] = {
308
+ "display_name": name, "names": {norm_name}, "dobs": {dob}, "ids": set(),
309
+ "face_gallery": list(facial_embeddings or []), "doc_ids": {doc_id}
 
 
310
  }
311
+ linking_method_log.append(f"New person by Name+DOB: {new_person_key}")
312
  return new_person_key
313
+
314
+ # Tier 4: Match by Normalized Name only (creates a more tentative profile)
315
  if name:
316
  norm_name = normalize_name(name)
317
+ # Check if any existing profile primarily matches this name AND has no stronger identifiers yet (e.g. no DOB, no ID, no face)
318
+ # This logic could be refined to prevent overly aggressive merging or splitting.
319
+ # For now, we'll create a new profile if not matched above.
320
+ new_person_key = f"person_name_{norm_name}_{str(uuid.uuid4())[:4]}"
321
  current_persons_data[new_person_key] = {
322
+ "display_name": name, "names": {norm_name}, "dobs": set(), "ids": set(),
323
+ "face_gallery": list(facial_embeddings or []), "doc_ids": {doc_id}
 
324
  }
325
+ linking_method_log.append(f"New person by Name only: {new_person_key}")
326
  return new_person_key
327
+
328
+ # Tier 5: Unclassifiable by PII, but might have a face
329
  generic_person_key = f"unidentified_person_{str(uuid.uuid4())[:6]}"
330
  current_persons_data[generic_person_key] = {
331
+ "display_name": f"Unknown Person ({doc_id[:6]})",
332
+ "names": set(), "dobs": set(), "ids": set(),
333
+ "face_gallery": list(facial_embeddings or []), "doc_ids": {doc_id}
334
  }
335
+ linking_method_log.append(f"New Unidentified Person: {generic_person_key}")
336
  return generic_person_key
337
 
338
+
339
  def format_dataframe_data(current_files_data):
340
  df_rows = []
341
  for f_data in current_files_data:
342
+ entities = f_data.get("entities") or {}
343
+ face_info = f_data.get("face_analysis_result", {})
344
+ face_detected_status = "Y" if face_info.get("count", 0) > 0 else "N"
345
+ if "error" in face_info : face_detected_status = "Error"
346
+ elif "message" in face_info and "No face detected" in face_info["message"]: face_detected_status = "N"
347
+
348
  df_rows.append([
349
  f_data.get("doc_id", "N/A")[:8],
350
  f_data.get("filename", "N/A"),
351
  f_data.get("status", "N/A"),
352
  entities.get("doc_type", "N/A"),
353
+ face_detected_status,
354
  entities.get("name", "N/A"),
355
  entities.get("dob", "N/A"),
356
+ entities.get("main_id", "N/A"), # Changed from passport_no to main_id
357
+ f_data.get("assigned_person_key", "N/A"),
358
+ f_data.get("linking_method", "N/A")
359
  ])
360
  return df_rows
361
 
362
  def format_persons_markdown(current_persons_data, current_files_data):
363
+ if not current_persons_data: return "No persons identified yet."
 
364
  md_parts = ["## Classified Persons & Documents\n"]
365
+ for p_key, p_data in sorted(current_persons_data.items()): # Sort for consistent display
366
  display_name = p_data.get('display_name', p_key)
367
  md_parts.append(f"### Person: {display_name} (Profile Key: {p_key})")
368
+ if p_data.get("dobs"): md_parts.append(f"* Known DOB(s): {', '.join(p_data['dobs'])}")
369
+ if p_data.get("ids"): md_parts.append(f"* Known ID(s): {', '.join(p_data['ids'])}")
370
+ if p_data.get("face_gallery") and len(p_data.get("face_gallery")) > 0:
371
+ md_parts.append(f"* Facial Signatures Stored: {len(p_data.get('face_gallery'))}")
372
  md_parts.append("* Documents:")
373
+ doc_ids_for_person = sorted(list(p_data.get("doc_ids", set()))) # Sort for consistency
374
  if doc_ids_for_person:
375
  for doc_id in doc_ids_for_person:
376
  doc_detail = next((f for f in current_files_data if f["doc_id"] == doc_id), None)
 
378
  filename = doc_detail.get("filename", "Unknown File")
379
  doc_entities = doc_detail.get("entities") or {}
380
  doc_type = doc_entities.get("doc_type", "Unknown Type")
381
+ linking_method = doc_detail.get("linking_method", "")
382
+ md_parts.append(f" - {filename} (`{doc_type}`) {linking_method}")
383
+ else: md_parts.append(f" - Document ID: {doc_id[:8]} (details error)")
384
+ else: md_parts.append(" - No documents currently assigned.")
 
385
  md_parts.append("\n---\n")
386
  return "\n".join(md_parts)
387
 
 
390
  processed_files_data = []
391
  person_profiles = {}
392
  if not OPENROUTER_API_KEY:
393
+ # Expected number of output components: df_data, persons_md, ocr_json_output, status_textbox
394
+ yield ([["N/A", "ERROR", "API Key Missing", "N/A","N/A", "N/A", "N/A", "N/A","N/A", "N/A"]], "API Key Missing.", "{}", "Error: API Key not set.")
 
 
 
395
  return
396
  if not files_list:
397
  yield ([], "No files uploaded.", "{}", "Upload files to begin.")
398
  return
399
+
400
+ # Initialize file data structures
401
+ for i, file_obj_path in enumerate(files_list): # gr.Files with type="filepath" returns list of path strings
402
  doc_uid = str(uuid.uuid4())
403
  processed_files_data.append({
404
  "doc_id": doc_uid,
405
+ "filename": os.path.basename(file_obj_path),
406
+ "filepath": file_obj_path,
407
+ "status": "Queued", "ocr_json": None, "entities": None,
408
+ "face_analysis_result": None, "facial_embeddings": None,
409
+ "assigned_person_key": None, "linking_method": ""
 
410
  })
411
+
412
+ df_data = format_dataframe_data(processed_files_data)
413
+ persons_md = format_persons_markdown(person_profiles, processed_files_data)
414
+ yield (df_data, persons_md, "{}", f"Initialized {len(files_list)} files.")
415
+
416
  for i, file_data_item in enumerate(progress.tqdm(processed_files_data, desc="Processing Documents")):
417
  current_doc_id = file_data_item["doc_id"]
418
  current_filename = file_data_item["filename"]
419
+ linking_method_log_for_doc = [] # To store how this doc was linked
420
+
421
+ if not file_data_item["filepath"] or not os.path.exists(file_data_item["filepath"]):
422
+ file_data_item["status"] = "Error: Invalid file"
423
+ linking_method_log_for_doc.append("File path error.")
424
+ file_data_item["linking_method"] = " ".join(linking_method_log_for_doc)
425
  df_data = format_dataframe_data(processed_files_data)
426
  persons_md = format_persons_markdown(person_profiles, processed_files_data)
427
+ yield(df_data, persons_md, "{}", f"({i+1}/{len(processed_files_data)}) Error for {current_filename}")
428
  continue
429
 
430
+ # 1. OCR
431
+ file_data_item["status"] = "OCR..."
432
+ df_data = format_dataframe_data(processed_files_data); yield (df_data, persons_md, file_data_item.get("ocr_json_str","{}"), f"OCR: {current_filename}")
 
433
  ocr_result = call_openrouter_ocr(file_data_item["filepath"])
434
  file_data_item["ocr_json"] = ocr_result
435
  if "error" in ocr_result:
436
+ file_data_item["status"] = f"OCR Err: {str(ocr_result['error'])[:30]}.."
437
+ linking_method_log_for_doc.append("OCR Failed.")
438
+ file_data_item["linking_method"] = " ".join(linking_method_log_for_doc)
439
+ df_data = format_dataframe_data(processed_files_data); yield (df_data, persons_md, json.dumps(ocr_result, indent=2), f"OCR Err: {current_filename}")
440
  continue
441
+ file_data_item["status"] = "OCR OK. Entities..."
442
+ df_data = format_dataframe_data(processed_files_data); yield (df_data, persons_md, json.dumps(ocr_result, indent=2), f"Entities: {current_filename}")
443
+
444
+ # 2. Entity Extraction
445
  entities = extract_entities_from_ocr(ocr_result)
446
  file_data_item["entities"] = entities
447
+ file_data_item["status"] = "Entities OK. Face..."
448
+ df_data = format_dataframe_data(processed_files_data); yield (df_data, persons_md, json.dumps(ocr_result, indent=2), f"Face Detect: {current_filename}")
449
+
450
+ # 3. Facial Feature Extraction
451
+ doc_type_lower = (entities.get("doc_type") or "").lower()
452
+ # Attempt face detection on photos, passports, IDs.
453
+ if DEEPFACE_AVAILABLE and ("photo" in doc_type_lower or "passport" in doc_type_lower or "id card" in doc_type_lower or "selfie" in doc_type_lower):
454
+ face_result = get_facial_embeddings_with_deepface(file_data_item["filepath"])
455
+ file_data_item["face_analysis_result"] = face_result
456
+ if "embeddings" in face_result and face_result["embeddings"]:
457
+ file_data_item["facial_embeddings"] = face_result["embeddings"]
458
+ file_data_item["status"] = f"Face OK ({face_result.get('count',0)}). Classify..."
459
+ linking_method_log_for_doc.append(f"{face_result.get('count',0)} face(s).")
460
+ elif "error" in face_result:
461
+ file_data_item["status"] = f"Face Err: {face_result['error'][:20]}.."
462
+ linking_method_log_for_doc.append("Face Ext. Error.")
463
+ else: # No error, but no embeddings (e.g. no face detected)
464
+ file_data_item["status"] = "No Face. Classify..."
465
+ linking_method_log_for_doc.append("No face det.")
466
+ else:
467
+ file_data_item["status"] = "No Face Ext. Classify..."
468
+ linking_method_log_for_doc.append("Face Ext. Skipped.")
469
+ df_data = format_dataframe_data(processed_files_data); yield (df_data, persons_md, json.dumps(ocr_result, indent=2), f"Classifying: {current_filename}")
470
+
471
+ # 4. Person Classification
472
+ person_key = get_person_id_and_update_profiles(current_doc_id, entities, file_data_item.get("facial_embeddings"), person_profiles, linking_method_log_for_doc)
473
  file_data_item["assigned_person_key"] = person_key
474
  file_data_item["status"] = "Classified"
475
+ file_data_item["linking_method"] = " ".join(linking_method_log_for_doc)
476
+
477
  df_data = format_dataframe_data(processed_files_data)
478
  persons_md = format_persons_markdown(person_profiles, processed_files_data)
479
+ yield (df_data, persons_md, json.dumps(ocr_result, indent=2), f"Done: {current_filename} -> {person_key}")
480
+
481
  final_df_data = format_dataframe_data(processed_files_data)
482
  final_persons_md = format_persons_markdown(person_profiles, processed_files_data)
483
  yield (final_df_data, final_persons_md, "{}", f"All {len(processed_files_data)} documents processed.")
484
 
485
+
486
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
487
+ gr.Markdown("# 📄 Intelligent Document Processor & Classifier v2 (with Face ID)")
488
  gr.Markdown(
489
+ "**Upload multiple documents. The system will OCR, extract entities & faces, and classify documents by person.**\n"
490
+ "Ensure `OPENROUTER_API_KEY` is set as a Secret. Facial recognition uses `deepface` ('VGG-Face' model, 'retinaface' detector)."
 
491
  )
492
+ if not OPENROUTER_API_KEY: gr.Markdown("<h3 style='color:red;'>⚠️ ERROR: `OPENROUTER_API_KEY` Secret missing! OCR will fail.</h3>")
493
+ if not DEEPFACE_AVAILABLE: gr.Markdown("<h3 style='color:orange;'>⚠️ WARNING: `deepface` library not installed. Facial recognition features are disabled.</h3>")
494
+
495
  with gr.Row():
496
  with gr.Column(scale=1):
497
+ files_input = gr.Files(label="Upload Document Images (Bulk)", file_count="multiple", type="filepath")
498
  process_button = gr.Button("🚀 Process Uploaded Documents", variant="primary")
499
+ with gr.Column(scale=2):
500
+ overall_status_textbox = gr.Textbox(label="Current Task & Overall Progress", interactive=False, lines=2)
501
+
502
  gr.Markdown("---")
503
  gr.Markdown("## Document Processing Details")
504
+ dataframe_headers = ["Doc ID", "Filename", "Status", "Type", "Face?", "Name", "DOB", "Main ID", "Person Key", "Linking Method"]
505
  document_status_df = gr.Dataframe(
506
+ headers=dataframe_headers, datatype=["str"] * len(dataframe_headers),
 
507
  label="Individual Document Status & Extracted Entities",
508
+ row_count=(1, "dynamic"), col_count=(len(dataframe_headers), "fixed"), wrap=True, height=400
 
 
509
  )
510
+
511
+ with gr.Accordion("Selected Document Full OCR JSON", open=False):
512
+ ocr_json_output = gr.Code(label="OCR JSON", language="json", interactive=False)
513
+
514
  gr.Markdown("---")
515
  person_classification_output_md = gr.Markdown("## Classified Persons & Documents\nNo persons identified yet.")
516
+
517
  process_button.click(
518
+ fn=process_uploaded_files, inputs=[files_input],
519
+ outputs=[document_status_df, person_classification_output_md, ocr_json_output, overall_status_textbox]
 
 
 
 
 
 
520
  )
521
+
522
  @document_status_df.select(inputs=None, outputs=ocr_json_output, show_progress="hidden")
523
  def display_selected_ocr(evt: gr.SelectData):
524
+ if evt.index is None or evt.index[0] is None: return "{}"
 
525
  selected_row_index = evt.index[0]
526
+ # Access global state. Be cautious with globals in complex apps.
 
527
  if 0 <= selected_row_index < len(processed_files_data):
528
  selected_doc_data = processed_files_data[selected_row_index]
529
  if selected_doc_data and selected_doc_data.get("ocr_json"):
 
530
  ocr_data_to_display = selected_doc_data["ocr_json"]
 
 
 
 
 
531
  return json.dumps(ocr_data_to_display, indent=2, ensure_ascii=False)
532
+ return json.dumps({"message": "No OCR data or selection out of bounds."}, indent=2)
533
 
534
  if __name__ == "__main__":
535
  demo.queue().launch(debug=True, share=os.environ.get("GRADIO_SHARE", "true").lower() == "true")
old_app.py CHANGED
@@ -1,70 +1,91 @@
1
  import gradio as gr
2
- import requests
3
  import base64
4
- import os
5
  import json
6
- import mimetypes
 
 
 
7
 
8
  # --- Configuration ---
9
- OPENROUTER_API_KEY = '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'
10
- IMAGE_MODEL = "opengvlab/internvl3-14b:free"
 
11
  OPENROUTER_API_URL = "https://openrouter.ai/api/v1/chat/completions"
12
 
13
- # --- Application State ---
14
- current_batch = []
 
 
15
 
16
  # --- Helper Functions ---
17
 
18
- def generate_extraction_prompt(doc_type_provided_by_user):
19
- prompt = f"""You are an advanced OCR and information extraction AI.
20
- The user has provided an image and identified it as a '{doc_type_provided_by_user}'.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  Your task is to meticulously analyze this image and extract all relevant information.
22
 
23
  Output Format Instructions:
24
  Provide your response as a SINGLE, VALID JSON OBJECT. Do not include any explanatory text before or after the JSON.
25
  The JSON object should have the following top-level keys:
26
- - "document_type_provided": (string) The type provided by the user: "{doc_type_provided_by_user}".
27
- - "document_type_detected": (string) Your best guess of the specific document type (e.g., "Passport", "National ID Card", "Driver's License", "Visa Sticker", "Hotel Confirmation Voucher", "Boarding Pass", "Photograph of a person").
28
  - "extracted_fields": (object) A key-value map of all extracted information. Be comprehensive. Examples:
29
- - For passports/IDs: "Surname", "Given Names", "Document Number", "Nationality", "Date of Birth", "Sex", "Place of Birth", "Date of Issue", "Date of Expiry", "Issuing Authority", "Country Code".
30
- - For hotel reservations: "Guest Name", "Hotel Name", "Booking Reference", "Check-in Date", "Check-out Date", "Room Type".
31
- - For photos: "Description" (e.g., "Portrait of a person", "Image contains text: [text if any]").
 
32
  - "mrz_data": (object or null) If a Machine Readable Zone (MRZ) is present:
33
  - "raw_mrz_lines": (array of strings) Each line of the MRZ.
34
- - "parsed_mrz": (object) Key-value pairs of parsed MRZ fields (e.g., "passport_type", "issuing_country", "surname", "given_names", "passport_number", "nationality", "dob", "sex", "expiry_date", "personal_number").
35
  If no MRZ, this field should be null.
36
- - "multilingual_info": (array of objects or null) For any text segments not in English:
37
- - Each object: {{"language_detected": "ISO 639-1 code", "original_text": "...", "english_translation_or_transliteration": "..."}}
38
- If no non-English text, this field can be null or an empty array.
39
  - "full_text_ocr": (string) Concatenation of all text found on the document.
40
 
41
  Extraction Guidelines:
42
- 1. Prioritize accuracy. If unsure about a character or word, indicate uncertainty if possible, or extract the most likely interpretation.
43
- 2. Extract all visible text, including small print, stamps, and handwritten annotations if legible.
44
  3. For dates, try to use ISO 8601 format (YYYY-MM-DD) if possible, but retain original format if conversion is ambiguous.
45
- 4. If the image is a photo of a person without much text, the "extracted_fields" might contain a description, and "full_text_ocr" might be minimal.
46
- 5. If the document is multi-page and only one page is provided, note this if apparent.
47
 
48
  Ensure the entire output strictly adheres to the JSON format.
49
  """
50
- return prompt
51
 
52
- def process_single_image_with_openrouter(image_path, doc_type):
53
  if not OPENROUTER_API_KEY:
54
- return {"error": "OpenRouter API key not set.", "document_type_provided": doc_type}
55
  try:
56
- with open(image_path, "rb") as f:
57
- encoded_image_bytes = f.read()
58
- encoded_image_string = base64.b64encode(encoded_image_bytes).decode("utf-8")
59
- mime_type, _ = mimetypes.guess_type(image_path)
60
- if not mime_type:
61
- ext = os.path.splitext(image_path)[1].lower()
62
- if ext == ".png": mime_type = "image/png"
63
- elif ext in [".jpg", ".jpeg"]: mime_type = "image/jpeg"
64
- elif ext == ".webp": mime_type = "image/webp"
65
- else: mime_type = "image/jpeg"
66
- data_url = f"data:{mime_type};base64,{encoded_image_string}"
67
- prompt_text = generate_extraction_prompt(doc_type)
68
  payload = {
69
  "model": IMAGE_MODEL,
70
  "messages": [
@@ -76,192 +97,295 @@ def process_single_image_with_openrouter(image_path, doc_type):
76
  ]
77
  }
78
  ],
79
- "max_tokens": 3000,
80
  "temperature": 0.1,
81
  }
82
  headers = {
83
  "Authorization": f"Bearer {OPENROUTER_API_KEY}",
84
  "Content-Type": "application/json",
85
- "HTTP-Referer": "https://huggingface.co/spaces/Passport_Extractor",
86
- "X-Title": "Document Classifier"
87
  }
88
- print(f"Sending request to OpenRouter for image: {os.path.basename(image_path)}, type: {doc_type}")
89
- response = requests.post(OPENROUTER_API_URL, headers=headers, json=payload, timeout=120)
90
  response.raise_for_status()
91
  result = response.json()
92
- print(f"Received response from OpenRouter. Status: {response.status_code}")
93
  if "choices" in result and result["choices"]:
94
- content_text = result["choices"][0]["message"]["content"]
95
- clean_content = content_text.strip()
96
- if clean_content.startswith("```json"):
97
- clean_content = clean_content[7:]
98
- if clean_content.endswith("```"):
99
- clean_content = clean_content[:-3]
100
- elif clean_content.startswith("`") and clean_content.endswith("`"):
101
- clean_content = clean_content[1:-1]
102
- try:
103
- parsed_json = json.loads(clean_content)
104
- if "document_type_provided" not in parsed_json:
105
- parsed_json["document_type_provided"] = doc_type
106
- return parsed_json
107
- except json.JSONDecodeError as e:
108
- print(f"JSONDecodeError: {e}. Raw content was:\n{content_text}")
109
- return {
110
- "error": "Failed to parse LLM output as JSON.",
111
- "raw_content_from_llm": content_text,
112
- "document_type_provided": doc_type
113
- }
114
  else:
115
- print(f"No 'choices' in API response: {result}")
116
- return {"error": "No choices in API response.", "details": result, "document_type_provided": doc_type}
117
  except requests.exceptions.Timeout:
118
- print(f"API Request Timeout for {os.path.basename(image_path)}")
119
- return {"error": "API request timed out.", "document_type_provided": doc_type}
120
  except requests.exceptions.RequestException as e:
121
  error_message = f"API Request Error: {str(e)}"
122
- if e.response is not None:
123
  error_message += f" Status: {e.response.status_code}, Response: {e.response.text}"
124
- print(error_message)
125
- return {"error": error_message, "document_type_provided": doc_type}
126
  except Exception as e:
127
- print(f"An unexpected error occurred during processing {os.path.basename(image_path)}: {str(e)}")
128
- return {"error": f"An unexpected error: {str(e)}", "document_type_provided": doc_type}
129
 
130
- def add_document_to_batch_ui(image_filepath, doc_type_selection):
131
- global current_batch
132
- if image_filepath and doc_type_selection:
133
- filename = os.path.basename(image_filepath)
134
- current_batch.append({"path": image_filepath, "type": doc_type_selection, "filename": filename})
135
- batch_display_data = [[item["filename"], item["type"]] for item in current_batch]
136
- return batch_display_data, f"Added '{filename}' as '{doc_type_selection}'."
137
- batch_display_data = [[item["filename"], item["type"]] for item in current_batch]
138
- return batch_display_data, "Failed to add: Image or document type missing."
139
 
140
- def process_batch_ui():
141
- global current_batch
142
- if not OPENROUTER_API_KEY:
143
- return {"error": "OPENROUTER_API_KEY is not set. Please configure it."}, "API Key Missing."
144
- if not current_batch:
145
- return {"message": "Batch is empty. Add documents first."}, "Batch is empty."
146
- all_results = []
147
- status_updates = []
148
- for i, item_to_process in enumerate(current_batch):
149
- status_msg = f"Processing document {i+1}/{len(current_batch)}: {item_to_process['filename']} ({item_to_process['type']})..."
150
- print(status_msg)
151
- extracted_data = process_single_image_with_openrouter(item_to_process["path"], item_to_process["type"])
152
- all_results.append(extracted_data)
153
- if "error" in extracted_data:
154
- status_updates.append(f"Error processing {item_to_process['filename']}: {extracted_data['error']}")
155
- else:
156
- status_updates.append(f"Successfully processed {item_to_process['filename']}.")
157
- grouped_by_person = {}
158
- unidentified_docs = []
159
- for result_item in all_results:
160
- doc_id = None
161
- if isinstance(result_item, dict) and "extracted_fields" in result_item and isinstance(result_item["extracted_fields"], dict):
162
- fields = result_item["extracted_fields"]
163
- passport_no = fields.get("Document Number") or fields.get("Passport Number") or fields.get("passport_number")
164
- name = fields.get("Given Names") or fields.get("Given Name") or fields.get("Name")
165
- surname = fields.get("Surname") or fields.get("Family Name")
166
- dob = fields.get("Date of Birth") or fields.get("DOB")
167
- if passport_no:
168
- doc_id = f"passport_{str(passport_no).replace(' ', '').lower()}"
169
- elif name and surname and dob:
170
- doc_id = f"{str(name).replace(' ', '').lower()}_{str(surname).replace(' ', '').lower()}_{str(dob).replace(' ', '')}"
171
- elif name and surname:
172
- doc_id = f"{str(name).replace(' ', '').lower()}_{str(surname).replace(' ', '').lower()}"
173
- if doc_id:
174
- if doc_id not in grouped_by_person:
175
- grouped_by_person[doc_id] = {"person_identifier": doc_id, "documents": []}
176
- grouped_by_person[doc_id]["documents"].append(result_item)
177
- else:
178
- unidentified_docs.append(result_item)
179
- final_structured_output = {
180
- "summary": f"Processed {len(current_batch)} documents.",
181
- "grouped_by_person": list(grouped_by_person.values()) if grouped_by_person else [],
182
- "unidentified_documents_or_errors": unidentified_docs
183
  }
184
- final_status = "Batch processing complete. " + " | ".join(status_updates)
185
- print(final_status)
186
- return final_structured_output, final_status
187
 
188
- def clear_batch_ui():
189
- global current_batch
190
- current_batch = []
191
- return [], "Batch cleared successfully."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
 
193
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
194
- gr.Markdown("# 📄 Document Information Extractor (OpenGVLab/InternVL3-14B via OpenRouter)")
195
  gr.Markdown(
196
- "**Instructions:**\n"
197
- "1. Upload a document image (e.g., passport front/back, photo, hotel reservation).\n"
198
- "2. Select the correct document type.\n"
199
- "3. Click 'Add Document to Current Batch'. Repeat for all documents of a person or a related set.\n"
200
- "4. Review the batch. Click 'Clear Entire Batch' to start over.\n"
201
- "5. Click 'Process Batch and Extract Information' to send documents to the AI.\n"
202
- "6. View the extracted information in JSON format below."
203
  )
204
  if not OPENROUTER_API_KEY:
205
- gr.Markdown(
206
- "<h3 style='color:red;'>⚠️ Warning: `OPENROUTER_API_KEY` environment variable is not detected. "
207
- "API calls will fail. Please set it and restart this application.</h3>"
208
- )
209
  with gr.Row():
210
  with gr.Column(scale=1):
211
- gr.Markdown("### Step 1: Add Document")
212
- image_input = gr.Image(
213
- label="Upload Document Image",
214
- type="filepath",
215
- sources=["upload"],
216
- height=300
217
- )
218
- doc_type_choices = [
219
- 'passport_front', 'passport_back', 'national_id_front', 'national_id_back',
220
- 'drivers_license_front', 'drivers_license_back', 'visa_sticker',
221
- 'photo', 'hotel_reservation', 'boarding_pass', 'utility_bill', 'other_document'
222
- ]
223
- doc_type_input = gr.Dropdown(
224
- label="Select Document Type",
225
- choices=doc_type_choices,
226
- value='passport_front',
227
- filterable=True
228
- )
229
- add_button = gr.Button("➕ Add Document to Current Batch", variant="secondary")
230
- with gr.Column(scale=2):
231
- gr.Markdown("### Step 2: Review Current Batch")
232
- batch_dataframe = gr.Dataframe(
233
- headers=["Filename", "Document Type"],
234
- datatype=["str", "str"],
235
- row_count=1, # Changed: Start with 1 row, should grow dynamically
236
- col_count=2, # Changed: Simpler integer for fixed columns
237
- wrap=True
238
- )
239
- clear_batch_button = gr.Button("🗑️ Clear Entire Batch", variant="stop")
240
- gr.Markdown("### Step 3: Process Batch")
241
- process_button = gr.Button("🚀 Process Batch and Extract Information", variant="primary")
242
- status_message_textbox = gr.Textbox(label="Processing Status", interactive=False, lines=2)
243
- gr.Markdown("### Step 4: View Results")
244
- output_json_display = gr.JSON(label="Extracted Information (JSON Format)")
245
- add_button.click(
246
- fn=add_document_to_batch_ui,
247
- inputs=[image_input, doc_type_input],
248
- outputs=[batch_dataframe, status_message_textbox]
249
- ).then(lambda: None, outputs=image_input)
250
- clear_batch_button.click(
251
- fn=clear_batch_ui,
252
- inputs=[],
253
- outputs=[batch_dataframe, status_message_textbox]
254
  )
 
 
 
255
  process_button.click(
256
- fn=process_batch_ui,
257
- inputs=[],
258
- outputs=[output_json_display, status_message_textbox]
 
 
 
 
 
259
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
 
261
  if __name__ == "__main__":
262
- if not OPENROUTER_API_KEY:
263
- print("ERROR: The OPENROUTER_API_KEY environment variable is not set.")
264
- print("Please set it before running the application, e.g.:")
265
- print(" export OPENROUTER_API_KEY='your_openrouter_key_here'")
266
- print("The application will launch, but API calls will fail.")
267
- demo.launch(share=True) # Added share=True
 
1
  import gradio as gr
 
2
  import base64
3
+ import requests
4
  import json
5
+ import re
6
+ import os
7
+ import uuid
8
+ from datetime import datetime
9
 
10
  # --- Configuration ---
11
+ # IMPORTANT: Set your OPENROUTER_API_KEY as a Hugging Face Space Secret
12
+ OPENROUTER_API_KEY = "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"
13
+ IMAGE_MODEL = "opengvlab/internvl3-14b:free" # Using the free tier model as specified
14
  OPENROUTER_API_URL = "https://openrouter.ai/api/v1/chat/completions"
15
 
16
+ # --- Global State (managed within Gradio's session if possible, or module-level for simplicity here) ---
17
+ # This will be reset each time the processing function is called.
18
+ processed_files_data = [] # Stores dicts for each file's details and status
19
+ person_profiles = {} # Stores dicts for each identified person and their documents
20
 
21
  # --- Helper Functions ---
22
 
23
+ def extract_json_from_text(text):
24
+ if not text:
25
+ return {"error": "Empty text provided for JSON extraction."}
26
+ match_block = re.search(r"```json\s*(\{.*?\})\s*```", text, re.DOTALL | re.IGNORECASE)
27
+ if match_block:
28
+ json_str = match_block.group(1)
29
+ else:
30
+ text_stripped = text.strip()
31
+ if text_stripped.startswith("`") and text_stripped.endswith("`"):
32
+ json_str = text_stripped[1:-1]
33
+ else:
34
+ json_str = text_stripped
35
+ try:
36
+ return json.loads(json_str)
37
+ except json.JSONDecodeError as e:
38
+ try:
39
+ first_brace = json_str.find('{')
40
+ last_brace = json_str.rfind('}')
41
+ if first_brace != -1 and last_brace != -1 and last_brace > first_brace:
42
+ potential_json_str = json_str[first_brace : last_brace+1]
43
+ return json.loads(potential_json_str)
44
+ else:
45
+ return {"error": f"Invalid JSON structure: {str(e)}", "original_text": text}
46
+ except json.JSONDecodeError as e2:
47
+ return {"error": f"Invalid JSON structure after attempting substring: {str(e2)}", "original_text": text}
48
+
49
+ def get_ocr_prompt():
50
+ return f"""You are an advanced OCR and information extraction AI.
51
  Your task is to meticulously analyze this image and extract all relevant information.
52
 
53
  Output Format Instructions:
54
  Provide your response as a SINGLE, VALID JSON OBJECT. Do not include any explanatory text before or after the JSON.
55
  The JSON object should have the following top-level keys:
56
+ - "document_type_detected": (string) Your best guess of the specific document type (e.g., "Passport", "National ID Card", "Driver's License", "Visa Sticker", "Hotel Confirmation Voucher", "Bank Statement", "Photo of a person").
 
57
  - "extracted_fields": (object) A key-value map of all extracted information. Be comprehensive. Examples:
58
+ - For passports/IDs: "Surname", "Given Names", "Full Name", "Document Number", "Nationality", "Date of Birth", "Sex", "Place of Birth", "Date of Issue", "Date of Expiry", "Issuing Authority", "Country Code".
59
+ - For hotel reservations: "Guest Name", "Hotel Name", "Booking Reference", "Check-in Date", "Check-out Date".
60
+ - For bank statements: "Account Holder Name", "Account Number", "Bank Name", "Statement Period", "Ending Balance".
61
+ - For photos: "Description" (e.g., "Portrait of a person", "Group photo at a location"), "People Present" (array of strings if multiple).
62
  - "mrz_data": (object or null) If a Machine Readable Zone (MRZ) is present:
63
  - "raw_mrz_lines": (array of strings) Each line of the MRZ.
64
+ - "parsed_mrz": (object) Key-value pairs of parsed MRZ fields.
65
  If no MRZ, this field should be null.
 
 
 
66
  - "full_text_ocr": (string) Concatenation of all text found on the document.
67
 
68
  Extraction Guidelines:
69
+ 1. Prioritize accuracy.
70
+ 2. Extract all visible text. Include "Full Name" by combining given and surnames if possible.
71
  3. For dates, try to use ISO 8601 format (YYYY-MM-DD) if possible, but retain original format if conversion is ambiguous.
 
 
72
 
73
  Ensure the entire output strictly adheres to the JSON format.
74
  """
 
75
 
76
+ def call_openrouter_ocr(image_filepath):
77
  if not OPENROUTER_API_KEY:
78
+ return {"error": "OpenRouter API Key not configured."}
79
  try:
80
+ with open(image_filepath, "rb") as f:
81
+ encoded_image = base64.b64encode(f.read()).decode("utf-8")
82
+ mime_type = "image/jpeg"
83
+ if image_filepath.lower().endswith(".png"):
84
+ mime_type = "image/png"
85
+ elif image_filepath.lower().endswith(".webp"):
86
+ mime_type = "image/webp"
87
+ data_url = f"data:{mime_type};base64,{encoded_image}"
88
+ prompt_text = get_ocr_prompt()
 
 
 
89
  payload = {
90
  "model": IMAGE_MODEL,
91
  "messages": [
 
97
  ]
98
  }
99
  ],
100
+ "max_tokens": 3500,
101
  "temperature": 0.1,
102
  }
103
  headers = {
104
  "Authorization": f"Bearer {OPENROUTER_API_KEY}",
105
  "Content-Type": "application/json",
106
+ "HTTP-Referer": "https://huggingface.co/spaces/YOUR_SPACE",
107
+ "X-Title": "Gradio Document Processor"
108
  }
109
+ response = requests.post(OPENROUTER_API_URL, headers=headers, json=payload, timeout=180)
 
110
  response.raise_for_status()
111
  result = response.json()
 
112
  if "choices" in result and result["choices"]:
113
+ raw_content = result["choices"][0]["message"]["content"]
114
+ return extract_json_from_text(raw_content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  else:
116
+ return {"error": "No 'choices' in API response from OpenRouter.", "details": result}
 
117
  except requests.exceptions.Timeout:
118
+ return {"error": "API request timed out."}
 
119
  except requests.exceptions.RequestException as e:
120
  error_message = f"API Request Error: {str(e)}"
121
+ if hasattr(e, 'response') and e.response is not None:
122
  error_message += f" Status: {e.response.status_code}, Response: {e.response.text}"
123
+ return {"error": error_message}
 
124
  except Exception as e:
125
+ return {"error": f"An unexpected error occurred during OCR: {str(e)}"}
 
126
 
127
+ def extract_entities_from_ocr(ocr_json):
128
+ if not ocr_json or "extracted_fields" not in ocr_json or not isinstance(ocr_json.get("extracted_fields"), dict):
129
+ doc_type_from_ocr = "Unknown"
130
+ if isinstance(ocr_json, dict): # ocr_json itself might be an error dict
131
+ doc_type_from_ocr = ocr_json.get("document_type_detected", "Unknown (error in OCR)")
132
+ return {"name": None, "dob": None, "passport_no": None, "doc_type": doc_type_from_ocr}
 
 
 
133
 
134
+ fields = ocr_json["extracted_fields"]
135
+ doc_type = ocr_json.get("document_type_detected", "Unknown")
136
+ name_keys = ["full name", "name", "account holder name", "guest name"]
137
+ dob_keys = ["date of birth", "dob"]
138
+ passport_keys = ["document number", "passport number"]
139
+ extracted_name = None
140
+ for key in name_keys:
141
+ for field_key, value in fields.items():
142
+ if key == field_key.lower():
143
+ extracted_name = str(value) if value else None
144
+ break
145
+ if extracted_name: break
146
+ extracted_dob = None
147
+ for key in dob_keys:
148
+ for field_key, value in fields.items():
149
+ if key == field_key.lower():
150
+ extracted_dob = str(value) if value else None
151
+ break
152
+ if extracted_dob: break
153
+ extracted_passport_no = None
154
+ for key in passport_keys:
155
+ for field_key, value in fields.items():
156
+ if key == field_key.lower():
157
+ extracted_passport_no = str(value).replace(" ", "").upper() if value else None
158
+ break
159
+ if extracted_passport_no: break
160
+ return {
161
+ "name": extracted_name,
162
+ "dob": extracted_dob,
163
+ "passport_no": extracted_passport_no,
164
+ "doc_type": doc_type
 
 
 
 
 
 
 
 
 
 
 
 
165
  }
 
 
 
166
 
167
+ def normalize_name(name):
168
+ if not name: return ""
169
+ return "".join(filter(str.isalnum, name)).lower()
170
+
171
+ def get_person_id_and_update_profiles(doc_id, entities, current_persons_data):
172
+ passport_no = entities.get("passport_no")
173
+ name = entities.get("name")
174
+ dob = entities.get("dob")
175
+ if passport_no:
176
+ for p_key, p_data in current_persons_data.items():
177
+ if passport_no in p_data.get("passport_numbers", set()):
178
+ p_data["doc_ids"].add(doc_id)
179
+ if name and not p_data.get("canonical_name"): p_data["canonical_name"] = name
180
+ if dob and not p_data.get("canonical_dob"): p_data["canonical_dob"] = dob
181
+ return p_key
182
+ new_person_key = f"person_{passport_no}"
183
+ current_persons_data[new_person_key] = {
184
+ "canonical_name": name, "canonical_dob": dob,
185
+ "names": {normalize_name(name)} if name else set(),
186
+ "dobs": {dob} if dob else set(),
187
+ "passport_numbers": {passport_no}, "doc_ids": {doc_id},
188
+ "display_name": name or f"Person (ID: {passport_no})"
189
+ }
190
+ return new_person_key
191
+ if name and dob:
192
+ norm_name = normalize_name(name)
193
+ composite_key_nd = f"{norm_name}_{dob}"
194
+ for p_key, p_data in current_persons_data.items():
195
+ if norm_name in p_data.get("names", set()) and dob in p_data.get("dobs", set()):
196
+ p_data["doc_ids"].add(doc_id)
197
+ return p_key
198
+ new_person_key = f"person_{composite_key_nd}_{str(uuid.uuid4())[:4]}"
199
+ current_persons_data[new_person_key] = {
200
+ "canonical_name": name, "canonical_dob": dob,
201
+ "names": {norm_name}, "dobs": {dob},
202
+ "passport_numbers": set(), "doc_ids": {doc_id},
203
+ "display_name": name
204
+ }
205
+ return new_person_key
206
+ if name:
207
+ norm_name = normalize_name(name)
208
+ new_person_key = f"person_{norm_name}_{str(uuid.uuid4())[:4]}"
209
+ current_persons_data[new_person_key] = {
210
+ "canonical_name": name, "canonical_dob": None,
211
+ "names": {norm_name}, "dobs": set(), "passport_numbers": set(),
212
+ "doc_ids": {doc_id}, "display_name": name
213
+ }
214
+ return new_person_key
215
+ generic_person_key = f"unidentified_person_{str(uuid.uuid4())[:6]}"
216
+ current_persons_data[generic_person_key] = {
217
+ "canonical_name": "Unknown", "canonical_dob": None,
218
+ "names": set(), "dobs": set(), "passport_numbers": set(),
219
+ "doc_ids": {doc_id}, "display_name": f"Unknown Person ({doc_id[:6]})"
220
+ }
221
+ return generic_person_key
222
+
223
+ def format_dataframe_data(current_files_data):
224
+ df_rows = []
225
+ for f_data in current_files_data:
226
+ entities = f_data.get("entities") or {} # CORRECTED LINE HERE
227
+ df_rows.append([
228
+ f_data.get("doc_id", "N/A")[:8],
229
+ f_data.get("filename", "N/A"),
230
+ f_data.get("status", "N/A"),
231
+ entities.get("doc_type", "N/A"),
232
+ entities.get("name", "N/A"),
233
+ entities.get("dob", "N/A"),
234
+ entities.get("passport_no", "N/A"),
235
+ f_data.get("assigned_person_key", "N/A")
236
+ ])
237
+ return df_rows
238
+
239
+ def format_persons_markdown(current_persons_data, current_files_data):
240
+ if not current_persons_data:
241
+ return "No persons identified yet."
242
+ md_parts = ["## Classified Persons & Documents\n"]
243
+ for p_key, p_data in current_persons_data.items():
244
+ display_name = p_data.get('display_name', p_key)
245
+ md_parts.append(f"### Person: {display_name} (Profile Key: {p_key})")
246
+ if p_data.get("canonical_dob"): md_parts.append(f"* DOB: {p_data['canonical_dob']}")
247
+ if p_data.get("passport_numbers"): md_parts.append(f"* Passport(s): {', '.join(p_data['passport_numbers'])}")
248
+ md_parts.append("* Documents:")
249
+ doc_ids_for_person = p_data.get("doc_ids", set())
250
+ if doc_ids_for_person:
251
+ for doc_id in doc_ids_for_person:
252
+ doc_detail = next((f for f in current_files_data if f["doc_id"] == doc_id), None)
253
+ if doc_detail:
254
+ filename = doc_detail.get("filename", "Unknown File")
255
+ doc_entities = doc_detail.get("entities") or {}
256
+ doc_type = doc_entities.get("doc_type", "Unknown Type")
257
+ md_parts.append(f" - {filename} (`{doc_type}`)")
258
+ else:
259
+ md_parts.append(f" - Document ID: {doc_id[:8]} (details error)")
260
+ else:
261
+ md_parts.append(" - No documents currently assigned.")
262
+ md_parts.append("\n---\n")
263
+ return "\n".join(md_parts)
264
+
265
+ def process_uploaded_files(files_list, progress=gr.Progress(track_tqdm=True)):
266
+ global processed_files_data, person_profiles
267
+ processed_files_data = []
268
+ person_profiles = {}
269
+ if not OPENROUTER_API_KEY:
270
+ yield (
271
+ [["N/A", "ERROR", "OpenRouter API Key not configured.", "N/A", "N/A", "N/A", "N/A", "N/A"]],
272
+ "Error: OpenRouter API Key not configured. Please set it in Space Secrets.",
273
+ "{}", "API Key Missing. Processing halted."
274
+ )
275
+ return
276
+ if not files_list:
277
+ yield ([], "No files uploaded.", "{}", "Upload files to begin.")
278
+ return
279
+ for i, file_obj in enumerate(files_list):
280
+ doc_uid = str(uuid.uuid4())
281
+ processed_files_data.append({
282
+ "doc_id": doc_uid,
283
+ "filename": os.path.basename(file_obj.name if hasattr(file_obj, 'name') else f"file_{i+1}.unknown"),
284
+ "filepath": file_obj.name if hasattr(file_obj, 'name') else None, # file_obj itself is filepath if from gr.Files type="filepath"
285
+ "status": "Queued",
286
+ "ocr_json": None,
287
+ "entities": None,
288
+ "assigned_person_key": None
289
+ })
290
+ initial_df_data = format_dataframe_data(processed_files_data)
291
+ initial_persons_md = format_persons_markdown(person_profiles, processed_files_data)
292
+ yield (initial_df_data, initial_persons_md, "{}", f"Initialized. Found {len(files_list)} files.")
293
+ for i, file_data_item in enumerate(progress.tqdm(processed_files_data, desc="Processing Documents")):
294
+ current_doc_id = file_data_item["doc_id"]
295
+ current_filename = file_data_item["filename"]
296
+ if not file_data_item["filepath"]: # Check if filepath is valid
297
+ file_data_item["status"] = "Error: Invalid file path"
298
+ df_data = format_dataframe_data(processed_files_data)
299
+ persons_md = format_persons_markdown(person_profiles, processed_files_data)
300
+ yield(df_data, persons_md, "{}", f"({i+1}/{len(processed_files_data)}) Error with file {current_filename}")
301
+ continue
302
+
303
+ file_data_item["status"] = "OCR in Progress..."
304
+ df_data = format_dataframe_data(processed_files_data)
305
+ persons_md = format_persons_markdown(person_profiles, processed_files_data)
306
+ yield (df_data, persons_md, "{}", f"({i+1}/{len(processed_files_data)}) OCR for: {current_filename}")
307
+ ocr_result = call_openrouter_ocr(file_data_item["filepath"])
308
+ file_data_item["ocr_json"] = ocr_result
309
+ if "error" in ocr_result:
310
+ file_data_item["status"] = f"OCR Error: {str(ocr_result['error'])[:50]}..."
311
+ df_data = format_dataframe_data(processed_files_data)
312
+ yield (df_data, persons_md, json.dumps(ocr_result, indent=2), f"({i+1}/{len(processed_files_data)}) OCR Error on {current_filename}")
313
+ continue
314
+ file_data_item["status"] = "OCR Done. Extracting Entities..."
315
+ df_data = format_dataframe_data(processed_files_data)
316
+ yield (df_data, persons_md, json.dumps(ocr_result, indent=2), f"({i+1}/{len(processed_files_data)}) OCR Done for {current_filename}")
317
+ entities = extract_entities_from_ocr(ocr_result)
318
+ file_data_item["entities"] = entities
319
+ file_data_item["status"] = "Entities Extracted. Classifying..."
320
+ df_data = format_dataframe_data(processed_files_data)
321
+ yield (df_data, persons_md, json.dumps(ocr_result, indent=2), f"({i+1}/{len(processed_files_data)}) Entities for {current_filename}")
322
+ person_key = get_person_id_and_update_profiles(current_doc_id, entities, person_profiles)
323
+ file_data_item["assigned_person_key"] = person_key
324
+ file_data_item["status"] = "Classified"
325
+ df_data = format_dataframe_data(processed_files_data)
326
+ persons_md = format_persons_markdown(person_profiles, processed_files_data)
327
+ yield (df_data, persons_md, json.dumps(ocr_result, indent=2), f"({i+1}/{len(processed_files_data)}) Classified {current_filename} -> {person_key}")
328
+ final_df_data = format_dataframe_data(processed_files_data)
329
+ final_persons_md = format_persons_markdown(person_profiles, processed_files_data)
330
+ yield (final_df_data, final_persons_md, "{}", f"All {len(processed_files_data)} documents processed.")
331
 
332
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
333
+ gr.Markdown("# 📄 Intelligent Document Processor & Classifier")
334
  gr.Markdown(
335
+ "**Upload multiple documents (images of passports, bank statements, hotel reservations, photos, etc.). "
336
+ "The system will perform OCR, attempt to extract key entities, and classify documents by the person they belong to.**\n"
337
+ "Ensure `OPENROUTER_API_KEY` is set as a Secret in your Hugging Face Space."
 
 
 
 
338
  )
339
  if not OPENROUTER_API_KEY:
340
+ gr.Markdown("<h3 style='color:red;'>⚠️ ERROR: `OPENROUTER_API_KEY` is not set in Space Secrets! OCR will fail.</h3>")
 
 
 
341
  with gr.Row():
342
  with gr.Column(scale=1):
343
+ files_input = gr.Files(label="Upload Document Images (Bulk)", file_count="multiple", type="filepath") # Using filepath
344
+ process_button = gr.Button("🚀 Process Uploaded Documents", variant="primary")
345
+ overall_status_textbox = gr.Textbox(label="Overall Progress", interactive=False, lines=1)
346
+ gr.Markdown("---")
347
+ gr.Markdown("## Document Processing Details")
348
+ dataframe_headers = ["Doc ID (short)", "Filename", "Status", "Detected Type", "Name", "DOB", "Passport No.", "Assigned Person Key"]
349
+ document_status_df = gr.Dataframe(
350
+ headers=dataframe_headers,
351
+ datatype=["str"] * len(dataframe_headers),
352
+ label="Individual Document Status & Extracted Entities",
353
+ row_count=(1, "dynamic"), # Start with 1 row, dynamically grows
354
+ col_count=(len(dataframe_headers), "fixed"),
355
+ wrap=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
356
  )
357
+ ocr_json_output = gr.Code(label="Selected Document OCR JSON", language="json", interactive=False)
358
+ gr.Markdown("---")
359
+ person_classification_output_md = gr.Markdown("## Classified Persons & Documents\nNo persons identified yet.")
360
  process_button.click(
361
+ fn=process_uploaded_files,
362
+ inputs=[files_input],
363
+ outputs=[
364
+ document_status_df,
365
+ person_classification_output_md,
366
+ ocr_json_output,
367
+ overall_status_textbox
368
+ ]
369
  )
370
+ @document_status_df.select(inputs=None, outputs=ocr_json_output, show_progress="hidden")
371
+ def display_selected_ocr(evt: gr.SelectData):
372
+ if evt.index is None or evt.index[0] is None:
373
+ return "{}"
374
+ selected_row_index = evt.index[0]
375
+ # Ensure processed_files_data is accessible here. If it's truly global, it should be.
376
+ # For safety, one might pass it or make it part of a class if this were more complex.
377
+ if 0 <= selected_row_index < len(processed_files_data):
378
+ selected_doc_data = processed_files_data[selected_row_index]
379
+ if selected_doc_data and selected_doc_data.get("ocr_json"):
380
+ # Check if ocr_json is already a dict, if not, try to parse (though it should be)
381
+ ocr_data_to_display = selected_doc_data["ocr_json"]
382
+ if isinstance(ocr_data_to_display, str): # Should not happen if stored correctly
383
+ try:
384
+ ocr_data_to_display = json.loads(ocr_data_to_display)
385
+ except json.JSONDecodeError:
386
+ return json.dumps({"error": "Stored OCR data is not valid JSON string."}, indent=2)
387
+ return json.dumps(ocr_data_to_display, indent=2, ensure_ascii=False)
388
+ return json.dumps({ "message": "No OCR data found for selected row or selection out of bounds (check if processing is complete). Current rows: " + str(len(processed_files_data))}, indent=2)
389
 
390
  if __name__ == "__main__":
391
+ demo.queue().launch(debug=True, share=os.environ.get("GRADIO_SHARE", "true").lower() == "true")
 
 
 
 
 
requirements.txt CHANGED
@@ -1,4 +1,8 @@
1
- gradio>=4.26.0
2
- requests>=2.25.0,<3.0.0
3
- # pillow might be needed explicitly if not pulled by gradio for image handling
4
- Pillow>=9.0.0
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ requests>=2.25.0
3
+ Pillow>=9.0.0
4
+ deepface>=0.0.79
5
+ tensorflow-cpu>=2.10.0 # Or tensorflow-cpu if GPU is not available/needed
6
+ opencv-python-headless>=4.5.0
7
+ # retina-face Pypi package for the detector if deepface doesn't pull it correctly
8
+ retina-face>=0.0.12