Spaces:
Runtime error
Runtime error
Sandy2636
commited on
Commit
Β·
ba14e67
1
Parent(s):
e08f157
Add application file
Browse files- .gradio/certificate.pem +31 -0
- app.py +390 -198
- old_app.py +267 -0
.gradio/certificate.pem
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
-----BEGIN CERTIFICATE-----
|
2 |
+
MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
|
3 |
+
TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
|
4 |
+
cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
|
5 |
+
WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
|
6 |
+
ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
|
7 |
+
MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
|
8 |
+
h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
|
9 |
+
0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
|
10 |
+
A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
|
11 |
+
T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
|
12 |
+
B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
|
13 |
+
B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
|
14 |
+
KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
|
15 |
+
OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
|
16 |
+
jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
|
17 |
+
qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
|
18 |
+
rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
|
19 |
+
HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
|
20 |
+
hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
|
21 |
+
ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
|
22 |
+
3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
|
23 |
+
NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
|
24 |
+
ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
|
25 |
+
TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
|
26 |
+
jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
|
27 |
+
oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
|
28 |
+
4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
|
29 |
+
mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
|
30 |
+
emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
|
31 |
+
-----END CERTIFICATE-----
|
app.py
CHANGED
@@ -1,70 +1,105 @@
|
|
1 |
import gradio as gr
|
2 |
-
import requests
|
3 |
import base64
|
4 |
-
import
|
5 |
import json
|
6 |
-
import
|
|
|
|
|
|
|
7 |
|
8 |
# --- Configuration ---
|
9 |
-
OPENROUTER_API_KEY
|
10 |
-
|
|
|
11 |
OPENROUTER_API_URL = "https://openrouter.ai/api/v1/chat/completions"
|
12 |
|
13 |
-
# ---
|
14 |
-
|
|
|
|
|
|
|
15 |
|
16 |
# --- Helper Functions ---
|
17 |
|
18 |
-
def
|
19 |
-
|
20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
Your task is to meticulously analyze this image and extract all relevant information.
|
22 |
|
23 |
Output Format Instructions:
|
24 |
Provide your response as a SINGLE, VALID JSON OBJECT. Do not include any explanatory text before or after the JSON.
|
25 |
The JSON object should have the following top-level keys:
|
26 |
-
- "
|
27 |
-
- "document_type_detected": (string) Your best guess of the specific document type (e.g., "Passport", "National ID Card", "Driver's License", "Visa Sticker", "Hotel Confirmation Voucher", "Boarding Pass", "Photograph of a person").
|
28 |
- "extracted_fields": (object) A key-value map of all extracted information. Be comprehensive. Examples:
|
29 |
-
- For passports/IDs: "Surname", "Given Names", "Document Number", "Nationality", "Date of Birth", "Sex", "Place of Birth", "Date of Issue", "Date of Expiry", "Issuing Authority", "Country Code".
|
30 |
-
- For hotel reservations: "Guest Name", "Hotel Name", "Booking Reference", "Check-in Date", "Check-out Date"
|
31 |
-
- For
|
|
|
32 |
- "mrz_data": (object or null) If a Machine Readable Zone (MRZ) is present:
|
33 |
- "raw_mrz_lines": (array of strings) Each line of the MRZ.
|
34 |
-
- "parsed_mrz": (object) Key-value pairs of parsed MRZ fields
|
35 |
If no MRZ, this field should be null.
|
36 |
-
- "multilingual_info": (array of objects or null) For any text segments not in English:
|
37 |
-
- Each object: {{"language_detected": "ISO 639-1 code", "original_text": "...", "english_translation_or_transliteration": "..."}}
|
38 |
-
If no non-English text, this field can be null or an empty array.
|
39 |
- "full_text_ocr": (string) Concatenation of all text found on the document.
|
40 |
|
41 |
Extraction Guidelines:
|
42 |
-
1. Prioritize accuracy.
|
43 |
-
2. Extract all visible text
|
44 |
3. For dates, try to use ISO 8601 format (YYYY-MM-DD) if possible, but retain original format if conversion is ambiguous.
|
45 |
-
4. If the image is a photo of a person without much text, the "extracted_fields" might contain a description, and "full_text_ocr" might be minimal.
|
46 |
-
5. If the document is multi-page and only one page is provided, note this if apparent.
|
47 |
|
48 |
Ensure the entire output strictly adheres to the JSON format.
|
49 |
"""
|
50 |
-
return prompt
|
51 |
|
52 |
-
def
|
53 |
if not OPENROUTER_API_KEY:
|
54 |
-
return {"error": "OpenRouter API
|
55 |
try:
|
56 |
-
with open(
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
data_url = f"data:{mime_type};base64,{
|
67 |
-
prompt_text =
|
|
|
68 |
payload = {
|
69 |
"model": IMAGE_MODEL,
|
70 |
"messages": [
|
@@ -76,192 +111,349 @@ def process_single_image_with_openrouter(image_path, doc_type):
|
|
76 |
]
|
77 |
}
|
78 |
],
|
79 |
-
"max_tokens":
|
80 |
"temperature": 0.1,
|
81 |
}
|
82 |
headers = {
|
83 |
"Authorization": f"Bearer {OPENROUTER_API_KEY}",
|
84 |
"Content-Type": "application/json",
|
85 |
-
"HTTP-Referer": "https://huggingface.co/spaces/
|
86 |
-
"X-Title": "
|
87 |
}
|
88 |
-
|
89 |
-
response = requests.post(OPENROUTER_API_URL, headers=headers, json=payload, timeout=
|
90 |
response.raise_for_status()
|
91 |
result = response.json()
|
92 |
-
|
93 |
if "choices" in result and result["choices"]:
|
94 |
-
|
95 |
-
|
96 |
-
if clean_content.startswith("```json"):
|
97 |
-
clean_content = clean_content[7:]
|
98 |
-
if clean_content.endswith("```"):
|
99 |
-
clean_content = clean_content[:-3]
|
100 |
-
elif clean_content.startswith("`") and clean_content.endswith("`"):
|
101 |
-
clean_content = clean_content[1:-1]
|
102 |
-
try:
|
103 |
-
parsed_json = json.loads(clean_content)
|
104 |
-
if "document_type_provided" not in parsed_json:
|
105 |
-
parsed_json["document_type_provided"] = doc_type
|
106 |
-
return parsed_json
|
107 |
-
except json.JSONDecodeError as e:
|
108 |
-
print(f"JSONDecodeError: {e}. Raw content was:\n{content_text}")
|
109 |
-
return {
|
110 |
-
"error": "Failed to parse LLM output as JSON.",
|
111 |
-
"raw_content_from_llm": content_text,
|
112 |
-
"document_type_provided": doc_type
|
113 |
-
}
|
114 |
else:
|
115 |
-
|
116 |
-
|
117 |
except requests.exceptions.Timeout:
|
118 |
-
|
119 |
-
return {"error": "API request timed out.", "document_type_provided": doc_type}
|
120 |
except requests.exceptions.RequestException as e:
|
121 |
error_message = f"API Request Error: {str(e)}"
|
122 |
-
if e.response is not None:
|
123 |
error_message += f" Status: {e.response.status_code}, Response: {e.response.text}"
|
124 |
-
|
125 |
-
return {"error": error_message, "document_type_provided": doc_type}
|
126 |
except Exception as e:
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
183 |
}
|
184 |
-
|
185 |
-
|
186 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
187 |
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
192 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
193 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
194 |
-
gr.Markdown("# π Document
|
195 |
gr.Markdown(
|
196 |
-
"**
|
197 |
-
"
|
198 |
-
"
|
199 |
-
"3. Click 'Add Document to Current Batch'. Repeat for all documents of a person or a related set.\n"
|
200 |
-
"4. Review the batch. Click 'Clear Entire Batch' to start over.\n"
|
201 |
-
"5. Click 'Process Batch and Extract Information' to send documents to the AI.\n"
|
202 |
-
"6. View the extracted information in JSON format below."
|
203 |
)
|
|
|
204 |
if not OPENROUTER_API_KEY:
|
205 |
-
gr.Markdown(
|
206 |
-
|
207 |
-
"API calls will fail. Please set it and restart this application.</h3>"
|
208 |
-
)
|
209 |
with gr.Row():
|
210 |
with gr.Column(scale=1):
|
211 |
-
gr.
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
value='passport_front',
|
227 |
-
filterable=True
|
228 |
-
)
|
229 |
-
add_button = gr.Button("β Add Document to Current Batch", variant="secondary")
|
230 |
-
with gr.Column(scale=2):
|
231 |
-
gr.Markdown("### Step 2: Review Current Batch")
|
232 |
-
batch_dataframe = gr.Dataframe(
|
233 |
-
headers=["Filename", "Document Type"],
|
234 |
-
datatype=["str", "str"],
|
235 |
-
row_count=1, # Changed: Start with 1 row, should grow dynamically
|
236 |
-
col_count=2, # Changed: Simpler integer for fixed columns
|
237 |
-
wrap=True
|
238 |
-
)
|
239 |
-
clear_batch_button = gr.Button("ποΈ Clear Entire Batch", variant="stop")
|
240 |
-
gr.Markdown("### Step 3: Process Batch")
|
241 |
-
process_button = gr.Button("π Process Batch and Extract Information", variant="primary")
|
242 |
-
status_message_textbox = gr.Textbox(label="Processing Status", interactive=False, lines=2)
|
243 |
-
gr.Markdown("### Step 4: View Results")
|
244 |
-
output_json_display = gr.JSON(label="Extracted Information (JSON Format)")
|
245 |
-
add_button.click(
|
246 |
-
fn=add_document_to_batch_ui,
|
247 |
-
inputs=[image_input, doc_type_input],
|
248 |
-
outputs=[batch_dataframe, status_message_textbox]
|
249 |
-
).then(lambda: None, outputs=image_input)
|
250 |
-
clear_batch_button.click(
|
251 |
-
fn=clear_batch_ui,
|
252 |
-
inputs=[],
|
253 |
-
outputs=[batch_dataframe, status_message_textbox]
|
254 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
255 |
process_button.click(
|
256 |
-
fn=
|
257 |
-
inputs=[],
|
258 |
-
outputs=[
|
|
|
|
|
|
|
|
|
|
|
259 |
)
|
260 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
261 |
if __name__ == "__main__":
|
262 |
-
|
263 |
-
print("ERROR: The OPENROUTER_API_KEY environment variable is not set.")
|
264 |
-
print("Please set it before running the application, e.g.:")
|
265 |
-
print(" export OPENROUTER_API_KEY='your_openrouter_key_here'")
|
266 |
-
print("The application will launch, but API calls will fail.")
|
267 |
-
demo.launch(share=True) # Added share=True
|
|
|
1 |
import gradio as gr
|
|
|
2 |
import base64
|
3 |
+
import requests
|
4 |
import json
|
5 |
+
import re
|
6 |
+
import os
|
7 |
+
import uuid
|
8 |
+
from datetime import datetime
|
9 |
|
10 |
# --- Configuration ---
|
11 |
+
# IMPORTANT: Set your OPENROUTER_API_KEY as a Hugging Face Space Secret
|
12 |
+
OPENROUTER_API_KEY = "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"
|
13 |
+
IMAGE_MODEL = "opengvlab/internvl3-14b:free" # Using the free tier model as specified
|
14 |
OPENROUTER_API_URL = "https://openrouter.ai/api/v1/chat/completions"
|
15 |
|
16 |
+
# --- Global State (managed within Gradio's session if possible, or module-level for simplicity here) ---
|
17 |
+
# This will be reset each time the processing function is called.
|
18 |
+
# For a multi-user or more robust app, session state or a proper backend DB would be needed.
|
19 |
+
processed_files_data = [] # Stores dicts for each file's details and status
|
20 |
+
person_profiles = {} # Stores dicts for each identified person and their documents
|
21 |
|
22 |
# --- Helper Functions ---
|
23 |
|
24 |
+
def extract_json_from_text(text):
|
25 |
+
"""
|
26 |
+
Extracts a JSON object from a string, trying common markdown and direct JSON.
|
27 |
+
"""
|
28 |
+
if not text:
|
29 |
+
return {"error": "Empty text provided for JSON extraction."}
|
30 |
+
|
31 |
+
# Try to match ```json ... ``` code block
|
32 |
+
match_block = re.search(r"```json\s*(\{.*?\})\s*```", text, re.DOTALL | re.IGNORECASE)
|
33 |
+
if match_block:
|
34 |
+
json_str = match_block.group(1)
|
35 |
+
else:
|
36 |
+
# If no block, assume the text itself might be JSON or wrapped in single backticks
|
37 |
+
text_stripped = text.strip()
|
38 |
+
if text_stripped.startswith("`") and text_stripped.endswith("`"):
|
39 |
+
json_str = text_stripped[1:-1]
|
40 |
+
else:
|
41 |
+
json_str = text_stripped # Assume it's direct JSON
|
42 |
+
|
43 |
+
try:
|
44 |
+
return json.loads(json_str)
|
45 |
+
except json.JSONDecodeError as e:
|
46 |
+
# Fallback: Try to find the first '{' and last '}' if initial parsing fails
|
47 |
+
try:
|
48 |
+
first_brace = json_str.find('{')
|
49 |
+
last_brace = json_str.rfind('}')
|
50 |
+
if first_brace != -1 and last_brace != -1 and last_brace > first_brace:
|
51 |
+
potential_json_str = json_str[first_brace : last_brace+1]
|
52 |
+
return json.loads(potential_json_str)
|
53 |
+
else:
|
54 |
+
return {"error": f"Invalid JSON structure: {str(e)}", "original_text": text}
|
55 |
+
except json.JSONDecodeError as e2:
|
56 |
+
return {"error": f"Invalid JSON structure after attempting substring: {str(e2)}", "original_text": text}
|
57 |
+
|
58 |
+
|
59 |
+
def get_ocr_prompt():
|
60 |
+
return f"""You are an advanced OCR and information extraction AI.
|
61 |
Your task is to meticulously analyze this image and extract all relevant information.
|
62 |
|
63 |
Output Format Instructions:
|
64 |
Provide your response as a SINGLE, VALID JSON OBJECT. Do not include any explanatory text before or after the JSON.
|
65 |
The JSON object should have the following top-level keys:
|
66 |
+
- "document_type_detected": (string) Your best guess of the specific document type (e.g., "Passport", "National ID Card", "Driver's License", "Visa Sticker", "Hotel Confirmation Voucher", "Bank Statement", "Photo of a person").
|
|
|
67 |
- "extracted_fields": (object) A key-value map of all extracted information. Be comprehensive. Examples:
|
68 |
+
- For passports/IDs: "Surname", "Given Names", "Full Name", "Document Number", "Nationality", "Date of Birth", "Sex", "Place of Birth", "Date of Issue", "Date of Expiry", "Issuing Authority", "Country Code".
|
69 |
+
- For hotel reservations: "Guest Name", "Hotel Name", "Booking Reference", "Check-in Date", "Check-out Date".
|
70 |
+
- For bank statements: "Account Holder Name", "Account Number", "Bank Name", "Statement Period", "Ending Balance".
|
71 |
+
- For photos: "Description" (e.g., "Portrait of a person", "Group photo at a location"), "People Present" (array of strings if multiple).
|
72 |
- "mrz_data": (object or null) If a Machine Readable Zone (MRZ) is present:
|
73 |
- "raw_mrz_lines": (array of strings) Each line of the MRZ.
|
74 |
+
- "parsed_mrz": (object) Key-value pairs of parsed MRZ fields.
|
75 |
If no MRZ, this field should be null.
|
|
|
|
|
|
|
76 |
- "full_text_ocr": (string) Concatenation of all text found on the document.
|
77 |
|
78 |
Extraction Guidelines:
|
79 |
+
1. Prioritize accuracy.
|
80 |
+
2. Extract all visible text. Include "Full Name" by combining given and surnames if possible.
|
81 |
3. For dates, try to use ISO 8601 format (YYYY-MM-DD) if possible, but retain original format if conversion is ambiguous.
|
|
|
|
|
82 |
|
83 |
Ensure the entire output strictly adheres to the JSON format.
|
84 |
"""
|
|
|
85 |
|
86 |
+
def call_openrouter_ocr(image_filepath):
|
87 |
if not OPENROUTER_API_KEY:
|
88 |
+
return {"error": "OpenRouter API Key not configured."}
|
89 |
try:
|
90 |
+
with open(image_filepath, "rb") as f:
|
91 |
+
encoded_image = base64.b64encode(f.read()).decode("utf-8")
|
92 |
+
|
93 |
+
# Basic MIME type guessing, default to jpeg
|
94 |
+
mime_type = "image/jpeg"
|
95 |
+
if image_filepath.lower().endswith(".png"):
|
96 |
+
mime_type = "image/png"
|
97 |
+
elif image_filepath.lower().endswith(".webp"):
|
98 |
+
mime_type = "image/webp"
|
99 |
+
|
100 |
+
data_url = f"data:{mime_type};base64,{encoded_image}"
|
101 |
+
prompt_text = get_ocr_prompt()
|
102 |
+
|
103 |
payload = {
|
104 |
"model": IMAGE_MODEL,
|
105 |
"messages": [
|
|
|
111 |
]
|
112 |
}
|
113 |
],
|
114 |
+
"max_tokens": 3500, # Increased for detailed JSON
|
115 |
"temperature": 0.1,
|
116 |
}
|
117 |
headers = {
|
118 |
"Authorization": f"Bearer {OPENROUTER_API_KEY}",
|
119 |
"Content-Type": "application/json",
|
120 |
+
"HTTP-Referer": "https://huggingface.co/spaces/DoClassifier", # Optional: Update with your Space URL
|
121 |
+
"X-Title": "DoClassifier Processor" # Optional
|
122 |
}
|
123 |
+
|
124 |
+
response = requests.post(OPENROUTER_API_URL, headers=headers, json=payload, timeout=180) # 3 min timeout
|
125 |
response.raise_for_status()
|
126 |
result = response.json()
|
127 |
+
|
128 |
if "choices" in result and result["choices"]:
|
129 |
+
raw_content = result["choices"][0]["message"]["content"]
|
130 |
+
return extract_json_from_text(raw_content)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
else:
|
132 |
+
return {"error": "No 'choices' in API response from OpenRouter.", "details": result}
|
133 |
+
|
134 |
except requests.exceptions.Timeout:
|
135 |
+
return {"error": "API request timed out."}
|
|
|
136 |
except requests.exceptions.RequestException as e:
|
137 |
error_message = f"API Request Error: {str(e)}"
|
138 |
+
if hasattr(e, 'response') and e.response is not None:
|
139 |
error_message += f" Status: {e.response.status_code}, Response: {e.response.text}"
|
140 |
+
return {"error": error_message}
|
|
|
141 |
except Exception as e:
|
142 |
+
return {"error": f"An unexpected error occurred during OCR: {str(e)}"}
|
143 |
+
|
144 |
+
def extract_entities_from_ocr(ocr_json):
|
145 |
+
if not ocr_json or "extracted_fields" not in ocr_json or not isinstance(ocr_json["extracted_fields"], dict):
|
146 |
+
return {"name": None, "dob": None, "passport_no": None, "doc_type": ocr_json.get("document_type_detected", "Unknown")}
|
147 |
+
|
148 |
+
fields = ocr_json["extracted_fields"]
|
149 |
+
doc_type = ocr_json.get("document_type_detected", "Unknown")
|
150 |
+
|
151 |
+
# Normalize potential field names (case-insensitive search)
|
152 |
+
name_keys = ["full name", "name", "account holder name", "guest name"]
|
153 |
+
dob_keys = ["date of birth", "dob"]
|
154 |
+
passport_keys = ["document number", "passport number"]
|
155 |
+
|
156 |
+
extracted_name = None
|
157 |
+
for key in name_keys:
|
158 |
+
for field_key, value in fields.items():
|
159 |
+
if key == field_key.lower():
|
160 |
+
extracted_name = str(value) if value else None
|
161 |
+
break
|
162 |
+
if extracted_name:
|
163 |
+
break
|
164 |
+
|
165 |
+
extracted_dob = None
|
166 |
+
for key in dob_keys:
|
167 |
+
for field_key, value in fields.items():
|
168 |
+
if key == field_key.lower():
|
169 |
+
extracted_dob = str(value) if value else None
|
170 |
+
break
|
171 |
+
if extracted_dob:
|
172 |
+
break
|
173 |
+
|
174 |
+
extracted_passport_no = None
|
175 |
+
for key in passport_keys:
|
176 |
+
for field_key, value in fields.items():
|
177 |
+
if key == field_key.lower():
|
178 |
+
extracted_passport_no = str(value).replace(" ", "").upper() if value else None # Normalize
|
179 |
+
break
|
180 |
+
if extracted_passport_no:
|
181 |
+
break
|
182 |
+
|
183 |
+
return {
|
184 |
+
"name": extracted_name,
|
185 |
+
"dob": extracted_dob,
|
186 |
+
"passport_no": extracted_passport_no,
|
187 |
+
"doc_type": doc_type
|
188 |
+
}
|
189 |
+
|
190 |
+
def normalize_name(name):
|
191 |
+
if not name: return ""
|
192 |
+
return "".join(filter(str.isalnum, name)).lower()
|
193 |
+
|
194 |
+
def get_person_id_and_update_profiles(doc_id, entities, current_persons_data):
|
195 |
+
"""
|
196 |
+
Tries to assign a document to an existing person or creates a new one.
|
197 |
+
Returns a person_key.
|
198 |
+
Updates current_persons_data in place.
|
199 |
+
"""
|
200 |
+
passport_no = entities.get("passport_no")
|
201 |
+
name = entities.get("name")
|
202 |
+
dob = entities.get("dob")
|
203 |
+
|
204 |
+
# 1. Match by Passport Number (strongest identifier)
|
205 |
+
if passport_no:
|
206 |
+
for p_key, p_data in current_persons_data.items():
|
207 |
+
if passport_no in p_data.get("passport_numbers", set()):
|
208 |
+
p_data["doc_ids"].add(doc_id)
|
209 |
+
# Update person profile with potentially new name/dob if current is missing
|
210 |
+
if name and not p_data.get("canonical_name"): p_data["canonical_name"] = name
|
211 |
+
if dob and not p_data.get("canonical_dob"): p_data["canonical_dob"] = dob
|
212 |
+
return p_key
|
213 |
+
# New person based on passport number
|
214 |
+
new_person_key = f"person_{passport_no}" # Or more robust ID generation
|
215 |
+
current_persons_data[new_person_key] = {
|
216 |
+
"canonical_name": name,
|
217 |
+
"canonical_dob": dob,
|
218 |
+
"names": {normalize_name(name)} if name else set(),
|
219 |
+
"dobs": {dob} if dob else set(),
|
220 |
+
"passport_numbers": {passport_no},
|
221 |
+
"doc_ids": {doc_id},
|
222 |
+
"display_name": name or f"Person (ID: {passport_no})"
|
223 |
+
}
|
224 |
+
return new_person_key
|
225 |
+
|
226 |
+
# 2. Match by Normalized Name + DOB (if passport not found or not present)
|
227 |
+
if name and dob:
|
228 |
+
norm_name = normalize_name(name)
|
229 |
+
composite_key_nd = f"{norm_name}_{dob}"
|
230 |
+
for p_key, p_data in current_persons_data.items():
|
231 |
+
# Check if this name and dob combo has been seen for this person
|
232 |
+
if norm_name in p_data.get("names", set()) and dob in p_data.get("dobs", set()):
|
233 |
+
p_data["doc_ids"].add(doc_id)
|
234 |
+
return p_key
|
235 |
+
# New person based on name and DOB
|
236 |
+
new_person_key = f"person_{composite_key_nd}_{str(uuid.uuid4())[:4]}"
|
237 |
+
current_persons_data[new_person_key] = {
|
238 |
+
"canonical_name": name,
|
239 |
+
"canonical_dob": dob,
|
240 |
+
"names": {norm_name},
|
241 |
+
"dobs": {dob},
|
242 |
+
"passport_numbers": set(),
|
243 |
+
"doc_ids": {doc_id},
|
244 |
+
"display_name": name
|
245 |
+
}
|
246 |
+
return new_person_key
|
247 |
+
|
248 |
+
# 3. If only name, less reliable, create new person (could add fuzzy matching later)
|
249 |
+
if name:
|
250 |
+
norm_name = normalize_name(name)
|
251 |
+
# Check if a person with just this name exists and has no other strong identifiers yet
|
252 |
+
# This part can be made more robust, for now, it might create more splits
|
253 |
+
new_person_key = f"person_{norm_name}_{str(uuid.uuid4())[:4]}"
|
254 |
+
current_persons_data[new_person_key] = {
|
255 |
+
"canonical_name": name, "canonical_dob": None,
|
256 |
+
"names": {norm_name}, "dobs": set(), "passport_numbers": set(),
|
257 |
+
"doc_ids": {doc_id}, "display_name": name
|
258 |
+
}
|
259 |
+
return new_person_key
|
260 |
+
|
261 |
+
# 4. Unclassifiable for now, assign a generic unique person key
|
262 |
+
generic_person_key = f"unidentified_person_{str(uuid.uuid4())[:6]}"
|
263 |
+
current_persons_data[generic_person_key] = {
|
264 |
+
"canonical_name": "Unknown", "canonical_dob": None,
|
265 |
+
"names": set(), "dobs": set(), "passport_numbers": set(),
|
266 |
+
"doc_ids": {doc_id}, "display_name": f"Unknown Person ({doc_id[:6]})"
|
267 |
}
|
268 |
+
return generic_person_key
|
269 |
+
|
270 |
+
|
271 |
+
def format_dataframe_data(current_files_data):
|
272 |
+
# Headers for the dataframe
|
273 |
+
# "ID", "Filename", "Status", "Detected Type", "Extracted Name", "Extracted DOB", "Main ID", "Person Key"
|
274 |
+
df_rows = []
|
275 |
+
for f_data in current_files_data:
|
276 |
+
entities = f_data.get("entities") or {}
|
277 |
+
df_rows.append([
|
278 |
+
f_data["doc_id"][:8], # Short ID
|
279 |
+
f_data["filename"],
|
280 |
+
f_data["status"],
|
281 |
+
entities.get("doc_type", "N/A"),
|
282 |
+
entities.get("name", "N/A"),
|
283 |
+
entities.get("dob", "N/A"),
|
284 |
+
entities.get("passport_no", "N/A"),
|
285 |
+
f_data.get("assigned_person_key", "N/A")
|
286 |
+
])
|
287 |
+
return df_rows
|
288 |
+
|
289 |
+
def format_persons_markdown(current_persons_data, current_files_data):
|
290 |
+
if not current_persons_data:
|
291 |
+
return "No persons identified yet."
|
292 |
+
|
293 |
+
md_parts = ["## Classified Persons & Documents\n"]
|
294 |
+
for p_key, p_data in current_persons_data.items():
|
295 |
+
display_name = p_data.get('display_name', p_key)
|
296 |
+
md_parts.append(f"### Person: {display_name} (Profile Key: {p_key})")
|
297 |
+
if p_data.get("canonical_dob"): md_parts.append(f"* DOB: {p_data['canonical_dob']}")
|
298 |
+
if p_data.get("passport_numbers"): md_parts.append(f"* Passport(s): {', '.join(p_data['passport_numbers'])}")
|
299 |
+
|
300 |
+
md_parts.append("* Documents:")
|
301 |
+
doc_ids_for_person = p_data.get("doc_ids", set())
|
302 |
+
if doc_ids_for_person:
|
303 |
+
for doc_id in doc_ids_for_person:
|
304 |
+
# Find the filename and detected type from current_files_data
|
305 |
+
doc_detail = next((f for f in current_files_data if f["doc_id"] == doc_id), None)
|
306 |
+
if doc_detail:
|
307 |
+
filename = doc_detail["filename"]
|
308 |
+
doc_type = doc_detail.get("entities", {}).get("doc_type", "Unknown Type")
|
309 |
+
md_parts.append(f" - {filename} (`{doc_type}`)")
|
310 |
+
else:
|
311 |
+
md_parts.append(f" - Document ID: {doc_id[:8]} (details not found, unexpected)")
|
312 |
+
else:
|
313 |
+
md_parts.append(" - No documents currently assigned.")
|
314 |
+
md_parts.append("\n---\n")
|
315 |
+
return "\n".join(md_parts)
|
316 |
+
|
317 |
+
# --- Main Gradio Processing Function (Generator) ---
|
318 |
+
def process_uploaded_files(files_list, progress=gr.Progress(track_tqdm=True)):
|
319 |
+
global processed_files_data, person_profiles # Reset global state for each run
|
320 |
+
processed_files_data = []
|
321 |
+
person_profiles = {}
|
322 |
+
|
323 |
+
if not OPENROUTER_API_KEY:
|
324 |
+
yield (
|
325 |
+
[["N/A", "ERROR", "OpenRouter API Key not configured.", "N/A", "N/A", "N/A", "N/A", "N/A"]],
|
326 |
+
"Error: OpenRouter API Key not configured. Please set it in Space Secrets.",
|
327 |
+
"{}", "API Key Missing. Processing halted."
|
328 |
+
)
|
329 |
+
return
|
330 |
+
|
331 |
+
if not files_list:
|
332 |
+
yield ([], "No files uploaded.", "{}", "Upload files to begin.")
|
333 |
+
return
|
334 |
+
|
335 |
+
# Initialize processed_files_data
|
336 |
+
for i, file_obj in enumerate(files_list):
|
337 |
+
doc_uid = str(uuid.uuid4())
|
338 |
+
processed_files_data.append({
|
339 |
+
"doc_id": doc_uid,
|
340 |
+
"filename": os.path.basename(file_obj.name), # file_obj.name is the temp path
|
341 |
+
"filepath": file_obj.name,
|
342 |
+
"status": "Queued",
|
343 |
+
"ocr_json": None,
|
344 |
+
"entities": None,
|
345 |
+
"assigned_person_key": None
|
346 |
+
})
|
347 |
+
|
348 |
+
initial_df_data = format_dataframe_data(processed_files_data)
|
349 |
+
initial_persons_md = format_persons_markdown(person_profiles, processed_files_data)
|
350 |
+
yield (initial_df_data, initial_persons_md, "{}", f"Initialized. Found {len(files_list)} files.")
|
351 |
|
352 |
+
# Iterate and process each file
|
353 |
+
for i, file_data_item in enumerate(progress.tqdm(processed_files_data, desc="Processing Documents")):
|
354 |
+
current_doc_id = file_data_item["doc_id"]
|
355 |
+
current_filename = file_data_item["filename"]
|
356 |
+
|
357 |
+
# 1. OCR Processing
|
358 |
+
file_data_item["status"] = "OCR in Progress..."
|
359 |
+
df_data = format_dataframe_data(processed_files_data)
|
360 |
+
persons_md = format_persons_markdown(person_profiles, processed_files_data) # No change yet
|
361 |
+
yield (df_data, persons_md, "{}", f"({i+1}/{len(processed_files_data)}) OCR for: {current_filename}")
|
362 |
|
363 |
+
ocr_result = call_openrouter_ocr(file_data_item["filepath"])
|
364 |
+
file_data_item["ocr_json"] = ocr_result # Store full JSON
|
365 |
+
|
366 |
+
if "error" in ocr_result:
|
367 |
+
file_data_item["status"] = f"OCR Error: {ocr_result['error'][:50]}..." # Truncate long errors
|
368 |
+
df_data = format_dataframe_data(processed_files_data)
|
369 |
+
yield (df_data, persons_md, json.dumps(ocr_result, indent=2), f"({i+1}/{len(processed_files_data)}) OCR Error on {current_filename}")
|
370 |
+
continue # Move to next file
|
371 |
+
|
372 |
+
file_data_item["status"] = "OCR Done. Extracting Entities..."
|
373 |
+
df_data = format_dataframe_data(processed_files_data)
|
374 |
+
yield (df_data, persons_md, json.dumps(ocr_result, indent=2), f"({i+1}/{len(processed_files_data)}) OCR Done for {current_filename}")
|
375 |
+
|
376 |
+
# 2. Entity Extraction
|
377 |
+
entities = extract_entities_from_ocr(ocr_result)
|
378 |
+
file_data_item["entities"] = entities
|
379 |
+
file_data_item["status"] = "Entities Extracted. Classifying..."
|
380 |
+
df_data = format_dataframe_data(processed_files_data) # Now entities will show up
|
381 |
+
yield (df_data, persons_md, json.dumps(ocr_result, indent=2), f"({i+1}/{len(processed_files_data)}) Entities for {current_filename}")
|
382 |
+
|
383 |
+
# 3. Person Classification / Linking
|
384 |
+
person_key = get_person_id_and_update_profiles(current_doc_id, entities, person_profiles)
|
385 |
+
file_data_item["assigned_person_key"] = person_key
|
386 |
+
file_data_item["status"] = "Classified"
|
387 |
+
|
388 |
+
df_data = format_dataframe_data(processed_files_data)
|
389 |
+
persons_md = format_persons_markdown(person_profiles, processed_files_data) # Now persons_md updates
|
390 |
+
yield (df_data, persons_md, json.dumps(ocr_result, indent=2), f"({i+1}/{len(processed_files_data)}) Classified {current_filename} -> {person_key}")
|
391 |
+
|
392 |
+
final_df_data = format_dataframe_data(processed_files_data)
|
393 |
+
final_persons_md = format_persons_markdown(person_profiles, processed_files_data)
|
394 |
+
yield (final_df_data, final_persons_md, "{}", f"All {len(processed_files_data)} documents processed.")
|
395 |
+
|
396 |
+
|
397 |
+
# --- Gradio UI Layout ---
|
398 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
399 |
+
gr.Markdown("# π Intelligent Document Processor & Classifier")
|
400 |
gr.Markdown(
|
401 |
+
"**Upload multiple documents (images of passports, bank statements, hotel reservations, photos, etc.). "
|
402 |
+
"The system will perform OCR, attempt to extract key entities, and classify documents by the person they belong to.**\n"
|
403 |
+
"Ensure `OPENROUTER_API_KEY` is set as a Secret in your Hugging Face Space."
|
|
|
|
|
|
|
|
|
404 |
)
|
405 |
+
|
406 |
if not OPENROUTER_API_KEY:
|
407 |
+
gr.Markdown("<h3 style='color:red;'>β οΈ ERROR: `OPENROUTER_API_KEY` is not set in Space Secrets! OCR will fail.</h3>")
|
408 |
+
|
|
|
|
|
409 |
with gr.Row():
|
410 |
with gr.Column(scale=1):
|
411 |
+
files_input = gr.Files(label="Upload Document Images (Bulk)", file_count="multiple", type="filepath")
|
412 |
+
process_button = gr.Button("Process Uploaded Documents", variant="primary")
|
413 |
+
overall_status_textbox = gr.Textbox(label="Overall Progress", interactive=False, lines=1)
|
414 |
+
|
415 |
+
gr.Markdown("---")
|
416 |
+
gr.Markdown("## Document Processing Details")
|
417 |
+
# "ID", "Filename", "Status", "Detected Type", "Extracted Name", "Extracted DOB", "Main ID", "Person Key"
|
418 |
+
dataframe_headers = ["Doc ID (short)", "Filename", "Status", "Detected Type", "Name", "DOB", "Passport No.", "Assigned Person Key"]
|
419 |
+
document_status_df = gr.Dataframe(
|
420 |
+
headers=dataframe_headers,
|
421 |
+
datatype=["str"] * len(dataframe_headers), # All as strings for display simplicity
|
422 |
+
label="Individual Document Status & Extracted Entities",
|
423 |
+
row_count=(0, "dynamic"), # Start empty, dynamically grows
|
424 |
+
col_count=(len(dataframe_headers), "fixed"),
|
425 |
+
wrap=True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
426 |
)
|
427 |
+
|
428 |
+
ocr_json_output = gr.Code(label="Selected Document OCR JSON", language="json", interactive=False)
|
429 |
+
|
430 |
+
gr.Markdown("---")
|
431 |
+
person_classification_output_md = gr.Markdown("## Classified Persons & Documents\nNo persons identified yet.")
|
432 |
+
|
433 |
+
# Event Handlers
|
434 |
process_button.click(
|
435 |
+
fn=process_uploaded_files,
|
436 |
+
inputs=[files_input],
|
437 |
+
outputs=[
|
438 |
+
document_status_df,
|
439 |
+
person_classification_output_md,
|
440 |
+
ocr_json_output, # Temporarily show last OCR here, better if select event works
|
441 |
+
overall_status_textbox
|
442 |
+
]
|
443 |
)
|
444 |
|
445 |
+
@document_status_df.select(inputs=None, outputs=ocr_json_output, show_progress="hidden")
|
446 |
+
def display_selected_ocr(evt: gr.SelectData):
|
447 |
+
if evt.index is None or evt.index[0] is None: # evt.index is (row, col)
|
448 |
+
return "{}" # Nothing selected or invalid selection
|
449 |
+
|
450 |
+
selected_row_index = evt.index[0]
|
451 |
+
if selected_row_index < len(processed_files_data):
|
452 |
+
selected_doc_data = processed_files_data[selected_row_index]
|
453 |
+
if selected_doc_data and selected_doc_data["ocr_json"]:
|
454 |
+
return json.dumps(selected_doc_data["ocr_json"], indent=2)
|
455 |
+
return "{ \"message\": \"No OCR data found for selected row or selection out of bounds.\" }"
|
456 |
+
|
457 |
+
|
458 |
if __name__ == "__main__":
|
459 |
+
demo.queue().launch(debug=True, share=True) # Use queue for longer processes, share=True for Spaces
|
|
|
|
|
|
|
|
|
|
old_app.py
ADDED
@@ -0,0 +1,267 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import requests
|
3 |
+
import base64
|
4 |
+
import os
|
5 |
+
import json
|
6 |
+
import mimetypes
|
7 |
+
|
8 |
+
# --- Configuration ---
|
9 |
+
OPENROUTER_API_KEY = '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'
|
10 |
+
IMAGE_MODEL = "opengvlab/internvl3-14b:free"
|
11 |
+
OPENROUTER_API_URL = "https://openrouter.ai/api/v1/chat/completions"
|
12 |
+
|
13 |
+
# --- Application State ---
|
14 |
+
current_batch = []
|
15 |
+
|
16 |
+
# --- Helper Functions ---
|
17 |
+
|
18 |
+
def generate_extraction_prompt(doc_type_provided_by_user):
|
19 |
+
prompt = f"""You are an advanced OCR and information extraction AI.
|
20 |
+
The user has provided an image and identified it as a '{doc_type_provided_by_user}'.
|
21 |
+
Your task is to meticulously analyze this image and extract all relevant information.
|
22 |
+
|
23 |
+
Output Format Instructions:
|
24 |
+
Provide your response as a SINGLE, VALID JSON OBJECT. Do not include any explanatory text before or after the JSON.
|
25 |
+
The JSON object should have the following top-level keys:
|
26 |
+
- "document_type_provided": (string) The type provided by the user: "{doc_type_provided_by_user}".
|
27 |
+
- "document_type_detected": (string) Your best guess of the specific document type (e.g., "Passport", "National ID Card", "Driver's License", "Visa Sticker", "Hotel Confirmation Voucher", "Boarding Pass", "Photograph of a person").
|
28 |
+
- "extracted_fields": (object) A key-value map of all extracted information. Be comprehensive. Examples:
|
29 |
+
- For passports/IDs: "Surname", "Given Names", "Document Number", "Nationality", "Date of Birth", "Sex", "Place of Birth", "Date of Issue", "Date of Expiry", "Issuing Authority", "Country Code".
|
30 |
+
- For hotel reservations: "Guest Name", "Hotel Name", "Booking Reference", "Check-in Date", "Check-out Date", "Room Type".
|
31 |
+
- For photos: "Description" (e.g., "Portrait of a person", "Image contains text: [text if any]").
|
32 |
+
- "mrz_data": (object or null) If a Machine Readable Zone (MRZ) is present:
|
33 |
+
- "raw_mrz_lines": (array of strings) Each line of the MRZ.
|
34 |
+
- "parsed_mrz": (object) Key-value pairs of parsed MRZ fields (e.g., "passport_type", "issuing_country", "surname", "given_names", "passport_number", "nationality", "dob", "sex", "expiry_date", "personal_number").
|
35 |
+
If no MRZ, this field should be null.
|
36 |
+
- "multilingual_info": (array of objects or null) For any text segments not in English:
|
37 |
+
- Each object: {{"language_detected": "ISO 639-1 code", "original_text": "...", "english_translation_or_transliteration": "..."}}
|
38 |
+
If no non-English text, this field can be null or an empty array.
|
39 |
+
- "full_text_ocr": (string) Concatenation of all text found on the document.
|
40 |
+
|
41 |
+
Extraction Guidelines:
|
42 |
+
1. Prioritize accuracy. If unsure about a character or word, indicate uncertainty if possible, or extract the most likely interpretation.
|
43 |
+
2. Extract all visible text, including small print, stamps, and handwritten annotations if legible.
|
44 |
+
3. For dates, try to use ISO 8601 format (YYYY-MM-DD) if possible, but retain original format if conversion is ambiguous.
|
45 |
+
4. If the image is a photo of a person without much text, the "extracted_fields" might contain a description, and "full_text_ocr" might be minimal.
|
46 |
+
5. If the document is multi-page and only one page is provided, note this if apparent.
|
47 |
+
|
48 |
+
Ensure the entire output strictly adheres to the JSON format.
|
49 |
+
"""
|
50 |
+
return prompt
|
51 |
+
|
52 |
+
def process_single_image_with_openrouter(image_path, doc_type):
|
53 |
+
if not OPENROUTER_API_KEY:
|
54 |
+
return {"error": "OpenRouter API key not set.", "document_type_provided": doc_type}
|
55 |
+
try:
|
56 |
+
with open(image_path, "rb") as f:
|
57 |
+
encoded_image_bytes = f.read()
|
58 |
+
encoded_image_string = base64.b64encode(encoded_image_bytes).decode("utf-8")
|
59 |
+
mime_type, _ = mimetypes.guess_type(image_path)
|
60 |
+
if not mime_type:
|
61 |
+
ext = os.path.splitext(image_path)[1].lower()
|
62 |
+
if ext == ".png": mime_type = "image/png"
|
63 |
+
elif ext in [".jpg", ".jpeg"]: mime_type = "image/jpeg"
|
64 |
+
elif ext == ".webp": mime_type = "image/webp"
|
65 |
+
else: mime_type = "image/jpeg"
|
66 |
+
data_url = f"data:{mime_type};base64,{encoded_image_string}"
|
67 |
+
prompt_text = generate_extraction_prompt(doc_type)
|
68 |
+
payload = {
|
69 |
+
"model": IMAGE_MODEL,
|
70 |
+
"messages": [
|
71 |
+
{
|
72 |
+
"role": "user",
|
73 |
+
"content": [
|
74 |
+
{"type": "text", "text": prompt_text},
|
75 |
+
{"type": "image_url", "image_url": {"url": data_url}}
|
76 |
+
]
|
77 |
+
}
|
78 |
+
],
|
79 |
+
"max_tokens": 3000,
|
80 |
+
"temperature": 0.1,
|
81 |
+
}
|
82 |
+
headers = {
|
83 |
+
"Authorization": f"Bearer {OPENROUTER_API_KEY}",
|
84 |
+
"Content-Type": "application/json",
|
85 |
+
"HTTP-Referer": "https://huggingface.co/spaces/Passport_Extractor",
|
86 |
+
"X-Title": "Document Classifier"
|
87 |
+
}
|
88 |
+
print(f"Sending request to OpenRouter for image: {os.path.basename(image_path)}, type: {doc_type}")
|
89 |
+
response = requests.post(OPENROUTER_API_URL, headers=headers, json=payload, timeout=120)
|
90 |
+
response.raise_for_status()
|
91 |
+
result = response.json()
|
92 |
+
print(f"Received response from OpenRouter. Status: {response.status_code}")
|
93 |
+
if "choices" in result and result["choices"]:
|
94 |
+
content_text = result["choices"][0]["message"]["content"]
|
95 |
+
clean_content = content_text.strip()
|
96 |
+
if clean_content.startswith("```json"):
|
97 |
+
clean_content = clean_content[7:]
|
98 |
+
if clean_content.endswith("```"):
|
99 |
+
clean_content = clean_content[:-3]
|
100 |
+
elif clean_content.startswith("`") and clean_content.endswith("`"):
|
101 |
+
clean_content = clean_content[1:-1]
|
102 |
+
try:
|
103 |
+
parsed_json = json.loads(clean_content)
|
104 |
+
if "document_type_provided" not in parsed_json:
|
105 |
+
parsed_json["document_type_provided"] = doc_type
|
106 |
+
return parsed_json
|
107 |
+
except json.JSONDecodeError as e:
|
108 |
+
print(f"JSONDecodeError: {e}. Raw content was:\n{content_text}")
|
109 |
+
return {
|
110 |
+
"error": "Failed to parse LLM output as JSON.",
|
111 |
+
"raw_content_from_llm": content_text,
|
112 |
+
"document_type_provided": doc_type
|
113 |
+
}
|
114 |
+
else:
|
115 |
+
print(f"No 'choices' in API response: {result}")
|
116 |
+
return {"error": "No choices in API response.", "details": result, "document_type_provided": doc_type}
|
117 |
+
except requests.exceptions.Timeout:
|
118 |
+
print(f"API Request Timeout for {os.path.basename(image_path)}")
|
119 |
+
return {"error": "API request timed out.", "document_type_provided": doc_type}
|
120 |
+
except requests.exceptions.RequestException as e:
|
121 |
+
error_message = f"API Request Error: {str(e)}"
|
122 |
+
if e.response is not None:
|
123 |
+
error_message += f" Status: {e.response.status_code}, Response: {e.response.text}"
|
124 |
+
print(error_message)
|
125 |
+
return {"error": error_message, "document_type_provided": doc_type}
|
126 |
+
except Exception as e:
|
127 |
+
print(f"An unexpected error occurred during processing {os.path.basename(image_path)}: {str(e)}")
|
128 |
+
return {"error": f"An unexpected error: {str(e)}", "document_type_provided": doc_type}
|
129 |
+
|
130 |
+
def add_document_to_batch_ui(image_filepath, doc_type_selection):
|
131 |
+
global current_batch
|
132 |
+
if image_filepath and doc_type_selection:
|
133 |
+
filename = os.path.basename(image_filepath)
|
134 |
+
current_batch.append({"path": image_filepath, "type": doc_type_selection, "filename": filename})
|
135 |
+
batch_display_data = [[item["filename"], item["type"]] for item in current_batch]
|
136 |
+
return batch_display_data, f"Added '{filename}' as '{doc_type_selection}'."
|
137 |
+
batch_display_data = [[item["filename"], item["type"]] for item in current_batch]
|
138 |
+
return batch_display_data, "Failed to add: Image or document type missing."
|
139 |
+
|
140 |
+
def process_batch_ui():
|
141 |
+
global current_batch
|
142 |
+
if not OPENROUTER_API_KEY:
|
143 |
+
return {"error": "OPENROUTER_API_KEY is not set. Please configure it."}, "API Key Missing."
|
144 |
+
if not current_batch:
|
145 |
+
return {"message": "Batch is empty. Add documents first."}, "Batch is empty."
|
146 |
+
all_results = []
|
147 |
+
status_updates = []
|
148 |
+
for i, item_to_process in enumerate(current_batch):
|
149 |
+
status_msg = f"Processing document {i+1}/{len(current_batch)}: {item_to_process['filename']} ({item_to_process['type']})..."
|
150 |
+
print(status_msg)
|
151 |
+
extracted_data = process_single_image_with_openrouter(item_to_process["path"], item_to_process["type"])
|
152 |
+
all_results.append(extracted_data)
|
153 |
+
if "error" in extracted_data:
|
154 |
+
status_updates.append(f"Error processing {item_to_process['filename']}: {extracted_data['error']}")
|
155 |
+
else:
|
156 |
+
status_updates.append(f"Successfully processed {item_to_process['filename']}.")
|
157 |
+
grouped_by_person = {}
|
158 |
+
unidentified_docs = []
|
159 |
+
for result_item in all_results:
|
160 |
+
doc_id = None
|
161 |
+
if isinstance(result_item, dict) and "extracted_fields" in result_item and isinstance(result_item["extracted_fields"], dict):
|
162 |
+
fields = result_item["extracted_fields"]
|
163 |
+
passport_no = fields.get("Document Number") or fields.get("Passport Number") or fields.get("passport_number")
|
164 |
+
name = fields.get("Given Names") or fields.get("Given Name") or fields.get("Name")
|
165 |
+
surname = fields.get("Surname") or fields.get("Family Name")
|
166 |
+
dob = fields.get("Date of Birth") or fields.get("DOB")
|
167 |
+
if passport_no:
|
168 |
+
doc_id = f"passport_{str(passport_no).replace(' ', '').lower()}"
|
169 |
+
elif name and surname and dob:
|
170 |
+
doc_id = f"{str(name).replace(' ', '').lower()}_{str(surname).replace(' ', '').lower()}_{str(dob).replace(' ', '')}"
|
171 |
+
elif name and surname:
|
172 |
+
doc_id = f"{str(name).replace(' ', '').lower()}_{str(surname).replace(' ', '').lower()}"
|
173 |
+
if doc_id:
|
174 |
+
if doc_id not in grouped_by_person:
|
175 |
+
grouped_by_person[doc_id] = {"person_identifier": doc_id, "documents": []}
|
176 |
+
grouped_by_person[doc_id]["documents"].append(result_item)
|
177 |
+
else:
|
178 |
+
unidentified_docs.append(result_item)
|
179 |
+
final_structured_output = {
|
180 |
+
"summary": f"Processed {len(current_batch)} documents.",
|
181 |
+
"grouped_by_person": list(grouped_by_person.values()) if grouped_by_person else [],
|
182 |
+
"unidentified_documents_or_errors": unidentified_docs
|
183 |
+
}
|
184 |
+
final_status = "Batch processing complete. " + " | ".join(status_updates)
|
185 |
+
print(final_status)
|
186 |
+
return final_structured_output, final_status
|
187 |
+
|
188 |
+
def clear_batch_ui():
|
189 |
+
global current_batch
|
190 |
+
current_batch = []
|
191 |
+
return [], "Batch cleared successfully."
|
192 |
+
|
193 |
+
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
194 |
+
gr.Markdown("# π Document Information Extractor (OpenGVLab/InternVL3-14B via OpenRouter)")
|
195 |
+
gr.Markdown(
|
196 |
+
"**Instructions:**\n"
|
197 |
+
"1. Upload a document image (e.g., passport front/back, photo, hotel reservation).\n"
|
198 |
+
"2. Select the correct document type.\n"
|
199 |
+
"3. Click 'Add Document to Current Batch'. Repeat for all documents of a person or a related set.\n"
|
200 |
+
"4. Review the batch. Click 'Clear Entire Batch' to start over.\n"
|
201 |
+
"5. Click 'Process Batch and Extract Information' to send documents to the AI.\n"
|
202 |
+
"6. View the extracted information in JSON format below."
|
203 |
+
)
|
204 |
+
if not OPENROUTER_API_KEY:
|
205 |
+
gr.Markdown(
|
206 |
+
"<h3 style='color:red;'>β οΈ Warning: `OPENROUTER_API_KEY` environment variable is not detected. "
|
207 |
+
"API calls will fail. Please set it and restart this application.</h3>"
|
208 |
+
)
|
209 |
+
with gr.Row():
|
210 |
+
with gr.Column(scale=1):
|
211 |
+
gr.Markdown("### Step 1: Add Document")
|
212 |
+
image_input = gr.Image(
|
213 |
+
label="Upload Document Image",
|
214 |
+
type="filepath",
|
215 |
+
sources=["upload"],
|
216 |
+
height=300
|
217 |
+
)
|
218 |
+
doc_type_choices = [
|
219 |
+
'passport_front', 'passport_back', 'national_id_front', 'national_id_back',
|
220 |
+
'drivers_license_front', 'drivers_license_back', 'visa_sticker',
|
221 |
+
'photo', 'hotel_reservation', 'boarding_pass', 'utility_bill', 'other_document'
|
222 |
+
]
|
223 |
+
doc_type_input = gr.Dropdown(
|
224 |
+
label="Select Document Type",
|
225 |
+
choices=doc_type_choices,
|
226 |
+
value='passport_front',
|
227 |
+
filterable=True
|
228 |
+
)
|
229 |
+
add_button = gr.Button("β Add Document to Current Batch", variant="secondary")
|
230 |
+
with gr.Column(scale=2):
|
231 |
+
gr.Markdown("### Step 2: Review Current Batch")
|
232 |
+
batch_dataframe = gr.Dataframe(
|
233 |
+
headers=["Filename", "Document Type"],
|
234 |
+
datatype=["str", "str"],
|
235 |
+
row_count=1, # Changed: Start with 1 row, should grow dynamically
|
236 |
+
col_count=2, # Changed: Simpler integer for fixed columns
|
237 |
+
wrap=True
|
238 |
+
)
|
239 |
+
clear_batch_button = gr.Button("ποΈ Clear Entire Batch", variant="stop")
|
240 |
+
gr.Markdown("### Step 3: Process Batch")
|
241 |
+
process_button = gr.Button("π Process Batch and Extract Information", variant="primary")
|
242 |
+
status_message_textbox = gr.Textbox(label="Processing Status", interactive=False, lines=2)
|
243 |
+
gr.Markdown("### Step 4: View Results")
|
244 |
+
output_json_display = gr.JSON(label="Extracted Information (JSON Format)")
|
245 |
+
add_button.click(
|
246 |
+
fn=add_document_to_batch_ui,
|
247 |
+
inputs=[image_input, doc_type_input],
|
248 |
+
outputs=[batch_dataframe, status_message_textbox]
|
249 |
+
).then(lambda: None, outputs=image_input)
|
250 |
+
clear_batch_button.click(
|
251 |
+
fn=clear_batch_ui,
|
252 |
+
inputs=[],
|
253 |
+
outputs=[batch_dataframe, status_message_textbox]
|
254 |
+
)
|
255 |
+
process_button.click(
|
256 |
+
fn=process_batch_ui,
|
257 |
+
inputs=[],
|
258 |
+
outputs=[output_json_display, status_message_textbox]
|
259 |
+
)
|
260 |
+
|
261 |
+
if __name__ == "__main__":
|
262 |
+
if not OPENROUTER_API_KEY:
|
263 |
+
print("ERROR: The OPENROUTER_API_KEY environment variable is not set.")
|
264 |
+
print("Please set it before running the application, e.g.:")
|
265 |
+
print(" export OPENROUTER_API_KEY='your_openrouter_key_here'")
|
266 |
+
print("The application will launch, but API calls will fail.")
|
267 |
+
demo.launch(share=True) # Added share=True
|