File size: 17,484 Bytes
df5c908
0a8e31d
3d827ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
df5c908
0a8e31d
 
 
 
 
 
3d827ec
 
 
0a8e31d
 
3d827ec
 
 
 
0a8e31d
df5c908
0a8e31d
3d827ec
 
 
 
0a8e31d
df5c908
3d827ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0a8e31d
3d827ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
df5c908
 
3d827ec
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
import gradio as gr
import requests
import base64
import os
import json
import mimetypes

# --- Configuration ---
# IMPORTANT: Set your OPENROUTER_API_KEY as an environment variable
# For example, in your terminal: export OPENROUTER_API_KEY='your_key_here'
OPENROUTER_API_KEY = "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"
IMAGE_MODEL = "opengvlab/internvl3-14b:free"
OPENROUTER_API_URL = "https://openrouter.ai/api/v1/chat/completions"

# --- Application State ---
# Global list to store documents in the current batch
# Each item: {"path": "image_file_path", "type": "document_type_string", "filename": "display_filename"}
current_batch = []

# --- Helper Functions ---

def generate_extraction_prompt(doc_type_provided_by_user):
    """
    Generates a detailed prompt for the LLM to extract information
    and structure it as a JSON object.
    """
    prompt = f"""You are an advanced OCR and information extraction AI.
The user has provided an image and identified it as a '{doc_type_provided_by_user}'.
Your task is to meticulously analyze this image and extract all relevant information.

Output Format Instructions:
Provide your response as a SINGLE, VALID JSON OBJECT. Do not include any explanatory text before or after the JSON.
The JSON object should have the following top-level keys:
- "document_type_provided": (string) The type provided by the user: "{doc_type_provided_by_user}".
- "document_type_detected": (string) Your best guess of the specific document type (e.g., "Passport", "National ID Card", "Driver's License", "Visa Sticker", "Hotel Confirmation Voucher", "Boarding Pass", "Photograph of a person").
- "extracted_fields": (object) A key-value map of all extracted information. Be comprehensive. Examples:
    - For passports/IDs: "Surname", "Given Names", "Document Number", "Nationality", "Date of Birth", "Sex", "Place of Birth", "Date of Issue", "Date of Expiry", "Issuing Authority", "Country Code".
    - For hotel reservations: "Guest Name", "Hotel Name", "Booking Reference", "Check-in Date", "Check-out Date", "Room Type".
    - For photos: "Description" (e.g., "Portrait of a person", "Image contains text: [text if any]").
- "mrz_data": (object or null) If a Machine Readable Zone (MRZ) is present:
    - "raw_mrz_lines": (array of strings) Each line of the MRZ.
    - "parsed_mrz": (object) Key-value pairs of parsed MRZ fields (e.g., "passport_type", "issuing_country", "surname", "given_names", "passport_number", "nationality", "dob", "sex", "expiry_date", "personal_number").
    If no MRZ, this field should be null.
- "multilingual_info": (array of objects or null) For any text segments not in English:
    - Each object: {{"language_detected": "ISO 639-1 code", "original_text": "...", "english_translation_or_transliteration": "..."}}
    If no non-English text, this field can be null or an empty array.
- "full_text_ocr": (string) Concatenation of all text found on the document.

Extraction Guidelines:
1.  Prioritize accuracy. If unsure about a character or word, indicate uncertainty if possible, or extract the most likely interpretation.
2.  Extract all visible text, including small print, stamps, and handwritten annotations if legible.
3.  For dates, try to use ISO 8601 format (YYYY-MM-DD) if possible, but retain original format if conversion is ambiguous.
4.  If the image is a photo of a person without much text, the "extracted_fields" might contain a description, and "full_text_ocr" might be minimal.
5.  If the document is multi-page and only one page is provided, note this if apparent.

Ensure the entire output strictly adheres to the JSON format.
"""
    return prompt

def process_single_image_with_openrouter(image_path, doc_type):
    """
    Encodes an image, sends it to OpenRouter with a generated prompt,
    and attempts to parse the JSON response from the LLM.
    """
    if not OPENROUTER_API_KEY:
        return {"error": "OpenRouter API key not set.", "document_type_provided": doc_type}

    try:
        with open(image_path, "rb") as f:
            encoded_image_bytes = f.read()
            encoded_image_string = base64.b64encode(encoded_image_bytes).decode("utf-8")

        mime_type, _ = mimetypes.guess_type(image_path)
        if not mime_type:
            # Fallback, try to infer from extension or default to common types
            ext = os.path.splitext(image_path)[1].lower()
            if ext == ".png": mime_type = "image/png"
            elif ext == ".jpg" or ext == ".jpeg": mime_type = "image/jpeg"
            elif ext == ".webp": mime_type = "image/webp"
            else: mime_type = "image/jpeg" # A common default

        data_url = f"data:{mime_type};base64,{encoded_image_string}"
        prompt_text = generate_extraction_prompt(doc_type)

        payload = {
            "model": IMAGE_MODEL,
            "messages": [
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": prompt_text},
                        {"type": "image_url", "image_url": {"url": data_url}}
                    ]
                }
            ],
            "max_tokens": 3000, # Increased for potentially large JSONs
            "temperature": 0.1, # Lower temperature for more deterministic output
            # "response_format": {"type": "json_object"}, # Uncomment if OpenRouter & model fully support this
                                                         # for guaranteed JSON. Prompt is primary method now.
        }

        headers = {
            "Authorization": f"Bearer {OPENROUTER_API_KEY}",
            "Content-Type": "application/json",
            "HTTP-Referer": "https://huggingface.co/spaces/YOUR_SPACE_NAME", # Optional: Replace with your app's URL
            "X-Title": "Gradio Document Extractor" # Optional: Replace with your app's name
        }

        print(f"Sending request to OpenRouter for image: {os.path.basename(image_path)}, type: {doc_type}")
        response = requests.post(OPENROUTER_API_URL, headers=headers, json=payload, timeout=120) # 120s timeout
        response.raise_for_status()  # Raises HTTPError for bad responses (4XX or 5XX)
        result = response.json()
        print(f"Received response from OpenRouter. Status: {response.status_code}")

        if "choices" in result and result["choices"]:
            content_text = result["choices"][0]["message"]["content"]
            
            # Try to clean up and parse JSON (models sometimes wrap in markdown)
            clean_content = content_text.strip()
            if clean_content.startswith("```json"):
                clean_content = clean_content[7:]
                if clean_content.endswith("```"):
                    clean_content = clean_content[:-3]
            elif clean_content.startswith("`") and clean_content.endswith("`"): # Single backtick
                 clean_content = clean_content[1:-1]
            
            try:
                parsed_json = json.loads(clean_content)
                # Ensure document_type_provided is in the root, even if LLM missed it
                if "document_type_provided" not in parsed_json:
                    parsed_json["document_type_provided"] = doc_type
                return parsed_json
            except json.JSONDecodeError as e:
                print(f"JSONDecodeError: {e}. Raw content was:\n{content_text}")
                return {
                    "error": "Failed to parse LLM output as JSON.",
                    "raw_content_from_llm": content_text,
                    "document_type_provided": doc_type
                }
        else:
            print(f"No 'choices' in API response: {result}")
            return {"error": "No choices in API response.", "details": result, "document_type_provided": doc_type}

    except requests.exceptions.Timeout:
        print(f"API Request Timeout for {os.path.basename(image_path)}")
        return {"error": "API request timed out.", "document_type_provided": doc_type}
    except requests.exceptions.RequestException as e:
        error_message = f"API Request Error: {str(e)}"
        if e.response is not None:
            error_message += f" Status: {e.response.status_code}, Response: {e.response.text}"
        print(error_message)
        return {"error": error_message, "document_type_provided": doc_type}
    except Exception as e:
        print(f"An unexpected error occurred during processing {os.path.basename(image_path)}: {str(e)}")
        return {"error": f"An unexpected error: {str(e)}", "document_type_provided": doc_type}

# --- Gradio Interface Callbacks ---

def add_document_to_batch_ui(image_filepath, doc_type_selection):
    """Adds an uploaded image and its type to the current batch state."""
    global current_batch
    if image_filepath and doc_type_selection:
        filename = os.path.basename(image_filepath)
        # Note: image_filepath is a temporary path from Gradio.
        # It should be used relatively quickly. For long-lived state,
        # you might copy the file or read its content.
        current_batch.append({"path": image_filepath, "type": doc_type_selection, "filename": filename})
        
        # Prepare display for Dataframe: list of lists
        batch_display_data = [[item["filename"], item["type"]] for item in current_batch]
        return batch_display_data, f"Added '{filename}' as '{doc_type_selection}'."
    
    # Return current state if inputs are invalid
    batch_display_data = [[item["filename"], item["type"]] for item in current_batch]
    return batch_display_data, "Failed to add: Image or document type missing."


def process_batch_ui():
    """Processes all documents in the current batch and returns combined JSON results."""
    global current_batch
    if not OPENROUTER_API_KEY:
        return {"error": "OPENROUTER_API_KEY is not set. Please configure it."}, "API Key Missing."

    if not current_batch:
        return {"message": "Batch is empty. Add documents first."}, "Batch is empty."

    all_results = []
    status_updates = []

    for i, item_to_process in enumerate(current_batch):
        status_msg = f"Processing document {i+1}/{len(current_batch)}: {item_to_process['filename']} ({item_to_process['type']})..."
        print(status_msg)
        # yield None, status_msg # This would require process_batch_ui to be a generator for live updates
        
        # Ensure the file path is valid; Gradio's temp files should be okay here
        # if not os.path.exists(item_to_process["path"]):
        #     error_res = {"error": f"File not found: {item_to_process['filename']}. It might have been a temporary file that was removed.", "document_type_provided": item_to_process['type']}
        #     all_results.append(error_res)
        #     status_updates.append(f"Error: File {item_to_process['filename']} not found.")
        #     continue

        extracted_data = process_single_image_with_openrouter(item_to_process["path"], item_to_process["type"])
        all_results.append(extracted_data)
        if "error" in extracted_data:
            status_updates.append(f"Error processing {item_to_process['filename']}: {extracted_data['error']}")
        else:
            status_updates.append(f"Successfully processed {item_to_process['filename']}.")

    # Attempt to group results by person (heuristic)
    # This is a basic grouping; more sophisticated logic could be added.
    grouped_by_person = {}
    unidentified_docs = []

    for result_item in all_results:
        doc_id = None
        if isinstance(result_item, dict) and "extracted_fields" in result_item and isinstance(result_item["extracted_fields"], dict):
            fields = result_item["extracted_fields"]
            # Try common identifiers
            passport_no = fields.get("Document Number") or fields.get("Passport Number") or fields.get("passport_number")
            name = fields.get("Given Names") or fields.get("Given Name") or fields.get("Name")
            surname = fields.get("Surname") or fields.get("Family Name")
            dob = fields.get("Date of Birth") or fields.get("DOB")

            if passport_no:
                doc_id = f"passport_{str(passport_no).replace(' ', '').lower()}"
            elif name and surname and dob:
                doc_id = f"{str(name).replace(' ', '').lower()}_{str(surname).replace(' ', '').lower()}_{str(dob).replace(' ', '')}"
            elif name and surname:
                 doc_id = f"{str(name).replace(' ', '').lower()}_{str(surname).replace(' ', '').lower()}"


        if doc_id:
            if doc_id not in grouped_by_person:
                grouped_by_person[doc_id] = {"person_identifier": doc_id, "documents": []}
            grouped_by_person[doc_id]["documents"].append(result_item)
        else:
            unidentified_docs.append(result_item)
    
    final_structured_output = {
        "summary": f"Processed {len(current_batch)} documents.",
        "grouped_by_person": list(grouped_by_person.values()) if grouped_by_person else [], # Convert dict to list for easier iteration in JSON
        "unidentified_documents_or_errors": unidentified_docs
    }
    
    # Do not clear batch here, let user do it.
    # current_batch = [] # Clears batch after processing
    # batch_display_data = []

    final_status = "Batch processing complete. " + " | ".join(status_updates)
    print(final_status)
    return final_structured_output, final_status # Output JSON and status message


def clear_batch_ui():
    """Clears the current batch and updates the UI."""
    global current_batch
    current_batch = []
    return [], "Batch cleared successfully." # Cleared dataframe and status message


# --- Gradio UI Layout ---
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# πŸ“„ Document Information Extractor (OpenGVLab/InternVL3-14B via OpenRouter)")
    gr.Markdown(
        "**Instructions:**\n"
        "1. Upload a document image (e.g., passport front/back, photo, hotel reservation).\n"
        "2. Select the correct document type.\n"
        "3. Click 'Add Document to Current Batch'. Repeat for all documents of a person or a related set.\n"
        "4. Review the batch. Click 'Clear Entire Batch' to start over.\n"
        "5. Click 'Process Batch and Extract Information' to send documents to the AI.\n"
        "6. View the extracted information in JSON format below."
    )
    
    if not OPENROUTER_API_KEY:
        gr.Markdown(
            "<h3 style='color:red;'>⚠️ Warning: `OPENROUTER_API_KEY` environment variable is not detected. "
            "API calls will fail. Please set it and restart this application.</h3>"
        )

    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### Step 1: Add Document")
            image_input = gr.Image(
                label="Upload Document Image",
                type="filepath", # 'filepath' gives a temporary path to the uploaded file
                sources=["upload"],
                height=300
            )
            doc_type_choices = [
                'passport_front', 'passport_back', 'national_id_front', 'national_id_back',
                'drivers_license_front', 'drivers_license_back', 'visa_sticker',
                'photo', 'hotel_reservation', 'boarding_pass', 'utility_bill', 'other_document'
            ]
            doc_type_input = gr.Dropdown(
                label="Select Document Type",
                choices=doc_type_choices,
                value='passport_front',
                filterable=True
            )
            add_button = gr.Button("βž• Add Document to Current Batch", variant="secondary")

        with gr.Column(scale=2):
            gr.Markdown("### Step 2: Review Current Batch")
            batch_dataframe = gr.Dataframe(
                headers=["Filename", "Document Type"],
                datatype=["str", "str"],
                row_count=(0, "dynamic"), # Start with 0 rows, dynamically adjusts
                col_count=(2, "fixed"),
                wrap=True,
                height=380,
            )
            clear_batch_button = gr.Button("πŸ—‘οΈ Clear Entire Batch", variant="stop")

    gr.Markdown("### Step 3: Process Batch")
    process_button = gr.Button("πŸš€ Process Batch and Extract Information", variant="primary")
    
    status_message_textbox = gr.Textbox(label="Processing Status", interactive=False, lines=2)

    gr.Markdown("### Step 4: View Results")
    output_json_display = gr.JSON(label="Extracted Information (JSON Format)")

    # --- Connect UI elements to functions ---
    add_button.click(
        fn=add_document_to_batch_ui,
        inputs=[image_input, doc_type_input],
        outputs=[batch_dataframe, status_message_textbox]
    ).then(lambda: None, outputs=image_input) # Clear image input after adding

    clear_batch_button.click(
        fn=clear_batch_ui,
        inputs=[],
        outputs=[batch_dataframe, status_message_textbox]
    )

    process_button.click(
        fn=process_batch_ui,
        inputs=[],
        outputs=[output_json_display, status_message_textbox]
    )

if __name__ == "__main__":
    if not OPENROUTER_API_KEY:
        print("ERROR: The OPENROUTER_API_KEY environment variable is not set.")
        print("Please set it before running the application, e.g.:")
        print("  export OPENROUTER_API_KEY='your_openrouter_key_here'")
        print("The application will launch, but API calls will fail.")
    
    demo.launch()