tienanh2003 commited on
Commit
7615a84
·
verified ·
1 Parent(s): f8416fc

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +566 -0
app.py ADDED
@@ -0,0 +1,566 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import re
4
+ import hashlib
5
+ import gc
6
+ from io import BytesIO
7
+ from collections import OrderedDict
8
+ from PIL import Image, UnidentifiedImageError
9
+ import torch
10
+ from transformers import AutoProcessor, BitsAndBytesConfig
11
+ from transformers.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
12
+ from pdf2image import convert_from_bytes
13
+ import gradio as gr
14
+ import fitz
15
+
16
+ # --- CONFIGURATION ---
17
+ MODEL_ID = "prithivMLmods/Camel-Doc-OCR-062825"
18
+ CACHE_MAX_SIZE = 128
19
+ DPI = 300 # Giữ vừa đủ, không quá cao
20
+ IMAGE_MAX_DIM = None # Không resize nếu không cần
21
+ JPEG_QUALITY = 80
22
+ GPU_MEMORY_FRACTION = 0.8
23
+
24
+ # --- 1. Device ---
25
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
26
+ torch.backends.cudnn.benchmark = True
27
+ if device.type == 'cuda':
28
+ torch.cuda.set_per_process_memory_fraction(GPU_MEMORY_FRACTION, device=0)
29
+
30
+ # --- 2. Load model ---
31
+ from transformers import AutoProcessor, BitsAndBytesConfig
32
+ from transformers.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
33
+
34
+ bnb = BitsAndBytesConfig(
35
+ load_in_4bit=True,
36
+ bnb_4bit_use_double_quant=True,
37
+ bnb_4bit_quant_type="nf4",
38
+ bnb_4bit_compute_dtype=torch.float16
39
+ )
40
+
41
+ processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
42
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
43
+ MODEL_ID,
44
+ quantization_config=bnb,
45
+ device_map="auto",
46
+ trust_remote_code=True
47
+ ).eval()
48
+ processor.tokenizer.pad_token_id = processor.tokenizer.eos_token_id
49
+
50
+ # --- 8. File handler ---
51
+ import traceback
52
+ from concurrent.futures import ThreadPoolExecutor
53
+
54
+ def handle_file(file, prompt, extra_prompt, max_new_tokens, progress=gr.Progress()):
55
+ try:
56
+ file_path = file.name if hasattr(file, "name") else file
57
+ filename = os.path.basename(file_path)
58
+ ext = filename.lower().split('.')[-1]
59
+ full_prompt = (prompt + "\n" + extra_prompt).strip() or ""
60
+
61
+ print(f"[INFO] handle_file → {filename} (.{ext})")
62
+
63
+ if ext == "pdf":
64
+ try:
65
+ with open(file_path, "rb") as f:
66
+ pdf_bytes = f.read()
67
+ print(f"[INFO] Read PDF bytes: {len(pdf_bytes)} bytes")
68
+
69
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
70
+ pages = []
71
+ zoom = DPI
72
+ mat = fitz.Matrix(zoom, zoom)
73
+ for i, page in enumerate(doc):
74
+ pix = page.get_pixmap(matrix=mat, colorspace=fitz.csRGB)
75
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
76
+ if max(img.size) > 3072:
77
+ img.thumbnail((3072, 3072), Image.Resampling.LANCZOS)
78
+ pages.append(img)
79
+ print(f"[INFO] Converted PDF → {len(pages)} pages")
80
+
81
+ except Exception as e:
82
+ traceback.print_exc()
83
+ return filename, f"[ERROR] PDF conversion failed: {e}"
84
+
85
+ outputs = []
86
+ with ThreadPoolExecutor(max_workers=4) as executor:
87
+ futures = [executor.submit(run_inference, img, full_prompt, max_new_tokens) for img in pages]
88
+ for idx, future in enumerate(futures):
89
+ try:
90
+ out = future.result()
91
+ except Exception as e:
92
+ traceback.print_exc()
93
+ out = f"[ERROR] Inference page {idx+1} failed: {e}"
94
+ outputs.append(out)
95
+ progress((idx) / len(pages), desc=f"Page {idx+1}/{len(pages)}")
96
+
97
+ result = "\n\n--- Page Break ---\n\n".join(outputs)
98
+ print("[INFO] handle_file done")
99
+ return filename, result
100
+
101
+ else:
102
+ try:
103
+ img = Image.open(file_path)
104
+ print(f"[INFO] Opened image: {img.mode}, {img.size}")
105
+ except Exception as e:
106
+ traceback.print_exc()
107
+ return filename, f"[ERROR] Image open failed: {e}"
108
+
109
+ return filename, run_inference(img, full_prompt, max_new_tokens)
110
+
111
+ except Exception as e:
112
+ traceback.print_exc()
113
+ return "error", f"[ERROR] handle_file unexpected: {e}"
114
+
115
+ def run_inference(img: Image.Image, prompt: str = "", max_new_tokens: int = 512) -> str:
116
+ if img.mode != "RGB":
117
+ img = img.convert("RGB")
118
+ prompt_text = prompt.strip()
119
+
120
+ messages = [{
121
+ "role": "user",
122
+ "content": [
123
+ {"type": "image", "image": img},
124
+ {"type": "text", "text": prompt_text}
125
+ ]
126
+ }]
127
+
128
+ text_prompt = processor.apply_chat_template(
129
+ messages, tokenize=False, add_generation_prompt=True
130
+ )
131
+
132
+ inputs = processor(
133
+ text=[text_prompt], images=[img], return_tensors="pt", padding=True
134
+ ).to(device)
135
+
136
+ with torch.inference_mode(), torch.cuda.amp.autocast(enabled=(device.type == 'cuda')):
137
+ gen = model.generate(
138
+ **inputs,
139
+ max_new_tokens=max_new_tokens,
140
+ do_sample=False,
141
+ eos_token_id=processor.tokenizer.eos_token_id
142
+ )
143
+
144
+ trimmed = [o[len(i):] for i, o in zip(inputs['input_ids'], gen)]
145
+ result = processor.tokenizer.batch_decode(
146
+ trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=True
147
+ )[0].strip()
148
+
149
+ return result
150
+
151
+ # --- 9. Prompt templates & JSON export ---
152
+ prompt_templates = {
153
+ "Electrolux": """Extract all structured information from the delivery order document image.
154
+ You must return the result as a valid XML block that strictly follows the structure below.
155
+ STRICT INSTRUCTIONS – read carefully and follow EXACTLY:
156
+ 1. Return **ONLY** the XML block – nothing before or after it.
157
+ 2. DO NOT add, remove, rename, or reorder any XML tags.
158
+ 3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block.
159
+ 4. For every tag, fill in the exact value read from the image.
160
+ • NEVER copy or repeat the label/placeholder text.
161
+ • NEVER guess or invent values.
162
+ 5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>).
163
+ 6. DO NOT include Vietnamese text or translations inside tag values.
164
+ 7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed.
165
+ 8. Dates must be in YYYY-MM-DD format.
166
+ 9. Boolean tags must be exactly true or false (lower-case, no quotes).
167
+ ✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false
168
+ 10. **Inside each value**
169
+ • Replace every internal line-break with “, ” (comma + space).
170
+ • Trim leading/trailing whitespace.
171
+ • Escape XML special characters: & → &amp;, < → &lt;, > → &gt;.
172
+ 11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”.
173
+ 12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty.
174
+ 13. Ignore any information not represented by the tags below.
175
+ <s_electrolux_form>
176
+ <document_number>Số lệnh giao nhận hàng</document_number>
177
+ <order_number>Số đơn hàng</order_number>
178
+ <customer_code>Mã số khách hàng</customer_code>
179
+ <customer_order_code>Mã đơn khách hàng</customer_order_code>
180
+ <customer_order_date>Ngày đặt hàng của khách</customer_order_date>
181
+ <delivery_date>Ngày giao hàng</delivery_date>
182
+ <requested_delivery_date>Ngày giao hàng yêu cầu</requested_delivery_date>
183
+ <invoice_number>Số hóa đơn</invoice_number>
184
+ <shipper_company_name>Tên công ty gửi hàng</shipper_company_name>
185
+ <shipper_address>Địa chỉ gửi hàng</shipper_address>
186
+ <shipper_phone>Số điện thoại</shipper_phone>
187
+ <shipper_fax>Số fax</shipper_fax>
188
+ <shipper_tax_code>Mã số thuế</shipper_tax_code>
189
+ <consignee_customer_code>Mã khách hàng</consignee_customer_code>
190
+ <consignee_company_name>Tên công ty nhận hàng</consignee_company_name>
191
+ <shipping_address>Địa chỉ nhận hàng chi tiết</shipping_address>
192
+ <city_province>Tỉnh/Thành phố</city_province>
193
+ <postal_code>Mã bưu chính</postal_code>
194
+ <preparer_name>Họ tên người lập phiếu</preparer_name>
195
+ <preparer_date>Ngày lập phiếu</preparer_date>
196
+ <s_is_signed>Đã ký hay chưa (true hoặc false)</s_is_signed>
197
+ </s_electrolux_form>
198
+ """,
199
+
200
+ "Jotun": """Extract all structured information from the delivery order document.
201
+ You must return the result as a valid XML block that strictly follows the structure below.
202
+ STRICT INSTRUCTIONS – read carefully and follow EXACTLY:
203
+ 1. Return **ONLY** the XML block – nothing before or after it.
204
+ 2. DO NOT add, remove, rename, or reorder any XML tags.
205
+ 3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block.
206
+ 4. For every tag, fill in the exact value read from the image.
207
+ • NEVER copy or repeat the label/placeholder text.
208
+ • NEVER guess or invent values.
209
+ 5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>).
210
+ 6. DO NOT include Vietnamese text or translations inside tag values.
211
+ 7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed.
212
+ 8. Dates must be in YYYY-MM-DD format.
213
+ 9. Boolean tags must be exactly true or false (lower-case, no quotes).
214
+ ✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false
215
+ 10. **Inside each value**
216
+ • Replace every internal line-break with “, ” (comma + space).
217
+ • Trim leading/trailing whitespace.
218
+ • Escape XML special characters: & → &amp;, < → &lt;, > → &gt;.
219
+ 11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”.
220
+ 12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty.
221
+ 13. Ignore any information not represented by the tags below.
222
+ <s_jotun_form>
223
+ <document_number>Số lệnh giao hàng</document_number>
224
+ <delivery_order_code>Số lệnh giao hàng số</delivery_order_code>
225
+ <customer_code>Mã khách hàng</customer_code>
226
+ <customer_name>Tên khách hàng</customer_name>
227
+ <customer_address>Địa chỉ khách hàng</customer_address>
228
+ <customer_phone>Điện thoại khách hàng</customer_phone>
229
+ <invoice_receiver_name>Tên người nhận hóa đơn</invoice_receiver_name>
230
+ <invoice_receiver_address>Địa chỉ người nhận hóa đơn</invoice_receiver_address>
231
+ <order_code>Số đơn đặt hàng</order_code>
232
+ <order_date>Ngày đặt hàng</order_date>
233
+ <order_number>Số đơn hàng</order_number>
234
+ <delivery_date>Ngày giao hàng</delivery_date>
235
+ <s_is_signed>Đã ký hay chưa (true hoặc false)</s_is_signed>
236
+ </s_jotun_form>
237
+ """,
238
+
239
+ "MAWB": """Extract all structured information from the Master Air Waybill (MAWB) document.
240
+ You must return the result as a valid XML block that strictly follows the structure below.
241
+ STRICT INSTRUCTIONS – read carefully and follow EXACTLY:
242
+ 1. Return **ONLY** the XML block – nothing before or after it.
243
+ 2. DO NOT add, remove, rename, or reorder any XML tags.
244
+ 3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block.
245
+ 4. For every tag, fill in the exact value read from the image.
246
+ • NEVER copy or repeat the label/placeholder text.
247
+ • NEVER guess or invent values.
248
+ 5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>).
249
+ 6. DO NOT include Vietnamese text or translations inside tag values.
250
+ 7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed.
251
+ 8. Dates must be in YYYY-MM-DD format.
252
+ 9. Boolean tags must be exactly true or false (lower-case, no quotes).
253
+ ✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false
254
+ 10. **Inside each value**
255
+ • Replace every internal line-break with “, ” (comma + space).
256
+ • Trim leading/trailing whitespace.
257
+ • Escape XML special characters: & → &amp;, < → &lt;, > → &gt;.
258
+ 11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”.
259
+ 12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty.
260
+ 13. Ignore any information not represented by the tags below.
261
+ <s_mawb_form>
262
+ <air_waybill_number>Số MAWB</air_waybill_number>
263
+ <shipper_name>Tên người gửi hàng</shipper_name>
264
+ <shipper_address>Địa chỉ người gửi hàng</shipper_address>
265
+ <shipper_account_number>Mã tài khoản người gửi</shipper_account_number>
266
+ <consignee_name>Tên người nhận hàng</consignee_name>
267
+ <consignee_address>Địa chỉ người nhận hàng</consignee_address>
268
+ <consignee_account_number>Mã tài khoản người nhận</consignee_account_number>
269
+ <dangerous_goods_note>Ghi chú hàng nguy hiểm (true or false)</dangerous_goods_note>
270
+ <shipper_signature>Chữ ký người gửi</shipper_signature>
271
+ </s_mawb_form>
272
+ """,
273
+
274
+ "Phiếu Cân": """Extract all structured information from the document 'PHIẾU CÂN / SHIPPER’S LETTER OF INSTRUCTIONS'.
275
+ You must return the result as a valid XML block that strictly follows the structure below.
276
+ STRICT INSTRUCTIONS – read carefully and follow EXACTLY:
277
+ 1. Return **ONLY** the XML block – nothing before or after it.
278
+ 2. DO NOT add, remove, rename, or reorder any XML tags.
279
+ 3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block.
280
+ 4. For every tag, fill in the exact value read from the image.
281
+ • NEVER copy or repeat the label/placeholder text.
282
+ • NEVER guess or invent values.
283
+ 5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>).
284
+ 6. DO NOT include Vietnamese text or translations inside tag values.
285
+ 7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed.
286
+ 8. Dates must be in YYYY-MM-DD format.
287
+ 9. Boolean tags must be exactly true or false (lower-case, no quotes).
288
+ ✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false
289
+ 10. **Inside each value**
290
+ • Replace every internal line-break with “, ” (comma + space).
291
+ • Trim leading/trailing whitespace.
292
+ • Escape XML special characters: & → &amp;, < → &lt;, > → &gt;.
293
+ 11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”.
294
+ 12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty.
295
+ 13. Ignore any information not represented by the tags below.
296
+ <s_weight_ticket>
297
+ <awb_number>Số AWB</awb_number>
298
+ <shipper_name>Tên người gửi hàng</shipper_name>
299
+ <shipper_address>Địa chỉ người gửi hàng</shipper_address>
300
+ <shipper_contact>Số điện thoại người gửi</shipper_contact>
301
+ <consignee_name>Tên người nhận hàng</consignee_name>
302
+ <consignee_address>Địa chỉ người nhận hàng</consignee_address>
303
+ <cargo_description>Tên hàng hóa</cargo_description>
304
+ <security_check_complete>Đã kiểm tra an ninh (true/false)</security_check_complete>
305
+ <acceptance_staff_name>Tên nhân viên tiếp nhận</acceptance_staff_name>
306
+ <acceptance_staff_signature>Chữ ký nhân viên tiếp nhận</acceptance_staff_signature>
307
+ </s_weight_ticket>
308
+ """,
309
+
310
+ "PC 3U": """Extract all structured information from the PC 3U air cargo instruction document.
311
+ You must return the result as a valid XML block that strictly follows the structure below.
312
+ STRICT INSTRUCTIONS – read carefully and follow EXACTLY:
313
+ 1. Return **ONLY** the XML block – nothing before or after it.
314
+ 2. DO NOT add, remove, rename, or reorder any XML tags.
315
+ 3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block.
316
+ 4. For every tag, fill in the exact value read from the image.
317
+ • NEVER copy or repeat the label/placeholder text.
318
+ • NEVER guess or invent values.
319
+ 5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>).
320
+ 6. DO NOT include Vietnamese text or translations inside tag values.
321
+ 7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed.
322
+ 8. Dates must be in YYYY-MM-DD format.
323
+ 9. Boolean tags must be exactly true or false (lower-case, no quotes).
324
+ ✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false
325
+ 10. **Inside each value**
326
+ • Replace every internal line-break with “, ” (comma + space).
327
+ • Trim leading/trailing whitespace.
328
+ • Escape XML special characters: & → &amp;, < → &lt;, > → &gt;.
329
+ 11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”.
330
+ 12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty.
331
+ 13. Ignore any information not represented by the tags below.
332
+ <s_pc3u_form>
333
+ <awb_number>Số AWB</awb_number>
334
+ <cargo_service_code>Mã dịch vụ</cargo_service_code>
335
+ <shipper_name>Tên người gửi</shipper_name>
336
+ <shipper_address>Địa chỉ người gửi</shipper_address>
337
+ <shipper_contact>Thông tin liên hệ người gửi</shipper_contact>
338
+ <payer_name>Người thanh toán</payer_name>
339
+ <payer_tax_code>Mã số thuế người thanh toán</payer_tax_code>
340
+ <consignee_name>Tên người nhận</consignee_name>
341
+ <consignee_address>Địa chỉ người nhận</consignee_address>
342
+ <consignee_contact>Thông tin liên hệ người nhận</consignee_contact>
343
+ <shipper_signature>Chữ ký người gửi</shipper_signature>
344
+ <acceptance_staff_signature>Chữ ký nhân viên tiếp nhận</acceptance_staff_signature>
345
+ </s_pc3u_form>
346
+ """,
347
+
348
+ "SLIS-AVS DAD": """Extract all structured information from the document 'TỜ KHAI GỬI HÀNG - SHIPPER’S LETTER OF INSTRUCTION'.
349
+ You must return the result as a valid XML block that strictly follows the structure below.
350
+ STRICT INSTRUCTIONS – read carefully and follow EXACTLY:
351
+ 1. Return **ONLY** the XML block – nothing before or after it.
352
+ 2. DO NOT add, remove, rename, or reorder any XML tags.
353
+ 3. DO NOT include explanations, markdown, notes, comments, or extra spacing outside the XML block.
354
+ 4. For every tag, fill in the exact value read from the image.
355
+ • NEVER copy or repeat the label/placeholder text.
356
+ • NEVER guess or invent values.
357
+ 5. If a value is missing or unreadable, leave the tag EMPTY (e.g. <tag></tag>).
358
+ 6. DO NOT include Vietnamese text or translations inside tag values.
359
+ 7. The output MUST start with the root tag and end with its correct closing tag; all tags must be well-formed.
360
+ 8. Dates must be in YYYY-MM-DD format.
361
+ 9. Boolean tags must be exactly true or false (lower-case, no quotes).
362
+ ✔ √ Yes Passed ⇒ true | ✘ X No Fail ⇒ false
363
+ 10. **Inside each value**
364
+ • Replace every internal line-break with “, ” (comma + space).
365
+ • Trim leading/trailing whitespace.
366
+ • Escape XML special characters: & → &amp;, < → &lt;, > → &gt;.
367
+ 11. **Phone / contact fields** – digits, “+”, “–”, spaces only; if multiple numbers, separate with “, ”.
368
+ 12. **Signature fields** – fill ONLY if the signature appears as legible text; if it is handwritten, leave the tag empty.
369
+ 13. Ignore any information not represented by the tags below.
370
+ <s_avs_dad>
371
+ <air_waybill_number>Số AWB</air_waybill_number>
372
+ <form_code>Mã biểu mẫu</form_code>
373
+ <shipper_name>Tên người gửi</shipper_name>
374
+ <shipper_address>Địa chỉ người gửi</shipper_address>
375
+ <shipper_phone>Điện thoại người gửi</shipper_phone>
376
+ <shipper_email>Email người gửi</shipper_email>
377
+ <shipper_tax_code>Mã số thuế người gửi</shipper_tax_code>
378
+ <consignee_name>Tên người nhận</consignee_name>
379
+ <consignee_address>Địa chỉ người nhận</consignee_address>
380
+ <consignee_phone>Điện thoại người nhận</consignee_phone>
381
+ <consignee_email>Email người nhận</consignee_email>
382
+ <departure_airport>Nơi đi</departure_airport>
383
+ <destination_airport>Nơi đến</destination_airport>
384
+ <acceptance_staff_name>Tên nhân viên tiếp nhận</acceptance_staff_name>
385
+ <acceptance_signature>Chữ ký nhân viên tiếp nhận</acceptance_signature>
386
+ <acceptance_time>Thời điểm tiếp nhận</acceptance_time>
387
+ <shipper_signature>Chữ ký người gửi</shipper_signature>
388
+ <shipper_signature_date>Ngày ký người gửi</shipper_signature_date>
389
+ </s_avs_dad>
390
+ """
391
+ }
392
+
393
+ def insert_template(name):
394
+ return prompt_templates.get(name, "")
395
+
396
+ def sanitize_filename(name):
397
+ return re.sub(r'[^a-zA-Z0-9_\-\.]', '_', name)
398
+
399
+ def clean_text(text):
400
+ text = re.sub(r'<[^<> ]+?>', lambda m: m.group(0).strip(), text)
401
+ text = re.sub(r'<[^<>]+?>[^<>]*?<[^<>]+?>', lambda m: m.group(0).strip(), text)
402
+ return text.strip()
403
+
404
+ def export_json(image_name, result_text):
405
+ try:
406
+ clean_name = sanitize_filename(image_name)
407
+ content = {"image": image_name, "text_sequence": clean_text(result_text)}
408
+ path = f"/tmp/{clean_name}.json"
409
+ with open(path, "w", encoding="utf-8") as f:
410
+ json.dump(content, f, ensure_ascii=False, indent=2)
411
+ return path, json.dumps(content, ensure_ascii=False, indent=2)
412
+ except Exception as e:
413
+ return "", f"[Export JSON Failed]: {e}"
414
+
415
+ # --- 10. Gradio UI ---
416
+ css = """
417
+ .gradio-textbox textarea {
418
+ font-size: 13px !important;
419
+ line-height: 1.3 !important;
420
+ padding: 6px 8px !important;
421
+ }
422
+ .gradio-textbox label {
423
+ font-size: 13px !important;
424
+ font-weight: 600 !important;
425
+ margin-bottom: 4px !important;
426
+ }
427
+ .gradio-button {
428
+ font-size: 12px !important;
429
+ padding: 4px 8px !important;
430
+ height: 28px !important;
431
+ min-height: 28px !important;
432
+ margin: 2px !important;
433
+ }
434
+ .gradio-button[data-variant="primary"] {
435
+ height: 36px !important;
436
+ font-size: 13px !important;
437
+ padding: 8px 16px !important;
438
+ }
439
+ .gradio-file {
440
+ font-size: 13px !important;
441
+ }
442
+ .gradio-file .file-upload {
443
+ padding: 8px !important;
444
+ min-height: 80px !important;
445
+ }
446
+ .gradio-markdown h3 {
447
+ font-size: 14px !important;
448
+ margin: 8px 0 4px 0 !important;
449
+ }
450
+ .gradio-markdown h2 {
451
+ font-size: 18px !important;
452
+ margin: 8px 0 !important;
453
+ }
454
+ .gradio-code {
455
+ font-size: 12px !important;
456
+ }
457
+ """
458
+
459
+ with gr.Blocks(title="Camel-Doc-OCR", css=css) as demo:
460
+ gr.Markdown("## 🧾 Camel-Doc-OCR (Qwen2.5-VL, 4-bit)")
461
+
462
+ # --- Main Layout: 2 Columns ---
463
+ with gr.Row():
464
+ # === LEFT COLUMN: Input ===
465
+ with gr.Column(scale=1):
466
+ gr.Markdown("### 📥 INPUT")
467
+
468
+ # File Input
469
+ file_input = gr.File(
470
+ label="📤 Tải ảnh hoặc PDF",
471
+ file_types=[".jpg", ".jpeg", ".png", ".pdf"],
472
+ height=100
473
+ )
474
+
475
+ # Prompt Input
476
+ prompt_input = gr.Textbox(
477
+ label="Prompt thuần",
478
+ lines=2,
479
+ placeholder="Nhập prompt tùy chỉnh...",
480
+ max_lines=3
481
+ )
482
+
483
+ # JSON Config
484
+ config_input = gr.Textbox(
485
+ label="JSON Prompt",
486
+ lines=6,
487
+ placeholder="Cấu hình JSON sẽ xuất hiện ở đây...",
488
+ max_lines=8
489
+ )
490
+
491
+ # Max New Tokens Radio
492
+ max_new_tokens_input = gr.Radio(
493
+ choices=[128, 256, 512, 1024, 1536, 2048],
494
+ value=512,
495
+ label="🔢 Chọn max_new_tokens (giới hạn độ dài đầu ra)",
496
+ info="Chọn độ dài tối đa cho đầu ra của mô hình"
497
+ )
498
+
499
+ # Prompt Templates
500
+ gr.Markdown("### 📑 Mẫu:")
501
+ with gr.Row():
502
+ for key in list(prompt_templates.keys()): # All buttons in one row
503
+ gr.Button(f"{key}", size="sm", scale=1).click(
504
+ fn=lambda *, k=key: insert_template(k),
505
+ inputs=[],
506
+ outputs=config_input
507
+ )
508
+
509
+ # Run Button
510
+ run_btn = gr.Button("🚀 Chạy OCR", variant="primary")
511
+
512
+ # === RIGHT COLUMN: Output ===
513
+ with gr.Column(scale=1):
514
+ gr.Markdown("### 📤 OUTPUT")
515
+
516
+ # Result Output
517
+ result_output = gr.Textbox(
518
+ label="Kết quả trích xuất",
519
+ lines=10,
520
+ placeholder="Kết quả sẽ hiển thị ở đây sau khi chạy OCR...",
521
+ max_lines=12
522
+ )
523
+
524
+ # Export Section
525
+ with gr.Row():
526
+ export_btn = gr.Button("📦 Xuất JSON", visible=False, variant="secondary", size="sm")
527
+
528
+ # JSON Output
529
+ json_text = gr.Code(
530
+ label="JSON Output",
531
+ language="json",
532
+ lines=6,
533
+ visible=False
534
+ )
535
+
536
+ # Download File
537
+ json_file = gr.File(
538
+ label="File JSON để tải",
539
+ visible=False,
540
+ file_types=[".json"]
541
+ )
542
+
543
+ # --- Hidden Fields ---
544
+ hidden_name = gr.Textbox(visible=False)
545
+
546
+ # --- Event Handlers ---
547
+
548
+ # Run Inference
549
+ run_btn.click(
550
+ fn=handle_file,
551
+ inputs=[file_input, prompt_input, config_input, max_new_tokens_input],
552
+ outputs=[hidden_name, result_output]
553
+ )
554
+
555
+ # Export JSON
556
+ export_btn.click(
557
+ fn=export_json,
558
+ inputs=[hidden_name, result_output],
559
+ outputs=[json_file, json_text]
560
+ )
561
+
562
+ export_btn.click(fn=lambda: gr.update(visible=True), outputs=[json_file])
563
+ export_btn.click(fn=lambda: gr.update(visible=True), outputs=[json_text])
564
+
565
+ if __name__ == "__main__":
566
+ demo.launch()