ChintanSatva commited on
Commit
fd8097f
·
verified ·
1 Parent(s): 3633a7f

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +322 -0
app.py ADDED
@@ -0,0 +1,322 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, File, UploadFile, HTTPException
2
+ import pytesseract
3
+ import cv2
4
+ import os
5
+ from PIL import Image
6
+ import json
7
+ import unicodedata
8
+ from pdf2image import convert_from_bytes
9
+ from pypdf import PdfReader
10
+ import numpy as np
11
+ from typing import List
12
+ import io
13
+ import logging
14
+ import time
15
+ import asyncio
16
+ import psutil
17
+ import cachetools
18
+ import hashlib
19
+ from vllm import LLM
20
+
21
+ app = FastAPI()
22
+
23
+ # Configure logging
24
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
25
+ logger = logging.getLogger(__name__)
26
+
27
+ # Set Tesseract path
28
+ pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
29
+
30
+ # Initialize BitNet model
31
+ try:
32
+ llm = LLM(model="bitnet/BitNet-b1.2-3B", gpu_memory_utilization=0.0) # CPU-only
33
+ except Exception as e:
34
+ logger.error(f"Failed to load BitNet model: {str(e)}")
35
+ raise HTTPException(status_code=500, detail="BitNet model initialization failed")
36
+
37
+ # In-memory caches (1-hour TTL)
38
+ raw_text_cache = cachetools.TTLCache(maxsize=100, ttl=3600)
39
+ structured_data_cache = cachetools.TTLCache(maxsize=100, ttl=3600)
40
+
41
+ def log_memory_usage():
42
+ """Log current memory usage."""
43
+ process = psutil.Process()
44
+ mem_info = process.memory_info()
45
+ return f"Memory usage: {mem_info.rss / 1024 / 1024:.2f} MB"
46
+
47
+ def get_file_hash(file_bytes):
48
+ """Generate MD5 hash of file content."""
49
+ return hashlib.md5(file_bytes).hexdigest()
50
+
51
+ def get_text_hash(raw_text):
52
+ """Generate MD5 hash of raw text."""
53
+ return hashlib.md5(raw_text.encode('utf-8')).hexdigest()
54
+
55
+ async def process_image(img_bytes, filename, idx):
56
+ """Process a single image (JPG/JPEG/PNG) with OCR."""
57
+ start_time = time.time()
58
+ logger.info(f"Starting OCR for {filename} image {idx}, {log_memory_usage()}")
59
+ try:
60
+ img = Image.open(io.BytesIO(img_bytes))
61
+ img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
62
+ gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
63
+ img_pil = Image.fromarray(cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB))
64
+ custom_config = r'--oem 1 --psm 6 -l eng+ara'
65
+ page_text = pytesseract.image_to_string(img_pil, config=custom_config)
66
+ logger.info(f"Completed OCR for {filename} image {idx}, took {time.time() - start_time:.2f} seconds, {log_memory_usage()}")
67
+ return page_text + "\n"
68
+ except Exception as e:
69
+ logger.error(f"OCR failed for {filename} image {idx}: {str(e)}, {log_memory_usage()}")
70
+ return ""
71
+
72
+ async def process_pdf_page(img, page_idx):
73
+ """Process a single PDF page with OCR."""
74
+ start_time = time.time()
75
+ logger.info(f"Starting OCR for PDF page {page_idx}, {log_memory_usage()}")
76
+ try:
77
+ img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
78
+ gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
79
+ img_pil = Image.fromarray(cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB))
80
+ custom_config = r'--oem 1 --psm 6 -l eng+ara'
81
+ page_text = pytesseract.image_to_string(img_pil, config=custom_config)
82
+ logger.info(f"Completed OCR for PDF page {page_idx}, took {time.time() - start_time:.2f} seconds, {log_memory_usage()}")
83
+ return page_text + "\n"
84
+ except Exception as e:
85
+ logger.error(f"OCR failed for PDF page {page_idx}: {str(e)}, {log_memory_usage()}")
86
+ return ""
87
+
88
+ async def process_with_bitnet(filename: str, raw_text: str):
89
+ """Process raw text with BitNet to extract structured data."""
90
+ start_time = time.time()
91
+ logger.info(f"Starting BitNet processing for {filename}, {log_memory_usage()}")
92
+
93
+ # Check structured data cache
94
+ text_hash = get_text_hash(raw_text)
95
+ if text_hash in structured_data_cache:
96
+ logger.info(f"Structured data cache hit for {filename}, {log_memory_usage()}")
97
+ return structured_data_cache[text_hash]
98
+
99
+ # Truncate text for BitNet
100
+ if len(raw_text) > 10000:
101
+ raw_text = raw_text[:10000]
102
+ logger.info(f"Truncated raw text for {filename} to 10000 characters, {log_memory_usage()}")
103
+
104
+ try:
105
+ prompt = f"""You are an intelligent invoice data extractor. Given raw text from an invoice (in English or other languages),
106
+ extract key business fields into the specified JSON format. Return each field with an estimated accuracy score between 0 and 1.
107
+
108
+ - Accuracy reflects confidence in the correctness of each field.
109
+ - Handle synonyms (e.g., 'total' = 'net', 'tax' = 'GST'/'TDS').
110
+ - Detect currency from symbols ($, ₹, €) or keywords (USD, INR, EUR); default to USD if unclear.
111
+ - The 'items' list may have multiple entries, each with detailed attributes.
112
+ - If a field is missing, return an empty value (`""` or `0`) and set `accuracy` to `0.0`.
113
+ - Convert any date to YYYY-MM-DD.
114
+
115
+ Raw text:
116
+ {raw_text}
117
+
118
+ Output JSON:
119
+ {{
120
+ "invoice": {{
121
+ "invoice_number": {{"value": "", "accuracy": 0.0}},
122
+ "invoice_date": {{"value": "", "accuracy": 0.0}},
123
+ "due_date": {{"value": "", "accuracy": 0.0}},
124
+ "purchase_order_number": {{"value": "", "accuracy": 0.0}},
125
+ "vendor": {{
126
+ "vendor_id": {{"value": "", "accuracy": 0.0}},
127
+ "name": {{"value": "", "accuracy": 0.0}},
128
+ "address": {{
129
+ "line1": {{"value": "", "accuracy": 0.0}},
130
+ "line2": {{"value": "", "accuracy": 0.0}},
131
+ "city": {{"value": "", "accuracy": 0.0}},
132
+ "state": {{"value": "", "accuracy": 0.0}},
133
+ "postal_code": {{"value": "", "accuracy": 0.0}},
134
+ "country": {{"value": "", "accuracy": 0.0}}
135
+ }},
136
+ "contact": {{
137
+ "email": {{"value": "", "accuracy": 0.0}},
138
+ "phone": {{"value": "", "accuracy": 0.0}}
139
+ }},
140
+ "tax_id": {{"value": "", "accuracy": 0.0}}
141
+ }},
142
+ "buyer": {{
143
+ "buyer_id": {{"value": "", "accuracy": 0.0}},
144
+ "name": {{"value": "", "accuracy": 0.0}},
145
+ "address": {{
146
+ "line1": {{"value": "", "accuracy": 0.0}},
147
+ "line2": {{"value": "", "accuracy": 0.0}},
148
+ "city": {{"value": "", "accuracy": 0.0}},
149
+ "state": {{"value": "", "accuracy": 0.0}},
150
+ "postal_code": {{"value": "", "accuracy": 0.0}},
151
+ "country": {{"value": "", "accuracy": 0.0}}
152
+ }},
153
+ "contact": {{
154
+ "email": {{"value": "", "accuracy": 0.0}},
155
+ "phone": {{"value": "", "accuracy": 0.0}}
156
+ }},
157
+ "tax_id": {{"value": "", "accuracy": 0.0}}
158
+ }},
159
+ "items": [
160
+ {{
161
+ "item_id": {{"value": "", "accuracy": 0.0}},
162
+ "description": {{"value": "", "accuracy": 0.0}},
163
+ "quantity": {{"value": 0, "accuracy": 0.0}},
164
+ "unit_of_measure": {{"value": "", "accuracy": 0.0}},
165
+ "unit_price": {{"value": 0, "accuracy": 0.0}},
166
+ "total_price": {{"value": 0, "accuracy": 0.0}},
167
+ "tax_rate": {{"value": 0, "accuracy": 0.0}},
168
+ "tax_amount": {{"value": 0, "accuracy": 0.0}},
169
+ "discount": {{"value": 0, "accuracy": 0.0}},
170
+ "net_amount": {{"value": 0, "accuracy": 0.0}}
171
+ }}
172
+ ],
173
+ "sub_total": {{"value": 0, "accuracy": 0.0}},
174
+ "tax_total": {{"value": 0, "accuracy": 0.0}},
175
+ "discount_total": {{"value": 0, "accuracy": 0.0}},
176
+ "total_amount": {{"value": 0, "accuracy": 0.0}},
177
+ "currency": {{"value": "", "accuracy": 0.0}}
178
+ }}
179
+ }}
180
+ """
181
+ output = llm.generate([{"role": "user", "content": prompt}])
182
+ json_str = output[0].text
183
+ json_start = json_str.find("{")
184
+ json_end = json_str.rfind("}") + 1
185
+ structured_data = json.loads(json_str[json_start:json_end])
186
+ structured_data_cache[text_hash] = structured_data
187
+ logger.info(f"BitNet processing for {filename}, took {time.time() - start_time:.2f} seconds, {log_memory_usage()}")
188
+ return structured_data
189
+ except Exception as e:
190
+ logger.error(f"BitNet processing failed for {filename}: {str(e)}, {log_memory_usage()}")
191
+ return {"error": f"BitNet processing failed: {str(e)}"}
192
+
193
+ @app.post("/ocr")
194
+ async def extract_and_structure(files: List[UploadFile] = File(...)):
195
+ output_json = {
196
+ "success": True,
197
+ "message": "",
198
+ "data": []
199
+ }
200
+ success_count = 0
201
+ fail_count = 0
202
+
203
+ logger.info(f"Starting processing for {len(files)} files, {log_memory_usage()}")
204
+
205
+ for file in files:
206
+ total_start_time = time.time()
207
+ logger.info(f"Processing file: {file.filename}, {log_memory_usage()}")
208
+
209
+ # Validate file format
210
+ valid_extensions = {'.pdf', '.jpg', '.jpeg', '.png'}
211
+ file_ext = os.path.splitext(file.filename.lower())[1]
212
+ if file_ext not in valid_extensions:
213
+ fail_count += 1
214
+ output_json["data"].append({
215
+ "filename": file.filename,
216
+ "structured_data": {"error": f"Unsupported file format: {file_ext}"},
217
+ "error": f"Unsupported file format: {file_ext}"
218
+ })
219
+ logger.error(f"Unsupported file format for {file.filename}: {file_ext}")
220
+ continue
221
+
222
+ # Read file into memory
223
+ try:
224
+ file_start_time = time.time()
225
+ file_bytes = await file.read()
226
+ file_stream = io.BytesIO(file_bytes)
227
+ file_hash = get_file_hash(file_bytes)
228
+ logger.info(f"Read file {file.filename}, took {time.time() - file_start_time:.2f} seconds, size: {len(file_bytes)/1024:.2f} KB, {log_memory_usage()}")
229
+ except Exception as e:
230
+ fail_count += 1
231
+ output_json["data"].append({
232
+ "filename": file.filename,
233
+ "structured_data": {"error": f"Failed to read file: {str(e)}"},
234
+ "error": f"Failed to read file: {str(e)}"
235
+ })
236
+ logger.error(f"Failed to read file {file.filename}: {str(e)}, {log_memory_usage()}")
237
+ continue
238
+
239
+ # Check raw text cache
240
+ raw_text = ""
241
+ if file_hash in raw_text_cache:
242
+ raw_text = raw_text_cache[file_hash]
243
+ logger.info(f"Raw text cache hit for {file.filename}, {log_memory_usage()}")
244
+ else:
245
+ if file_ext == '.pdf':
246
+ # Try extracting embedded text
247
+ try:
248
+ extract_start_time = time.time()
249
+ reader = PdfReader(file_stream)
250
+ for page in reader.pages:
251
+ text = page.extract_text()
252
+ if text:
253
+ raw_text += text + "\n"
254
+ logger.info(f"Embedded text extraction for {file.filename}, took {time.time() - extract_start_time:.2f} seconds, text length: {len(raw_text)}, {log_memory_usage()}")
255
+ except Exception as e:
256
+ logger.warning(f"Embedded text extraction failed for {file.filename}: {str(e)}, {log_memory_usage()}")
257
+
258
+ # If no embedded text, perform OCR
259
+ if not raw_text.strip():
260
+ try:
261
+ convert_start_time = time.time()
262
+ images = convert_from_bytes(file_bytes, poppler_path="/usr/local/bin", dpi=100)
263
+ logger.info(f"PDF to images conversion for {file.filename}, {len(images)} pages, took {time.time() - convert_start_time:.2f} seconds, {log_memory_usage()}")
264
+
265
+ ocr_start_time = time.time()
266
+ page_texts = []
267
+ for i, img in enumerate(images):
268
+ page_text = await process_pdf_page(img, i)
269
+ page_texts.append(page_text)
270
+ raw_text = "".join(page_texts)
271
+ logger.info(f"Total OCR for {file.filename}, took {time.time() - ocr_start_time:.2f} seconds, text length: {len(raw_text)}, {log_memory_usage()}")
272
+ except Exception as e:
273
+ fail_count += 1
274
+ output_json["data"].append({
275
+ "filename": file.filename,
276
+ "structured_data": {"error": f"OCR failed: {str(e)}"},
277
+ "error": f"OCR failed: {str(e)}"
278
+ })
279
+ logger.error(f"OCR failed for {file.filename}: {str(e)}, {log_memory_usage()}")
280
+ continue
281
+ else: # JPG/JPEG/PNG
282
+ try:
283
+ ocr_start_time = time.time()
284
+ raw_text = await process_image(file_bytes, file.filename, 0)
285
+ logger.info(f"Image OCR for {file.filename}, took {time.time() - ocr_start_time:.2f} seconds, text length: {len(raw_text)}, {log_memory_usage()}")
286
+ except Exception as e:
287
+ fail_count += 1
288
+ output_json["data"].append({
289
+ "filename": file.filename,
290
+ "structured_data": {"error": f"Image OCR failed: {str(e)}"},
291
+ "error": f"Image OCR failed: {str(e)}"
292
+ })
293
+ logger.error(f"Image OCR failed for {file.filename}: {str(e)}, {log_memory_usage()}")
294
+ continue
295
+
296
+ # Normalize text
297
+ try:
298
+ normalize_start_time = time.time()
299
+ raw_text = unicodedata.normalize('NFKC', raw_text)
300
+ raw_text = raw_text.encode().decode('utf-8')
301
+ raw_text_cache[file_hash] = raw_text
302
+ logger.info(f"Text normalization for {file.filename}, took {time.time() - normalize_start_time:.2f} seconds, text length: {len(raw_text)}, {log_memory_usage()}")
303
+ except Exception as e:
304
+ logger.warning(f"Text normalization failed for {file.filename}: {str(e)}, {log_memory_usage()}")
305
+
306
+ # Process with BitNet
307
+ structured_data = await process_with_bitnet(file.filename, raw_text)
308
+ success_count += 1
309
+ output_json["data"].append({
310
+ "filename": file.filename,
311
+ "structured_data": structured_data,
312
+ "error": ""
313
+ })
314
+
315
+ logger.info(f"Total processing for {file.filename}, took {time.time() - total_start_time:.2f} seconds, {log_memory_usage()}")
316
+
317
+ output_json["message"] = f"Processed {len(files)} files. {success_count} succeeded, {fail_count} failed."
318
+ if fail_count > 0 and success_count == 0:
319
+ output_json["success"] = False
320
+
321
+ logger.info(f"Completed processing for {len(files)} files, {success_count} succeeded, {fail_count} failed, {log_memory_usage()}")
322
+ return output_json