Spaces:

ChintanSatva
/

bitnet_expense_categorization

Sleeping

App Files Files Community

ChintanSatva commited on Jun 21

Commit

cc3cef4

verified ·

1 Parent(s): 344effa

Update app.py

Browse files

Files changed (1) hide show

app.py +245 -144

app.py CHANGED Viewed

@@ -16,9 +16,8 @@ import asyncio
 import psutil
 import cachetools
 import hashlib
-from vllm import LLM
-app = FastAPI()
 # Configure logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -27,19 +26,28 @@ logger = logging.getLogger(__name__)
 # Set Tesseract path
 pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
-# Initialize BitNet model for CPU-only
 try:
     llm = LLM(
-        model="username/bitnet-finetuned-invoice",  # Replace with your fine-tuned BitNet model
         device="cpu",
-        enforce_eager=True,  # Disable CUDA graph compilation
-        tensor_parallel_size=1,  # Single CPU process
-        disable_custom_all_reduce=True,  # Avoid GPU optimizations
-        max_model_len=2048,  # Fit within 16GB RAM
     )
 except Exception as e:
-    logger.error(f"Failed to load BitNet model: {str(e)}")
-    raise HTTPException(status_code=500, detail="BitNet model initialization failed")
 # In-memory caches (1-hour TTL)
 raw_text_cache = cachetools.TTLCache(maxsize=100, ttl=3600)
@@ -47,9 +55,12 @@ structured_data_cache = cachetools.TTLCache(maxsize=100, ttl=3600)
 def log_memory_usage():
     """Log current memory usage."""
-    process = psutil.Process()
-    mem_info = process.memory_info()
-    return f"Memory usage: {mem_info.rss / 1024 / 1024:.2f} MB"
 def get_file_hash(file_bytes):
     """Generate MD5 hash of file content."""
@@ -65,140 +76,218 @@ async def process_image(img_bytes, filename, idx):
     logger.info(f"Starting OCR for {filename} image {idx}, {log_memory_usage()}")
     try:
         img = Image.open(io.BytesIO(img_bytes))
         img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
         gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
-        img_pil = Image.fromarray(cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB))
-        custom_config = r'--oem 1 --psm 6 -l eng+ara'
         page_text = pytesseract.image_to_string(img_pil, config=custom_config)
-        logger.info(f"Completed OCR for {filename} image {idx}, took {time.time() - start_time:.2f} seconds, {log_memory_usage()}")
         return page_text + "\n"
     except Exception as e:
-        logger.error(f"OCR failed for {filename} image {idx}: {str(e)}, {log_memory_usage()}")
         return ""
 async def process_pdf_page(img, page_idx):
     """Process a single PDF page with OCR."""
     start_time = time.time()
-    logger.info(f"Starting OCR for PDF page {page_idx}, {log_memory_usage()}")
     try:
         img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
         gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
-        img_pil = Image.fromarray(cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB))
-        custom_config = r'--oem 1 --psm 6 -l eng+ara'
         page_text = pytesseract.image_to_string(img_pil, config=custom_config)
-        logger.info(f"Completed OCR for PDF page {page_idx}, took {time.time() - start_time:.2f} seconds, {log_memory_usage()}")
         return page_text + "\n"
     except Exception as e:
-        logger.error(f"OCR failed for PDF page {page_idx}: {str(e)}, {log_memory_usage()}")
         return ""
-async def process_with_bitnet(filename: str, raw_text: str):
-    """Process raw text with BitNet to extract structured data."""
     start_time = time.time()
-    logger.info(f"Starting BitNet processing for {filename}, {log_memory_usage()}")
     # Check structured data cache
     text_hash = get_text_hash(raw_text)
     if text_hash in structured_data_cache:
-        logger.info(f"Structured data cache hit for {filename}, {log_memory_usage()}")
         return structured_data_cache[text_hash]
-    # Truncate text for BitNet
-    if len(raw_text) > 10000:
-        raw_text = raw_text[:10000]
-        logger.info(f"Truncated raw text for {filename} to 10000 characters, {log_memory_usage()}")
     try:
-        prompt = f"""You are an intelligent invoice data extractor. Given raw text from an invoice (in English or other languages),
-extract key business fields into the specified JSON format. Return each field with an estimated accuracy score between 0 and 1.
-- Accuracy reflects confidence in the correctness of each field.
-- Handle synonyms (e.g., 'total' = 'net', 'tax' = 'GST'/'TDS').
-- Detect currency from symbols ($, ₹, €) or keywords (USD, INR, EUR); default to USD if unclear.
-- The 'items' list may have multiple entries, each with detailed attributes.
-- If a field is missing, return an empty value (`""` or `0`) and set `accuracy` to `0.0`.
-- Convert any date to YYYY-MM-DD.
-Raw text:
-{raw_text}
-Output JSON:
-{{
-  "invoice": {{
-    "invoice_number": {{"value": "", "accuracy": 0.0}},
-    "invoice_date": {{"value": "", "accuracy": 0.0}},
-    "due_date": {{"value": "", "accuracy": 0.0}},
-    "purchase_order_number": {{"value": "", "accuracy": 0.0}},
-    "vendor": {{
-      "vendor_id": {{"value": "", "accuracy": 0.0}},
-      "name": {{"value": "", "accuracy": 0.0}},
-      "address": {{
-        "line1": {{"value": "", "accuracy": 0.0}},
-        "line2": {{"value": "", "accuracy": 0.0}},
-        "city": {{"value": "", "accuracy": 0.0}},
-        "state": {{"value": "", "accuracy": 0.0}},
-        "postal_code": {{"value": "", "accuracy": 0.0}},
-        "country": {{"value": "", "accuracy": 0.0}}
-      }},
-      "contact": {{
-        "email": {{"value": "", "accuracy": 0.0}},
-        "phone": {{"value": "", "accuracy": 0.0}}
-      }},
-      "tax_id": {{"value": "", "accuracy": 0.0}}
-    }},
-    "buyer": {{
-      "buyer_id": {{"value": "", "accuracy": 0.0}},
-      "name": {{"value": "", "accuracy": 0.0}},
-      "address": {{
-        "line1": {{"value": "", "accuracy": 0.0}},
-        "line2": {{"value": "", "accuracy": 0.0}},
-        "city": {{"value": "", "accuracy": 0.0}},
-        "state": {{"value": "", "accuracy": 0.0}},
-        "postal_code": {{"value": "", "accuracy": 0.0}},
-        "country": {{"value": "", "accuracy": 0.0}}
-      }},
-      "contact": {{
-        "email": {{"value": "", "accuracy": 0.0}},
-        "phone": {{"value": "", "accuracy": 0.0}}
-      }},
-      "tax_id": {{"value": "", "accuracy": 0.0}}
-    }},
-    "items": [
-      {{
-        "item_id": {{"value": "", "accuracy": 0.0}},
-        "description": {{"value": "", "accuracy": 0.0}},
-        "quantity": {{"value": 0, "accuracy": 0.0}},
-        "unit_of_measure": {{"value": "", "accuracy": 0.0}},
-        "unit_price": {{"value": 0, "accuracy": 0.0}},
-        "total_price": {{"value": 0, "accuracy": 0.0}},
-        "tax_rate": {{"value": 0, "accuracy": 0.0}},
-        "tax_amount": {{"value": 0, "accuracy": 0.0}},
-        "discount": {{"value": 0, "accuracy": 0.0}},
-        "net_amount": {{"value": 0, "accuracy": 0.0}}
-      }}
-    ],
-    "sub_total": {{"value": 0, "accuracy": 0.0}},
-    "tax_total": {{"value": 0, "accuracy": 0.0}},
-    "discount_total": {{"value": 0, "accuracy": 0.0}},
-    "total_amount": {{"value": 0, "accuracy": 0.0}},
-    "currency": {{"value": "", "accuracy": 0.0}}
-  }}
-}}
-"""
-        outputs = llm.generate(prompts=[prompt])
-        json_str = outputs[0].outputs[0].text
-        json_start = json_str.find("{")
-        json_end = json_str.rfind("}") + 1
-        structured_data = json.loads(json_str[json_start:json_end])
         structured_data_cache[text_hash] = structured_data
-        logger.info(f"BitNet processing for {filename}, took {time.time() - start_time:.2f} seconds, {log_memory_usage()}")
         return structured_data
     except Exception as e:
-        logger.error(f"BitNet processing failed for {filename}: {str(e)}, {log_memory_usage()}")
-        return {"error": f"BitNet processing failed: {str(e)}"}
 @app.post("/ocr")
 async def extract_and_structure(files: List[UploadFile] = File(...)):
     output_json = {
         "success": True,
         "message": "",
@@ -207,15 +296,15 @@ async def extract_and_structure(files: List[UploadFile] = File(...)):
     success_count = 0
     fail_count = 0
-    logger.info(f"Starting processing for {len(files)} files, {log_memory_usage()}")
     for file in files:
         total_start_time = time.time()
-        logger.info(f"Processing file: {file.filename}, {log_memory_usage()}")
         # Validate file format
         valid_extensions = {'.pdf', '.jpg', '.jpeg', '.png'}
-        file_ext = os.path.splitext(file.filename.lower())[1]
         if file_ext not in valid_extensions:
             fail_count += 1
             output_json["data"].append({
@@ -232,7 +321,7 @@ async def extract_and_structure(files: List[UploadFile] = File(...)):
             file_bytes = await file.read()
             file_stream = io.BytesIO(file_bytes)
             file_hash = get_file_hash(file_bytes)
-            logger.info(f"Read file {file.filename}, took {time.time() - file_start_time:.2f} seconds, size: {len(file_bytes)/1024:.2f} KB, {log_memory_usage()}")
         except Exception as e:
             fail_count += 1
             output_json["data"].append({
@@ -240,17 +329,17 @@ async def extract_and_structure(files: List[UploadFile] = File(...)):
                 "structured_data": {"error": f"Failed to read file: {str(e)}"},
                 "error": f"Failed to read file: {str(e)}"
             })
-            logger.error(f"Failed to read file {file.filename}: {str(e)}, {log_memory_usage()}")
             continue
         # Check raw text cache
         raw_text = ""
         if file_hash in raw_text_cache:
             raw_text = raw_text_cache[file_hash]
-            logger.info(f"Raw text cache hit for {file.filename}, {log_memory_usage()}")
         else:
             if file_ext == '.pdf':
-                # Try extracting embedded text
                 try:
                     extract_start_time = time.time()
                     reader = PdfReader(file_stream)
@@ -258,16 +347,16 @@ async def extract_and_structure(files: List[UploadFile] = File(...)):
                         text = page.extract_text()
                         if text:
                             raw_text += text + "\n"
-                    logger.info(f"Embedded text extraction for {file.filename}, took {time.time() - extract_start_time:.2f} seconds, text length: {len(raw_text)}, {log_memory_usage()}")
                 except Exception as e:
-                    logger.warning(f"Embedded text extraction failed for {file.filename}: {str(e)}, {log_memory_usage()}")
                 # If no embedded text, perform OCR
                 if not raw_text.strip():
                     try:
                         convert_start_time = time.time()
-                        images = convert_from_bytes(file_bytes, dpi=100)
-                        logger.info(f"PDF to images conversion for {file.filename}, {len(images)} pages, took {time.time() - convert_start_time:.2f} seconds, {log_memory_usage()}")
                         ocr_start_time = time.time()
                         page_texts = []
@@ -275,7 +364,7 @@ async def extract_and_structure(files: List[UploadFile] = File(...)):
                             page_text = await process_pdf_page(img, i)
                             page_texts.append(page_text)
                         raw_text = "".join(page_texts)
-                        logger.info(f"Total OCR for {file.filename}, took {time.time() - ocr_start_time:.2f} seconds, text length: {len(raw_text)}, {log_memory_usage()}")
                     except Exception as e:
                         fail_count += 1
                         output_json["data"].append({
@@ -283,13 +372,13 @@ async def extract_and_structure(files: List[UploadFile] = File(...)):
                             "structured_data": {"error": f"OCR failed: {str(e)}"},
                             "error": f"OCR failed: {str(e)}"
                         })
-                        logger.error(f"OCR failed for {file.filename}: {str(e)}, {log_memory_usage()}")
                         continue
             else:  # JPG/JPEG/PNG
                 try:
                     ocr_start_time = time.time()
                     raw_text = await process_image(file_bytes, file.filename, 0)
-                    logger.info(f"Image OCR for {file.filename}, took {time.time() - ocr_start_time:.2f} seconds, text length: {len(raw_text)}, {log_memory_usage()}")
                 except Exception as e:
                     fail_count += 1
                     output_json["data"].append({
@@ -297,33 +386,45 @@ async def extract_and_structure(files: List[UploadFile] = File(...)):
                         "structured_data": {"error": f"Image OCR failed: {str(e)}"},
                         "error": f"Image OCR failed: {str(e)}"
                     })
-                    logger.error(f"Image OCR failed for {file.filename}: {str(e)}, {log_memory_usage()}")
                     continue
             # Normalize text
             try:
-                normalize_start_time = time.time()
                 raw_text = unicodedata.normalize('NFKC', raw_text)
-                raw_text = raw_text.encode().decode('utf-8')
                 raw_text_cache[file_hash] = raw_text
-                logger.info(f"Text normalization for {file.filename}, took {time.time() - normalize_start_time:.2f} seconds, text length: {len(raw_text)}, {log_memory_usage()}")
             except Exception as e:
-                logger.warning(f"Text normalization failed for {file.filename}: {str(e)}, {log_memory_usage()}")
-        # Process with BitNet
-        structured_data = await process_with_bitnet(file.filename, raw_text)
-        success_count += 1
-        output_json["data"].append({
-            "filename": file.filename,
-            "structured_data": structured_data,
-            "error": ""
-        })
-        logger.info(f"Total processing for {file.filename}, took {time.time() - total_start_time:.2f} seconds, {log_memory_usage()}")
     output_json["message"] = f"Processed {len(files)} files. {success_count} succeeded, {fail_count} failed."
     if fail_count > 0 and success_count == 0:
         output_json["success"] = False
-    logger.info(f"Completed processing for {len(files)} files, {success_count} succeeded, {fail_count} failed, {log_memory_usage()}")
-    return output_json

 import psutil
 import cachetools
 import hashlib
+app = FastAPI(title="Invoice OCR and Extraction API", version="1.0.0")
 # Configure logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 # Set Tesseract path
 pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
+# Initialize LLM with fallback handling
+llm = None
 try:
+    # Try to import and initialize vLLM
+    from vllm import LLM
+    # For Hugging Face Spaces, use a smaller, more compatible model
+    model_name = "microsoft/DialoGPT-medium"  # Fallback model
     llm = LLM(
+        model=model_name,
         device="cpu",
+        enforce_eager=True,
+        tensor_parallel_size=1,
+        disable_custom_all_reduce=True,
+        max_model_len=1024,  # Reduced for compatibility
+        trust_remote_code=True
     )
+    logger.info("LLM model loaded successfully")
 except Exception as e:
+    logger.error(f"Failed to load vLLM: {str(e)}")
+    logger.info("Will use rule-based extraction as fallback")
 # In-memory caches (1-hour TTL)
 raw_text_cache = cachetools.TTLCache(maxsize=100, ttl=3600)
 def log_memory_usage():
     """Log current memory usage."""
+    try:
+        process = psutil.Process()
+        mem_info = process.memory_info()
+        return f"Memory usage: {mem_info.rss / 1024 / 1024:.2f} MB"
+    except:
+        return "Memory usage: N/A"
 def get_file_hash(file_bytes):
     """Generate MD5 hash of file content."""
     logger.info(f"Starting OCR for {filename} image {idx}, {log_memory_usage()}")
     try:
         img = Image.open(io.BytesIO(img_bytes))
+        # Convert to RGB if needed
+        if img.mode != 'RGB':
+            img = img.convert('RGB')
         img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
         gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
+        # Preprocess image for better OCR
+        gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
+        img_pil = Image.fromarray(gray)
+        custom_config = r'--oem 3 --psm 6 -l eng'
         page_text = pytesseract.image_to_string(img_pil, config=custom_config)
+        logger.info(f"Completed OCR for {filename} image {idx}, took {time.time() - start_time:.2f} seconds")
         return page_text + "\n"
     except Exception as e:
+        logger.error(f"OCR failed for {filename} image {idx}: {str(e)}")
         return ""
 async def process_pdf_page(img, page_idx):
     """Process a single PDF page with OCR."""
     start_time = time.time()
+    logger.info(f"Starting OCR for PDF page {page_idx}")
     try:
         img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
         gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
+        # Preprocess image for better OCR
+        gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
+        img_pil = Image.fromarray(gray)
+        custom_config = r'--oem 3 --psm 6 -l eng'
         page_text = pytesseract.image_to_string(img_pil, config=custom_config)
+        logger.info(f"Completed OCR for PDF page {page_idx}, took {time.time() - start_time:.2f} seconds")
         return page_text + "\n"
     except Exception as e:
+        logger.error(f"OCR failed for PDF page {page_idx}: {str(e)}")
         return ""
+def rule_based_extraction(raw_text: str):
+    """Rule-based fallback extraction when LLM is not available."""
+    import re
+    # Initialize the structure
+    structured_data = {
+        "invoice": {
+            "invoice_number": {"value": "", "accuracy": 0.0},
+            "invoice_date": {"value": "", "accuracy": 0.0},
+            "due_date": {"value": "", "accuracy": 0.0},
+            "purchase_order_number": {"value": "", "accuracy": 0.0},
+            "vendor": {
+                "vendor_id": {"value": "", "accuracy": 0.0},
+                "name": {"value": "", "accuracy": 0.0},
+                "address": {
+                    "line1": {"value": "", "accuracy": 0.0},
+                    "line2": {"value": "", "accuracy": 0.0},
+                    "city": {"value": "", "accuracy": 0.0},
+                    "state": {"value": "", "accuracy": 0.0},
+                    "postal_code": {"value": "", "accuracy": 0.0},
+                    "country": {"value": "", "accuracy": 0.0}
+                },
+                "contact": {
+                    "email": {"value": "", "accuracy": 0.0},
+                    "phone": {"value": "", "accuracy": 0.0}
+                },
+                "tax_id": {"value": "", "accuracy": 0.0}
+            },
+            "buyer": {
+                "buyer_id": {"value": "", "accuracy": 0.0},
+                "name": {"value": "", "accuracy": 0.0},
+                "address": {
+                    "line1": {"value": "", "accuracy": 0.0},
+                    "line2": {"value": "", "accuracy": 0.0},
+                    "city": {"value": "", "accuracy": 0.0},
+                    "state": {"value": "", "accuracy": 0.0},
+                    "postal_code": {"value": "", "accuracy": 0.0},
+                    "country": {"value": "", "accuracy": 0.0}
+                },
+                "contact": {
+                    "email": {"value": "", "accuracy": 0.0},
+                    "phone": {"value": "", "accuracy": 0.0}
+                },
+                "tax_id": {"value": "", "accuracy": 0.0}
+            },
+            "items": [{
+                "item_id": {"value": "", "accuracy": 0.0},
+                "description": {"value": "", "accuracy": 0.0},
+                "quantity": {"value": 0, "accuracy": 0.0},
+                "unit_of_measure": {"value": "", "accuracy": 0.0},
+                "unit_price": {"value": 0, "accuracy": 0.0},
+                "total_price": {"value": 0, "accuracy": 0.0},
+                "tax_rate": {"value": 0, "accuracy": 0.0},
+                "tax_amount": {"value": 0, "accuracy": 0.0},
+                "discount": {"value": 0, "accuracy": 0.0},
+                "net_amount": {"value": 0, "accuracy": 0.0}
+            }],
+            "sub_total": {"value": 0, "accuracy": 0.0},
+            "tax_total": {"value": 0, "accuracy": 0.0},
+            "discount_total": {"value": 0, "accuracy": 0.0},
+            "total_amount": {"value": 0, "accuracy": 0.0},
+            "currency": {"value": "USD", "accuracy": 0.5}
+        }
+    }
+    # Simple pattern matching
+    try:
+        # Invoice number
+        inv_pattern = r'(?:invoice|inv)(?:\s*#|\s*no\.?|\s*number)?\s*:?\s*([A-Z0-9\-/]+)'
+        inv_match = re.search(inv_pattern, raw_text, re.IGNORECASE)
+        if inv_match:
+            structured_data["invoice"]["invoice_number"]["value"] = inv_match.group(1)
+            structured_data["invoice"]["invoice_number"]["accuracy"] = 0.7
+        # Date patterns
+        date_pattern = r'(\d{1,2}[/-]\d{1,2}[/-]\d{2,4}|\d{4}[/-]\d{1,2}[/-]\d{1,2})'
+        dates = re.findall(date_pattern, raw_text)
+        if dates:
+            structured_data["invoice"]["invoice_date"]["value"] = dates[0]
+            structured_data["invoice"]["invoice_date"]["accuracy"] = 0.6
+        # Total amount
+        amount_pattern = r'(?:total|amount|sum)\s*:?\s*\$?(\d+\.?\d*)'
+        amount_match = re.search(amount_pattern, raw_text, re.IGNORECASE)
+        if amount_match:
+            structured_data["invoice"]["total_amount"]["value"] = float(amount_match.group(1))
+            structured_data["invoice"]["total_amount"]["accuracy"] = 0.6
+        # Email
+        email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
+        email_match = re.search(email_pattern, raw_text)
+        if email_match:
+            structured_data["invoice"]["vendor"]["contact"]["email"]["value"] = email_match.group()
+            structured_data["invoice"]["vendor"]["contact"]["email"]["accuracy"] = 0.8
+        # Phone
+        phone_pattern = r'(?:\+?1[-.\s]?)?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})'
+        phone_match = re.search(phone_pattern, raw_text)
+        if phone_match:
+            structured_data["invoice"]["vendor"]["contact"]["phone"]["value"] = phone_match.group()
+            structured_data["invoice"]["vendor"]["contact"]["phone"]["accuracy"] = 0.7
+    except Exception as e:
+        logger.error(f"Rule-based extraction error: {str(e)}")
+    return structured_data
+async def process_with_model(filename: str, raw_text: str):
+    """Process raw text with available model or fallback to rule-based."""
     start_time = time.time()
+    logger.info(f"Starting text processing for {filename}")
     # Check structured data cache
     text_hash = get_text_hash(raw_text)
     if text_hash in structured_data_cache:
+        logger.info(f"Structured data cache hit for {filename}")
         return structured_data_cache[text_hash]
+    # Truncate text
+    if len(raw_text) > 5000:
+        raw_text = raw_text[:5000]
+        logger.info(f"Truncated raw text for {filename} to 5000 characters")
     try:
+        if llm is not None:
+            # Use LLM if available
+            prompt = f"""Extract invoice data from this text and return JSON:
+Text: {raw_text}
+Return structured JSON with invoice details including vendor, amounts, dates."""
+            outputs = llm.generate(prompts=[prompt], sampling_params={"max_tokens": 512, "temperature": 0.1})
+            response_text = outputs[0].outputs[0].text
+            # Try to parse JSON from response
+            try:
+                json_start = response_text.find("{")
+                json_end = response_text.rfind("}") + 1
+                if json_start >= 0 and json_end > json_start:
+                    structured_data = json.loads(response_text[json_start:json_end])
+                else:
+                    raise ValueError("No JSON found in response")
+            except:
+                # Fallback to rule-based if JSON parsing fails
+                structured_data = rule_based_extraction(raw_text)
+        else:
+            # Use rule-based extraction
+            structured_data = rule_based_extraction(raw_text)
+        # Cache the result
         structured_data_cache[text_hash] = structured_data
+        logger.info(f"Text processing for {filename} completed in {time.time() - start_time:.2f} seconds")
         return structured_data
     except Exception as e:
+        logger.error(f"Text processing failed for {filename}: {str(e)}")
+        return rule_based_extraction(raw_text)
+@app.get("/")
+async def root():
+    """Health check endpoint."""
+    return {
+        "message": "Invoice OCR and Extraction API",
+        "status": "active",
+        "llm_available": llm is not None
+    }
 @app.post("/ocr")
 async def extract_and_structure(files: List[UploadFile] = File(...)):
+    """Main endpoint for OCR and data extraction."""
     output_json = {
         "success": True,
         "message": "",
     success_count = 0
     fail_count = 0
+    logger.info(f"Starting processing for {len(files)} files")
     for file in files:
         total_start_time = time.time()
+        logger.info(f"Processing file: {file.filename}")
         # Validate file format
         valid_extensions = {'.pdf', '.jpg', '.jpeg', '.png'}
+        file_ext = os.path.splitext(file.filename.lower())[1] if file.filename else '.unknown'
         if file_ext not in valid_extensions:
             fail_count += 1
             output_json["data"].append({
             file_bytes = await file.read()
             file_stream = io.BytesIO(file_bytes)
             file_hash = get_file_hash(file_bytes)
+            logger.info(f"Read file {file.filename}, size: {len(file_bytes)/1024:.2f} KB")
         except Exception as e:
             fail_count += 1
             output_json["data"].append({
                 "structured_data": {"error": f"Failed to read file: {str(e)}"},
                 "error": f"Failed to read file: {str(e)}"
             })
+            logger.error(f"Failed to read file {file.filename}: {str(e)}")
             continue
         # Check raw text cache
         raw_text = ""
         if file_hash in raw_text_cache:
             raw_text = raw_text_cache[file_hash]
+            logger.info(f"Raw text cache hit for {file.filename}")
         else:
             if file_ext == '.pdf':
+                # Try extracting embedded text first
                 try:
                     extract_start_time = time.time()
                     reader = PdfReader(file_stream)
                         text = page.extract_text()
                         if text:
                             raw_text += text + "\n"
+                    logger.info(f"Embedded text extraction for {file.filename}, text length: {len(raw_text)}")
                 except Exception as e:
+                    logger.warning(f"Embedded text extraction failed for {file.filename}: {str(e)}")
                 # If no embedded text, perform OCR
                 if not raw_text.strip():
                     try:
                         convert_start_time = time.time()
+                        images = convert_from_bytes(file_bytes, dpi=150, first_page=1, last_page=3)  # Limit pages
+                        logger.info(f"PDF to images conversion for {file.filename}, {len(images)} pages")
                         ocr_start_time = time.time()
                         page_texts = []
                             page_text = await process_pdf_page(img, i)
                             page_texts.append(page_text)
                         raw_text = "".join(page_texts)
+                        logger.info(f"Total OCR for {file.filename}, text length: {len(raw_text)}")
                     except Exception as e:
                         fail_count += 1
                         output_json["data"].append({
                             "structured_data": {"error": f"OCR failed: {str(e)}"},
                             "error": f"OCR failed: {str(e)}"
                         })
+                        logger.error(f"OCR failed for {file.filename}: {str(e)}")
                         continue
             else:  # JPG/JPEG/PNG
                 try:
                     ocr_start_time = time.time()
                     raw_text = await process_image(file_bytes, file.filename, 0)
+                    logger.info(f"Image OCR for {file.filename}, text length: {len(raw_text)}")
                 except Exception as e:
                     fail_count += 1
                     output_json["data"].append({
                         "structured_data": {"error": f"Image OCR failed: {str(e)}"},
                         "error": f"Image OCR failed: {str(e)}"
                     })
+                    logger.error(f"Image OCR failed for {file.filename}: {str(e)}")
                     continue
             # Normalize text
             try:
                 raw_text = unicodedata.normalize('NFKC', raw_text)
+                raw_text = raw_text.encode('utf-8', errors='ignore').decode('utf-8')
                 raw_text_cache[file_hash] = raw_text
+                logger.info(f"Text normalization for {file.filename} completed")
             except Exception as e:
+                logger.warning(f"Text normalization failed for {file.filename}: {str(e)}")
+        # Process with model or rule-based extraction
+        if raw_text.strip():
+            structured_data = await process_with_model(file.filename, raw_text)
+            success_count += 1
+            output_json["data"].append({
+                "filename": file.filename,
+                "structured_data": structured_data,
+                "raw_text": raw_text[:500] + "..." if len(raw_text) > 500 else raw_text,  # Include snippet
+                "error": ""
+            })
+        else:
+            fail_count += 1
+            output_json["data"].append({
+                "filename": file.filename,
+                "structured_data": {"error": "No text extracted from file"},
+                "error": "No text extracted from file"
+            })
+        logger.info(f"Total processing for {file.filename} completed in {time.time() - total_start_time:.2f} seconds")
     output_json["message"] = f"Processed {len(files)} files. {success_count} succeeded, {fail_count} failed."
     if fail_count > 0 and success_count == 0:
         output_json["success"] = False
+    logger.info(f"Batch processing completed: {success_count} succeeded, {fail_count} failed")
+    return output_json
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)