Spaces:

aioverlords-amnil
/

ocr_test_pali

Sleeping

App Files Files Community

rockerritesh commited on Mar 12

Commit

95f4d99

verified ·

1 Parent(s): 13ed0ee

Update main.py

Browse files

Files changed (1) hide show

main.py +205 -35

main.py CHANGED Viewed

@@ -1,65 +1,235 @@
-# main.py
-from fastapi import FastAPI, File, UploadFile
 from transformers import PaliGemmaProcessor, PaliGemmaForConditionalGeneration
-from transformers.image_utils import load_image
 import torch
 from io import BytesIO
 import os
 from dotenv import load_dotenv
 from PIL import Image
 from huggingface_hub import login
 # Load environment variables
 load_dotenv()
 # Set the cache directory to a writable path
 os.environ["TORCHINDUCTOR_CACHE_DIR"] = "/tmp/torch_inductor_cache"
 token = os.getenv("huggingface_ankit")
 # Login to the Hugging Face Hub
 login(token)
 app = FastAPI()
-model_id = "google/paligemma2-3b-mix-448"
-model = PaliGemmaForConditionalGeneration.from_pretrained(model_id).to('cuda')
-processor = PaliGemmaProcessor.from_pretrained(model_id)
 def predict(image):
     prompt = "<image> ocr"
-    model_inputs = processor(text=prompt, images=image, return_tensors="pt").to('cuda')
-    input_len = model_inputs["input_ids"].shape[-1]
     with torch.inference_mode():
         generation = model.generate(**model_inputs, max_new_tokens=200)
-    torch.cuda.empty_cache()
-    decoded = processor.decode(generation[0], skip_special_tokens=True) #[len(prompt):].lstrip("\n")
     return decoded
 @app.post("/extract_text")
-async def extract_text(file: UploadFile = File(...)):
-    image = Image.open(BytesIO(await file.read())).convert("RGB")  # Ensure it's a valid PIL image
-    text = predict(image)
-    return {"extracted_text": text}
 @app.post("/batch_extract_text")
-async def batch_extract_text(files: list[UploadFile] = File(...)):
-    # if len(files) > 20:
-    #     return {"error": "A maximum of 20 images can be processed at a time."}
-    images = [Image.open(BytesIO(await file.read())).convert("RGB") for file in files]
-    prompts = ["OCR"] * len(images)
-    model_inputs = processor(text=prompts, images=images, return_tensors="pt").to(torch.bfloat16).to(model.device)
-    input_len = model_inputs["input_ids"].shape[-1]
-    with torch.inference_mode():
-        generations = model.generate(**model_inputs, max_new_tokens=200, do_sample=False)
-    torch.cuda.empty_cache()
-    extracted_texts = [processor.decode(generations[i], skip_special_tokens=True) for i in range(len(images))]
-    return {"extracted_texts": extracted_texts}
-if __name__ == "__main__":
-    import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=7860)

+# # main.py
+# from fastapi import FastAPI, File, UploadFile
+# from transformers import PaliGemmaProcessor, PaliGemmaForConditionalGeneration
+# from transformers.image_utils import load_image
+# import torch
+# from io import BytesIO
+# import os
+# from dotenv import load_dotenv
+# from PIL import Image
+# from huggingface_hub import login
+# # Load environment variables
+# load_dotenv()
+# # Set the cache directory to a writable path
+# os.environ["TORCHINDUCTOR_CACHE_DIR"] = "/tmp/torch_inductor_cache"
+# token = os.getenv("huggingface_ankit")
+# # Login to the Hugging Face Hub
+# login(token)
+# app = FastAPI()
+# model_id = "google/paligemma2-3b-mix-448"
+# model = PaliGemmaForConditionalGeneration.from_pretrained(model_id).to('cuda')
+# processor = PaliGemmaProcessor.from_pretrained(model_id)
+# def predict(image):
+#     prompt = "<image> ocr"
+#     model_inputs = processor(text=prompt, images=image, return_tensors="pt").to('cuda')
+#     input_len = model_inputs["input_ids"].shape[-1]
+#     with torch.inference_mode():
+#         generation = model.generate(**model_inputs, max_new_tokens=200)
+#     torch.cuda.empty_cache()
+#     decoded = processor.decode(generation[0], skip_special_tokens=True) #[len(prompt):].lstrip("\n")
+#     return decoded
+# @app.post("/extract_text")
+# async def extract_text(file: UploadFile = File(...)):
+#     image = Image.open(BytesIO(await file.read())).convert("RGB")  # Ensure it's a valid PIL image
+#     text = predict(image)
+#     return {"extracted_text": text}
+# @app.post("/batch_extract_text")
+# async def batch_extract_text(files: list[UploadFile] = File(...)):
+#     # if len(files) > 20:
+#     #     return {"error": "A maximum of 20 images can be processed at a time."}
+#     images = [Image.open(BytesIO(await file.read())).convert("RGB") for file in files]
+#     prompts = ["OCR"] * len(images)
+#     model_inputs = processor(text=prompts, images=images, return_tensors="pt").to(torch.bfloat16).to(model.device)
+#     input_len = model_inputs["input_ids"].shape[-1]
+#     with torch.inference_mode():
+#         generations = model.generate(**model_inputs, max_new_tokens=200, do_sample=False)
+#     torch.cuda.empty_cache()
+#     extracted_texts = [processor.decode(generations[i], skip_special_tokens=True) for i in range(len(images))]
+#     return {"extracted_texts": extracted_texts}
+# if __name__ == "__main__":
+#     import uvicorn
+#     uvicorn.run(app, host="0.0.0.0", port=7860)
+from fastapi import FastAPI, File, UploadFile, BackgroundTasks
 from transformers import PaliGemmaProcessor, PaliGemmaForConditionalGeneration
 import torch
 from io import BytesIO
 import os
 from dotenv import load_dotenv
 from PIL import Image
 from huggingface_hub import login
+import gc
+import logging
+from typing import List
+import time
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
 # Load environment variables
 load_dotenv()
 # Set the cache directory to a writable path
 os.environ["TORCHINDUCTOR_CACHE_DIR"] = "/tmp/torch_inductor_cache"
 token = os.getenv("huggingface_ankit")
 # Login to the Hugging Face Hub
 login(token)
 app = FastAPI()
+# Global variables for model and processor
+model = None
+processor = None
+def load_model():
+    """Load model and processor when needed"""
+    global model, processor
+    if model is None:
+        model_id = "google/paligemma2-3b-mix-448"
+        logger.info(f"Loading model {model_id}")
+        # Load model with memory-efficient settings
+        model = PaliGemmaForConditionalGeneration.from_pretrained(
+            model_id,
+            device_map="auto",
+            torch_dtype=torch.bfloat16  # Use lower precision for memory efficiency
+        )
+        processor = PaliGemmaProcessor.from_pretrained(model_id)
+        logger.info("Model loaded successfully")
+def clean_memory():
+    """Force garbage collection and clear CUDA cache"""
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        logger.info("Memory cleaned")
 def predict(image):
+    """Process a single image"""
+    load_model()  # Ensure model is loaded
+    # Process input
     prompt = "<image> ocr"
+    model_inputs = processor(text=prompt, images=image, return_tensors="pt")
+    # Move to appropriate device
+    model_inputs = {k: v.to(model.device) for k, v in model_inputs.items()}
+    # Generate with memory optimization
     with torch.inference_mode():
         generation = model.generate(**model_inputs, max_new_tokens=200)
+    # Decode output
+    decoded = processor.decode(generation[0], skip_special_tokens=True)
+    # Clean up intermediates
+    del model_inputs, generation
+    clean_memory()
     return decoded
 @app.post("/extract_text")
+async def extract_text(file: UploadFile = File(...), background_tasks: BackgroundTasks):
+    """Extract text from a single image"""
+    try:
+        start_time = time.time()
+        image = Image.open(BytesIO(await file.read())).convert("RGB")
+        text = predict(image)
+        # Schedule cleanup after response
+        background_tasks.add_task(clean_memory)
+        logger.info(f"Processing completed in {time.time() - start_time:.2f} seconds")
+        return {"extracted_text": text}
+    except Exception as e:
+        logger.error(f"Error processing image: {str(e)}")
+        return {"error": str(e)}
 @app.post("/batch_extract_text")
+async def batch_extract_text(files: List[UploadFile] = File(...), background_tasks: BackgroundTasks):
+    """Extract text from multiple images with batching"""
+    try:
+        start_time = time.time()
+        # Limit batch size for memory management
+        max_batch_size = 5  # Adjust based on your GPU memory
+        if len(files) > 20:
+            return {"error": "A maximum of 20 images can be processed at a time."}
+        load_model()  # Ensure model is loaded
+        all_results = []
+        # Process in smaller batches
+        for i in range(0, len(files), max_batch_size):
+            batch_files = files[i:i+max_batch_size]
+            # Load images
+            images = []
+            for file in batch_files:
+                image_data = await file.read()
+                img = Image.open(BytesIO(image_data)).convert("RGB")
+                images.append(img)
+            # Create batch inputs
+            prompts = ["<image> ocr"] * len(images)
+            model_inputs = processor(text=prompts, images=images, return_tensors="pt")
+            # Move to appropriate device
+            model_inputs = {k: v.to(model.device) for k, v in model_inputs.items()}
+            # Generate with memory optimization
+            with torch.inference_mode():
+                generations = model.generate(**model_inputs, max_new_tokens=200, do_sample=False)
+            # Decode outputs
+            batch_results = [processor.decode(generations[i], skip_special_tokens=True) for i in range(len(images))]
+            all_results.extend(batch_results)
+            # Clean up batch resources
+            del model_inputs, generations, images
+            clean_memory()
+        # Schedule cleanup after response
+        background_tasks.add_task(clean_memory)
+        logger.info(f"Batch processing completed in {time.time() - start_time:.2f} seconds")
+        return {"extracted_texts": all_results}
+    except Exception as e:
+        logger.error(f"Error in batch processing: {str(e)}")
+        return {"error": str(e)}
+# Health check endpoint
+@app.get("/health")
+async def health_check():
+    return {"status": "healthy"}
+# if __name__ == "__main__":
+#     import uvicorn
+#     # Start the server with proper worker configuration
+#     uvicorn.run(
+#         app,
+#         host="0.0.0.0",
+#         port=7860,
+#         log_level="info",
+#         workers=1  # Multiple workers can cause GPU memory issues
+#     )