import torch from PIL import Image import pytesseract from transformers import BlipProcessor, BlipForConditionalGeneration from io import BytesIO import base64 from typing import Union import whisper # Must be `openai-whisper` installed # === DEVICE SETUP === device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # === LOAD WHISPER MODEL FOR AUDIO TRANSCRIPTION === try: whisper_model = whisper.load_model("base") # Options: "tiny", "base", "small", "medium", "large" except Exception as e: raise RuntimeError(f"❌ Failed to load Whisper model: {str(e)}") # === LOAD BLIP FOR IMAGE CAPTIONING === try: blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") blip_model = BlipForConditionalGeneration.from_pretrained( "Salesforce/blip-image-captioning-base" ).to(device) except Exception as e: raise RuntimeError(f"❌ Failed to load BLIP model: {str(e)}") # === TEXT EXTRACTION (OCR) === def extract_text_from_image_base64(image_base64: str) -> str: """Extract text from a base64-encoded image.""" try: image_data = base64.b64decode(image_base64) image = Image.open(BytesIO(image_data)) return pytesseract.image_to_string(image).strip() except Exception as e: return f"❌ OCR Error (base64): {str(e)}" def extract_text_from_image_path(image_path: str) -> str: """Extract text from an image file path.""" try: image = Image.open(image_path) return pytesseract.image_to_string(image).strip() except Exception as e: return f"❌ OCR Error (path): {str(e)}" def extract_text_from_image_bytes(image_bytes: bytes) -> str: """Extract text from raw image bytes (e.g., file uploads).""" try: image = Image.open(BytesIO(image_bytes)) return pytesseract.image_to_string(image).strip() except Exception as e: return f"❌ OCR Error (bytes): {str(e)}" def extract_text_from_image(image_base64: str) -> str: """API alias for default OCR from base64 input.""" return extract_text_from_image_base64(image_base64) # === IMAGE CAPTIONING === def caption_image(image: Image.Image) -> str: """Generate a caption from a PIL image object.""" try: inputs = blip_processor(image.convert("RGB"), return_tensors="pt").to(device) outputs = blip_model.generate(**inputs) return blip_processor.decode(outputs[0], skip_special_tokens=True) except Exception as e: return f"❌ Captioning Error: {str(e)}" def caption_image_path(image_path: str) -> str: """Generate a caption from image file path.""" try: image = Image.open(image_path) return caption_image(image) except Exception as e: return f"❌ Captioning Error (path): {str(e)}" def caption_image_bytes(image_bytes: bytes) -> str: """Generate a caption from image bytes.""" try: image = Image.open(BytesIO(image_bytes)) return caption_image(image) except Exception as e: return f"❌ Captioning Error (bytes): {str(e)}" def describe_image(input_data: Union[str, bytes]) -> str: """ Unified captioning API — accepts either path or bytes. """ try: if isinstance(input_data, bytes): return caption_image_bytes(input_data) elif isinstance(input_data, str): return caption_image_path(input_data) else: return "❌ Unsupported input type for describe_image" except Exception as e: return f"❌ Description Error: {str(e)}" # === AUDIO TRANSCRIPTION === def transcribe_audio_bytes(audio_bytes: bytes) -> str: """Transcribe raw audio bytes using Whisper.""" try: # Save to temporary file temp_path = "/tmp/temp_audio.wav" with open(temp_path, "wb") as f: f.write(audio_bytes) result = whisper_model.transcribe(temp_path) return result.get("text", "").strip() except Exception as e: return f"❌ Transcription Error: {str(e)}" def transcribe_audio_path(audio_path: str) -> str: """Transcribe audio file using Whisper.""" try: result = whisper_model.transcribe(audio_path) return result.get("text", "").strip() except Exception as e: return f"❌ Transcription Error (path): {str(e)}"