Spaces:

awacke1
/

Flamingo-Gradio-ImageDescribe

Runtime error

awacke1 commited on Oct 3, 2024

Commit

eeebcf5

verified ·

1 Parent(s): c91a3e7

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,21 +1,21 @@
 import os
 import gradio as gr
 import torch
-import PIL
-from transformers import AutoProcessor, AutoModelForCausalLM  # Using AutoModel classes
 EXAMPLES_DIR = 'examples'
 DEFAULT_PROMPT = "<image>"
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-# Load model using AutoModel with trust_remote_code=True
-model = AutoModelForCausalLM.from_pretrained('dhansmair/flamingo-mini', trust_remote_code=True)
 model.to(device)
 model.eval()
-# Initialize processor without the `device` argument
-processor = AutoProcessor.from_pretrained('dhansmair/flamingo-mini')
 # Setup some example images
 examples = []
@@ -28,14 +28,12 @@ if os.path.isdir(EXAMPLES_DIR):
 def predict_caption(image, prompt):
     assert isinstance(prompt, str)
-    # Process the image using the model
-    caption = model.generate(
-        processor(images=image, prompt=prompt),  # Pass processed inputs to the model
-        max_length=50
-    )
-    if isinstance(caption, list):
-        caption = caption[0]
     return caption

 import os
 import gradio as gr
 import torch
+from transformers import Blip2Processor, Blip2ForConditionalGeneration
+from PIL import Image  # PIL should be imported separately for image handling
 EXAMPLES_DIR = 'examples'
 DEFAULT_PROMPT = "<image>"
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+# Load the BLIP2 model using the AutoModel with trust_remote_code=True
+model = Blip2ForConditionalGeneration.from_pretrained('Salesforce/blip2-flan-t5-xl', device_map="auto", torch_dtype=torch.float16)
 model.to(device)
 model.eval()
+# Initialize processor
+processor = Blip2Processor.from_pretrained('Salesforce/blip2-flan-t5-xl')
 # Setup some example images
 examples = []
 def predict_caption(image, prompt):
     assert isinstance(prompt, str)
+    # Convert the PIL image to the format expected by the processor
+    inputs = processor(images=image, text=prompt, return_tensors="pt").to(device)
+    # Generate the caption
+    generated_ids = model.generate(**inputs, max_length=50)
+    caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
     return caption