Spaces:

Ujeshhh
/

image-captioning

Sleeping

App Files Files Community

Ujeshhh commited on Apr 2

Commit

97e1f50

verified ·

1 Parent(s): 2747657

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -50

app.py CHANGED Viewed

@@ -1,54 +1,43 @@
-from transformers import BlipProcessor, BlipForConditionalGeneration
-from transformers import MarianMTModel, MarianTokenizer
 import gradio as gr
-# Load BLIP model for image captioning
-processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
-model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
-# Load MarianMT model for translation (English to Tamil)
-translation_model_name = "Helsinki-NLP/opus-mt-en-ta"
-translator_model = MarianMTModel.from_pretrained(translation_model_name)
-translator_tokenizer = MarianTokenizer.from_pretrained(translation_model_name)
-import numpy as np
-from PIL import Image
-import torch
 def generate_caption(image):
-    # Ensure the image is in PIL format (Gradio should handle this, but let's explicitly ensure it)
-    if isinstance(image, Image.Image) is False:
-        image = Image.open(image)
-    # Resize the image to the expected size (e.g., 384x384)
-    image = image.resize((384, 384))
-    # Convert image to numpy array with float32 dtype
-    image_array = np.array(image).astype(np.float32)
-    # Normalize the image (if needed)
-    image_array /= 255.0  # Normalize to [0, 1]
-    # Convert the numpy array to a tensor, explicitly specifying dtype as float32
-    image_tensor = torch.tensor(image_array, dtype=torch.float32).permute(2, 0, 1).unsqueeze(0)  # [1, C, H, W]
-    # Generate caption from image
-    inputs = processor(images=image_tensor, return_tensors="pt", padding=True)  # Added padding=True
-    out = model.generate(**inputs)
-    caption = processor.decode(out[0], skip_special_tokens=True)
-    # Translate caption to Tamil
-    translated = translator_tokenizer(caption, return_tensors="pt", padding=True)
-    translated_text = translator_model.generate(**translated)
-    translation = translator_tokenizer.decode(translated_text[0], skip_special_tokens=True)
-    return caption, translation
-# Gradio interface
-interface = gr.Interface(fn=generate_caption,
-                         inputs=gr.Image(type="pil"),
-                         outputs=[gr.Textbox(label="Caption in English"),
-                                  gr.Textbox(label="Caption in Tamil")])
-# Launch the Gradio app
-interface.launch()

+import openai
 import gradio as gr
+import os
+# Set your OpenAI API Key
+openai.api_key = "YOUR_OPENAI_API_KEY"  # Replace with your API key
 def generate_caption(image):
+    """Generate a caption for the uploaded image using OpenAI's GPT-4 Vision API."""
+    with open(image, "rb") as img_file:
+        response = openai.ChatCompletion.create(
+            model="gpt-4-vision-preview",
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are an AI assistant that describes images accurately."
+                },
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "Describe the contents of this image in detail."},
+                        {"type": "image", "image": img_file.read()},
+                    ],
+                },
+            ],
+            max_tokens=100
+        )
+    caption = response["choices"][0]["message"]["content"]
+    return caption
+# Gradio UI
+iface = gr.Interface(
+    fn=generate_caption,
+    inputs=gr.Image(type="filepath"),
+    outputs="text",
+    title="Image Captioning App",
+    description="Upload an image, and the AI will generate a descriptive caption."
+)
+# Run the app
+if __name__ == "__main__":
+    iface.launch()