Spaces:

sahalhes
/

vqa

Sleeping

App Files Files Community

sahalhes commited on Jul 20

Commit

5292153

1 Parent(s): 348528a

k

Browse files

Files changed (1) hide show

app.py +315 -0

app.py ADDED Viewed

	@@ -0,0 +1,315 @@

+import gradio as gr
+import torch
+from PIL import Image
+from transformers import BlipProcessor, BlipForQuestionAnswering
+from transformers import Blip2Processor, Blip2ForConditionalGeneration
+import requests
+from io import BytesIO
+import logging
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class VQAApp:
+    def __init__(self):
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        logger.info(f"Using device: {self.device}")
+        # Initialize models
+        self.models = {}
+        self.processors = {}
+        self.current_model = "blip2"
+        # Load models
+        self.load_models()
+    def load_models(self):
+        """Load all available VQA models"""
+        try:
+            # BLIP-2 (Recommended for best performance)
+            logger.info("Loading BLIP-2 model...")
+            self.processors["blip2"] = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
+            self.models["blip2"] = Blip2ForConditionalGeneration.from_pretrained(
+                "Salesforce/blip2-opt-2.7b",
+                torch_dtype=torch.float16 if self.device.type == "cuda" else torch.float32
+            ).to(self.device)
+            # Original BLIP (Faster but less accurate)
+            logger.info("Loading BLIP model...")
+            self.processors["blip"] = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
+            self.models["blip"] = BlipForQuestionAnswering.from_pretrained(
+                "Salesforce/blip-vqa-base"
+            ).to(self.device)
+            logger.info("All models loaded successfully!")
+        except Exception as e:
+            logger.error(f"Error loading models: {str(e)}")
+            raise e
+    def answer_question(self, image, question, model_choice="blip2", max_length=50):
+        """
+        Answer a question about an image using the selected model
+        Args:
+            image: PIL Image or path to image
+            question: String question about the image
+            model_choice: Model to use ("blip2" or "blip")
+            max_length: Maximum length of generated answer
+        Returns:
+            String answer to the question
+        """
+        try:
+            if image is None:
+                return "Please upload an image first."
+            if not question.strip():
+                return "Please ask a question about the image."
+            # Ensure image is PIL Image
+            if isinstance(image, str):
+                if image.startswith('http'):
+                    response = requests.get(image)
+                    image = Image.open(BytesIO(response.content)).convert('RGB')
+                else:
+                    image = Image.open(image).convert('RGB')
+            elif not isinstance(image, Image.Image):
+                image = Image.fromarray(image).convert('RGB')
+            # Get model and processor
+            model = self.models[model_choice]
+            processor = self.processors[model_choice]
+            if model_choice == "blip2":
+                # BLIP-2 processing
+                inputs = processor(image, question, return_tensors="pt").to(self.device)
+                with torch.no_grad():
+                    generated_ids = model.generate(
+                        **inputs,
+                        max_length=max_length,
+                        num_beams=5,
+                        temperature=0.7,
+                        do_sample=True,
+                        top_p=0.9
+                    )
+                answer = processor.decode(generated_ids[0], skip_special_tokens=True)
+            else:  # blip
+                # Original BLIP processing
+                inputs = processor(image, question, return_tensors="pt").to(self.device)
+                with torch.no_grad():
+                    outputs = model.generate(**inputs, max_length=max_length, num_beams=5)
+                answer = processor.decode(outputs[0], skip_special_tokens=True)
+            return answer.strip()
+        except Exception as e:
+            logger.error(f"Error in answer_question: {str(e)}")
+            return f"Error processing question: {str(e)}"
+    def batch_qa(self, image, questions_text):
+        """
+        Answer multiple questions about the same image
+        Args:
+            image: PIL Image
+            questions_text: String with questions separated by newlines
+        Returns:
+            String with questions and answers
+        """
+        if not questions_text.strip():
+            return "Please enter questions (one per line)."
+        questions = [q.strip() for q in questions_text.split('\n') if q.strip()]
+        results = []
+        for i, question in enumerate(questions, 1):
+            answer = self.answer_question(image, question, self.current_model)
+            results.append(f"Q{i}: {question}")
+            results.append(f"A{i}: {answer}")
+            results.append("")
+        return "\n".join(results)
+def create_gradio_interface():
+    """Create the Gradio interface for the VQA app"""
+    # Initialize VQA app
+    vqa_app = VQAApp()
+    # Sample images for demo
+    sample_images = [
+        "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png",
+        "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/lena.png"
+    ]
+    with gr.Blocks(title="Visual Question Answering App", theme=gr.themes.Soft()) as demo:
+        gr.Markdown("""
+        # 🔍 Visual Question Answering App
+        Upload an image and ask questions about its content! This app uses state-of-the-art multimodal models
+        from Hugging Face to understand and answer questions about images.
+        **Models available:**
+        - **BLIP-2**: Advanced model with better understanding (recommended)
+        - **BLIP**: Faster model for quick answers
+        """)
+        with gr.Tab("Single Question"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    image_input = gr.Image(
+                        label="Upload Image",
+                        type="pil",
+                        height=300
+                    )
+                    model_choice = gr.Dropdown(
+                        choices=["blip2", "blip"],
+                        value="blip2",
+                        label="Choose Model",
+                        info="BLIP-2 is more accurate but slower"
+                    )
+                    max_length_slider = gr.Slider(
+                        minimum=10,
+                        maximum=100,
+                        value=50,
+                        step=5,
+                        label="Max Answer Length"
+                    )
+                with gr.Column(scale=1):
+                    question_input = gr.Textbox(
+                        label="Ask a question about the image",
+                        placeholder="What do you see in this image?",
+                        lines=3
+                    )
+                    answer_button = gr.Button("Get Answer", variant="primary", size="lg")
+                    answer_output = gr.Textbox(
+                        label="Answer",
+                        lines=5,
+                        interactive=False
+                    )
+            # Example questions
+            gr.Markdown("### Example Questions:")
+            example_questions = [
+                "What objects are in this image?",
+                "What color is the main subject?",
+                "How many people are in the image?",
+                "What is the setting or location?",
+                "What activity is taking place?",
+                "What's the weather like in this image?"
+            ]
+            with gr.Row():
+                for i, eq in enumerate(example_questions[:3]):
+                    gr.Button(eq, size="sm").click(
+                        lambda q=eq: q, outputs=question_input
+                    )
+            with gr.Row():
+                for i, eq in enumerate(example_questions[3:]):
+                    gr.Button(eq, size="sm").click(
+                        lambda q=eq: q, outputs=question_input
+                    )
+        with gr.Tab("Multiple Questions"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    batch_image_input = gr.Image(
+                        label="Upload Image",
+                        type="pil",
+                        height=300
+                    )
+                    batch_model_choice = gr.Dropdown(
+                        choices=["blip2", "blip"],
+                        value="blip2",
+                        label="Choose Model"
+                    )
+                with gr.Column(scale=1):
+                    batch_questions_input = gr.Textbox(
+                        label="Questions (one per line)",
+                        placeholder="What do you see?\nHow many objects are there?\nWhat color is dominant?",
+                        lines=6
+                    )
+                    batch_button = gr.Button("Answer All Questions", variant="primary")
+                    batch_output = gr.Textbox(
+                        label="Questions & Answers",
+                        lines=10,
+                        interactive=False
+                    )
+        with gr.Tab("Sample Images"):
+            gr.Markdown("### Try these sample images:")
+            with gr.Row():
+                for img_url in sample_images:
+                    with gr.Column():
+                        sample_img = gr.Image(value=img_url, label="Sample Image")
+                        gr.Button("Use This Image").click(
+                            lambda x=img_url: x,
+                            outputs=image_input
+                        )
+        # Event handlers
+        def update_model_choice(choice):
+            vqa_app.current_model = choice
+            return choice
+        model_choice.change(update_model_choice, inputs=model_choice)
+        batch_model_choice.change(update_model_choice, inputs=batch_model_choice)
+        answer_button.click(
+            vqa_app.answer_question,
+            inputs=[image_input, question_input, model_choice, max_length_slider],
+            outputs=answer_output
+        )
+        batch_button.click(
+            vqa_app.batch_qa,
+            inputs=[batch_image_input, batch_questions_input],
+            outputs=batch_output
+        )
+        gr.Markdown("""
+        ### Tips for better results:
+        - Use clear, specific questions
+        - BLIP-2 works better for complex reasoning
+        - Try different phrasings if you don't get good results
+        - Upload high-quality images for best performance
+        """)
+    return demo
+# Alternative standalone functions for direct usage
+def simple_vqa(image_path, question, model_name="blip2"):
+    vqa = VQAApp()
+    if isinstance(image_path, str):
+        image = Image.open(image_path).convert('RGB')
+    else:
+        image = image_path
+    return vqa.answer_question(image, question, model_name)
+if __name__ == "__main__":
+    demo = create_gradio_interface()
+    demo.launch()