Spaces:

sahalhes
/

vqa

Sleeping

App Files Files Community

sahalhes commited on Jul 20

Commit

a37e4d7

1 Parent(s): 5292153

k

Browse files

Files changed (1) hide show

app.py +27 -303

app.py CHANGED Viewed

@@ -1,315 +1,39 @@
 import gradio as gr
-import torch
 from PIL import Image
 from transformers import BlipProcessor, BlipForQuestionAnswering
-from transformers import Blip2Processor, Blip2ForConditionalGeneration
-import requests
-from io import BytesIO
-import logging
-# Set up logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-class VQAApp:
-    def __init__(self):
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        logger.info(f"Using device: {self.device}")
-        # Initialize models
-        self.models = {}
-        self.processors = {}
-        self.current_model = "blip2"
-        # Load models
-        self.load_models()
-    def load_models(self):
-        """Load all available VQA models"""
-        try:
-            # BLIP-2 (Recommended for best performance)
-            logger.info("Loading BLIP-2 model...")
-            self.processors["blip2"] = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
-            self.models["blip2"] = Blip2ForConditionalGeneration.from_pretrained(
-                "Salesforce/blip2-opt-2.7b",
-                torch_dtype=torch.float16 if self.device.type == "cuda" else torch.float32
-            ).to(self.device)
-            # Original BLIP (Faster but less accurate)
-            logger.info("Loading BLIP model...")
-            self.processors["blip"] = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
-            self.models["blip"] = BlipForQuestionAnswering.from_pretrained(
-                "Salesforce/blip-vqa-base"
-            ).to(self.device)
-            logger.info("All models loaded successfully!")
-        except Exception as e:
-            logger.error(f"Error loading models: {str(e)}")
-            raise e
-    def answer_question(self, image, question, model_choice="blip2", max_length=50):
-        """
-        Answer a question about an image using the selected model
-        Args:
-            image: PIL Image or path to image
-            question: String question about the image
-            model_choice: Model to use ("blip2" or "blip")
-            max_length: Maximum length of generated answer
-        Returns:
-            String answer to the question
-        """
-        try:
-            if image is None:
-                return "Please upload an image first."
-            if not question.strip():
-                return "Please ask a question about the image."
-            # Ensure image is PIL Image
-            if isinstance(image, str):
-                if image.startswith('http'):
-                    response = requests.get(image)
-                    image = Image.open(BytesIO(response.content)).convert('RGB')
-                else:
-                    image = Image.open(image).convert('RGB')
-            elif not isinstance(image, Image.Image):
-                image = Image.fromarray(image).convert('RGB')
-            # Get model and processor
-            model = self.models[model_choice]
-            processor = self.processors[model_choice]
-            if model_choice == "blip2":
-                # BLIP-2 processing
-                inputs = processor(image, question, return_tensors="pt").to(self.device)
-                with torch.no_grad():
-                    generated_ids = model.generate(
-                        **inputs,
-                        max_length=max_length,
-                        num_beams=5,
-                        temperature=0.7,
-                        do_sample=True,
-                        top_p=0.9
-                    )
-                answer = processor.decode(generated_ids[0], skip_special_tokens=True)
-            else:  # blip
-                # Original BLIP processing
-                inputs = processor(image, question, return_tensors="pt").to(self.device)
-                with torch.no_grad():
-                    outputs = model.generate(**inputs, max_length=max_length, num_beams=5)
-                answer = processor.decode(outputs[0], skip_special_tokens=True)
-            return answer.strip()
-        except Exception as e:
-            logger.error(f"Error in answer_question: {str(e)}")
-            return f"Error processing question: {str(e)}"
-    def batch_qa(self, image, questions_text):
-        """
-        Answer multiple questions about the same image
-        Args:
-            image: PIL Image
-            questions_text: String with questions separated by newlines
-        Returns:
-            String with questions and answers
-        """
-        if not questions_text.strip():
-            return "Please enter questions (one per line)."
-        questions = [q.strip() for q in questions_text.split('\n') if q.strip()]
-        results = []
-        for i, question in enumerate(questions, 1):
-            answer = self.answer_question(image, question, self.current_model)
-            results.append(f"Q{i}: {question}")
-            results.append(f"A{i}: {answer}")
-            results.append("")
-        return "\n".join(results)
-def create_gradio_interface():
-    """Create the Gradio interface for the VQA app"""
-    # Initialize VQA app
-    vqa_app = VQAApp()
-    # Sample images for demo
-    sample_images = [
-        "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png",
-        "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/lena.png"
-    ]
-    with gr.Blocks(title="Visual Question Answering App", theme=gr.themes.Soft()) as demo:
-        gr.Markdown("""
-        # 🔍 Visual Question Answering App
-        Upload an image and ask questions about its content! This app uses state-of-the-art multimodal models
-        from Hugging Face to understand and answer questions about images.
-        **Models available:**
-        - **BLIP-2**: Advanced model with better understanding (recommended)
-        - **BLIP**: Faster model for quick answers
-        """)
-        with gr.Tab("Single Question"):
-            with gr.Row():
-                with gr.Column(scale=1):
-                    image_input = gr.Image(
-                        label="Upload Image",
-                        type="pil",
-                        height=300
-                    )
-                    model_choice = gr.Dropdown(
-                        choices=["blip2", "blip"],
-                        value="blip2",
-                        label="Choose Model",
-                        info="BLIP-2 is more accurate but slower"
-                    )
-                    max_length_slider = gr.Slider(
-                        minimum=10,
-                        maximum=100,
-                        value=50,
-                        step=5,
-                        label="Max Answer Length"
-                    )
-                with gr.Column(scale=1):
-                    question_input = gr.Textbox(
-                        label="Ask a question about the image",
-                        placeholder="What do you see in this image?",
-                        lines=3
-                    )
-                    answer_button = gr.Button("Get Answer", variant="primary", size="lg")
-                    answer_output = gr.Textbox(
-                        label="Answer",
-                        lines=5,
-                        interactive=False
-                    )
-            # Example questions
-            gr.Markdown("### Example Questions:")
-            example_questions = [
-                "What objects are in this image?",
-                "What color is the main subject?",
-                "How many people are in the image?",
-                "What is the setting or location?",
-                "What activity is taking place?",
-                "What's the weather like in this image?"
-            ]
-            with gr.Row():
-                for i, eq in enumerate(example_questions[:3]):
-                    gr.Button(eq, size="sm").click(
-                        lambda q=eq: q, outputs=question_input
-                    )
-            with gr.Row():
-                for i, eq in enumerate(example_questions[3:]):
-                    gr.Button(eq, size="sm").click(
-                        lambda q=eq: q, outputs=question_input
-                    )
-        with gr.Tab("Multiple Questions"):
-            with gr.Row():
-                with gr.Column(scale=1):
-                    batch_image_input = gr.Image(
-                        label="Upload Image",
-                        type="pil",
-                        height=300
-                    )
-                    batch_model_choice = gr.Dropdown(
-                        choices=["blip2", "blip"],
-                        value="blip2",
-                        label="Choose Model"
-                    )
-                with gr.Column(scale=1):
-                    batch_questions_input = gr.Textbox(
-                        label="Questions (one per line)",
-                        placeholder="What do you see?\nHow many objects are there?\nWhat color is dominant?",
-                        lines=6
-                    )
-                    batch_button = gr.Button("Answer All Questions", variant="primary")
-                    batch_output = gr.Textbox(
-                        label="Questions & Answers",
-                        lines=10,
-                        interactive=False
-                    )
-        with gr.Tab("Sample Images"):
-            gr.Markdown("### Try these sample images:")
-            with gr.Row():
-                for img_url in sample_images:
-                    with gr.Column():
-                        sample_img = gr.Image(value=img_url, label="Sample Image")
-                        gr.Button("Use This Image").click(
-                            lambda x=img_url: x,
-                            outputs=image_input
-                        )
-        # Event handlers
-        def update_model_choice(choice):
-            vqa_app.current_model = choice
-            return choice
-        model_choice.change(update_model_choice, inputs=model_choice)
-        batch_model_choice.change(update_model_choice, inputs=batch_model_choice)
-        answer_button.click(
-            vqa_app.answer_question,
-            inputs=[image_input, question_input, model_choice, max_length_slider],
-            outputs=answer_output
-        )
-        batch_button.click(
-            vqa_app.batch_qa,
-            inputs=[batch_image_input, batch_questions_input],
-            outputs=batch_output
-        )
-        gr.Markdown("""
-        ### Tips for better results:
-        - Use clear, specific questions
-        - BLIP-2 works better for complex reasoning
-        - Try different phrasings if you don't get good results
-        - Upload high-quality images for best performance
-        """)
-    return demo
-# Alternative standalone functions for direct usage
-def simple_vqa(image_path, question, model_name="blip2"):
-    vqa = VQAApp()
-    if isinstance(image_path, str):
-        image = Image.open(image_path).convert('RGB')
-    else:
-        image = image_path
-    return vqa.answer_question(image, question, model_name)
 if __name__ == "__main__":
-    demo = create_gradio_interface()
     demo.launch()

 import gradio as gr
 from PIL import Image
+import torch
 from transformers import BlipProcessor, BlipForQuestionAnswering
+# Load processor and small BLIP VQA model
+processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
+model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
+# Use CPU explicitly
+device = torch.device("cpu")
+model.to(device)
+# VQA function
+def answer_question(image: Image.Image, question: str) -> str:
+    # Prepare input
+    inputs = processor(image.convert("RGB"), question, return_tensors="pt").to(device)
+    # Generate answer
+    with torch.no_grad():
+        output = model.generate(**inputs)
+    # Decode answer
+    return processor.decode(output[0], skip_special_tokens=True).strip()
+# Gradio interface
+demo = gr.Interface(
+    fn=answer_question,
+    inputs=[
+        gr.Image(type="pil", label="Upload an Image"),
+        gr.Textbox(label="Ask a Question About the Image")
+    ],
+    outputs=gr.Textbox(label="Answer"),
+    title="BLIP Visual Question Answering (CPU Friendly)",
+    description="Ask a question about an image using Salesforce's BLIP VQA Base model."
+)
 if __name__ == "__main__":
     demo.launch()