Spaces:

josephtran04
/

VQA

Running

josephtran04 commited on 1 day ago

Commit

91efc56

verified ·

1 Parent(s): ae7ad61

Upload 3 files

Files changed (3) hide show

LICENSE ADDED Viewed

+MIT License
+Copyright (c) 2025 Trần Minh Phát
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

app.py ADDED Viewed

+# This script creates a simple web application using Gradio to generate answers for VQA using the BLIP model from Hugging Face's Transformers library.
+# Import necessary libraries
+import gradio as gr
+import numpy as np
+from PIL import Image
+from transformers import BlipProcessor, BlipForQuestionAnswering
+# Load BLIP processor and model
+processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
+model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
+# Define the function for Visual Question Answering
+def VQA(input_image: np.ndarray, question):
+    # Convert numpy array to PIL Image and convert to RGB
+    raw_image = Image.fromarray(input_image).convert('RGB')
+    # Prepare the inputs for the model
+    inputs = processor(raw_image, question, return_tensors="pt")
+    # Generate the answer using the model
+    outputs = model.generate(**inputs, max_length=100)
+    # Decode the generated tokens to text and store it into `answer`
+    answer = processor.decode(outputs[0], skip_special_tokens=True)
+    return answer
+# Create a Gradio interface
+iface = gr.Interface(
+    fn=VQA,
+    inputs=[
+        gr.Image(label="Input image:"),
+        gr.Textbox(label="Question:", placeholder="Type your question here...")
+    ],
+    outputs="text",
+    title="Visual Question Answering",
+    description="This is a simple web app for VQA using BLIP model from Salesforce.\nUpload the image file:"
+)
+# Launch the Gradio app
+iface.launch()

requirements.txt ADDED Viewed

+langchain==0.1.11
+gradio==5.23.2
+transformers==4.38.2
+bs4==0.0.2
+requests==2.31.0
+torch==2.2.1