Spaces:
Running
Running
| import gradio as gr | |
| from PIL import Image | |
| import torch | |
| from transformers import AutoProcessor, AutoModelForVision2Seq | |
| import os | |
| # Determine the device to use | |
| # Using os.environ.get to allow device override from Space hardware config if needed | |
| # Defaults to CUDA if available, else CPU. | |
| device_choice = os.environ.get("DEVICE", "auto") | |
| if device_choice == "auto": | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| else: | |
| device = device_choice | |
| print(f"Using device: {device}") | |
| # Load the model and processor | |
| model_id = "lusxvr/nanoVLM-222M" | |
| try: | |
| processor = AutoProcessor.from_pretrained(model_id) | |
| model = AutoModelForVision2Seq.from_pretrained(model_id).to(device) | |
| print("Model and processor loaded successfully.") | |
| except Exception as e: | |
| print(f"Error loading model/processor: {e}") | |
| # If loading fails, we'll have the Gradio app display an error. | |
| # This helps in debugging if the Space doesn't start correctly. | |
| processor = None | |
| model = None | |
| def generate_text_for_image(image_input, prompt_input): | |
| """ | |
| Generates text based on an image and a text prompt. | |
| """ | |
| if model is None or processor is None: | |
| return "Error: Model or processor not loaded. Check the Space logs for details." | |
| if image_input is None: | |
| return "Please upload an image." | |
| if not prompt_input: | |
| return "Please provide a prompt (e.g., 'Describe this image' or 'What color is the car?')." | |
| try: | |
| # Ensure the image is in PIL format and RGB | |
| if not isinstance(image_input, Image.Image): | |
| pil_image = Image.fromarray(image_input) | |
| else: | |
| pil_image = image_input | |
| if pil_image.mode != "RGB": | |
| pil_image = pil_image.convert("RGB") | |
| # Prepare inputs for the model | |
| # The prompt for nanoVLM is typically a question or an instruction. | |
| inputs = processor(text=[prompt_input], images=[pil_image], return_tensors="pt").to(device) | |
| # Generate text | |
| # You can adjust max_new_tokens, temperature, top_k, etc. | |
| generated_ids = model.generate( | |
| **inputs, | |
| max_new_tokens=150, # Increased for potentially longer descriptions | |
| num_beams=3, # Example of adding beam search | |
| no_repeat_ngram_size=2, | |
| early_stopping=True | |
| ) | |
| # Decode the generated tokens | |
| generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
| # The output might sometimes include the prompt itself, depending on the model. | |
| # Simple heuristic to remove prompt if it appears at the beginning: | |
| if generated_text.startswith(prompt_input): | |
| cleaned_text = generated_text[len(prompt_input):].lstrip(" ,.:") | |
| else: | |
| cleaned_text = generated_text | |
| return cleaned_text.strip() | |
| except Exception as e: | |
| print(f"Error during generation: {e}") | |
| return f"An error occurred: {str(e)}" | |
| # Create the Gradio interface | |
| description = """ | |
| Upload an image and provide a text prompt (e.g., "What is in this image?", "Describe the animal in detail."). | |
| The model will generate a textual response based on the visual content and your query. | |
| This Space uses the `lusxvr/nanoVLM-222M` model. | |
| """ | |
| # Example image from COCO dataset | |
| example_image_url = "http://images.cocodataset.org/val2017/000000039769.jpg" # A cat and a remote | |
| # iface = gr.Interface( | |
| # fn=generate_text_for_image, | |
| # inputs=[ | |
| # gr.Image(type="pil", label="Upload Image"), | |
| # gr.Textbox(label="Your Prompt/Question", info="e.g., 'What is this a picture of?', 'Describe the main subject.', 'How many animals are there?'") | |
| # ], | |
| # outputs=gr.Textbox(label="Generated Text", show_copy_button=True), | |
| # title="Interactive nanoVLM-222M Demo", | |
| # description=description, | |
| # examples=[ | |
| # [example_image_url, "a photo of a"], | |
| # [example_image_url, "Describe the image in detail."], | |
| # [example_image_url, "What objects are on the sofa?"], | |
| # ], | |
| # cache_examples=True # Cache results for examples to load faster | |
| # ) | |
| # ... (other parts of your app.py) | |
| iface = gr.Interface( | |
| fn=generate_text_for_image, | |
| inputs=[ | |
| gr.Image(type="pil", label="Upload Image"), | |
| gr.Textbox(label="Your Prompt/Question", info="e.g., 'What is this a picture of?', 'Describe the main subject.', 'How many animals are there?'") | |
| ], | |
| outputs=gr.Textbox(label="Generated Text", show_copy_button=True), | |
| title="Interactive nanoVLM-222M Demo", | |
| description=description, | |
| examples=[ | |
| [example_image_url, "a photo of a"], | |
| [example_image_url, "Describe the image in detail."], | |
| [example_image_url, "What objects are on the sofa?"], | |
| ], | |
| cache_examples=True, | |
| allow_flagging="never" # Add this line to disable flagging | |
| # OR, if you want flagging, configure its directory (ensure GRADIO_FLAGGING_DIR is set in Dockerfile): | |
| # import os | |
| # flagging_dir=os.environ.get("GRADIO_FLAGGING_DIR"), | |
| ) | |
| if __name__ == "__main__": | |
| # For Hugging Face Spaces, it's common to launch with server_name="0.0.0.0" | |
| # The Space infrastructure handles the public URL and port mapping. | |
| iface.launch(server_name="0.0.0.0", server_port=7860) |