khurrameycon commited on
Commit
71c9483
·
verified ·
1 Parent(s): fa13ed8
Files changed (1) hide show
  1. app.py +72 -0
app.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import torch
4
+ from transformers import AutoProcessor, MllamaForConditionalGeneration
5
+ from PIL import Image
6
+ import spaces
7
+
8
+ # Check if we're running in a Hugging Face Space and if SPACES_ZERO_GPU is enabled
9
+ IS_SPACES_ZERO = os.environ.get("SPACES_ZERO_GPU", "0") == "1"
10
+ IS_SPACE = os.environ.get("SPACE_ID", None) is not None
11
+
12
+ # Determine the device (GPU if available, else CPU)
13
+ device = "cuda" if torch.cuda.is_available() else "cpu"
14
+ LOW_MEMORY = os.getenv("LOW_MEMORY", "0") == "1"
15
+
16
+ print(f"Using device: {device}")
17
+ print(f"Low memory mode: {LOW_MEMORY}")
18
+
19
+ # Get Hugging Face token from environment variables
20
+ HF_TOKEN = os.environ.get('HF_TOKEN')
21
+
22
+ # Load the model and processor
23
+ model_name = "ruslanmv/Llama-3.2-11B-Vision-Instruct"
24
+ model = MllamaForConditionalGeneration.from_pretrained(
25
+ model_name,
26
+ use_auth_token=HF_TOKEN,
27
+ torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
28
+ device_map="auto" if device == "cuda" else None, # Use device mapping if CUDA is available
29
+ )
30
+
31
+ # Move the model to the appropriate device (GPU if available)
32
+ model.to(device)
33
+ processor = AutoProcessor.from_pretrained(model_name, use_auth_token=HF_TOKEN)
34
+
35
+ @spaces.GPU # Use the free GPU provided by Hugging Face Spaces
36
+ def predict(image, text):
37
+ # Prepare the input messages
38
+ messages = [
39
+ {"role": "user", "content": [
40
+ {"type": "image"}, # Specify that an image is provided
41
+ {"type": "text", "text": text} # Add the user-provided text input
42
+ ]}
43
+ ]
44
+
45
+ # Create the input text using the processor's chat template
46
+ input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
47
+
48
+ # Process the inputs and move to the appropriate device
49
+ inputs = processor(image, input_text, return_tensors="pt").to(device)
50
+
51
+ # Generate a response from the model
52
+ outputs = model.generate(**inputs, max_new_tokens=100)
53
+
54
+ # Decode the output to return the final response
55
+ response = processor.decode(outputs[0], skip_special_tokens=True)
56
+ return response
57
+
58
+ # Define the Gradio interface
59
+ interface = gr.Interface(
60
+ fn=predict,
61
+ inputs=[
62
+ gr.Image(type="pil", label="Image Input"), # Image input with label
63
+ gr.Textbox(label="Text Input") # Textbox input with label
64
+ ],
65
+ outputs=gr.Textbox(label="Generated Response"), # Output with a more descriptive label
66
+ title="Llama 3.2 11B Vision Instruct Demo", # Title of the interface
67
+ description="This demo uses Meta's Llama 3.2 11B Vision model to generate responses based on an image and text input.", # Short description
68
+ theme="compact" # Using a compact theme for a cleaner look
69
+ )
70
+
71
+ # Launch the interface
72
+ interface.launch(debug=True)