File size: 3,669 Bytes
05e7387
 
 
 
 
 
 
 
106a6dd
 
 
 
 
 
 
 
05e7387
106a6dd
05e7387
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106a6dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bdc2a1b
 
 
106a6dd
 
 
 
 
 
 
bdc2a1b
106a6dd
 
 
bdc2a1b
106a6dd
 
 
bdc2a1b
106a6dd
 
05e7387
 
106a6dd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import gradio as gr
from transformers import LlavaNextForConditionalGeneration, LlavaNextProcessor
from PIL import Image
import requests
import torch
import spaces


title = """  # πŸ™‹πŸ»β€β™‚οΈWelcome to Tonic'sπŸ¦…Falcon VisionπŸ‘οΈLanguage Model !
"""

description = """
Falcon2-11B-vlm is an 11B parameters causal decoder-only model built by TII and trained on over 5,000B tokens of RefinedWeb enhanced with curated corpora. To bring vision capabilities, , we integrate the pretrained CLIP ViT-L/14 vision encoder with our Falcon2-11B chat-finetuned model and train with image-text data. For enhancing the VLM's perception of fine-grained details w.r.t small objects in images, we employ a dynamic encoding mechanism at high-resolution for image inputs. 

Join us : 🌟TeamTonic🌟 is always making cool demos! Join our active builder's πŸ› οΈcommunity πŸ‘» [![Join us on Discord](https://img.shields.io/discord/1109943800132010065?label=Discord&logo=discord&style=flat-square)](https://discord.gg/GWpVpekp) On πŸ€—Huggingface:[MultiTransformer](https://huggingface.co/MultiTransformer) Math πŸ” [introspector](https://huggingface.co/introspector) On 🌐Github: [Tonic-AI](https://github.com/tonic-ai) & contribute to🌟 [MultiTonic](https://github.com/multitonic/)πŸ€—Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant πŸ€—
"""

processor = LlavaNextProcessor.from_pretrained("tiiuae/falcon-11B-vlm", tokenizer_class='PreTrainedTokenizerFast')
model = LlavaNextForConditionalGeneration.from_pretrained("tiiuae/falcon-11B-vlm", torch_dtype=torch.bfloat16).to('cuda:0')


@spaces.GPU
def generate_paragraph(image_url):

    cats_image = Image.open(requests.get(image_url, stream=True).raw)
    instruction = 'Write a long paragraph about this picture.'
    prompt = f"User:<image>\n{instruction} Falcon:"
    inputs = processor(prompt, images=cats_image, return_tensors="pt", padding=True).to('cuda:0')
    output = model.generate(**inputs, max_new_tokens=256)
    generated_captions = processor.decode(output[0], skip_special_tokens=True).strip()

    return generated_captions

# Function to set the URL and generate the paragraph
def set_and_generate(url):
    generated_paragraph = generate_paragraph(url)
    return url, generated_paragraph

# Create the Gradio Blocks interface
with gr.Blocks() as demo:
    gr.Markdown(title)
    gr.Markdown(description)

    with gr.Row():
        with gr.Column():
            image_url_input = gr.Textbox(label="Image URL")
            generate_button = gr.Button("Generate Paragraph")

            example_1 = gr.Button("Types of Falcons")
            example_2 = gr.Button("Camel Racing - Saudi Arabia")
            example_3 = gr.Button("Urban Street Scene - India")
        
        with gr.Column():
            generated_paragraph_output = gr.Textbox(label="Generated Paragraph")

    generate_button.click(generate_paragraph, inputs=image_url_input, outputs=generated_paragraph_output)

    example_1.click(
        lambda: set_and_generate("https://www.animalspot.net/wp-content/uploads/2020/01/Types-of-Falcons.jpg"), 
        outputs=[image_url_input, generated_paragraph_output]
    )
    example_2.click(
        lambda: set_and_generate("https://www.leaders-mena.com/leaders/uploads/2023/01/The-Traditional-Camel-Racing-In-Saudi-Arabia-Unique-Sport-Activity-1024x576.jpg"), 
        outputs=[image_url_input, generated_paragraph_output]
    )
    example_3.click(
        lambda: set_and_generate("http://embed.robertharding.com/embed/1161-4342.jpg"), 
        outputs=[image_url_input, generated_paragraph_output]
    )

# Launch the Gradio interface
demo.launch()