File size: 3,649 Bytes
8274479
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import base64

import cv2
import gradio as gr
import numpy as np
import requests

MARKDOWN = """
# HotDogGPT πŸ’¬ + 🌭

HotDogGPT is OpenAI Vision API experiment reproducing the famous 
[Hot Dog, Not Hot Dog](https://www.youtube.com/watch?v=ACmydtFDTGs) app from Silicon 
Valley.

<p align="center">
    <img width="600" src="https://miro.medium.com/v2/resize:fit:650/1*VrpXE1hE4rO1roK0laOd7g.png" alt="hotdog">
</p>

Visit [awesome-openai-vision-api-experiments](https://github.com/roboflow/awesome-openai-vision-api-experiments) 
repository to find more OpenAI Vision API experiments or contribute your own.
"""
API_URL = "https://api.openai.com/v1/chat/completions"
CLASSES = ["🌭 Hot Dog", "❌ Not Hot Dog"]


def preprocess_image(image: np.ndarray) -> np.ndarray:
    image = np.fliplr(image)
    return cv2.cvtColor(image, cv2.COLOR_RGB2BGR)


def encode_image_to_base64(image: np.ndarray) -> str:
    success, buffer = cv2.imencode('.jpg', image)
    if not success:
        raise ValueError("Could not encode image to JPEG format.")

    encoded_image = base64.b64encode(buffer).decode('utf-8')
    return encoded_image


def compose_payload(image: np.ndarray, prompt: str) -> dict:
    base64_image = encode_image_to_base64(image)
    return {
        "model": "gpt-4-vision-preview",
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": prompt
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}"
                        }
                    }
                ]
            }
        ],
        "max_tokens": 300
    }


def compose_classification_prompt(classes: list) -> str:
    return (f"What is in the image? Return the class of the object in the image. Here "
            f"are the classes: {', '.join(classes)}. You can only return one class "
            f"from that list.")


def compose_headers(api_key: str) -> dict:
    return {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }


def prompt_image(api_key: str, image: np.ndarray, prompt: str) -> str:
    headers = compose_headers(api_key=api_key)
    payload = compose_payload(image=image, prompt=prompt)
    response = requests.post(url=API_URL, headers=headers, json=payload).json()

    if 'error' in response:
        raise ValueError(response['error']['message'])
    return response['choices'][0]['message']['content']


def classify_image(api_key: str, image: np.ndarray) -> str:
    if not api_key:
        raise ValueError(
            "API_KEY is not set. "
            "Please follow the instructions in the README to set it up.")
    image = preprocess_image(image=image)
    prompt = compose_classification_prompt(classes=CLASSES)
    response = prompt_image(api_key=api_key, image=image, prompt=prompt)
    return response


with gr.Blocks() as demo:
    gr.Markdown(MARKDOWN)
    api_key_textbox = gr.Textbox(
        label="πŸ”‘ OpenAI API", type="password")

    with gr.TabItem("Basic"):
        with gr.Column():
            input_image = gr.Image(
                image_mode='RGB', type='numpy', height=500)
            output_text = gr.Textbox(
                label="Output")
            submit_button = gr.Button("Submit")

        submit_button.click(
            fn=classify_image,
            inputs=[api_key_textbox, input_image],
            outputs=output_text)

demo.launch(debug=False, show_error=True)