File size: 2,896 Bytes
e4d072e
cd00a55
e4d072e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dfd8ef3
73e2c66
31447f5
73e2c66
 
31447f5
e4d072e
31447f5
 
 
 
 
 
 
 
 
e4d072e
31447f5
e4d072e
31447f5
 
e4d072e
 
 
 
73e2c66
 
e4d072e
 
73e2c66
e4d072e
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import os
from PIL import Image
import gradio as gr
from transformers import pipeline

# 1) ํŒŒ์ดํ”„๋ผ์ธ ์ดˆ๊ธฐํ™”
captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
scene_classifier = pipeline(
    "zero-shot-image-classification",
    model="openai/clip-vit-base-patch32"
)

# 2) ์žฅ๋ฉด ๋ ˆ์ด๋ธ” & ํ…œํ”Œ๋ฆฟ ์ •์˜
SCENE_LABELS = [
    "outdoor", "indoor", "beach", "office", "street",
    "restaurant", "park", "sports", "kitchen", "mountain"
]
TEMPLATES = {
    "outdoor":    "In this picture, {caption}. It looks like a pleasant outdoor setting, and the subject seems relaxed.",
    "indoor":     "In this picture, {caption}. It appears to be indoors, perhaps at home or in an office environment.",
    "beach":      "In this picture, {caption}. It seems to be on a beach, and the atmosphere looks warm and sunny.",
    "office":     "In this picture, {caption}. It looks like an office scene, with people engaged in work or discussion.",
    "street":     "In this picture, {caption}. The scene appears to be on a busy street, with vehicles and pedestrians.",
    "restaurant": "In this picture, {caption}. It looks like a restaurant setting, where people are dining together.",
    "park":       "In this picture, {caption}. The location seems to be a park, with trees and open space.",
    "sports":     "In this picture, {caption}. It appears to be a sports activity, showing movement and action.",
    "kitchen":    "In this picture, {caption}. It seems to be in a kitchen, with cooking utensils visible.",
    "mountain":   "In this picture, {caption}. The background looks like mountains, suggesting a hiking scene."
}

def generate_caption(image_path):
    try:
        # 1) ์ด๋ฏธ์ง€ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ
        img = Image.open(image_path).convert("RGB")

        # 2) ์›๋ณธ ์บก์…˜ ์ƒ์„ฑ
        out = captioner(img)
        first = out[0] if isinstance(out, list) else out
        raw = first.get("generated_text") or first.get("text") or str(first)
        raw = raw.strip()

        # 3) ์žฅ๋ฉด ๋ถ„๋ฅ˜
        cls = scene_classifier(img, candidate_labels=SCENE_LABELS)
        scene = cls["labels"][0]

        # 4) ํ…œํ”Œ๋ฆฟ ๋งคํ•‘ ๋ฐ ๋ฆฌํ„ด
        template = TEMPLATES.get(scene, "In this picture, {caption}.")
        return template.format(caption=raw)

    except Exception as e:
        return f"๐Ÿ”ด Error: {e}"

# 5) Gradio ์ธํ„ฐํŽ˜์ด์Šค ์ •์˜
with gr.Blocks() as demo:
    gr.Markdown("## ๐Ÿ“ธ TOEIC Partโ€ฏ1: ์ƒํ™ฉ๋ณ„ ์‚ฌ์ง„ ๋ฌ˜์‚ฌ (Single Image)")
    img_in = gr.Image(type="filepath", label="Upload an image")
    btn = gr.Button("Describe")
    output = gr.Textbox(label="TOEIC Partโ€ฏ1 Response", lines=4)
    btn.click(fn=generate_caption, inputs=img_in, outputs=output)

# 6) ์•ฑ ์‹คํ–‰
if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=int(os.environ.get("PORT", 7860))
    )