Spaces:
Sleeping
Sleeping
File size: 2,896 Bytes
e4d072e cd00a55 e4d072e dfd8ef3 73e2c66 31447f5 73e2c66 31447f5 e4d072e 31447f5 e4d072e 31447f5 e4d072e 31447f5 e4d072e 73e2c66 e4d072e 73e2c66 e4d072e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
import os
from PIL import Image
import gradio as gr
from transformers import pipeline
# 1) ํ์ดํ๋ผ์ธ ์ด๊ธฐํ
captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
scene_classifier = pipeline(
"zero-shot-image-classification",
model="openai/clip-vit-base-patch32"
)
# 2) ์ฅ๋ฉด ๋ ์ด๋ธ & ํ
ํ๋ฆฟ ์ ์
SCENE_LABELS = [
"outdoor", "indoor", "beach", "office", "street",
"restaurant", "park", "sports", "kitchen", "mountain"
]
TEMPLATES = {
"outdoor": "In this picture, {caption}. It looks like a pleasant outdoor setting, and the subject seems relaxed.",
"indoor": "In this picture, {caption}. It appears to be indoors, perhaps at home or in an office environment.",
"beach": "In this picture, {caption}. It seems to be on a beach, and the atmosphere looks warm and sunny.",
"office": "In this picture, {caption}. It looks like an office scene, with people engaged in work or discussion.",
"street": "In this picture, {caption}. The scene appears to be on a busy street, with vehicles and pedestrians.",
"restaurant": "In this picture, {caption}. It looks like a restaurant setting, where people are dining together.",
"park": "In this picture, {caption}. The location seems to be a park, with trees and open space.",
"sports": "In this picture, {caption}. It appears to be a sports activity, showing movement and action.",
"kitchen": "In this picture, {caption}. It seems to be in a kitchen, with cooking utensils visible.",
"mountain": "In this picture, {caption}. The background looks like mountains, suggesting a hiking scene."
}
def generate_caption(image_path):
try:
# 1) ์ด๋ฏธ์ง ๋ถ๋ฌ์ค๊ธฐ
img = Image.open(image_path).convert("RGB")
# 2) ์๋ณธ ์บก์
์์ฑ
out = captioner(img)
first = out[0] if isinstance(out, list) else out
raw = first.get("generated_text") or first.get("text") or str(first)
raw = raw.strip()
# 3) ์ฅ๋ฉด ๋ถ๋ฅ
cls = scene_classifier(img, candidate_labels=SCENE_LABELS)
scene = cls["labels"][0]
# 4) ํ
ํ๋ฆฟ ๋งคํ ๋ฐ ๋ฆฌํด
template = TEMPLATES.get(scene, "In this picture, {caption}.")
return template.format(caption=raw)
except Exception as e:
return f"๐ด Error: {e}"
# 5) Gradio ์ธํฐํ์ด์ค ์ ์
with gr.Blocks() as demo:
gr.Markdown("## ๐ธ TOEIC Partโฏ1: ์ํฉ๋ณ ์ฌ์ง ๋ฌ์ฌ (Single Image)")
img_in = gr.Image(type="filepath", label="Upload an image")
btn = gr.Button("Describe")
output = gr.Textbox(label="TOEIC Partโฏ1 Response", lines=4)
btn.click(fn=generate_caption, inputs=img_in, outputs=output)
# 6) ์ฑ ์คํ
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=int(os.environ.get("PORT", 7860))
)
|