yongyeol commited on
Commit
4b414b1
·
verified ·
1 Parent(s): 8606266

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -0
app.py CHANGED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
3
+ from PIL import Image
4
+ import torch
5
+ import requests
6
+
7
+ # Load caption model
8
+ caption_model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
9
+ feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
10
+ tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
11
+
12
+ # Load ChatTTS (via inference API)
13
+ CHAT_TTS_API = "https://api-inference.huggingface.co/models/2Noise/ChatTTS"
14
+ headers = {"Authorization": f"Bearer {YOUR_HF_TOKEN}"}
15
+
16
+ def generate_caption(image):
17
+ pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
18
+ output_ids = caption_model.generate(pixel_values, max_length=50, num_beams=4)
19
+ caption = tokenizer.decode(output_ids[0], skip_special_tokens=True)
20
+ return caption
21
+
22
+ def tts_audio(text):
23
+ payload = {"inputs": text}
24
+ response = requests.post(CHAT_TTS_API, headers=headers, json=payload)
25
+ audio = response.content
26
+ return audio
27
+
28
+ def process(image):
29
+ caption = generate_caption(image)
30
+ audio = tts_audio(caption)
31
+ return caption, (audio, "result.wav")
32
+
33
+ demo = gr.Interface(
34
+ fn=process,
35
+ inputs=gr.Image(type="pil"),
36
+ outputs=[gr.Text(label="설명"), gr.Audio(label="TTS 음성")],
37
+ title="🎨 AI 그림 설명 낭독기",
38
+ )
39
+
40
+ demo.launch()