yongyeol commited on
Commit
f5e6532
Β·
verified Β·
1 Parent(s): 8586da3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -12
app.py CHANGED
@@ -13,7 +13,7 @@ from transformers import (
13
 
14
  logging.basicConfig(level=logging.INFO)
15
 
16
- # ───────── 1. λͺ¨λΈ λ‘œλ“œ ─────────
17
  processor = BlipProcessor.from_pretrained(
18
  "Salesforce/blip-image-captioning-large"
19
  )
@@ -30,7 +30,7 @@ translation_pipeline = pipeline(
30
  device=0 if torch.cuda.is_available() else -1,
31
  )
32
 
33
- # --- TTS (ko / en) ---
34
  tts_ko = VitsModel.from_pretrained("facebook/mms-tts-kor").to(
35
  "cuda" if torch.cuda.is_available() else "cpu"
36
  )
@@ -44,7 +44,6 @@ tok_en = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
44
  uroman = Uroman()
45
 
46
 
47
- # ───────── 2. 곡톡 ν•¨μˆ˜ ─────────
48
  def tts(model, tokenizer, text: str):
49
  roman = uroman.romanize_string(text)
50
  ids = tokenizer(roman, return_tensors="pt").input_ids.long().to(model.device)
@@ -59,9 +58,9 @@ def generate(img: Image.Image, lang: str):
59
  lang == "en" β†’ μ˜μ–΄ μΊ‘μ…˜+μŒμ„±
60
  """
61
  if img is None:
62
- raise gr.Error("λ¨Όμ € 이미지λ₯Ό μ—…λ‘œλ“œν•˜μ„Έμš” πŸ“·")
 
63
 
64
- # β‘  μ˜μ–΄ μΊ‘μ…˜
65
  pix = processor(images=img, return_tensors="pt").pixel_values.to(blip_model.device)
66
  cap_en = processor.batch_decode(
67
  blip_model.generate(pix, max_length=64), skip_special_tokens=True
@@ -71,7 +70,7 @@ def generate(img: Image.Image, lang: str):
71
  sr, wav = tts(tts_en, tok_en, cap_en)
72
  return cap_en, (sr, wav)
73
 
74
- # β‘‘ λ²ˆμ—­(β†’ko)
75
  try:
76
  cap_ko = translation_pipeline(cap_en)[0]["translation_text"].strip()
77
  except Exception as e:
@@ -84,8 +83,8 @@ def generate(img: Image.Image, lang: str):
84
  return cap_ko, (sr, wav)
85
 
86
 
87
- # ───────── 3. Gradio UI ─────────
88
- with gr.Blocks(title="Image β†’ Caption & TTS", css="footer{display:none;}") as demo:
89
  gr.Markdown(
90
  "## 이미지 β†’ ν•œκΈ€ / English μΊ‘μ…˜ & μŒμ„± λ³€ν™˜\n"
91
  "BLIP (caption) β†’ NLLB (translate) β†’ VITS (TTS)"
@@ -98,16 +97,16 @@ with gr.Blocks(title="Image β†’ Caption & TTS", css="footer{display:none;}") as
98
  audio_play = gr.Audio(label="πŸ”Š μŒμ„± μž¬μƒ", type="numpy")
99
 
100
  with gr.Row():
101
- ko_btn = gr.Button("ν•œκΈ€ 생성")
102
- en_btn = gr.Button("English")
 
103
 
104
- # 이미지 μ—…λ‘œλ“œ μ‹œ state μ—…λ°μ΄νŠΈ
105
  def store_img(img):
106
  return img
107
 
108
  input_img.change(store_img, inputs=input_img, outputs=img_state, queue=False)
109
 
110
- # λ²„νŠΌ ↔ 생성 ν•¨μˆ˜ μ—°κ²°
111
  ko_btn.click(fn=lambda img: generate(img, "ko"), inputs=img_state, outputs=[caption_box, audio_play])
112
  en_btn.click(fn=lambda img: generate(img, "en"), inputs=img_state, outputs=[caption_box, audio_play])
113
 
 
13
 
14
  logging.basicConfig(level=logging.INFO)
15
 
16
+
17
  processor = BlipProcessor.from_pretrained(
18
  "Salesforce/blip-image-captioning-large"
19
  )
 
30
  device=0 if torch.cuda.is_available() else -1,
31
  )
32
 
33
+
34
  tts_ko = VitsModel.from_pretrained("facebook/mms-tts-kor").to(
35
  "cuda" if torch.cuda.is_available() else "cpu"
36
  )
 
44
  uroman = Uroman()
45
 
46
 
 
47
  def tts(model, tokenizer, text: str):
48
  roman = uroman.romanize_string(text)
49
  ids = tokenizer(roman, return_tensors="pt").input_ids.long().to(model.device)
 
58
  lang == "en" β†’ μ˜μ–΄ μΊ‘μ…˜+μŒμ„±
59
  """
60
  if img is None:
61
+ raise gr.Error("이미지λ₯Ό μ—…λ‘œλ“œν•˜μ„Έμš” πŸ“·")
62
+
63
 
 
64
  pix = processor(images=img, return_tensors="pt").pixel_values.to(blip_model.device)
65
  cap_en = processor.batch_decode(
66
  blip_model.generate(pix, max_length=64), skip_special_tokens=True
 
70
  sr, wav = tts(tts_en, tok_en, cap_en)
71
  return cap_en, (sr, wav)
72
 
73
+
74
  try:
75
  cap_ko = translation_pipeline(cap_en)[0]["translation_text"].strip()
76
  except Exception as e:
 
83
  return cap_ko, (sr, wav)
84
 
85
 
86
+
87
+ with gr.Blocks(title="Image β†’ Caption & TTS", css="footer {display: none !important;}") as demo:
88
  gr.Markdown(
89
  "## 이미지 β†’ ν•œκΈ€ / English μΊ‘μ…˜ & μŒμ„± λ³€ν™˜\n"
90
  "BLIP (caption) β†’ NLLB (translate) β†’ VITS (TTS)"
 
97
  audio_play = gr.Audio(label="πŸ”Š μŒμ„± μž¬μƒ", type="numpy")
98
 
99
  with gr.Row():
100
+ ko_btn = gr.Button("ν•œκΈ€λ‘œ 생성πŸͺ„")
101
+ en_btn = gr.Button("μ˜μ–΄λ‘œ 생성πŸͺ„")
102
+
103
 
 
104
  def store_img(img):
105
  return img
106
 
107
  input_img.change(store_img, inputs=input_img, outputs=img_state, queue=False)
108
 
109
+
110
  ko_btn.click(fn=lambda img: generate(img, "ko"), inputs=img_state, outputs=[caption_box, audio_play])
111
  en_btn.click(fn=lambda img: generate(img, "en"), inputs=img_state, outputs=[caption_box, audio_play])
112