imagetospeech

Sleeping

App Files Files Community

yongyeol commited on Jul 8

Commit

f5e6532

verified ·

1 Parent(s): 8586da3

Update app.py

Browse files

Files changed (1) hide show

app.py +11 -12

app.py CHANGED Viewed

@@ -13,7 +13,7 @@ from transformers import (
 logging.basicConfig(level=logging.INFO)
-# ───────── 1. 모델 로드 ─────────
 processor = BlipProcessor.from_pretrained(
     "Salesforce/blip-image-captioning-large"
 )
@@ -30,7 +30,7 @@ translation_pipeline = pipeline(
     device=0 if torch.cuda.is_available() else -1,
 )
-# --- TTS (ko / en) ---
 tts_ko = VitsModel.from_pretrained("facebook/mms-tts-kor").to(
     "cuda" if torch.cuda.is_available() else "cpu"
 )
@@ -44,7 +44,6 @@ tok_en = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
 uroman = Uroman()
-# ───────── 2. 공통 함수 ─────────
 def tts(model, tokenizer, text: str):
     roman = uroman.romanize_string(text)
     ids = tokenizer(roman, return_tensors="pt").input_ids.long().to(model.device)
@@ -59,9 +58,9 @@ def generate(img: Image.Image, lang: str):
     lang == "en" → 영어  캡션+음성
     """
     if img is None:
-        raise gr.Error("먼저 이미지를 업로드하세요 📷")
-    # ① 영어 캡션
     pix = processor(images=img, return_tensors="pt").pixel_values.to(blip_model.device)
     cap_en = processor.batch_decode(
         blip_model.generate(pix, max_length=64), skip_special_tokens=True
@@ -71,7 +70,7 @@ def generate(img: Image.Image, lang: str):
         sr, wav = tts(tts_en, tok_en, cap_en)
         return cap_en, (sr, wav)
-    # ② 번역(→ko)
     try:
         cap_ko = translation_pipeline(cap_en)[0]["translation_text"].strip()
     except Exception as e:
@@ -84,8 +83,8 @@ def generate(img: Image.Image, lang: str):
     return cap_ko, (sr, wav)
-# ───────── 3. Gradio UI ─────────
-with gr.Blocks(title="Image → Caption & TTS", css="footer{display:none;}") as demo:
     gr.Markdown(
         "## 이미지 → 한글 / English 캡션 & 음성 변환\n"
         "BLIP (caption) → NLLB (translate) → VITS (TTS)"
@@ -98,16 +97,16 @@ with gr.Blocks(title="Image → Caption & TTS", css="footer{display:none;}") as
     audio_play = gr.Audio(label="🔊 음성 재생", type="numpy")
     with gr.Row():
-        ko_btn = gr.Button("한글 생성")
-        en_btn = gr.Button("English")
-    # 이미지 업로드 시 state 업데이트
     def store_img(img):
         return img
     input_img.change(store_img, inputs=input_img, outputs=img_state, queue=False)
-    # 버튼 ↔ 생성 함수 연결
     ko_btn.click(fn=lambda img: generate(img, "ko"), inputs=img_state, outputs=[caption_box, audio_play])
     en_btn.click(fn=lambda img: generate(img, "en"), inputs=img_state, outputs=[caption_box, audio_play])

 logging.basicConfig(level=logging.INFO)
 processor = BlipProcessor.from_pretrained(
     "Salesforce/blip-image-captioning-large"
 )
     device=0 if torch.cuda.is_available() else -1,
 )
 tts_ko = VitsModel.from_pretrained("facebook/mms-tts-kor").to(
     "cuda" if torch.cuda.is_available() else "cpu"
 )
 uroman = Uroman()
 def tts(model, tokenizer, text: str):
     roman = uroman.romanize_string(text)
     ids = tokenizer(roman, return_tensors="pt").input_ids.long().to(model.device)
     lang == "en" → 영어  캡션+음성
     """
     if img is None:
+        raise gr.Error("이미지를 업로드하세요 📷")
     pix = processor(images=img, return_tensors="pt").pixel_values.to(blip_model.device)
     cap_en = processor.batch_decode(
         blip_model.generate(pix, max_length=64), skip_special_tokens=True
         sr, wav = tts(tts_en, tok_en, cap_en)
         return cap_en, (sr, wav)
     try:
         cap_ko = translation_pipeline(cap_en)[0]["translation_text"].strip()
     except Exception as e:
     return cap_ko, (sr, wav)
+with gr.Blocks(title="Image → Caption & TTS", css="footer {display: none !important;}") as demo:
     gr.Markdown(
         "## 이미지 → 한글 / English 캡션 & 음성 변환\n"
         "BLIP (caption) → NLLB (translate) → VITS (TTS)"
     audio_play = gr.Audio(label="🔊 음성 재생", type="numpy")
     with gr.Row():
+        ko_btn = gr.Button("한글로 생성🪄")
+        en_btn = gr.Button("영어로 생성🪄")
     def store_img(img):
         return img
     input_img.change(store_img, inputs=input_img, outputs=img_state, queue=False)
     ko_btn.click(fn=lambda img: generate(img, "ko"), inputs=img_state, outputs=[caption_box, audio_play])
     en_btn.click(fn=lambda img: generate(img, "en"), inputs=img_state, outputs=[caption_box, audio_play])