imagetospeech

Sleeping

App Files Files Community

yongyeol commited on Jul 8

Commit

8586da3

verified ·

1 Parent(s): 7d2e0c9

Update app.py

Browse files

Files changed (1) hide show

app.py +77 -69

app.py CHANGED Viewed

@@ -1,107 +1,115 @@
 import gradio as gr
 import logging
 from PIL import Image
 from transformers import (
     BlipProcessor,
     BlipForConditionalGeneration,
     pipeline,
     AutoTokenizer,
-    VitsModel
 )
-import torch
-from uroman import Uroman
-# ─────────────── 로깅 설정 ───────────────
 logging.basicConfig(level=logging.INFO)
-# ─────────────── 1. BLIP 이미지 캡셔닝 (영어 생성) ───────────────
-processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
-blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
-blip_model.to("cuda" if torch.cuda.is_available() else "cpu")
-# ─────────────── 2. 영어 → 한국어 번역 ───────────────
 translation_pipeline = pipeline(
     "translation",
     model="facebook/nllb-200-distilled-600M",
     src_lang="eng_Latn",
     tgt_lang="kor_Hang",
     max_length=200,
-    device=0 if torch.cuda.is_available() else -1
 )
-# ─────────────── 3. 한국어 TTS ───────────────
-tts_model = VitsModel.from_pretrained("facebook/mms-tts-kor")
-tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-kor")
-tts_model.to("cuda" if torch.cuda.is_available() else "cpu")
 uroman = Uroman()
-def synthesize_tts(text: str):
-    """한글 문장을 VITS‑TTS 파형으로 변환"""
-    romanized = uroman.romanize_string(text)
-    inputs = tts_tokenizer(romanized, return_tensors="pt")
-    input_ids = inputs["input_ids"].long().to(tts_model.device)
     with torch.no_grad():
-        output = tts_model(input_ids=input_ids)
-    waveform = output.waveform.squeeze().cpu().numpy()
-    return tts_model.config.sampling_rate, waveform
-# ─────────────── 4. 이미지 → 캡션 + 번역 + 음성 출력 ───────────────
-def describe_and_speak(img: Image.Image):
-    logging.info("[DEBUG] describe_and_speak 호출")
     # ① 영어 캡션
-    pixel_values = processor(images=img, return_tensors="pt").pixel_values.to(blip_model.device)
-    generated_ids = blip_model.generate(pixel_values, max_length=64)
-    caption_en = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
-    logging.info(f"[DEBUG] caption_en: {caption_en}")
-    # ② 번역
     try:
-        result = translation_pipeline(caption_en)
-        caption_ko = result[0]["translation_text"].strip()
     except Exception as e:
-        logging.error(f"[ERROR] 번역 오류: {e}")
-        caption_ko = ""
-    logging.info(f"[DEBUG] caption_ko: {caption_ko}")
-    if not caption_ko:
-        return "이미지에 대한 설명을 생성할 수 없습니다.", None
-    # ③ TTS
-    try:
-        sr, wav = synthesize_tts(caption_ko)
-        return caption_ko, (sr, wav)
-    except Exception as e:
-        logging.error(f"[ERROR] TTS 오류: {e}")
-        return caption_ko, None
-# ─────────────── 5. Gradio 인터페이스 ───────────────
-with gr.Blocks(
-    title="이미지 → 한글 캡션 & 음성 변환",
-    css="footer {display: none !important;}"   # 푸터 숨기기
-) as demo:
     gr.Markdown(
-        "## 이미지 → 한글 캡션 & 음성 변환\n"
-        "BLIP으로 영어 캡션 생성 → NLLB로 한국어 번역 → VITS로 음성 생성"
     )
-    # 입력/출력 컴포넌트
-    input_img = gr.Image(
-        type="pil",
-        sources=["upload", "webcam"],
-        label="입력 이미지"
-    )
-    caption_out = gr.Textbox(label="한글 캡션")
-    audio_out = gr.Audio(label="음성 재생", type="numpy")
-    # 이미지가 변경(업로드‧캡처)될 때마다 함수 자동 실행
-    input_img.change(
-        fn=describe_and_speak,
-        inputs=input_img,
-        outputs=[caption_out, audio_out],
-        queue=True    # 동시 접속 시 안전
-    )
 if __name__ == "__main__":
-    demo.launch(debug=True)

 import gradio as gr
 import logging
 from PIL import Image
+import torch
+from uroman import Uroman
 from transformers import (
     BlipProcessor,
     BlipForConditionalGeneration,
     pipeline,
     AutoTokenizer,
+    VitsModel,
 )
 logging.basicConfig(level=logging.INFO)
+# ───────── 1. 모델 로드 ─────────
+processor = BlipProcessor.from_pretrained(
+    "Salesforce/blip-image-captioning-large"
+)
+blip_model = BlipForConditionalGeneration.from_pretrained(
+    "Salesforce/blip-image-captioning-large"
+).to("cuda" if torch.cuda.is_available() else "cpu")
 translation_pipeline = pipeline(
     "translation",
     model="facebook/nllb-200-distilled-600M",
     src_lang="eng_Latn",
     tgt_lang="kor_Hang",
     max_length=200,
+    device=0 if torch.cuda.is_available() else -1,
 )
+# --- TTS (ko / en) ---
+tts_ko = VitsModel.from_pretrained("facebook/mms-tts-kor").to(
+    "cuda" if torch.cuda.is_available() else "cpu"
+)
+tok_ko = AutoTokenizer.from_pretrained("facebook/mms-tts-kor")
+tts_en = VitsModel.from_pretrained("facebook/mms-tts-eng").to(
+    "cuda" if torch.cuda.is_available() else "cpu"
+)
+tok_en = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
 uroman = Uroman()
+# ───────── 2. 공통 함수 ─────────
+def tts(model, tokenizer, text: str):
+    roman = uroman.romanize_string(text)
+    ids = tokenizer(roman, return_tensors="pt").input_ids.long().to(model.device)
     with torch.no_grad():
+        wav = model(input_ids=ids).waveform.squeeze().cpu().numpy()
+    return model.config.sampling_rate, wav
+def generate(img: Image.Image, lang: str):
+    """
+    lang == "ko" → 한국어 캡션+음성
+    lang == "en" → 영어  캡션+음성
+    """
+    if img is None:
+        raise gr.Error("먼저 이미지를 업로드하세요 📷")
     # ① 영어 캡션
+    pix = processor(images=img, return_tensors="pt").pixel_values.to(blip_model.device)
+    cap_en = processor.batch_decode(
+        blip_model.generate(pix, max_length=64), skip_special_tokens=True
+    )[0].strip()
+    if lang == "en":
+        sr, wav = tts(tts_en, tok_en, cap_en)
+        return cap_en, (sr, wav)
+    # ② 번역(→ko)
     try:
+        cap_ko = translation_pipeline(cap_en)[0]["translation_text"].strip()
     except Exception as e:
+        logging.error(f"[ERROR] 번역 실패: {e}")
+        cap_ko = ""
+    if not cap_ko:
+        return "번역 오류가 발생했습니다.", None
+    sr, wav = tts(tts_ko, tok_ko, cap_ko)
+    return cap_ko, (sr, wav)
+# ───────── 3. Gradio UI ─────────
+with gr.Blocks(title="Image → Caption & TTS", css="footer{display:none;}") as demo:
     gr.Markdown(
+        "## 이미지 → 한글 / English 캡션 & 음성 변환\n"
+        "BLIP (caption) → NLLB (translate) → VITS (TTS)"
     )
+    img_state = gr.State()  # 최근 이미지 저장
+    input_img = gr.Image(type="pil", label="📷 이미지 업로드")
+    caption_box = gr.Textbox(label="📑 캡션 결과")
+    audio_play = gr.Audio(label="🔊 음성 재생", type="numpy")
+    with gr.Row():
+        ko_btn = gr.Button("한글 생성")
+        en_btn = gr.Button("English")
+    # 이미지 업로드 시 state 업데이트
+    def store_img(img):
+        return img
+    input_img.change(store_img, inputs=input_img, outputs=img_state, queue=False)
+    # 버튼 ↔ 생성 함수 연결
+    ko_btn.click(fn=lambda img: generate(img, "ko"), inputs=img_state, outputs=[caption_box, audio_play])
+    en_btn.click(fn=lambda img: generate(img, "en"), inputs=img_state, outputs=[caption_box, audio_play])
 if __name__ == "__main__":
+    demo.launch()