imagetospeech

Sleeping

App Files Files Community

yongyeol commited on Jul 7

Commit

7d2e0c9

verified ·

1 Parent(s): 0d621f4

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -24

app.py CHANGED Viewed

@@ -9,6 +9,7 @@ from transformers import (
     VitsModel
 )
 import torch
 # ─────────────── 로깅 설정 ───────────────
 logging.basicConfig(level=logging.INFO)
@@ -16,77 +17,91 @@ logging.basicConfig(level=logging.INFO)
 # ─────────────── 1. BLIP 이미지 캡셔닝 (영어 생성) ───────────────
 processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
 blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
-# ─────────────── 2. 영어→한국어 번역: NLLB 파이프라인 ───────────────
 translation_pipeline = pipeline(
     "translation",
     model="facebook/nllb-200-distilled-600M",
     src_lang="eng_Latn",
     tgt_lang="kor_Hang",
-    max_length=200
 )
-# ─────────────── 3. 한국어 TTS: VITS 직접 로딩 방식 ───────────────
 tts_model = VitsModel.from_pretrained("facebook/mms-tts-kor")
 tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-kor")
 tts_model.to("cuda" if torch.cuda.is_available() else "cpu")
-from uroman import Uroman
 uroman = Uroman()
 def synthesize_tts(text: str):
-    romanized = uroman.romanize_string(text)          # ← 핵심 수정
     inputs = tts_tokenizer(romanized, return_tensors="pt")
     input_ids = inputs["input_ids"].long().to(tts_model.device)
     with torch.no_grad():
         output = tts_model(input_ids=input_ids)
     waveform = output.waveform.squeeze().cpu().numpy()
-    return (tts_model.config.sampling_rate, waveform)
 # ─────────────── 4. 이미지 → 캡션 + 번역 + 음성 출력 ───────────────
 def describe_and_speak(img: Image.Image):
-    logging.info("[DEBUG] describe_and_speak 함수 호출됨")
-    # ① 영어 캡션 생성
-    pixel_values = processor(images=img, return_tensors="pt").pixel_values
     generated_ids = blip_model.generate(pixel_values, max_length=64)
     caption_en = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
     logging.info(f"[DEBUG] caption_en: {caption_en}")
-    print(f"[DEBUG] caption_en: {caption_en}")
     # ② 번역
     try:
         result = translation_pipeline(caption_en)
-        caption_ko = result[0]['translation_text'].strip()
     except Exception as e:
         logging.error(f"[ERROR] 번역 오류: {e}")
         caption_ko = ""
     logging.info(f"[DEBUG] caption_ko: {caption_ko}")
-    print(f"[DEBUG] caption_ko: {caption_ko}")
     if not caption_ko:
         return "이미지에 대한 설명을 생성할 수 없습니다.", None
-    # ③ TTS 합성
     try:
         sr, wav = synthesize_tts(caption_ko)
         return caption_ko, (sr, wav)
     except Exception as e:
-        logging.error(f"[ERROR] TTS 에러: {e}")
         return caption_ko, None
 # ─────────────── 5. Gradio 인터페이스 ───────────────
-demo = gr.Interface(
-    fn=describe_and_speak,
-    inputs=gr.Image(type="pil", sources=["upload", "webcam"], label="입력 이미지"),
-    outputs=[
-        gr.Textbox(label="한글 캡션"),
-        gr.Audio(label="음성 재생", type="numpy")
-    ],
     title="이미지 → 한글 캡션 & 음성 변환",
-    description="BLIP으로 영어 캡션 생성 → NLLB로 한국어 번역 → VITS로 음성 생성"
-)
 if __name__ == "__main__":
     demo.launch(debug=True)

     VitsModel
 )
 import torch
+from uroman import Uroman
 # ─────────────── 로깅 설정 ───────────────
 logging.basicConfig(level=logging.INFO)
 # ─────────────── 1. BLIP 이미지 캡셔닝 (영어 생성) ───────────────
 processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
 blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
+blip_model.to("cuda" if torch.cuda.is_available() else "cpu")
+# ─────────────── 2. 영어 → 한국어 번역 ───────────────
 translation_pipeline = pipeline(
     "translation",
     model="facebook/nllb-200-distilled-600M",
     src_lang="eng_Latn",
     tgt_lang="kor_Hang",
+    max_length=200,
+    device=0 if torch.cuda.is_available() else -1
 )
+# ─────────────── 3. 한국어 TTS ───────────────
 tts_model = VitsModel.from_pretrained("facebook/mms-tts-kor")
 tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-kor")
 tts_model.to("cuda" if torch.cuda.is_available() else "cpu")
 uroman = Uroman()
 def synthesize_tts(text: str):
+    """한글 문장을 VITS‑TTS 파형으로 변환"""
+    romanized = uroman.romanize_string(text)
     inputs = tts_tokenizer(romanized, return_tensors="pt")
     input_ids = inputs["input_ids"].long().to(tts_model.device)
     with torch.no_grad():
         output = tts_model(input_ids=input_ids)
     waveform = output.waveform.squeeze().cpu().numpy()
+    return tts_model.config.sampling_rate, waveform
 # ─────────────── 4. 이미지 → 캡션 + 번역 + 음성 출력 ───────────────
 def describe_and_speak(img: Image.Image):
+    logging.info("[DEBUG] describe_and_speak 호출")
+    # ① 영어 캡션
+    pixel_values = processor(images=img, return_tensors="pt").pixel_values.to(blip_model.device)
     generated_ids = blip_model.generate(pixel_values, max_length=64)
     caption_en = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
     logging.info(f"[DEBUG] caption_en: {caption_en}")
     # ② 번역
     try:
         result = translation_pipeline(caption_en)
+        caption_ko = result[0]["translation_text"].strip()
     except Exception as e:
         logging.error(f"[ERROR] 번역 오류: {e}")
         caption_ko = ""
     logging.info(f"[DEBUG] caption_ko: {caption_ko}")
     if not caption_ko:
         return "이미지에 대한 설명을 생성할 수 없습니다.", None
+    # ③ TTS
     try:
         sr, wav = synthesize_tts(caption_ko)
         return caption_ko, (sr, wav)
     except Exception as e:
+        logging.error(f"[ERROR] TTS 오류: {e}")
         return caption_ko, None
 # ─────────────── 5. Gradio 인터페이스 ───────────────
+with gr.Blocks(
     title="이미지 → 한글 캡션 & 음성 변환",
+    css="footer {display: none !important;}"   # 푸터 숨기기
+) as demo:
+    gr.Markdown(
+        "## 이미지 → 한글 캡션 & 음성 변환\n"
+        "BLIP으로 영어 캡션 생성 → NLLB로 한국어 번역 → VITS로 음성 생성"
+    )
+    # 입력/출력 컴포넌트
+    input_img = gr.Image(
+        type="pil",
+        sources=["upload", "webcam"],
+        label="입력 이미지"
+    )
+    caption_out = gr.Textbox(label="한글 캡션")
+    audio_out = gr.Audio(label="음성 재생", type="numpy")
+    # 이미지가 변경(업로드‧캡처)될 때마다 함수 자동 실행
+    input_img.change(
+        fn=describe_and_speak,
+        inputs=input_img,
+        outputs=[caption_out, audio_out],
+        queue=True    # 동시 접속 시 안전
+    )
 if __name__ == "__main__":
     demo.launch(debug=True)