imagetospeech

Sleeping

File size: 4,093 Bytes

40b810e
 
 
 
 
 
 
 
 
 
 
7d2e0c9
40b810e
 
cb2bc8c
 
40b810e
 
 
7d2e0c9
40b810e
7d2e0c9
40b810e
 
 
 
 
7d2e0c9
 
40b810e
 
7d2e0c9
40b810e
 
 
 
820f54d
6ee0045
40b810e
7d2e0c9
 
6ee0045
 
40b810e
 
 
7d2e0c9
6ee0045
40b810e
 
7d2e0c9
40b810e
7d2e0c9
 
40b810e
 
 
 
 
 
 
7d2e0c9
40b810e
 
 
 
 
 
 
 
7d2e0c9
40b810e
 
 
 
7d2e0c9
40b810e
 
 
7d2e0c9
40b810e
7d2e0c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40b810e

import gradio as gr
import logging
from PIL import Image
from transformers import (
    BlipProcessor,
    BlipForConditionalGeneration,
    pipeline,
    AutoTokenizer,
    VitsModel
)
import torch
from uroman import Uroman

# ─────────────── 로깅 설정 ───────────────
logging.basicConfig(level=logging.INFO)

# ─────────────── 1. BLIP 이미지 캡셔닝 (영어 생성) ───────────────
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
blip_model.to("cuda" if torch.cuda.is_available() else "cpu")

# ─────────────── 2. 영어 → 한국어 번역 ───────────────
translation_pipeline = pipeline(
    "translation",
    model="facebook/nllb-200-distilled-600M",
    src_lang="eng_Latn",
    tgt_lang="kor_Hang",
    max_length=200,
    device=0 if torch.cuda.is_available() else -1
)

# ─────────────── 3. 한국어 TTS ───────────────
tts_model = VitsModel.from_pretrained("facebook/mms-tts-kor")
tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-kor")
tts_model.to("cuda" if torch.cuda.is_available() else "cpu")

uroman = Uroman()

def synthesize_tts(text: str):
    """한글 문장을 VITS‑TTS 파형으로 변환"""
    romanized = uroman.romanize_string(text)
    inputs = tts_tokenizer(romanized, return_tensors="pt")
    input_ids = inputs["input_ids"].long().to(tts_model.device)
    with torch.no_grad():
        output = tts_model(input_ids=input_ids)
    waveform = output.waveform.squeeze().cpu().numpy()
    return tts_model.config.sampling_rate, waveform

# ─────────────── 4. 이미지 → 캡션 + 번역 + 음성 출력 ───────────────
def describe_and_speak(img: Image.Image):
    logging.info("[DEBUG] describe_and_speak 호출")

    # ① 영어 캡션
    pixel_values = processor(images=img, return_tensors="pt").pixel_values.to(blip_model.device)
    generated_ids = blip_model.generate(pixel_values, max_length=64)
    caption_en = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
    logging.info(f"[DEBUG] caption_en: {caption_en}")

    # ② 번역
    try:
        result = translation_pipeline(caption_en)
        caption_ko = result[0]["translation_text"].strip()
    except Exception as e:
        logging.error(f"[ERROR] 번역 오류: {e}")
        caption_ko = ""
    logging.info(f"[DEBUG] caption_ko: {caption_ko}")

    if not caption_ko:
        return "이미지에 대한 설명을 생성할 수 없습니다.", None

    # ③ TTS
    try:
        sr, wav = synthesize_tts(caption_ko)
        return caption_ko, (sr, wav)
    except Exception as e:
        logging.error(f"[ERROR] TTS 오류: {e}")
        return caption_ko, None

# ─────────────── 5. Gradio 인터페이스 ───────────────
with gr.Blocks(
    title="이미지 → 한글 캡션 & 음성 변환",
    css="footer {display: none !important;}"   # 푸터 숨기기
) as demo:
    gr.Markdown(
        "## 이미지 → 한글 캡션 & 음성 변환\n"
        "BLIP으로 영어 캡션 생성 → NLLB로 한국어 번역 → VITS로 음성 생성"
    )

    # 입력/출력 컴포넌트
    input_img = gr.Image(
        type="pil",
        sources=["upload", "webcam"],
        label="입력 이미지"
    )
    caption_out = gr.Textbox(label="한글 캡션")
    audio_out = gr.Audio(label="음성 재생", type="numpy")

    # 이미지가 변경(업로드‧캡처)될 때마다 함수 자동 실행
    input_img.change(
        fn=describe_and_speak,
        inputs=input_img,
        outputs=[caption_out, audio_out],
        queue=True    # 동시 접속 시 안전
    )

if __name__ == "__main__":
    demo.launch(debug=True)