Spaces:

yongyeol
/

Spellcheck

Sleeping

App Files Files Community

yongyeol commited on Jul 6

Commit

fd7bb60

verified ·

1 Parent(s): aefb32f

Upload 2 files

Browse files

Files changed (2) hide show

app.py +87 -0
requirements.txt +0 -0

app.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import gradio as gr
+import logging
+from PIL import Image
+from transformers import (
+    BlipProcessor,
+    BlipForConditionalGeneration,
+    pipeline,
+    AutoTokenizer,
+    VitsModel
+)
+import torch
+# ─────────────── 로깅 설정 ───────────────
+logging.basicConfig(level=logging.INFO)
+# ─────────────── 1. BLIP 이미지 캡셔닝 (영어 생성) ───────────────
+processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
+blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
+# ─────────────── 2. 영어→한국어 번역: NLLB 파이프라인 ───────────────
+translation_pipeline = pipeline(
+    "translation",
+    model="facebook/nllb-200-distilled-600M",
+    src_lang="eng_Latn",
+    tgt_lang="kor_Hang",
+    max_length=200
+)
+# ─────────────── 3. 한국어 TTS: VITS 직접 로딩 방식 ───────────────
+tts_model = VitsModel.from_pretrained("facebook/mms-tts-kor")
+tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-kor")
+tts_model.to("cuda" if torch.cuda.is_available() else "cpu")
+def synthesize_tts(text: str):
+    inputs = tts_tokenizer(text, return_tensors="pt").to(tts_model.device)
+    with torch.no_grad():
+        output = tts_model(**inputs)
+    waveform = output.waveform.squeeze().cpu().numpy()
+    return (tts_model.config.sampling_rate, waveform)
+# ─────────────── 4. 이미지 → 캡션 + 번역 + 음성 출력 ───────────────
+def describe_and_speak(img: Image.Image):
+    logging.info("[DEBUG] describe_and_speak 함수 호출됨")
+    # ① 영어 캡션 생성
+    pixel_values = processor(images=img, return_tensors="pt").pixel_values
+    generated_ids = blip_model.generate(pixel_values, max_length=64)
+    caption_en = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
+    logging.info(f"[DEBUG] caption_en: {caption_en}")
+    print(f"[DEBUG] caption_en: {caption_en}")
+    # ② 번역
+    try:
+        result = translation_pipeline(caption_en)
+        caption_ko = result[0]['translation_text'].strip()
+    except Exception as e:
+        logging.error(f"[ERROR] 번역 오류: {e}")
+        caption_ko = ""
+    logging.info(f"[DEBUG] caption_ko: {caption_ko}")
+    print(f"[DEBUG] caption_ko: {caption_ko}")
+    if not caption_ko:
+        return "이미지에 대한 설명을 생성할 수 없습니다.", None
+    # ③ TTS 합성
+    try:
+        sr, wav = synthesize_tts(caption_ko)
+        return caption_ko, (sr, wav)
+    except Exception as e:
+        logging.error(f"[ERROR] TTS 에러: {e}")
+        return caption_ko, None
+# ─────────────── 5. Gradio 인터페이스 ───────────────
+demo = gr.Interface(
+    fn=describe_and_speak,
+    inputs=gr.Image(type="pil", label="입력 이미지"),
+    outputs=[
+        gr.Textbox(label="한글 캡션"),
+        gr.Audio(label="음성 재생", type="numpy")
+    ],
+    title="이미지 → 한글 캡션 & 음성 변환",
+    description="BLIP으로 영어 캡션 생성 → NLLB로 한국어 번역 → VITS로 음성 생성"
+)
+if __name__ == "__main__":
+    demo.launch(debug=True)

requirements.txt ADDED Viewed

Binary file (126 Bytes). View file