Spaces:
Sleeping
Sleeping
File size: 3,659 Bytes
40b810e cb2bc8c 40b810e 820f54d 6ee0045 40b810e d0977dc 6ee0045 40b810e 6ee0045 40b810e 202b3c7 40b810e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
import gradio as gr
import logging
from PIL import Image
from transformers import (
BlipProcessor,
BlipForConditionalGeneration,
pipeline,
AutoTokenizer,
VitsModel
)
import torch
# βββββββββββββββ λ‘κΉ
μ€μ βββββββββββββββ
logging.basicConfig(level=logging.INFO)
# βββββββββββββββ 1. BLIP μ΄λ―Έμ§ μΊ‘μ
λ (μμ΄ μμ±) βββββββββββββββ
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
# βββββββββββββββ 2. μμ΄βνκ΅μ΄ λ²μ: NLLB νμ΄νλΌμΈ βββββββββββββββ
translation_pipeline = pipeline(
"translation",
model="facebook/nllb-200-distilled-600M",
src_lang="eng_Latn",
tgt_lang="kor_Hang",
max_length=200
)
# βββββββββββββββ 3. νκ΅μ΄ TTS: VITS μ§μ λ‘λ© λ°©μ βββββββββββββββ
tts_model = VitsModel.from_pretrained("facebook/mms-tts-kor")
tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-kor")
tts_model.to("cuda" if torch.cuda.is_available() else "cpu")
from uroman import Uroman
uroman = Uroman()
def synthesize_tts(text: str):
romanized = uroman.romanize_string(text) # β ν΅μ¬ μμ
inputs = tts_tokenizer(romanized, return_tensors="pt")
input_ids = inputs["input_ids"].long().to(tts_model.device)
with torch.no_grad():
output = tts_model(input_ids=input_ids)
waveform = output.waveform.squeeze().cpu().numpy()
return (tts_model.config.sampling_rate, waveform)
# βββββββββββββββ 4. μ΄λ―Έμ§ β μΊ‘μ
+ λ²μ + μμ± μΆλ ₯ βββββββββββββββ
def describe_and_speak(img: Image.Image):
logging.info("[DEBUG] describe_and_speak ν¨μ νΈμΆλ¨")
# β μμ΄ μΊ‘μ
μμ±
pixel_values = processor(images=img, return_tensors="pt").pixel_values
generated_ids = blip_model.generate(pixel_values, max_length=64)
caption_en = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
logging.info(f"[DEBUG] caption_en: {caption_en}")
print(f"[DEBUG] caption_en: {caption_en}")
# β‘ λ²μ
try:
result = translation_pipeline(caption_en)
caption_ko = result[0]['translation_text'].strip()
except Exception as e:
logging.error(f"[ERROR] λ²μ μ€λ₯: {e}")
caption_ko = ""
logging.info(f"[DEBUG] caption_ko: {caption_ko}")
print(f"[DEBUG] caption_ko: {caption_ko}")
if not caption_ko:
return "μ΄λ―Έμ§μ λν μ€λͺ
μ μμ±ν μ μμ΅λλ€.", None
# β’ TTS ν©μ±
try:
sr, wav = synthesize_tts(caption_ko)
return caption_ko, (sr, wav)
except Exception as e:
logging.error(f"[ERROR] TTS μλ¬: {e}")
return caption_ko, None
# βββββββββββββββ 5. Gradio μΈν°νμ΄μ€ βββββββββββββββ
demo = gr.Interface(
fn=describe_and_speak,
inputs=gr.Image(type="pil", sources=["upload", "webcam"], label="μ
λ ₯ μ΄λ―Έμ§"),
outputs=[
gr.Textbox(label="νκΈ μΊ‘μ
"),
gr.Audio(label="μμ± μ¬μ", type="numpy")
],
title="μ΄λ―Έμ§ β νκΈ μΊ‘μ
& μμ± λ³ν",
description="BLIPμΌλ‘ μμ΄ μΊ‘μ
μμ± β NLLBλ‘ νκ΅μ΄ λ²μ β VITSλ‘ μμ± μμ±"
)
if __name__ == "__main__":
demo.launch(debug=True)
|