imagetospeech

Sleeping

App Files Files Community

imagetospeech / app.py

yongyeol

Update app.py

8586da3 verified about 2 months ago

raw

history blame

3.67 kB

	import gradio as gr
	import logging
	from PIL import Image
	import torch
	from uroman import Uroman
	from transformers import (
	BlipProcessor,
	BlipForConditionalGeneration,
	pipeline,
	AutoTokenizer,
	VitsModel,
	)

	logging.basicConfig(level=logging.INFO)

	# ───────── 1. 모델 로드 ─────────
	processor = BlipProcessor.from_pretrained(
	"Salesforce/blip-image-captioning-large"
	)
	blip_model = BlipForConditionalGeneration.from_pretrained(
	"Salesforce/blip-image-captioning-large"
	).to("cuda" if torch.cuda.is_available() else "cpu")

	translation_pipeline = pipeline(
	"translation",
	model="facebook/nllb-200-distilled-600M",
	src_lang="eng_Latn",
	tgt_lang="kor_Hang",
	max_length=200,
	device=0 if torch.cuda.is_available() else -1,
	)

	# --- TTS (ko / en) ---
	tts_ko = VitsModel.from_pretrained("facebook/mms-tts-kor").to(
	"cuda" if torch.cuda.is_available() else "cpu"
	)
	tok_ko = AutoTokenizer.from_pretrained("facebook/mms-tts-kor")

	tts_en = VitsModel.from_pretrained("facebook/mms-tts-eng").to(
	"cuda" if torch.cuda.is_available() else "cpu"
	)
	tok_en = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")

	uroman = Uroman()


	# ───────── 2. 공통 함수 ─────────
	def tts(model, tokenizer, text: str):
	roman = uroman.romanize_string(text)
	ids = tokenizer(roman, return_tensors="pt").input_ids.long().to(model.device)
	with torch.no_grad():
	wav = model(input_ids=ids).waveform.squeeze().cpu().numpy()
	return model.config.sampling_rate, wav


	def generate(img: Image.Image, lang: str):
	"""
	lang == "ko" → 한국어 캡션+음성
	lang == "en" → 영어 캡션+음성
	"""
	if img is None:
	raise gr.Error("먼저 이미지를 업로드하세요 📷")

	# ① 영어 캡션
	pix = processor(images=img, return_tensors="pt").pixel_values.to(blip_model.device)
	cap_en = processor.batch_decode(
	blip_model.generate(pix, max_length=64), skip_special_tokens=True
	)[0].strip()

	if lang == "en":
	sr, wav = tts(tts_en, tok_en, cap_en)
	return cap_en, (sr, wav)

	# ② 번역(→ko)
	try:
	cap_ko = translation_pipeline(cap_en)[0]["translation_text"].strip()
	except Exception as e:
	logging.error(f"[ERROR] 번역 실패: {e}")
	cap_ko = ""
	if not cap_ko:
	return "번역 오류가 발생했습니다.", None

	sr, wav = tts(tts_ko, tok_ko, cap_ko)
	return cap_ko, (sr, wav)


	# ───────── 3. Gradio UI ─────────
	with gr.Blocks(title="Image → Caption & TTS", css="footer{display:none;}") as demo:
	gr.Markdown(
	"## 이미지 → 한글 / English 캡션 & 음성 변환\n"
	"BLIP (caption) → NLLB (translate) → VITS (TTS)"
	)

	img_state = gr.State() # 최근 이미지 저장

	input_img = gr.Image(type="pil", label="📷 이미지 업로드")
	caption_box = gr.Textbox(label="📑 캡션 결과")
	audio_play = gr.Audio(label="🔊 음성 재생", type="numpy")

	with gr.Row():
	ko_btn = gr.Button("한글 생성")
	en_btn = gr.Button("English")

	# 이미지 업로드 시 state 업데이트
	def store_img(img):
	return img

	input_img.change(store_img, inputs=input_img, outputs=img_state, queue=False)

	# 버튼 ↔ 생성 함수 연결
	ko_btn.click(fn=lambda img: generate(img, "ko"), inputs=img_state, outputs=[caption_box, audio_play])
	en_btn.click(fn=lambda img: generate(img, "en"), inputs=img_state, outputs=[caption_box, audio_play])

	if __name__ == "__main__":
	demo.launch()