imagetospeech

Sleeping

App Files Files Community

imagetospeech / app.py

yongyeol

Update app.py

7d2e0c9 verified about 2 months ago

raw

history blame

4.09 kB

	import gradio as gr
	import logging
	from PIL import Image
	from transformers import (
	BlipProcessor,
	BlipForConditionalGeneration,
	pipeline,
	AutoTokenizer,
	VitsModel
	)
	import torch
	from uroman import Uroman

	# ─────────────── 로깅 설정 ───────────────
	logging.basicConfig(level=logging.INFO)

	# ─────────────── 1. BLIP 이미지 캡셔닝 (영어 생성) ───────────────
	processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
	blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
	blip_model.to("cuda" if torch.cuda.is_available() else "cpu")

	# ─────────────── 2. 영어 → 한국어 번역 ───────────────
	translation_pipeline = pipeline(
	"translation",
	model="facebook/nllb-200-distilled-600M",
	src_lang="eng_Latn",
	tgt_lang="kor_Hang",
	max_length=200,
	device=0 if torch.cuda.is_available() else -1
	)

	# ─────────────── 3. 한국어 TTS ───────────────
	tts_model = VitsModel.from_pretrained("facebook/mms-tts-kor")
	tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-kor")
	tts_model.to("cuda" if torch.cuda.is_available() else "cpu")

	uroman = Uroman()

	def synthesize_tts(text: str):
	"""한글 문장을 VITS‑TTS 파형으로 변환"""
	romanized = uroman.romanize_string(text)
	inputs = tts_tokenizer(romanized, return_tensors="pt")
	input_ids = inputs["input_ids"].long().to(tts_model.device)
	with torch.no_grad():
	output = tts_model(input_ids=input_ids)
	waveform = output.waveform.squeeze().cpu().numpy()
	return tts_model.config.sampling_rate, waveform

	# ─────────────── 4. 이미지 → 캡션 + 번역 + 음성 출력 ───────────────
	def describe_and_speak(img: Image.Image):
	logging.info("[DEBUG] describe_and_speak 호출")

	# ① 영어 캡션
	pixel_values = processor(images=img, return_tensors="pt").pixel_values.to(blip_model.device)
	generated_ids = blip_model.generate(pixel_values, max_length=64)
	caption_en = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
	logging.info(f"[DEBUG] caption_en: {caption_en}")

	# ② 번역
	try:
	result = translation_pipeline(caption_en)
	caption_ko = result[0]["translation_text"].strip()
	except Exception as e:
	logging.error(f"[ERROR] 번역 오류: {e}")
	caption_ko = ""
	logging.info(f"[DEBUG] caption_ko: {caption_ko}")

	if not caption_ko:
	return "이미지에 대한 설명을 생성할 수 없습니다.", None

	# ③ TTS
	try:
	sr, wav = synthesize_tts(caption_ko)
	return caption_ko, (sr, wav)
	except Exception as e:
	logging.error(f"[ERROR] TTS 오류: {e}")
	return caption_ko, None

	# ─────────────── 5. Gradio 인터페이스 ───────────────
	with gr.Blocks(
	title="이미지 → 한글 캡션 & 음성 변환",
	css="footer {display: none !important;}" # 푸터 숨기기
	) as demo:
	gr.Markdown(
	"## 이미지 → 한글 캡션 & 음성 변환\n"
	"BLIP으로 영어 캡션 생성 → NLLB로 한국어 번역 → VITS로 음성 생성"
	)

	# 입력/출력 컴포넌트
	input_img = gr.Image(
	type="pil",
	sources=["upload", "webcam"],
	label="입력 이미지"
	)
	caption_out = gr.Textbox(label="한글 캡션")
	audio_out = gr.Audio(label="음성 재생", type="numpy")

	# 이미지가 변경(업로드‧캡처)될 때마다 함수 자동 실행
	input_img.change(
	fn=describe_and_speak,
	inputs=input_img,
	outputs=[caption_out, audio_out],
	queue=True # 동시 접속 시 안전
	)

	if __name__ == "__main__":
	demo.launch(debug=True)