imagetospeech

Sleeping

yongyeol commited on Jul 6

Commit

6ee0045

verified ·

1 Parent(s): 256226a

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -31,14 +31,20 @@ tts_model = VitsModel.from_pretrained("facebook/mms-tts-kor")
 tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-kor")
 tts_model.to("cuda" if torch.cuda.is_available() else "cpu")
 def synthesize_tts(text: str):
-    inputs = tts_tokenizer(text, return_tensors="pt")
-    input_ids = inputs["input_ids"].to(tts_model.device)
     with torch.no_grad():
         output = tts_model(input_ids=input_ids)
     waveform = output.waveform.squeeze().cpu().numpy()
     return (tts_model.config.sampling_rate, waveform)
 # ─────────────── 4. 이미지 → 캡션 + 번역 + 음성 출력 ───────────────
 def describe_and_speak(img: Image.Image):
     logging.info("[DEBUG] describe_and_speak 함수 호출됨")

 tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-kor")
 tts_model.to("cuda" if torch.cuda.is_available() else "cpu")
+from uroman import uromanize
 def synthesize_tts(text: str):
+    romanized = uromanize(text)
+    inputs = tts_tokenizer(romanized, return_tensors="pt")
+    input_ids = inputs["input_ids"].long().to(tts_model.device)
     with torch.no_grad():
         output = tts_model(input_ids=input_ids)
     waveform = output.waveform.squeeze().cpu().numpy()
     return (tts_model.config.sampling_rate, waveform)
 # ─────────────── 4. 이미지 → 캡션 + 번역 + 음성 출력 ───────────────
 def describe_and_speak(img: Image.Image):
     logging.info("[DEBUG] describe_and_speak 함수 호출됨")