yongyeol commited on
Commit
7d2e0c9
Β·
verified Β·
1 Parent(s): 0d621f4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -24
app.py CHANGED
@@ -9,6 +9,7 @@ from transformers import (
9
  VitsModel
10
  )
11
  import torch
 
12
 
13
  # ─────────────── λ‘œκΉ… μ„€μ • ───────────────
14
  logging.basicConfig(level=logging.INFO)
@@ -16,77 +17,91 @@ logging.basicConfig(level=logging.INFO)
16
  # ─────────────── 1. BLIP 이미지 캑셔닝 (μ˜μ–΄ 생성) ───────────────
17
  processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
18
  blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
 
19
 
20
- # ─────────────── 2. μ˜μ–΄β†’ν•œκ΅­μ–΄ λ²ˆμ—­: NLLB νŒŒμ΄ν”„λΌμΈ ───────────────
21
  translation_pipeline = pipeline(
22
  "translation",
23
  model="facebook/nllb-200-distilled-600M",
24
  src_lang="eng_Latn",
25
  tgt_lang="kor_Hang",
26
- max_length=200
 
27
  )
28
 
29
- # ─────────────── 3. ν•œκ΅­μ–΄ TTS: VITS 직접 λ‘œλ”© 방식 ───────────────
30
  tts_model = VitsModel.from_pretrained("facebook/mms-tts-kor")
31
  tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-kor")
32
  tts_model.to("cuda" if torch.cuda.is_available() else "cpu")
33
 
34
- from uroman import Uroman
35
  uroman = Uroman()
36
 
37
  def synthesize_tts(text: str):
38
- romanized = uroman.romanize_string(text) # ← 핡심 μˆ˜μ •
 
39
  inputs = tts_tokenizer(romanized, return_tensors="pt")
40
  input_ids = inputs["input_ids"].long().to(tts_model.device)
41
  with torch.no_grad():
42
  output = tts_model(input_ids=input_ids)
43
  waveform = output.waveform.squeeze().cpu().numpy()
44
- return (tts_model.config.sampling_rate, waveform)
45
-
46
 
47
  # ─────────────── 4. 이미지 β†’ μΊ‘μ…˜ + λ²ˆμ—­ + μŒμ„± 좜λ ₯ ───────────────
48
  def describe_and_speak(img: Image.Image):
49
- logging.info("[DEBUG] describe_and_speak ν•¨μˆ˜ 호좜됨")
50
 
51
- # β‘  μ˜μ–΄ μΊ‘μ…˜ 생성
52
- pixel_values = processor(images=img, return_tensors="pt").pixel_values
53
  generated_ids = blip_model.generate(pixel_values, max_length=64)
54
  caption_en = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
55
  logging.info(f"[DEBUG] caption_en: {caption_en}")
56
- print(f"[DEBUG] caption_en: {caption_en}")
57
 
58
  # β‘‘ λ²ˆμ—­
59
  try:
60
  result = translation_pipeline(caption_en)
61
- caption_ko = result[0]['translation_text'].strip()
62
  except Exception as e:
63
  logging.error(f"[ERROR] λ²ˆμ—­ 였λ₯˜: {e}")
64
  caption_ko = ""
65
  logging.info(f"[DEBUG] caption_ko: {caption_ko}")
66
- print(f"[DEBUG] caption_ko: {caption_ko}")
67
 
68
  if not caption_ko:
69
  return "이미지에 λŒ€ν•œ μ„€λͺ…을 생성할 수 μ—†μŠ΅λ‹ˆλ‹€.", None
70
 
71
- # β‘’ TTS ν•©μ„±
72
  try:
73
  sr, wav = synthesize_tts(caption_ko)
74
  return caption_ko, (sr, wav)
75
  except Exception as e:
76
- logging.error(f"[ERROR] TTS μ—λŸ¬: {e}")
77
  return caption_ko, None
78
 
79
  # ─────────────── 5. Gradio μΈν„°νŽ˜μ΄μŠ€ ───────────────
80
- demo = gr.Interface(
81
- fn=describe_and_speak,
82
- inputs=gr.Image(type="pil", sources=["upload", "webcam"], label="μž…λ ₯ 이미지"),
83
- outputs=[
84
- gr.Textbox(label="ν•œκΈ€ μΊ‘μ…˜"),
85
- gr.Audio(label="μŒμ„± μž¬μƒ", type="numpy")
86
- ],
87
  title="이미지 β†’ ν•œκΈ€ μΊ‘μ…˜ & μŒμ„± λ³€ν™˜",
88
- description="BLIP으둜 μ˜μ–΄ μΊ‘μ…˜ 생성 β†’ NLLB둜 ν•œκ΅­μ–΄ λ²ˆμ—­ β†’ VITS둜 μŒμ„± 생성"
89
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
  if __name__ == "__main__":
92
  demo.launch(debug=True)
 
9
  VitsModel
10
  )
11
  import torch
12
+ from uroman import Uroman
13
 
14
  # ─────────────── λ‘œκΉ… μ„€μ • ───────────────
15
  logging.basicConfig(level=logging.INFO)
 
17
  # ─────────────── 1. BLIP 이미지 캑셔닝 (μ˜μ–΄ 생성) ───────────────
18
  processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
19
  blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
20
+ blip_model.to("cuda" if torch.cuda.is_available() else "cpu")
21
 
22
+ # ─────────────── 2. μ˜μ–΄ β†’ ν•œκ΅­μ–΄ λ²ˆμ—­ ───────────────
23
  translation_pipeline = pipeline(
24
  "translation",
25
  model="facebook/nllb-200-distilled-600M",
26
  src_lang="eng_Latn",
27
  tgt_lang="kor_Hang",
28
+ max_length=200,
29
+ device=0 if torch.cuda.is_available() else -1
30
  )
31
 
32
+ # ─────────────── 3. ν•œκ΅­μ–΄ TTS ───────────────
33
  tts_model = VitsModel.from_pretrained("facebook/mms-tts-kor")
34
  tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-kor")
35
  tts_model.to("cuda" if torch.cuda.is_available() else "cpu")
36
 
 
37
  uroman = Uroman()
38
 
39
  def synthesize_tts(text: str):
40
+ """ν•œκΈ€ λ¬Έμž₯을 VITS‑TTS νŒŒν˜•μœΌλ‘œ λ³€ν™˜"""
41
+ romanized = uroman.romanize_string(text)
42
  inputs = tts_tokenizer(romanized, return_tensors="pt")
43
  input_ids = inputs["input_ids"].long().to(tts_model.device)
44
  with torch.no_grad():
45
  output = tts_model(input_ids=input_ids)
46
  waveform = output.waveform.squeeze().cpu().numpy()
47
+ return tts_model.config.sampling_rate, waveform
 
48
 
49
  # ─────────────── 4. 이미지 β†’ μΊ‘μ…˜ + λ²ˆμ—­ + μŒμ„± 좜λ ₯ ───────────────
50
  def describe_and_speak(img: Image.Image):
51
+ logging.info("[DEBUG] describe_and_speak 호좜")
52
 
53
+ # β‘  μ˜μ–΄ μΊ‘μ…˜
54
+ pixel_values = processor(images=img, return_tensors="pt").pixel_values.to(blip_model.device)
55
  generated_ids = blip_model.generate(pixel_values, max_length=64)
56
  caption_en = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
57
  logging.info(f"[DEBUG] caption_en: {caption_en}")
 
58
 
59
  # β‘‘ λ²ˆμ—­
60
  try:
61
  result = translation_pipeline(caption_en)
62
+ caption_ko = result[0]["translation_text"].strip()
63
  except Exception as e:
64
  logging.error(f"[ERROR] λ²ˆμ—­ 였λ₯˜: {e}")
65
  caption_ko = ""
66
  logging.info(f"[DEBUG] caption_ko: {caption_ko}")
 
67
 
68
  if not caption_ko:
69
  return "이미지에 λŒ€ν•œ μ„€λͺ…을 생성할 수 μ—†μŠ΅λ‹ˆλ‹€.", None
70
 
71
+ # β‘’ TTS
72
  try:
73
  sr, wav = synthesize_tts(caption_ko)
74
  return caption_ko, (sr, wav)
75
  except Exception as e:
76
+ logging.error(f"[ERROR] TTS 였λ₯˜: {e}")
77
  return caption_ko, None
78
 
79
  # ─────────────── 5. Gradio μΈν„°νŽ˜μ΄μŠ€ ───────────────
80
+ with gr.Blocks(
 
 
 
 
 
 
81
  title="이미지 β†’ ν•œκΈ€ μΊ‘μ…˜ & μŒμ„± λ³€ν™˜",
82
+ css="footer {display: none !important;}" # ν‘Έν„° 숨기기
83
+ ) as demo:
84
+ gr.Markdown(
85
+ "## 이미지 β†’ ν•œκΈ€ μΊ‘μ…˜ & μŒμ„± λ³€ν™˜\n"
86
+ "BLIP으둜 μ˜μ–΄ μΊ‘μ…˜ 생성 β†’ NLLB둜 ν•œκ΅­μ–΄ λ²ˆμ—­ β†’ VITS둜 μŒμ„± 생성"
87
+ )
88
+
89
+ # μž…λ ₯/좜λ ₯ μ»΄ν¬λ„ŒνŠΈ
90
+ input_img = gr.Image(
91
+ type="pil",
92
+ sources=["upload", "webcam"],
93
+ label="μž…λ ₯ 이미지"
94
+ )
95
+ caption_out = gr.Textbox(label="ν•œκΈ€ μΊ‘μ…˜")
96
+ audio_out = gr.Audio(label="μŒμ„± μž¬μƒ", type="numpy")
97
+
98
+ # 이미지가 λ³€κ²½(μ—…λ‘œλ“œβ€§μΊ‘μ²˜)될 λ•Œλ§ˆλ‹€ ν•¨μˆ˜ μžλ™ μ‹€ν–‰
99
+ input_img.change(
100
+ fn=describe_and_speak,
101
+ inputs=input_img,
102
+ outputs=[caption_out, audio_out],
103
+ queue=True # λ™μ‹œ 접속 μ‹œ μ•ˆμ „
104
+ )
105
 
106
  if __name__ == "__main__":
107
  demo.launch(debug=True)