yongyeol commited on
Commit
fd7bb60
Β·
verified Β·
1 Parent(s): aefb32f

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +87 -0
  2. requirements.txt +0 -0
app.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import logging
3
+ from PIL import Image
4
+ from transformers import (
5
+ BlipProcessor,
6
+ BlipForConditionalGeneration,
7
+ pipeline,
8
+ AutoTokenizer,
9
+ VitsModel
10
+ )
11
+ import torch
12
+
13
+ # ─────────────── λ‘œκΉ… μ„€μ • ───────────────
14
+ logging.basicConfig(level=logging.INFO)
15
+
16
+ # ─────────────── 1. BLIP 이미지 캑셔닝 (μ˜μ–΄ 생성) ───────────────
17
+ processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
18
+ blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
19
+
20
+ # ─────────────── 2. μ˜μ–΄β†’ν•œκ΅­μ–΄ λ²ˆμ—­: NLLB νŒŒμ΄ν”„λΌμΈ ───────────────
21
+ translation_pipeline = pipeline(
22
+ "translation",
23
+ model="facebook/nllb-200-distilled-600M",
24
+ src_lang="eng_Latn",
25
+ tgt_lang="kor_Hang",
26
+ max_length=200
27
+ )
28
+
29
+ # ─────────────── 3. ν•œκ΅­μ–΄ TTS: VITS 직접 λ‘œλ”© 방식 ───────────────
30
+ tts_model = VitsModel.from_pretrained("facebook/mms-tts-kor")
31
+ tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-kor")
32
+
33
+ tts_model.to("cuda" if torch.cuda.is_available() else "cpu")
34
+
35
+ def synthesize_tts(text: str):
36
+ inputs = tts_tokenizer(text, return_tensors="pt").to(tts_model.device)
37
+ with torch.no_grad():
38
+ output = tts_model(**inputs)
39
+ waveform = output.waveform.squeeze().cpu().numpy()
40
+ return (tts_model.config.sampling_rate, waveform)
41
+
42
+ # ─────────────── 4. 이미지 β†’ μΊ‘μ…˜ + λ²ˆμ—­ + μŒμ„± 좜λ ₯ ───────────────
43
+ def describe_and_speak(img: Image.Image):
44
+ logging.info("[DEBUG] describe_and_speak ν•¨μˆ˜ 호좜됨")
45
+
46
+ # β‘  μ˜μ–΄ μΊ‘μ…˜ 생성
47
+ pixel_values = processor(images=img, return_tensors="pt").pixel_values
48
+ generated_ids = blip_model.generate(pixel_values, max_length=64)
49
+ caption_en = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
50
+ logging.info(f"[DEBUG] caption_en: {caption_en}")
51
+ print(f"[DEBUG] caption_en: {caption_en}")
52
+
53
+ # β‘‘ λ²ˆμ—­
54
+ try:
55
+ result = translation_pipeline(caption_en)
56
+ caption_ko = result[0]['translation_text'].strip()
57
+ except Exception as e:
58
+ logging.error(f"[ERROR] λ²ˆμ—­ 였λ₯˜: {e}")
59
+ caption_ko = ""
60
+ logging.info(f"[DEBUG] caption_ko: {caption_ko}")
61
+ print(f"[DEBUG] caption_ko: {caption_ko}")
62
+
63
+ if not caption_ko:
64
+ return "이미지에 λŒ€ν•œ μ„€λͺ…을 생성할 수 μ—†μŠ΅λ‹ˆλ‹€.", None
65
+
66
+ # β‘’ TTS ν•©μ„±
67
+ try:
68
+ sr, wav = synthesize_tts(caption_ko)
69
+ return caption_ko, (sr, wav)
70
+ except Exception as e:
71
+ logging.error(f"[ERROR] TTS μ—λŸ¬: {e}")
72
+ return caption_ko, None
73
+
74
+ # ─────────────── 5. Gradio μΈν„°νŽ˜μ΄μŠ€ ───────────────
75
+ demo = gr.Interface(
76
+ fn=describe_and_speak,
77
+ inputs=gr.Image(type="pil", label="μž…λ ₯ 이미지"),
78
+ outputs=[
79
+ gr.Textbox(label="ν•œκΈ€ μΊ‘μ…˜"),
80
+ gr.Audio(label="μŒμ„± μž¬μƒ", type="numpy")
81
+ ],
82
+ title="이미지 β†’ ν•œκΈ€ μΊ‘μ…˜ & μŒμ„± λ³€ν™˜",
83
+ description="BLIP으둜 μ˜μ–΄ μΊ‘μ…˜ 생성 β†’ NLLB둜 ν•œκ΅­μ–΄ λ²ˆμ—­ β†’ VITS둜 μŒμ„± 생성"
84
+ )
85
+
86
+ if __name__ == "__main__":
87
+ demo.launch(debug=True)
requirements.txt ADDED
Binary file (126 Bytes). View file