yongyeol commited on
Commit
cb2bc8c
Β·
verified Β·
1 Parent(s): fd7bb60

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -87
app.py CHANGED
@@ -1,87 +1,58 @@
1
- import gradio as gr
2
- import logging
3
- from PIL import Image
4
- from transformers import (
5
- BlipProcessor,
6
- BlipForConditionalGeneration,
7
- pipeline,
8
- AutoTokenizer,
9
- VitsModel
10
- )
11
- import torch
12
-
13
- # ─────────────── λ‘œκΉ… μ„€μ • ───────────────
14
- logging.basicConfig(level=logging.INFO)
15
-
16
- # ─────────────── 1. BLIP 이미지 캑셔닝 (μ˜μ–΄ 생성) ───────────────
17
- processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
18
- blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
19
-
20
- # ─────────────── 2. μ˜μ–΄β†’ν•œκ΅­μ–΄ λ²ˆμ—­: NLLB νŒŒμ΄ν”„λΌμΈ ───────────────
21
- translation_pipeline = pipeline(
22
- "translation",
23
- model="facebook/nllb-200-distilled-600M",
24
- src_lang="eng_Latn",
25
- tgt_lang="kor_Hang",
26
- max_length=200
27
- )
28
-
29
- # ─────────────── 3. ν•œκ΅­μ–΄ TTS: VITS 직접 λ‘œλ”© 방식 ───────────────
30
- tts_model = VitsModel.from_pretrained("facebook/mms-tts-kor")
31
- tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-kor")
32
-
33
- tts_model.to("cuda" if torch.cuda.is_available() else "cpu")
34
-
35
- def synthesize_tts(text: str):
36
- inputs = tts_tokenizer(text, return_tensors="pt").to(tts_model.device)
37
- with torch.no_grad():
38
- output = tts_model(**inputs)
39
- waveform = output.waveform.squeeze().cpu().numpy()
40
- return (tts_model.config.sampling_rate, waveform)
41
-
42
- # ─────────────── 4. 이미지 β†’ μΊ‘μ…˜ + λ²ˆμ—­ + μŒμ„± 좜λ ₯ ───────────────
43
- def describe_and_speak(img: Image.Image):
44
- logging.info("[DEBUG] describe_and_speak ν•¨μˆ˜ 호좜됨")
45
-
46
- # β‘  μ˜μ–΄ μΊ‘μ…˜ 생성
47
- pixel_values = processor(images=img, return_tensors="pt").pixel_values
48
- generated_ids = blip_model.generate(pixel_values, max_length=64)
49
- caption_en = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
50
- logging.info(f"[DEBUG] caption_en: {caption_en}")
51
- print(f"[DEBUG] caption_en: {caption_en}")
52
-
53
- # β‘‘ λ²ˆμ—­
54
- try:
55
- result = translation_pipeline(caption_en)
56
- caption_ko = result[0]['translation_text'].strip()
57
- except Exception as e:
58
- logging.error(f"[ERROR] λ²ˆμ—­ 였λ₯˜: {e}")
59
- caption_ko = ""
60
- logging.info(f"[DEBUG] caption_ko: {caption_ko}")
61
- print(f"[DEBUG] caption_ko: {caption_ko}")
62
-
63
- if not caption_ko:
64
- return "이미지에 λŒ€ν•œ μ„€λͺ…을 생성할 수 μ—†μŠ΅λ‹ˆλ‹€.", None
65
-
66
- # β‘’ TTS ν•©μ„±
67
- try:
68
- sr, wav = synthesize_tts(caption_ko)
69
- return caption_ko, (sr, wav)
70
- except Exception as e:
71
- logging.error(f"[ERROR] TTS μ—λŸ¬: {e}")
72
- return caption_ko, None
73
-
74
- # ─────────────── 5. Gradio μΈν„°νŽ˜μ΄μŠ€ ───────────────
75
- demo = gr.Interface(
76
- fn=describe_and_speak,
77
- inputs=gr.Image(type="pil", label="μž…λ ₯ 이미지"),
78
- outputs=[
79
- gr.Textbox(label="ν•œκΈ€ μΊ‘μ…˜"),
80
- gr.Audio(label="μŒμ„± μž¬μƒ", type="numpy")
81
- ],
82
- title="이미지 β†’ ν•œκΈ€ μΊ‘μ…˜ & μŒμ„± λ³€ν™˜",
83
- description="BLIP으둜 μ˜μ–΄ μΊ‘μ…˜ 생성 β†’ NLLB둜 ν•œκ΅­μ–΄ λ²ˆμ—­ β†’ VITS둜 μŒμ„± 생성"
84
- )
85
-
86
- if __name__ == "__main__":
87
- demo.launch(debug=True)
 
1
+ import gradio as gr import logging from PIL import Image from transformers import ( BlipProcessor, BlipForConditionalGeneration, pipeline, AutoTokenizer, VitsModel ) import torch
2
+
3
+ ─────────────── λ‘œκΉ… μ„€μ • ───────────────
4
+
5
+ logging.basicConfig(level=logging.INFO)
6
+
7
+ ─────────────── 1. BLIP 이미지 캑셔닝 (μ˜μ–΄ 생성) ───────────────
8
+
9
+ processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large") blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
10
+
11
+ ─────────────── 2. μ˜μ–΄β†’ν•œκ΅­μ–΄ λ²ˆμ—­: NLLB νŒŒμ΄ν”„λΌμΈ ───────────────
12
+
13
+ translation_pipeline = pipeline( "translation", model="facebook/nllb-200-distilled-600M", src_lang="eng_Latn", tgt_lang="kor_Hang", max_length=200 )
14
+
15
+ ─────────────── 3. ν•œκ΅­μ–΄ TTS: VITS 직접 λ‘œλ”© 방식 ───────────────
16
+
17
+ tts_model = VitsModel.from_pretrained("facebook/mms-tts-kor") tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-kor") tts_model.to("cuda" if torch.cuda.is_available() else "cpu")
18
+
19
+ def synthesize_tts(text: str): inputs = tts_tokenizer(text, return_tensors="pt") input_ids = inputs["input_ids"].to(tts_model.device) # ⚠ fix: use LongTensor only with torch.no_grad(): output = tts_model(input_ids=input_ids) waveform = output.waveform.squeeze().cpu().numpy() return (tts_model.config.sampling_rate, waveform)
20
+
21
+ ─────────────── 4. 이미지 β†’ μΊ‘μ…˜ + λ²ˆμ—­ + μŒμ„± 좜λ ₯ ───────────────
22
+
23
+ def describe_and_speak(img: Image.Image): logging.info("[DEBUG] describe_and_speak ν•¨μˆ˜ 호좜됨")
24
+
25
+ # β‘  μ˜μ–΄ μΊ‘μ…˜ 생성
26
+ pixel_values = processor(images=img, return_tensors="pt").pixel_values
27
+ generated_ids = blip_model.generate(pixel_values, max_length=64)
28
+ caption_en = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
29
+ logging.info(f"[DEBUG] caption_en: {caption_en}")
30
+ print(f"[DEBUG] caption_en: {caption_en}")
31
+
32
+ # β‘‘ λ²ˆμ—­
33
+ try:
34
+ result = translation_pipeline(caption_en)
35
+ caption_ko = result[0]['translation_text'].strip()
36
+ except Exception as e:
37
+ logging.error(f"[ERROR] λ²ˆμ—­ 였λ₯˜: {e}")
38
+ caption_ko = ""
39
+ logging.info(f"[DEBUG] caption_ko: {caption_ko}")
40
+ print(f"[DEBUG] caption_ko: {caption_ko}")
41
+
42
+ if not caption_ko:
43
+ return "이미지에 λŒ€ν•œ μ„€λͺ…을 생성할 수 μ—†μŠ΅λ‹ˆλ‹€.", None
44
+
45
+ # β‘’ TTS ν•©μ„±
46
+ try:
47
+ sr, wav = synthesize_tts(caption_ko)
48
+ return caption_ko, (sr, wav)
49
+ except Exception as e:
50
+ logging.error(f"[ERROR] TTS μ—λŸ¬: {e}")
51
+ return caption_ko, None
52
+
53
+ ─────────────── 5. Gradio μΈν„°νŽ˜μ΄μŠ€ ───────────────
54
+
55
+ demo = gr.Interface( fn=describe_and_speak, inputs=gr.Image(type="pil", sources=["upload", "camera"], label="μž…λ ₯ 이미지"), outputs=[ gr.Textbox(label="ν•œκΈ€ μΊ‘μ…˜"), gr.Audio(label="μŒμ„± μž¬μƒ", type="numpy") ], title="이미지 β†’ ν•œκΈ€ μΊ‘μ…˜ & μŒμ„± λ³€ν™˜", description="BLIP으둜 μ˜μ–΄ μΊ‘μ…˜ 생성 β†’ NLLB둜 ν•œκ΅­μ–΄ λ²ˆμ—­ β†’ VITS둜 μŒμ„± 생성" )
56
+
57
+ if name == "main": demo.launch(debug=True)
58
+