Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,14 +1,15 @@
|
|
1 |
# -*- coding: utf-8 -*-
|
2 |
"""
|
3 |
-
Colab & Hugging Face Spaces ๊ณต์ฉ app.py (
|
4 |
-
-
|
5 |
-
- Spaces:
|
6 |
-
- Whisper(faster-whisper)๋ก STT + koBART ์์ฝ
|
7 |
"""
|
8 |
|
9 |
-
import os, sys, subprocess, tempfile
|
10 |
from datetime import datetime
|
11 |
|
|
|
12 |
def _in_colab() -> bool:
|
13 |
try:
|
14 |
import google.colab # noqa
|
@@ -16,48 +17,64 @@ def _in_colab() -> bool:
|
|
16 |
except Exception:
|
17 |
return False
|
18 |
|
19 |
-
def
|
20 |
-
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
22 |
# ffmpeg
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
for mod, pkg in [
|
32 |
-
|
33 |
-
|
34 |
-
|
|
|
|
|
|
|
|
|
35 |
try:
|
36 |
__import__(mod)
|
37 |
except Exception:
|
38 |
-
|
|
|
|
|
|
|
39 |
|
40 |
-
|
41 |
|
42 |
-
# ===== Imports =====
|
43 |
import gradio as gr
|
44 |
from pydub import AudioSegment
|
45 |
from transformers import pipeline
|
46 |
from faster_whisper import WhisperModel
|
|
|
47 |
|
48 |
-
# ===== Models =====
|
49 |
DEFAULT_WHISPER_SIZE = os.getenv("WHISPER_MODEL_SIZE", "small") # tiny/base/small/medium/large-v3
|
50 |
-
device = "cuda" if os.path.exists("/proc/driver/nvidia") else "cpu"
|
51 |
compute_type = "float16" if device == "cuda" else "int8"
|
52 |
|
53 |
-
# ์ต์ด ์ธ์คํด์ค (ํ์ ์ ๊ณ ๊ธ์ค์ ์์ ๋ณ๊ฒฝ)
|
54 |
_asr = WhisperModel(DEFAULT_WHISPER_SIZE, device=device, compute_type=compute_type)
|
55 |
-
_summarizer = pipeline("summarization", model="gogamza/kobart-summarization"
|
|
|
56 |
|
57 |
# ===== Utils =====
|
58 |
def convert_to_wav(src_path: str) -> str:
|
59 |
if src_path.lower().endswith(".wav"):
|
60 |
return src_path
|
|
|
|
|
|
|
61 |
sound = AudioSegment.from_file(src_path)
|
62 |
fd, tmp_wav = tempfile.mkstemp(suffix=".wav")
|
63 |
os.close(fd)
|
@@ -78,6 +95,7 @@ def chunk_text(txt: str, max_chars=850):
|
|
78 |
return parts
|
79 |
|
80 |
def summarize_long(text: str) -> str:
|
|
|
81 |
chunks = chunk_text(text)
|
82 |
partial = []
|
83 |
for c in chunks:
|
@@ -93,23 +111,29 @@ def save_minutes_to_file(minutes_text: str) -> str:
|
|
93 |
f.write(minutes_text)
|
94 |
return path
|
95 |
|
96 |
-
# ===== Core
|
97 |
def transcribe_and_summarize(audio_path, model_size, auto_detect_lang):
|
|
|
|
|
|
|
|
|
|
|
98 |
if not audio_path:
|
99 |
-
return "โ ๏ธ ์ค๋์ค๊ฐ ์
๋ ฅ๋์ง ์์์ต๋๋ค.", "", "", None,
|
|
|
100 |
|
101 |
-
#
|
102 |
global _asr
|
103 |
-
|
104 |
-
|
105 |
_asr = WhisperModel(model_size, device=device, compute_type=compute_type)
|
106 |
-
|
107 |
-
|
|
|
108 |
|
109 |
wav_path = None
|
110 |
try:
|
111 |
wav_path = convert_to_wav(audio_path)
|
112 |
-
|
113 |
language = None if auto_detect_lang else "ko"
|
114 |
segments, info = _asr.transcribe(
|
115 |
wav_path,
|
@@ -119,7 +143,8 @@ def transcribe_and_summarize(audio_path, model_size, auto_detect_lang):
|
|
119 |
)
|
120 |
text = "".join(seg.text for seg in segments).strip()
|
121 |
if not text:
|
122 |
-
return "โ ๏ธ ์ธ์๋ ํ
์คํธ๊ฐ ์์ต๋๋ค.", "", "", None,
|
|
|
123 |
|
124 |
summary = summarize_long(text)
|
125 |
|
@@ -133,10 +158,12 @@ def transcribe_and_summarize(audio_path, model_size, auto_detect_lang):
|
|
133 |
{text}
|
134 |
"""
|
135 |
file_path = save_minutes_to_file(minutes)
|
136 |
-
return text, summary, minutes, file_path,
|
|
|
137 |
|
138 |
except Exception as e:
|
139 |
-
return "", "", "", None,
|
|
|
140 |
finally:
|
141 |
if wav_path and wav_path != audio_path and os.path.exists(wav_path):
|
142 |
try: os.remove(wav_path)
|
@@ -218,7 +245,6 @@ with gr.Blocks(title="ํ์๋ก ์๋ ์์ฑ๊ธฐ (Whisper)", theme=theme, css=CU
|
|
218 |
|
219 |
gr.HTML('<div class="footer">ยฉ Whisper + KoBART ยท Designed for Colab & Hugging Face Spaces</div>')
|
220 |
|
221 |
-
# Actions
|
222 |
run_button.click(
|
223 |
fn=transcribe_and_summarize,
|
224 |
inputs=[audio_input, model_size, auto_detect],
|
@@ -237,3 +263,4 @@ else:
|
|
237 |
demo.launch()
|
238 |
|
239 |
|
|
|
|
1 |
# -*- coding: utf-8 -*-
|
2 |
"""
|
3 |
+
Colab & Hugging Face Spaces ๊ณต์ฉ app.py (๋ฐํ์ ์ค์น ๋ณด๊ฐ)
|
4 |
+
- PyTorch ๋๋ฝ ์ ์๋ ์ค์น (Colab/Spaces ๊ณตํต)
|
5 |
+
- Colab: ffmpeg ์๋ ์ค์น / Spaces: ffmpeg ์์ผ๋ฉด ๊ฒฝ๊ณ ํ์
|
6 |
+
- Whisper(faster-whisper)๋ก STT + koBART ์์ฝ + ์ธ๋ จ๋ Gradio UI
|
7 |
"""
|
8 |
|
9 |
+
import os, sys, subprocess, tempfile, shutil
|
10 |
from datetime import datetime
|
11 |
|
12 |
+
# ===== Env Detect =====
|
13 |
def _in_colab() -> bool:
|
14 |
try:
|
15 |
import google.colab # noqa
|
|
|
17 |
except Exception:
|
18 |
return False
|
19 |
|
20 |
+
def _has_cmd(cmd: str) -> bool:
|
21 |
+
return shutil.which(cmd) is not None
|
22 |
+
|
23 |
+
# ===== Runtime Installer =====
|
24 |
+
def _pip_install(pkgs):
|
25 |
+
subprocess.check_call([sys.executable, "-m", "pip", "install", "--quiet"] + pkgs)
|
26 |
+
|
27 |
+
def _ensure_runtime():
|
28 |
# ffmpeg
|
29 |
+
if not _has_cmd("ffmpeg"):
|
30 |
+
if _in_colab():
|
31 |
+
subprocess.run(["apt-get", "update", "-y"], check=False)
|
32 |
+
subprocess.run(["apt-get", "install", "-y", "ffmpeg"], check=False)
|
33 |
+
# Spaces๋ apt.txt ์ฌ์ฉ ๊ถ์ฅ โ ์ฌ๊ธฐ์ ๊ฒฝ๊ณ ๋ง
|
34 |
+
|
35 |
+
# python packages
|
36 |
+
need = []
|
37 |
+
for mod, pkg in [
|
38 |
+
("torch", "torch"),
|
39 |
+
("transformers", "transformers==4.*"),
|
40 |
+
("sentencepiece", "sentencepiece"),
|
41 |
+
("faster_whisper", "faster-whisper==1.*"),
|
42 |
+
("pydub", "pydub"),
|
43 |
+
("gradio", "gradio==4.*"),
|
44 |
+
]:
|
45 |
try:
|
46 |
__import__(mod)
|
47 |
except Exception:
|
48 |
+
need.append(pkg)
|
49 |
+
|
50 |
+
if need:
|
51 |
+
_pip_install(need)
|
52 |
|
53 |
+
_ensure_runtime()
|
54 |
|
55 |
+
# ===== Imports (after install) =====
|
56 |
import gradio as gr
|
57 |
from pydub import AudioSegment
|
58 |
from transformers import pipeline
|
59 |
from faster_whisper import WhisperModel
|
60 |
+
import torch
|
61 |
|
62 |
+
# ===== Device / Models =====
|
63 |
DEFAULT_WHISPER_SIZE = os.getenv("WHISPER_MODEL_SIZE", "small") # tiny/base/small/medium/large-v3
|
64 |
+
device = "cuda" if torch.cuda.is_available() and os.path.exists("/proc/driver/nvidia") else "cpu"
|
65 |
compute_type = "float16" if device == "cuda" else "int8"
|
66 |
|
|
|
67 |
_asr = WhisperModel(DEFAULT_WHISPER_SIZE, device=device, compute_type=compute_type)
|
68 |
+
_summarizer = pipeline("summarization", model="gogamza/kobart-summarization",
|
69 |
+
device=0 if device == "cuda" else -1)
|
70 |
|
71 |
# ===== Utils =====
|
72 |
def convert_to_wav(src_path: str) -> str:
|
73 |
if src_path.lower().endswith(".wav"):
|
74 |
return src_path
|
75 |
+
# ffmpeg ํ์
|
76 |
+
if not _has_cmd("ffmpeg"):
|
77 |
+
raise RuntimeError("ffmpeg๊ฐ ํ์ํฉ๋๋ค. (Spaces: apt.txt์ 'ffmpeg' ์ถ๊ฐ, Colab: ์๋ ์ค์น๋จ)")
|
78 |
sound = AudioSegment.from_file(src_path)
|
79 |
fd, tmp_wav = tempfile.mkstemp(suffix=".wav")
|
80 |
os.close(fd)
|
|
|
95 |
return parts
|
96 |
|
97 |
def summarize_long(text: str) -> str:
|
98 |
+
# koBART๊ฐ ๊ธด ์
๋ ฅ์ ์ฝํ๋ฏ๋ก ๋ถํ ์์ฝ + ๋ฉํ ์์ฝ
|
99 |
chunks = chunk_text(text)
|
100 |
partial = []
|
101 |
for c in chunks:
|
|
|
111 |
f.write(minutes_text)
|
112 |
return path
|
113 |
|
114 |
+
# ===== Core =====
|
115 |
def transcribe_and_summarize(audio_path, model_size, auto_detect_lang):
|
116 |
+
# ffmpeg ์ฒดํฌ (Spaces์์ ์์ฃผ ๋น ์ง)
|
117 |
+
if not _has_cmd("ffmpeg"):
|
118 |
+
return ("", "", "", None,
|
119 |
+
gr.update(visible=True, value="โ ๏ธ ffmpeg๊ฐ ์์ต๋๋ค. Spaces์์๋ apt.txt์ 'ffmpeg'๋ฅผ ์ถ๊ฐํ์ธ์."))
|
120 |
+
|
121 |
if not audio_path:
|
122 |
+
return ("โ ๏ธ ์ค๋์ค๊ฐ ์
๋ ฅ๋์ง ์์์ต๋๋ค.", "", "", None,
|
123 |
+
gr.update(visible=True, value="โ ๏ธ ์ค๋์ค๋ฅผ ์
๋ก๋ํ๏ฟฝ๏ฟฝ๏ฟฝ๋ ๋
น์ํด ์ฃผ์ธ์."))
|
124 |
|
125 |
+
# ๋ชจ๋ธ ํฌ๊ธฐ ๋ณ๊ฒฝ ์ ๋์ ์ฌ๋ก๋ฉ
|
126 |
global _asr
|
127 |
+
try:
|
128 |
+
if model_size and model_size != DEFAULT_WHISPER_SIZE:
|
129 |
_asr = WhisperModel(model_size, device=device, compute_type=compute_type)
|
130 |
+
except Exception as e:
|
131 |
+
return ("", "", "", None,
|
132 |
+
gr.update(visible=True, value=f"โ ๏ธ Whisper ๋ชจ๋ธ ๋ก๋ ์คํจ: {e}"))
|
133 |
|
134 |
wav_path = None
|
135 |
try:
|
136 |
wav_path = convert_to_wav(audio_path)
|
|
|
137 |
language = None if auto_detect_lang else "ko"
|
138 |
segments, info = _asr.transcribe(
|
139 |
wav_path,
|
|
|
143 |
)
|
144 |
text = "".join(seg.text for seg in segments).strip()
|
145 |
if not text:
|
146 |
+
return ("โ ๏ธ ์ธ์๋ ํ
์คํธ๊ฐ ์์ต๋๋ค.", "", "", None,
|
147 |
+
gr.update(visible=True, value="โ ๏ธ ์์ฑ ์ธ์ ๊ฒฐ๊ณผ๊ฐ ๋น์ด ์์ต๋๋ค."))
|
148 |
|
149 |
summary = summarize_long(text)
|
150 |
|
|
|
158 |
{text}
|
159 |
"""
|
160 |
file_path = save_minutes_to_file(minutes)
|
161 |
+
return (text, summary, minutes, file_path,
|
162 |
+
gr.update(visible=True, value="โ
์๋ฃ! ๊ฒฐ๊ณผ๋ฅผ ํ์ธํ์ธ์."))
|
163 |
|
164 |
except Exception as e:
|
165 |
+
return ("", "", "", None,
|
166 |
+
gr.update(visible=True, value=f"โ ๏ธ ์ฒ๋ฆฌ ์ค ์ค๋ฅ: {e}"))
|
167 |
finally:
|
168 |
if wav_path and wav_path != audio_path and os.path.exists(wav_path):
|
169 |
try: os.remove(wav_path)
|
|
|
245 |
|
246 |
gr.HTML('<div class="footer">ยฉ Whisper + KoBART ยท Designed for Colab & Hugging Face Spaces</div>')
|
247 |
|
|
|
248 |
run_button.click(
|
249 |
fn=transcribe_and_summarize,
|
250 |
inputs=[audio_input, model_size, auto_detect],
|
|
|
263 |
demo.launch()
|
264 |
|
265 |
|
266 |
+
|