Update app.py
Browse files
app.py
CHANGED
@@ -4,9 +4,7 @@ import gradio as gr
|
|
4 |
from transformers import pipeline
|
5 |
from huggingface_hub import InferenceClient
|
6 |
import os
|
7 |
-
import json
|
8 |
from datetime import datetime
|
9 |
-
import time
|
10 |
|
11 |
MODEL_NAME = "openai/whisper-large-v3-turbo"
|
12 |
BATCH_SIZE = 8
|
@@ -14,10 +12,6 @@ FILE_LIMIT_MB = 1000
|
|
14 |
|
15 |
device = 0 if torch.cuda.is_available() else "cpu"
|
16 |
|
17 |
-
# νμΌ μ μ₯ κ²½λ‘ μ€μ
|
18 |
-
HISTORY_DIR = "transcription_history"
|
19 |
-
os.makedirs(HISTORY_DIR, exist_ok=True)
|
20 |
-
|
21 |
# Whisper νμ΄νλΌμΈ μ΄κΈ°ν
|
22 |
pipe = pipeline(
|
23 |
task="automatic-speech-recognition",
|
@@ -32,151 +26,191 @@ hf_client = InferenceClient(
|
|
32 |
token=os.getenv("HF_TOKEN")
|
33 |
)
|
34 |
|
35 |
-
def save_transcription(transcribed_text, summary_text):
|
36 |
-
"""λ³ν κ²°κ³Όλ₯Ό JSON νμΌλ‘ μ μ₯"""
|
37 |
-
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
38 |
-
filename = f"{HISTORY_DIR}/transcription_{timestamp}.json"
|
39 |
-
|
40 |
-
data = {
|
41 |
-
"timestamp": timestamp,
|
42 |
-
"transcribed_text": transcribed_text,
|
43 |
-
"summary": summary_text
|
44 |
-
}
|
45 |
-
|
46 |
-
with open(filename, "w", encoding="utf-8") as f:
|
47 |
-
json.dump(data, f, ensure_ascii=False, indent=2)
|
48 |
-
|
49 |
-
return filename
|
50 |
-
|
51 |
-
def process_long_audio(audio_input, chunk_duration=30):
|
52 |
-
"""κΈ΄ μ€λμ€ νμΌμ μ²ν¬λ‘ λλμ΄ μ²λ¦¬"""
|
53 |
-
# μ€λμ€ μ²λ¦¬ λ‘μ§ κ΅¬ν
|
54 |
-
pass
|
55 |
-
|
56 |
-
def detect_language(text):
|
57 |
-
"""ν
μ€νΈμ μΈμ΄ κ°μ§"""
|
58 |
-
# μΈμ΄ κ°μ§ λ‘μ§ κ΅¬ν
|
59 |
-
pass
|
60 |
-
|
61 |
def get_word_count(text):
|
62 |
"""ν
μ€νΈμ λ¨μ΄ μ κ³μ°"""
|
|
|
|
|
63 |
return len(text.split())
|
64 |
|
65 |
-
def
|
66 |
-
"""
|
67 |
-
|
|
|
|
|
|
|
|
|
|
|
68 |
|
69 |
@spaces.GPU
|
70 |
-
def transcribe_summarize(audio_input, task
|
71 |
if audio_input is None:
|
72 |
raise gr.Error("μ€λμ€ νμΌμ΄ μ μΆλμ§ μμμ΅λλ€!")
|
73 |
|
74 |
-
start_time = time.time()
|
75 |
-
|
76 |
-
# μμ±μ ν
μ€νΈλ‘ λ³ν
|
77 |
-
result = pipe(
|
78 |
-
audio_input,
|
79 |
-
batch_size=BATCH_SIZE,
|
80 |
-
generate_kwargs={"task": task},
|
81 |
-
return_timestamps=True
|
82 |
-
)
|
83 |
-
transcribed_text = result["text"]
|
84 |
-
|
85 |
-
# λΆμ μ 보 μμ§
|
86 |
-
stats = {
|
87 |
-
"word_count": get_word_count(transcribed_text),
|
88 |
-
"processing_time": f"{time.time() - start_time:.2f}μ΄",
|
89 |
-
"audio_duration": get_speaking_time(result.get("duration", 0)),
|
90 |
-
"language": detect_language(transcribed_text)
|
91 |
-
}
|
92 |
-
|
93 |
-
# ν
μ€νΈ μμ½
|
94 |
try:
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
prompt=prompt,
|
102 |
-
max_new_tokens=150,
|
103 |
-
temperature=0.3,
|
104 |
-
top_p=0.9,
|
105 |
-
repetition_penalty=1.2,
|
106 |
-
stop_sequences=["\n", "ν
μ€νΈ:", "μμ½:"]
|
107 |
)
|
|
|
108 |
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
|
114 |
-
|
115 |
-
|
|
|
|
|
|
|
|
|
|
|
116 |
|
117 |
-
|
118 |
-
|
|
|
119 |
|
120 |
-
except Exception as e:
|
121 |
-
print(f"μμ½ μμ± μ€ μ€λ₯ λ°μ: {str(e)}")
|
122 |
-
summary_text = "μμ½μ μμ±ν μ μμ΅λλ€. μ μ ν λ€μ μλν΄μ£ΌμΈμ."
|
123 |
-
|
124 |
-
# κ²°κ³Ό μ μ₯
|
125 |
-
if save_result:
|
126 |
-
saved_file = save_transcription(transcribed_text, summary_text)
|
127 |
-
print(f"κ²°κ³Όκ° μ μ₯λμμ΅λλ€: {saved_file}")
|
128 |
-
|
129 |
-
# λ²μ κΈ°λ₯ (μ΅μ
)
|
130 |
-
translated_text = ""
|
131 |
-
if enable_translation and task != "translate":
|
132 |
-
try:
|
133 |
-
# λ²μ λ‘μ§ κ΅¬ν
|
134 |
-
pass
|
135 |
except Exception as e:
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
gr.update(value=f"""
|
142 |
π λΆμ μ 보:
|
143 |
-
- λ¨μ΄ μ: {
|
144 |
-
-
|
145 |
-
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
|
|
|
|
|
|
150 |
|
151 |
# CSS μ€νμΌ
|
152 |
css = """
|
153 |
footer { visibility: hidden; }
|
154 |
-
.gradio-container {
|
155 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
"""
|
157 |
|
158 |
# νμΌ μ
λ‘λ μΈν°νμ΄μ€
|
159 |
file_transcribe = gr.Interface(
|
160 |
fn=transcribe_summarize,
|
161 |
inputs=[
|
162 |
-
gr.Audio(
|
|
|
|
|
|
|
|
|
163 |
gr.Radio(
|
164 |
choices=["transcribe", "translate"],
|
165 |
-
label="μμ
",
|
166 |
value="transcribe"
|
167 |
-
)
|
168 |
-
gr.Checkbox(label="κ²°κ³Ό μ μ₯νκΈ°", value=False),
|
169 |
-
gr.Checkbox(label="λ²μ νμ±ν", value=False)
|
170 |
],
|
171 |
outputs=[
|
172 |
-
gr.Textbox(
|
173 |
-
|
174 |
-
|
175 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
176 |
],
|
177 |
-
title="λ°μμ°κΈ° AI
|
178 |
-
description="
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
179 |
flagging_mode="never"
|
180 |
)
|
181 |
|
182 |
-
# λ§μ΄ν¬ λ
Ήμ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
from transformers import pipeline
|
5 |
from huggingface_hub import InferenceClient
|
6 |
import os
|
|
|
7 |
from datetime import datetime
|
|
|
8 |
|
9 |
MODEL_NAME = "openai/whisper-large-v3-turbo"
|
10 |
BATCH_SIZE = 8
|
|
|
12 |
|
13 |
device = 0 if torch.cuda.is_available() else "cpu"
|
14 |
|
|
|
|
|
|
|
|
|
15 |
# Whisper νμ΄νλΌμΈ μ΄κΈ°ν
|
16 |
pipe = pipeline(
|
17 |
task="automatic-speech-recognition",
|
|
|
26 |
token=os.getenv("HF_TOKEN")
|
27 |
)
|
28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
def get_word_count(text):
|
30 |
"""ν
μ€νΈμ λ¨μ΄ μ κ³μ°"""
|
31 |
+
if not text:
|
32 |
+
return 0
|
33 |
return len(text.split())
|
34 |
|
35 |
+
def format_duration(seconds):
|
36 |
+
"""μ΄ λ¨μ μκ°μ mm:ss νμμΌλ‘ λ³ν"""
|
37 |
+
try:
|
38 |
+
minutes = int(seconds // 60)
|
39 |
+
seconds = int(seconds % 60)
|
40 |
+
return f"{minutes:02d}:{seconds:02d}"
|
41 |
+
except:
|
42 |
+
return "00:00"
|
43 |
|
44 |
@spaces.GPU
|
45 |
+
def transcribe_summarize(audio_input, task):
|
46 |
if audio_input is None:
|
47 |
raise gr.Error("μ€λμ€ νμΌμ΄ μ μΆλμ§ μμμ΅λλ€!")
|
48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
try:
|
50 |
+
# μμ±μ ν
μ€νΈλ‘ λ³ν
|
51 |
+
result = pipe(
|
52 |
+
audio_input,
|
53 |
+
batch_size=BATCH_SIZE,
|
54 |
+
generate_kwargs={"task": task},
|
55 |
+
return_timestamps=True
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
)
|
57 |
+
transcribed_text = result["text"]
|
58 |
|
59 |
+
# κΈ°λ³Έ λΆμ μ 보
|
60 |
+
word_count = get_word_count(transcribed_text)
|
61 |
+
duration = format_duration(result.get("duration", 0))
|
62 |
+
|
63 |
+
# ν
μ€νΈ μμ½
|
64 |
+
try:
|
65 |
+
prompt = (
|
66 |
+
"λ€μ ν
μ€νΈλ₯Ό νκ΅μ΄λ‘ κ°λ¨ν μμ½ν΄μ£ΌμΈμ:\n\n"
|
67 |
+
f"ν
μ€νΈ: {transcribed_text}\n"
|
68 |
+
"μμ½:"
|
69 |
+
)
|
70 |
|
71 |
+
response = hf_client.text_generation(
|
72 |
+
prompt=prompt,
|
73 |
+
max_new_tokens=150,
|
74 |
+
temperature=0.3,
|
75 |
+
top_p=0.9,
|
76 |
+
repetition_penalty=1.2
|
77 |
+
)
|
78 |
|
79 |
+
summary_text = str(response)
|
80 |
+
if "μμ½:" in summary_text:
|
81 |
+
summary_text = summary_text.split("μμ½:")[1].strip()
|
82 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
except Exception as e:
|
84 |
+
print(f"μμ½ μμ± μ€ μ€λ₯: {str(e)}")
|
85 |
+
summary_text = "μμ½μ μμ±ν μ μμ΅λλ€."
|
86 |
+
|
87 |
+
# λΆμ μ 보 ν¬λ§·ν
|
88 |
+
stats = f"""
|
|
|
89 |
π λΆμ μ 보:
|
90 |
+
- λ¨μ΄ μ: {word_count}κ°
|
91 |
+
- μμ± κΈΈμ΄: {duration}
|
92 |
+
- μμ± μκ°: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
|
93 |
+
"""
|
94 |
+
|
95 |
+
return [transcribed_text, summary_text, stats]
|
96 |
+
|
97 |
+
except Exception as e:
|
98 |
+
error_msg = f"μ²λ¦¬ μ€ μ€λ₯κ° λ°μνμ΅λλ€: {str(e)}"
|
99 |
+
return ["", error_msg, ""]
|
100 |
|
101 |
# CSS μ€νμΌ
|
102 |
css = """
|
103 |
footer { visibility: hidden; }
|
104 |
+
.gradio-container {
|
105 |
+
max-width: 1000px;
|
106 |
+
margin: auto;
|
107 |
+
padding: 20px;
|
108 |
+
}
|
109 |
+
.output-stats {
|
110 |
+
background-color: #f5f5f5;
|
111 |
+
padding: 10px;
|
112 |
+
border-radius: 5px;
|
113 |
+
font-family: monospace;
|
114 |
+
}
|
115 |
"""
|
116 |
|
117 |
# νμΌ μ
λ‘λ μΈν°νμ΄μ€
|
118 |
file_transcribe = gr.Interface(
|
119 |
fn=transcribe_summarize,
|
120 |
inputs=[
|
121 |
+
gr.Audio(
|
122 |
+
sources="upload",
|
123 |
+
type="filepath",
|
124 |
+
label="μ€λμ€ νμΌ"
|
125 |
+
),
|
126 |
gr.Radio(
|
127 |
choices=["transcribe", "translate"],
|
128 |
+
label="μμ
μ ν",
|
129 |
value="transcribe"
|
130 |
+
)
|
|
|
|
|
131 |
],
|
132 |
outputs=[
|
133 |
+
gr.Textbox(
|
134 |
+
label="λ³νλ ν
μ€νΈ",
|
135 |
+
lines=5,
|
136 |
+
placeholder="μμ±μ΄ ν
μ€νΈλ‘ λ³νλμ΄ μ¬κΈ°μ νμλ©λλ€..."
|
137 |
+
),
|
138 |
+
gr.Textbox(
|
139 |
+
label="μμ½",
|
140 |
+
lines=3,
|
141 |
+
placeholder="ν
μ€νΈ μμ½μ΄ μ¬κΈ°μ νμλ©λλ€..."
|
142 |
+
),
|
143 |
+
gr.Textbox(
|
144 |
+
label="λΆμ μ 보",
|
145 |
+
lines=4,
|
146 |
+
placeholder="λΆμ μ λ³΄κ° μ¬κΈ°μ νμλ©λλ€..."
|
147 |
+
)
|
148 |
],
|
149 |
+
title="π€ λ°μμ°κΈ° AI",
|
150 |
+
description="""
|
151 |
+
μμ± νμΌμ μ
λ‘λνκ±°λ μ§μ λ
Ήμνμ¬ ν
μ€νΈλ‘ λ³ννκ³ μμ½ν μ μμ΅λλ€.
|
152 |
+
|
153 |
+
μ¬μ© λ°©λ²:
|
154 |
+
1. μ€λμ€ νμΌμ μ
λ‘λνκ±°λ λ§μ΄ν¬λ‘ λ
ΉμνμΈμ
|
155 |
+
2. μμ
μ νμ μ ννμΈμ (λ³ν λλ λ²μ)
|
156 |
+
3. λ³ν λ²νΌμ ν΄λ¦νμΈμ
|
157 |
+
""",
|
158 |
+
article="developed by Claude",
|
159 |
+
examples=[],
|
160 |
+
cache_examples=False,
|
161 |
flagging_mode="never"
|
162 |
)
|
163 |
|
164 |
+
# λ§μ΄ν¬ λ
Ήμ μΈν°νμ΄μ€
|
165 |
+
mic_transcribe = gr.Interface(
|
166 |
+
fn=transcribe_summarize,
|
167 |
+
inputs=[
|
168 |
+
gr.Audio(
|
169 |
+
sources="microphone",
|
170 |
+
type="filepath",
|
171 |
+
label="λ§μ΄ν¬ λ
Ήμ"
|
172 |
+
),
|
173 |
+
gr.Radio(
|
174 |
+
choices=["transcribe", "translate"],
|
175 |
+
label="μμ
μ ν",
|
176 |
+
value="transcribe"
|
177 |
+
)
|
178 |
+
],
|
179 |
+
outputs=[
|
180 |
+
gr.Textbox(
|
181 |
+
label="λ³νλ ν
μ€νΈ",
|
182 |
+
lines=5,
|
183 |
+
placeholder="μμ±μ΄ ν
μ€νΈλ‘ λ³νλμ΄ μ¬κΈ°μ νμλ©λλ€..."
|
184 |
+
),
|
185 |
+
gr.Textbox(
|
186 |
+
label="μμ½",
|
187 |
+
lines=3,
|
188 |
+
placeholder="ν
μ€νΈ μμ½μ΄ μ¬κΈ°μ νμλ©λλ€..."
|
189 |
+
),
|
190 |
+
gr.Textbox(
|
191 |
+
label="λΆμ μ 보",
|
192 |
+
lines=4,
|
193 |
+
placeholder="λΆμ μ λ³΄κ° μ¬κΈ°μ νμλ©λλ€..."
|
194 |
+
)
|
195 |
+
],
|
196 |
+
title="π€ λ°μμ°κΈ° AI",
|
197 |
+
description="λ§μ΄ν¬λ‘ μμ±μ λ
Ήμνμ¬ ν
μ€νΈλ‘ λ³ννκ³ μμ½ν μ μμ΅λλ€.",
|
198 |
+
flagging_mode="never",
|
199 |
+
css=css
|
200 |
+
)
|
201 |
+
|
202 |
+
# λ©μΈ μ ν리μΌμ΄μ
|
203 |
+
demo = gr.Blocks(theme="gradio/soft", css=css)
|
204 |
+
with demo:
|
205 |
+
gr.TabbedInterface(
|
206 |
+
[file_transcribe, mic_transcribe],
|
207 |
+
["μ€λμ€ νμΌ", "λ§μ΄ν¬ λ
Ήμ"]
|
208 |
+
)
|
209 |
+
|
210 |
+
# μ ν리μΌμ΄μ
μ€ν
|
211 |
+
demo.queue().launch(
|
212 |
+
share=False,
|
213 |
+
debug=True,
|
214 |
+
show_error=True,
|
215 |
+
ssr_mode=False
|
216 |
+
)
|