Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import spaces
|
2 |
+
import torch
|
3 |
+
import gradio as gr
|
4 |
+
from transformers import pipeline
|
5 |
+
from huggingface_hub import InferenceClient
|
6 |
+
import os
|
7 |
+
import json
|
8 |
+
from datetime import datetime
|
9 |
+
import time
|
10 |
+
|
11 |
+
MODEL_NAME = "openai/whisper-large-v3-turbo"
|
12 |
+
BATCH_SIZE = 8
|
13 |
+
FILE_LIMIT_MB = 1000
|
14 |
+
|
15 |
+
device = 0 if torch.cuda.is_available() else "cpu"
|
16 |
+
|
17 |
+
# νμΌ μ μ₯ κ²½λ‘ μ€μ
|
18 |
+
HISTORY_DIR = "transcription_history"
|
19 |
+
os.makedirs(HISTORY_DIR, exist_ok=True)
|
20 |
+
|
21 |
+
# Whisper νμ΄νλΌμΈ μ΄κΈ°ν
|
22 |
+
pipe = pipeline(
|
23 |
+
task="automatic-speech-recognition",
|
24 |
+
model=MODEL_NAME,
|
25 |
+
chunk_length_s=30,
|
26 |
+
device=device,
|
27 |
+
)
|
28 |
+
|
29 |
+
# Hugging Face μΆλ‘ ν΄λΌμ΄μΈνΈ μ€μ
|
30 |
+
hf_client = InferenceClient(
|
31 |
+
"CohereForAI/c4ai-command-r-plus-08-2024",
|
32 |
+
token=os.getenv("HF_TOKEN")
|
33 |
+
)
|
34 |
+
|
35 |
+
def save_transcription(transcribed_text, summary_text):
|
36 |
+
"""λ³ν κ²°κ³Όλ₯Ό JSON νμΌλ‘ μ μ₯"""
|
37 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
38 |
+
filename = f"{HISTORY_DIR}/transcription_{timestamp}.json"
|
39 |
+
|
40 |
+
data = {
|
41 |
+
"timestamp": timestamp,
|
42 |
+
"transcribed_text": transcribed_text,
|
43 |
+
"summary": summary_text
|
44 |
+
}
|
45 |
+
|
46 |
+
with open(filename, "w", encoding="utf-8") as f:
|
47 |
+
json.dump(data, f, ensure_ascii=False, indent=2)
|
48 |
+
|
49 |
+
return filename
|
50 |
+
|
51 |
+
def process_long_audio(audio_input, chunk_duration=30):
|
52 |
+
"""κΈ΄ μ€λμ€ νμΌμ μ²ν¬λ‘ λλμ΄ μ²λ¦¬"""
|
53 |
+
# μ€λμ€ μ²λ¦¬ λ‘μ§ κ΅¬ν
|
54 |
+
pass
|
55 |
+
|
56 |
+
def detect_language(text):
|
57 |
+
"""ν
μ€νΈμ μΈμ΄ κ°μ§"""
|
58 |
+
# μΈμ΄ κ°μ§ λ‘μ§ κ΅¬ν
|
59 |
+
pass
|
60 |
+
|
61 |
+
def get_word_count(text):
|
62 |
+
"""ν
μ€νΈμ λ¨μ΄ μ κ³μ°"""
|
63 |
+
return len(text.split())
|
64 |
+
|
65 |
+
def get_speaking_time(audio_duration):
|
66 |
+
"""μμ± κΈΈμ΄λ₯Ό μ:λΆ:μ΄ νμμΌλ‘ λ³ν"""
|
67 |
+
return time.strftime("%H:%M:%S", time.gmtime(audio_duration))
|
68 |
+
|
69 |
+
@spaces.GPU
|
70 |
+
def transcribe_summarize(audio_input, task, save_result=False, enable_translation=False):
|
71 |
+
if audio_input is None:
|
72 |
+
raise gr.Error("μ€λμ€ νμΌμ΄ μ μΆλμ§ μμμ΅λλ€!")
|
73 |
+
|
74 |
+
start_time = time.time()
|
75 |
+
|
76 |
+
# μμ±μ ν
μ€νΈλ‘ λ³ν
|
77 |
+
result = pipe(
|
78 |
+
audio_input,
|
79 |
+
batch_size=BATCH_SIZE,
|
80 |
+
generate_kwargs={"task": task},
|
81 |
+
return_timestamps=True
|
82 |
+
)
|
83 |
+
transcribed_text = result["text"]
|
84 |
+
|
85 |
+
# λΆμ μ 보 μμ§
|
86 |
+
stats = {
|
87 |
+
"word_count": get_word_count(transcribed_text),
|
88 |
+
"processing_time": f"{time.time() - start_time:.2f}μ΄",
|
89 |
+
"audio_duration": get_speaking_time(result.get("duration", 0)),
|
90 |
+
"language": detect_language(transcribed_text)
|
91 |
+
}
|
92 |
+
|
93 |
+
# ν
μ€νΈ μμ½
|
94 |
+
try:
|
95 |
+
prompt = f"""μλ ν
μ€νΈλ₯Ό κ°λ¨ν μμ½ν΄μ£ΌμΈμ:
|
96 |
+
ν
μ€νΈ: {transcribed_text}
|
97 |
+
μμ½:"""
|
98 |
+
|
99 |
+
response = hf_client.text_generation(
|
100 |
+
model="CohereForAI/c4ai-command-r-plus-08-2024",
|
101 |
+
prompt=prompt,
|
102 |
+
max_new_tokens=150,
|
103 |
+
temperature=0.3,
|
104 |
+
top_p=0.9,
|
105 |
+
repetition_penalty=1.2,
|
106 |
+
stop_sequences=["\n", "ν
μ€νΈ:", "μμ½:"]
|
107 |
+
)
|
108 |
+
|
109 |
+
if isinstance(response, str):
|
110 |
+
summary_text = response
|
111 |
+
else:
|
112 |
+
summary_text = response.generated_text if hasattr(response, 'generated_text') else str(response)
|
113 |
+
|
114 |
+
if "μμ½:" in summary_text:
|
115 |
+
summary_text = summary_text.split("μμ½:")[1].strip()
|
116 |
+
|
117 |
+
if not summary_text:
|
118 |
+
summary_text = "μμ½μ μμ±ν μ μμ΅λλ€."
|
119 |
+
|
120 |
+
except Exception as e:
|
121 |
+
print(f"μμ½ μμ± μ€ μ€λ₯ λ°μ: {str(e)}")
|
122 |
+
summary_text = "μμ½μ μμ±ν μ μμ΅λλ€. μ μ ν λ€μ μλν΄μ£ΌμΈμ."
|
123 |
+
|
124 |
+
# κ²°κ³Ό μ μ₯
|
125 |
+
if save_result:
|
126 |
+
saved_file = save_transcription(transcribed_text, summary_text)
|
127 |
+
print(f"κ²°κ³Όκ° μ μ₯λμμ΅λλ€: {saved_file}")
|
128 |
+
|
129 |
+
# λ²μ κΈ°λ₯ (μ΅μ
)
|
130 |
+
translated_text = ""
|
131 |
+
if enable_translation and task != "translate":
|
132 |
+
try:
|
133 |
+
# λ²μ λ‘μ§ κ΅¬ν
|
134 |
+
pass
|
135 |
+
except Exception as e:
|
136 |
+
translated_text = "λ²μ μ€ μ€λ₯κ° λ°μνμ΅λλ€."
|
137 |
+
|
138 |
+
return [
|
139 |
+
transcribed_text,
|
140 |
+
summary_text,
|
141 |
+
gr.update(value=f"""
|
142 |
+
π λΆμ μ 보:
|
143 |
+
- λ¨μ΄ μ: {stats['word_count']}κ°
|
144 |
+
- μ²λ¦¬ μκ°: {stats['processing_time']}
|
145 |
+
- μμ± κΈΈμ΄: {stats['audio_duration']}
|
146 |
+
- κ°μ§λ μΈμ΄: {stats['language']}
|
147 |
+
"""),
|
148 |
+
translated_text if enable_translation else None
|
149 |
+
]
|
150 |
+
|
151 |
+
# CSS μ€νμΌ
|
152 |
+
css = """
|
153 |
+
footer { visibility: hidden; }
|
154 |
+
.gradio-container { max-width: 1200px; margin: auto; }
|
155 |
+
.audio-stats { background-color: #f0f0f0; padding: 10px; border-radius: 5px; }
|
156 |
+
"""
|
157 |
+
|
158 |
+
# νμΌ μ
λ‘λ μΈν°νμ΄μ€
|
159 |
+
file_transcribe = gr.Interface(
|
160 |
+
fn=transcribe_summarize,
|
161 |
+
inputs=[
|
162 |
+
gr.Audio(sources="upload", type="filepath", label="μ€λμ€ νμΌ"),
|
163 |
+
gr.Radio(
|
164 |
+
choices=["transcribe", "translate"],
|
165 |
+
label="μμ
",
|
166 |
+
value="transcribe"
|
167 |
+
),
|
168 |
+
gr.Checkbox(label="κ²°κ³Ό μ μ₯νκΈ°", value=False),
|
169 |
+
gr.Checkbox(label="λ²μ νμ±ν", value=False)
|
170 |
+
],
|
171 |
+
outputs=[
|
172 |
+
gr.Textbox(label="λ³νλ ν
μ€νΈ", lines=5),
|
173 |
+
gr.Textbox(label="μμ½", lines=3),
|
174 |
+
gr.Textbox(label="λΆμ μ 보", lines=4),
|
175 |
+
gr.Textbox(label="λ²μ κ²°κ³Ό", lines=5, visible=False)
|
176 |
+
],
|
177 |
+
title="λ°μμ°κΈ° AI: μμ±μ ν
μ€νΈλ‘ λ³ννκ³ μμ½νκΈ°",
|
178 |
+
description="μμ± νμΌμ μ
λ‘λνκ±°λ μ§μ λ
Ήμνμ¬ ν
μ€νΈλ‘ λ³ννκ³ μμ½ν μ μμ΅λλ€.",
|
179 |
+
flagging_mode="never"
|
180 |
+
)
|
181 |
+
|
182 |
+
# λ§μ΄ν¬ λ
Ήμ μΈν°νμ΄μ€μ λ©μΈ μ ν리μΌμ΄μ
μ½λλ λμΌνκ² μ μ§...
|