openfree commited on
Commit
d883298
Β·
verified Β·
1 Parent(s): 629129d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +152 -118
app.py CHANGED
@@ -4,9 +4,7 @@ import gradio as gr
4
  from transformers import pipeline
5
  from huggingface_hub import InferenceClient
6
  import os
7
- import json
8
  from datetime import datetime
9
- import time
10
 
11
  MODEL_NAME = "openai/whisper-large-v3-turbo"
12
  BATCH_SIZE = 8
@@ -14,10 +12,6 @@ FILE_LIMIT_MB = 1000
14
 
15
  device = 0 if torch.cuda.is_available() else "cpu"
16
 
17
- # 파일 μ €μž₯ 경둜 μ„€μ •
18
- HISTORY_DIR = "transcription_history"
19
- os.makedirs(HISTORY_DIR, exist_ok=True)
20
-
21
  # Whisper νŒŒμ΄ν”„λΌμΈ μ΄ˆκΈ°ν™”
22
  pipe = pipeline(
23
  task="automatic-speech-recognition",
@@ -32,151 +26,191 @@ hf_client = InferenceClient(
32
  token=os.getenv("HF_TOKEN")
33
  )
34
 
35
- def save_transcription(transcribed_text, summary_text):
36
- """λ³€ν™˜ κ²°κ³Όλ₯Ό JSON 파일둜 μ €μž₯"""
37
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
38
- filename = f"{HISTORY_DIR}/transcription_{timestamp}.json"
39
-
40
- data = {
41
- "timestamp": timestamp,
42
- "transcribed_text": transcribed_text,
43
- "summary": summary_text
44
- }
45
-
46
- with open(filename, "w", encoding="utf-8") as f:
47
- json.dump(data, f, ensure_ascii=False, indent=2)
48
-
49
- return filename
50
-
51
- def process_long_audio(audio_input, chunk_duration=30):
52
- """κΈ΄ μ˜€λ””μ˜€ νŒŒμΌμ„ 청크둜 λ‚˜λˆ„μ–΄ 처리"""
53
- # μ˜€λ””μ˜€ 처리 둜직 κ΅¬ν˜„
54
- pass
55
-
56
- def detect_language(text):
57
- """ν…μŠ€νŠΈμ˜ μ–Έμ–΄ 감지"""
58
- # μ–Έμ–΄ 감지 둜직 κ΅¬ν˜„
59
- pass
60
-
61
  def get_word_count(text):
62
  """ν…μŠ€νŠΈμ˜ 단어 수 계산"""
 
 
63
  return len(text.split())
64
 
65
- def get_speaking_time(audio_duration):
66
- """μŒμ„± 길이λ₯Ό μ‹œ:λΆ„:초 ν˜•μ‹μœΌλ‘œ λ³€ν™˜"""
67
- return time.strftime("%H:%M:%S", time.gmtime(audio_duration))
 
 
 
 
 
68
 
69
  @spaces.GPU
70
- def transcribe_summarize(audio_input, task, save_result=False, enable_translation=False):
71
  if audio_input is None:
72
  raise gr.Error("μ˜€λ””μ˜€ 파일이 μ œμΆœλ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€!")
73
 
74
- start_time = time.time()
75
-
76
- # μŒμ„±μ„ ν…μŠ€νŠΈλ‘œ λ³€ν™˜
77
- result = pipe(
78
- audio_input,
79
- batch_size=BATCH_SIZE,
80
- generate_kwargs={"task": task},
81
- return_timestamps=True
82
- )
83
- transcribed_text = result["text"]
84
-
85
- # 뢄석 정보 μˆ˜μ§‘
86
- stats = {
87
- "word_count": get_word_count(transcribed_text),
88
- "processing_time": f"{time.time() - start_time:.2f}초",
89
- "audio_duration": get_speaking_time(result.get("duration", 0)),
90
- "language": detect_language(transcribed_text)
91
- }
92
-
93
- # ν…μŠ€νŠΈ μš”μ•½
94
  try:
95
- prompt = f"""μ•„λž˜ ν…μŠ€νŠΈλ₯Ό κ°„λ‹¨νžˆ μš”μ•½ν•΄μ£Όμ„Έμš”:
96
- ν…μŠ€νŠΈ: {transcribed_text}
97
- μš”μ•½:"""
98
-
99
- response = hf_client.text_generation(
100
- model="CohereForAI/c4ai-command-r-plus-08-2024",
101
- prompt=prompt,
102
- max_new_tokens=150,
103
- temperature=0.3,
104
- top_p=0.9,
105
- repetition_penalty=1.2,
106
- stop_sequences=["\n", "ν…μŠ€νŠΈ:", "μš”μ•½:"]
107
  )
 
108
 
109
- if isinstance(response, str):
110
- summary_text = response
111
- else:
112
- summary_text = response.generated_text if hasattr(response, 'generated_text') else str(response)
 
 
 
 
 
 
 
113
 
114
- if "μš”μ•½:" in summary_text:
115
- summary_text = summary_text.split("μš”μ•½:")[1].strip()
 
 
 
 
 
116
 
117
- if not summary_text:
118
- summary_text = "μš”μ•½μ„ 생성할 수 μ—†μŠ΅λ‹ˆλ‹€."
 
119
 
120
- except Exception as e:
121
- print(f"μš”μ•½ 생성 쀑 였λ₯˜ λ°œμƒ: {str(e)}")
122
- summary_text = "μš”μ•½μ„ 생성할 수 μ—†μŠ΅λ‹ˆλ‹€. μž μ‹œ ν›„ λ‹€μ‹œ μ‹œλ„ν•΄μ£Όμ„Έμš”."
123
-
124
- # κ²°κ³Ό μ €μž₯
125
- if save_result:
126
- saved_file = save_transcription(transcribed_text, summary_text)
127
- print(f"κ²°κ³Όκ°€ μ €μž₯λ˜μ—ˆμŠ΅λ‹ˆλ‹€: {saved_file}")
128
-
129
- # λ²ˆμ—­ κΈ°λŠ₯ (μ˜΅μ…˜)
130
- translated_text = ""
131
- if enable_translation and task != "translate":
132
- try:
133
- # λ²ˆμ—­ 둜직 κ΅¬ν˜„
134
- pass
135
  except Exception as e:
136
- translated_text = "λ²ˆμ—­ 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€."
137
-
138
- return [
139
- transcribed_text,
140
- summary_text,
141
- gr.update(value=f"""
142
  πŸ“Š 뢄석 정보:
143
- - 단어 수: {stats['word_count']}개
144
- - 처리 μ‹œκ°„: {stats['processing_time']}
145
- - μŒμ„± 길이: {stats['audio_duration']}
146
- - κ°μ§€λœ μ–Έμ–΄: {stats['language']}
147
- """),
148
- translated_text if enable_translation else None
149
- ]
 
 
 
150
 
151
  # CSS μŠ€νƒ€μΌ
152
  css = """
153
  footer { visibility: hidden; }
154
- .gradio-container { max-width: 1200px; margin: auto; }
155
- .audio-stats { background-color: #f0f0f0; padding: 10px; border-radius: 5px; }
 
 
 
 
 
 
 
 
 
156
  """
157
 
158
  # 파일 μ—…λ‘œλ“œ μΈν„°νŽ˜μ΄μŠ€
159
  file_transcribe = gr.Interface(
160
  fn=transcribe_summarize,
161
  inputs=[
162
- gr.Audio(sources="upload", type="filepath", label="μ˜€λ””μ˜€ 파일"),
 
 
 
 
163
  gr.Radio(
164
  choices=["transcribe", "translate"],
165
- label="μž‘μ—…",
166
  value="transcribe"
167
- ),
168
- gr.Checkbox(label="κ²°κ³Ό μ €μž₯ν•˜κΈ°", value=False),
169
- gr.Checkbox(label="λ²ˆμ—­ ν™œμ„±ν™”", value=False)
170
  ],
171
  outputs=[
172
- gr.Textbox(label="λ³€ν™˜λœ ν…μŠ€νŠΈ", lines=5),
173
- gr.Textbox(label="μš”μ•½", lines=3),
174
- gr.Textbox(label="뢄석 정보", lines=4),
175
- gr.Textbox(label="λ²ˆμ—­ κ²°κ³Ό", lines=5, visible=False)
 
 
 
 
 
 
 
 
 
 
 
176
  ],
177
- title="λ°›μ•„μ“°κΈ° AI: μŒμ„±μ„ ν…μŠ€νŠΈλ‘œ λ³€ν™˜ν•˜κ³  μš”μ•½ν•˜κΈ°",
178
- description="μŒμ„± νŒŒμΌμ„ μ—…λ‘œλ“œν•˜κ±°λ‚˜ 직접 λ…ΉμŒν•˜μ—¬ ν…μŠ€νŠΈλ‘œ λ³€ν™˜ν•˜κ³  μš”μ•½ν•  수 μžˆμŠ΅λ‹ˆλ‹€.",
 
 
 
 
 
 
 
 
 
 
179
  flagging_mode="never"
180
  )
181
 
182
- # 마이크 λ…ΉμŒ μΈν„°νŽ˜μ΄μŠ€μ™€ 메인 μ• ν”Œλ¦¬μΌ€μ΄μ…˜ μ½”λ“œλŠ” λ™μΌν•˜κ²Œ μœ μ§€...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  from transformers import pipeline
5
  from huggingface_hub import InferenceClient
6
  import os
 
7
  from datetime import datetime
 
8
 
9
  MODEL_NAME = "openai/whisper-large-v3-turbo"
10
  BATCH_SIZE = 8
 
12
 
13
  device = 0 if torch.cuda.is_available() else "cpu"
14
 
 
 
 
 
15
  # Whisper νŒŒμ΄ν”„λΌμΈ μ΄ˆκΈ°ν™”
16
  pipe = pipeline(
17
  task="automatic-speech-recognition",
 
26
  token=os.getenv("HF_TOKEN")
27
  )
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  def get_word_count(text):
30
  """ν…μŠ€νŠΈμ˜ 단어 수 계산"""
31
+ if not text:
32
+ return 0
33
  return len(text.split())
34
 
35
+ def format_duration(seconds):
36
+ """초 λ‹¨μœ„ μ‹œκ°„μ„ mm:ss ν˜•μ‹μœΌλ‘œ λ³€ν™˜"""
37
+ try:
38
+ minutes = int(seconds // 60)
39
+ seconds = int(seconds % 60)
40
+ return f"{minutes:02d}:{seconds:02d}"
41
+ except:
42
+ return "00:00"
43
 
44
  @spaces.GPU
45
+ def transcribe_summarize(audio_input, task):
46
  if audio_input is None:
47
  raise gr.Error("μ˜€λ””μ˜€ 파일이 μ œμΆœλ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€!")
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  try:
50
+ # μŒμ„±μ„ ν…μŠ€νŠΈλ‘œ λ³€ν™˜
51
+ result = pipe(
52
+ audio_input,
53
+ batch_size=BATCH_SIZE,
54
+ generate_kwargs={"task": task},
55
+ return_timestamps=True
 
 
 
 
 
 
56
  )
57
+ transcribed_text = result["text"]
58
 
59
+ # κΈ°λ³Έ 뢄석 정보
60
+ word_count = get_word_count(transcribed_text)
61
+ duration = format_duration(result.get("duration", 0))
62
+
63
+ # ν…μŠ€νŠΈ μš”μ•½
64
+ try:
65
+ prompt = (
66
+ "λ‹€μŒ ν…μŠ€νŠΈλ₯Ό ν•œκ΅­μ–΄λ‘œ κ°„λ‹¨νžˆ μš”μ•½ν•΄μ£Όμ„Έμš”:\n\n"
67
+ f"ν…μŠ€νŠΈ: {transcribed_text}\n"
68
+ "μš”μ•½:"
69
+ )
70
 
71
+ response = hf_client.text_generation(
72
+ prompt=prompt,
73
+ max_new_tokens=150,
74
+ temperature=0.3,
75
+ top_p=0.9,
76
+ repetition_penalty=1.2
77
+ )
78
 
79
+ summary_text = str(response)
80
+ if "μš”μ•½:" in summary_text:
81
+ summary_text = summary_text.split("μš”μ•½:")[1].strip()
82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  except Exception as e:
84
+ print(f"μš”μ•½ 생성 쀑 였λ₯˜: {str(e)}")
85
+ summary_text = "μš”μ•½μ„ 생성할 수 μ—†μŠ΅λ‹ˆλ‹€."
86
+
87
+ # 뢄석 정보 ν¬λ§·νŒ…
88
+ stats = f"""
 
89
  πŸ“Š 뢄석 정보:
90
+ - 단어 수: {word_count}개
91
+ - μŒμ„± 길이: {duration}
92
+ - 생성 μ‹œκ°„: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
93
+ """
94
+
95
+ return [transcribed_text, summary_text, stats]
96
+
97
+ except Exception as e:
98
+ error_msg = f"처리 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {str(e)}"
99
+ return ["", error_msg, ""]
100
 
101
  # CSS μŠ€νƒ€μΌ
102
  css = """
103
  footer { visibility: hidden; }
104
+ .gradio-container {
105
+ max-width: 1000px;
106
+ margin: auto;
107
+ padding: 20px;
108
+ }
109
+ .output-stats {
110
+ background-color: #f5f5f5;
111
+ padding: 10px;
112
+ border-radius: 5px;
113
+ font-family: monospace;
114
+ }
115
  """
116
 
117
  # 파일 μ—…λ‘œλ“œ μΈν„°νŽ˜μ΄μŠ€
118
  file_transcribe = gr.Interface(
119
  fn=transcribe_summarize,
120
  inputs=[
121
+ gr.Audio(
122
+ sources="upload",
123
+ type="filepath",
124
+ label="μ˜€λ””μ˜€ 파일"
125
+ ),
126
  gr.Radio(
127
  choices=["transcribe", "translate"],
128
+ label="μž‘μ—… 선택",
129
  value="transcribe"
130
+ )
 
 
131
  ],
132
  outputs=[
133
+ gr.Textbox(
134
+ label="λ³€ν™˜λœ ν…μŠ€νŠΈ",
135
+ lines=5,
136
+ placeholder="μŒμ„±μ΄ ν…μŠ€νŠΈλ‘œ λ³€ν™˜λ˜μ–΄ 여기에 ν‘œμ‹œλ©λ‹ˆλ‹€..."
137
+ ),
138
+ gr.Textbox(
139
+ label="μš”μ•½",
140
+ lines=3,
141
+ placeholder="ν…μŠ€νŠΈ μš”μ•½μ΄ 여기에 ν‘œμ‹œλ©λ‹ˆλ‹€..."
142
+ ),
143
+ gr.Textbox(
144
+ label="뢄석 정보",
145
+ lines=4,
146
+ placeholder="뢄석 정보가 여기에 ν‘œμ‹œλ©λ‹ˆλ‹€..."
147
+ )
148
  ],
149
+ title="🎀 λ°›μ•„μ“°κΈ° AI",
150
+ description="""
151
+ μŒμ„± νŒŒμΌμ„ μ—…λ‘œλ“œν•˜κ±°λ‚˜ 직접 λ…ΉμŒν•˜μ—¬ ν…μŠ€νŠΈλ‘œ λ³€ν™˜ν•˜κ³  μš”μ•½ν•  수 μžˆμŠ΅λ‹ˆλ‹€.
152
+
153
+ μ‚¬μš© 방법:
154
+ 1. μ˜€λ””μ˜€ νŒŒμΌμ„ μ—…λ‘œλ“œν•˜κ±°λ‚˜ 마이크둜 λ…ΉμŒν•˜μ„Έμš”
155
+ 2. μž‘μ—… μœ ν˜•μ„ μ„ νƒν•˜μ„Έμš” (λ³€ν™˜ λ˜λŠ” λ²ˆμ—­)
156
+ 3. λ³€ν™˜ λ²„νŠΌμ„ ν΄λ¦­ν•˜μ„Έμš”
157
+ """,
158
+ article="developed by Claude",
159
+ examples=[],
160
+ cache_examples=False,
161
  flagging_mode="never"
162
  )
163
 
164
+ # 마이크 λ…ΉμŒ μΈν„°νŽ˜μ΄μŠ€
165
+ mic_transcribe = gr.Interface(
166
+ fn=transcribe_summarize,
167
+ inputs=[
168
+ gr.Audio(
169
+ sources="microphone",
170
+ type="filepath",
171
+ label="마이크 λ…ΉμŒ"
172
+ ),
173
+ gr.Radio(
174
+ choices=["transcribe", "translate"],
175
+ label="μž‘μ—… 선택",
176
+ value="transcribe"
177
+ )
178
+ ],
179
+ outputs=[
180
+ gr.Textbox(
181
+ label="λ³€ν™˜λœ ν…μŠ€νŠΈ",
182
+ lines=5,
183
+ placeholder="μŒμ„±μ΄ ν…μŠ€νŠΈλ‘œ λ³€ν™˜λ˜μ–΄ 여기에 ν‘œμ‹œλ©λ‹ˆλ‹€..."
184
+ ),
185
+ gr.Textbox(
186
+ label="μš”μ•½",
187
+ lines=3,
188
+ placeholder="ν…μŠ€νŠΈ μš”μ•½μ΄ 여기에 ν‘œμ‹œλ©λ‹ˆλ‹€..."
189
+ ),
190
+ gr.Textbox(
191
+ label="뢄석 정보",
192
+ lines=4,
193
+ placeholder="뢄석 정보가 여기에 ν‘œμ‹œλ©λ‹ˆλ‹€..."
194
+ )
195
+ ],
196
+ title="🎀 λ°›μ•„μ“°κΈ° AI",
197
+ description="마이크둜 μŒμ„±μ„ λ…ΉμŒν•˜μ—¬ ν…μŠ€νŠΈλ‘œ λ³€ν™˜ν•˜κ³  μš”μ•½ν•  수 μžˆμŠ΅λ‹ˆλ‹€.",
198
+ flagging_mode="never",
199
+ css=css
200
+ )
201
+
202
+ # 메인 μ• ν”Œλ¦¬μΌ€μ΄μ…˜
203
+ demo = gr.Blocks(theme="gradio/soft", css=css)
204
+ with demo:
205
+ gr.TabbedInterface(
206
+ [file_transcribe, mic_transcribe],
207
+ ["μ˜€λ””μ˜€ 파일", "마이크 λ…ΉμŒ"]
208
+ )
209
+
210
+ # μ• ν”Œλ¦¬μΌ€μ΄μ…˜ μ‹€ν–‰
211
+ demo.queue().launch(
212
+ share=False,
213
+ debug=True,
214
+ show_error=True,
215
+ ssr_mode=False
216
+ )