DeeeeeeM
commited on
Commit
·
b4ef081
1
Parent(s):
ab5cd4f
Removed .json checker, result_to_json, and added minor changes
Browse files
app.py
CHANGED
@@ -1,14 +1,11 @@
|
|
1 |
-
import gradio as gr
|
2 |
-
import mimetypes
|
3 |
import os
|
4 |
os.environ['KMP_DUPLICATE_LIB_OK']='True'
|
5 |
-
import argparse
|
6 |
-
import stable_whisper
|
7 |
-
from stable_whisper.text_output import result_to_any, sec2srt
|
8 |
import tempfile
|
9 |
-
import
|
10 |
-
import
|
11 |
import torch
|
|
|
|
|
12 |
|
13 |
def process_media(
|
14 |
model_size, source_lang, upload, model_type,
|
@@ -20,30 +17,21 @@ def process_media(
|
|
20 |
return None, None, None, None
|
21 |
|
22 |
temp_path = upload.name
|
23 |
-
base_path = os.path.splitext(temp_path)[0]
|
24 |
-
word_transcription_path = base_path + '.json'
|
25 |
|
26 |
-
|
27 |
-
if
|
28 |
-
|
29 |
-
|
30 |
else:
|
31 |
-
|
32 |
-
|
33 |
-
#-- Check if CUDA is available or not --#
|
34 |
-
if model_type == "faster whisper":
|
35 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
36 |
-
model = stable_whisper.load_faster_whisper(model_size, device=device)
|
37 |
-
else:
|
38 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
39 |
-
model = stable_whisper.load_model(model_size, device=device)
|
40 |
-
|
41 |
-
try:
|
42 |
-
result = model.transcribe(temp_path, language=source_lang, vad=True, regroup=False, denoiser="demucs")
|
43 |
-
except Exception as e:
|
44 |
-
return None, None, None, None # Remove the 5th value
|
45 |
-
result.save_as_json(word_transcription_path)
|
46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
# ADVANCED SETTINGS #
|
48 |
if max_chars or max_words:
|
49 |
result.split_by_length(
|
@@ -99,7 +87,7 @@ def process_media(
|
|
99 |
audio_out = temp_path if mime and mime.startswith("audio") else None
|
100 |
video_out = temp_path if mime and mime.startswith("video") else None
|
101 |
|
102 |
-
return audio_out, video_out, transcript_txt, srt_file_path
|
103 |
|
104 |
def optimize_text(text, max_lines_per_segment, line_penalty, longest_line_char_penalty):
|
105 |
text = text.strip()
|
@@ -257,7 +245,8 @@ with gr.Blocks() as interface:
|
|
257 |
"""
|
258 |
<style>.html-container.svelte-phx28p.padding { padding: 0 !important; }</style>
|
259 |
<div class='custom-container'>
|
260 |
-
<h1 style='text-align: left;'>Speech Solutions
|
|
|
261 |
"""
|
262 |
)
|
263 |
gr.Markdown(
|
@@ -266,8 +255,8 @@ with gr.Blocks() as interface:
|
|
266 |
|
267 |
- Speech-to-text (WhisperAI)
|
268 |
- Language translation (GPT-4) (In progress)
|
269 |
-
-
|
270 |
-
-
|
271 |
|
272 |
<b>NOTE: This app is currently in the process of applying other AI-solutions for other use cases.</b>
|
273 |
"""
|
@@ -292,7 +281,7 @@ with gr.Blocks() as interface:
|
|
292 |
source_lang = gr.Dropdown(
|
293 |
choices=WHISPER_LANGUAGES,
|
294 |
label="Source Language",
|
295 |
-
value="tl",
|
296 |
interactive=True
|
297 |
)
|
298 |
model_type = gr.Dropdown(
|
@@ -320,6 +309,7 @@ with gr.Blocks() as interface:
|
|
320 |
with gr.Accordion("Advanced Settings", open=False):
|
321 |
gr.Markdown(
|
322 |
"""
|
|
|
323 |
These settings allow you to customize the segmentation of the audio or video file. Adjust these parameters to control how the segments are created based on characters, words, and lines.
|
324 |
|
325 |
<b><i>Note: The values currently set are the default values. You can adjust them to your needs, but be aware that changing these values may affect the segmentation of the audio or video file.</i></b>
|
@@ -386,7 +376,7 @@ with gr.Blocks() as interface:
|
|
386 |
precision=2,
|
387 |
interactive=True
|
388 |
)
|
389 |
-
submit_btn = gr.Button("PROCESS
|
390 |
with gr.Row():
|
391 |
with gr.Column():
|
392 |
transcript_output = gr.Textbox(label="Transcript", lines=8, interactive=False)
|
|
|
|
|
|
|
1 |
import os
|
2 |
os.environ['KMP_DUPLICATE_LIB_OK']='True'
|
|
|
|
|
|
|
3 |
import tempfile
|
4 |
+
import mimetypes
|
5 |
+
import gradio as gr
|
6 |
import torch
|
7 |
+
import stable_whisper
|
8 |
+
from stable_whisper.text_output import result_to_any, sec2srt
|
9 |
|
10 |
def process_media(
|
11 |
model_size, source_lang, upload, model_type,
|
|
|
17 |
return None, None, None, None
|
18 |
|
19 |
temp_path = upload.name
|
|
|
|
|
20 |
|
21 |
+
#-- Check if CUDA is available or not --#
|
22 |
+
if model_type == "faster whisper":
|
23 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
24 |
+
model = stable_whisper.load_faster_whisper(model_size, device=device)
|
25 |
else:
|
26 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
27 |
+
model = stable_whisper.load_model(model_size, device=device)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
+
try:
|
30 |
+
result = model.transcribe(temp_path, language=source_lang, vad=True, regroup=False, denoiser="demucs")
|
31 |
+
#result.save_as_json(word_transcription_path)
|
32 |
+
except Exception as e:
|
33 |
+
return None, None, None, None
|
34 |
+
|
35 |
# ADVANCED SETTINGS #
|
36 |
if max_chars or max_words:
|
37 |
result.split_by_length(
|
|
|
87 |
audio_out = temp_path if mime and mime.startswith("audio") else None
|
88 |
video_out = temp_path if mime and mime.startswith("video") else None
|
89 |
|
90 |
+
return audio_out, video_out, transcript_txt, srt_file_path
|
91 |
|
92 |
def optimize_text(text, max_lines_per_segment, line_penalty, longest_line_char_penalty):
|
93 |
text = text.strip()
|
|
|
245 |
"""
|
246 |
<style>.html-container.svelte-phx28p.padding { padding: 0 !important; }</style>
|
247 |
<div class='custom-container'>
|
248 |
+
<h1 style='text-align: left;'>Speech Solutions✨</h1>
|
249 |
+
<p style='text-align: left;'>Hosted on 🤗 <b>Hugging Face Spaces</b></p>
|
250 |
"""
|
251 |
)
|
252 |
gr.Markdown(
|
|
|
255 |
|
256 |
- Speech-to-text (WhisperAI)
|
257 |
- Language translation (GPT-4) (In progress)
|
258 |
+
- Improved transcription (GPT-4) (In progress)
|
259 |
+
- Text to Speech (In progress)
|
260 |
|
261 |
<b>NOTE: This app is currently in the process of applying other AI-solutions for other use cases.</b>
|
262 |
"""
|
|
|
281 |
source_lang = gr.Dropdown(
|
282 |
choices=WHISPER_LANGUAGES,
|
283 |
label="Source Language",
|
284 |
+
value="tl",
|
285 |
interactive=True
|
286 |
)
|
287 |
model_type = gr.Dropdown(
|
|
|
309 |
with gr.Accordion("Advanced Settings", open=False):
|
310 |
gr.Markdown(
|
311 |
"""
|
312 |
+
|
313 |
These settings allow you to customize the segmentation of the audio or video file. Adjust these parameters to control how the segments are created based on characters, words, and lines.
|
314 |
|
315 |
<b><i>Note: The values currently set are the default values. You can adjust them to your needs, but be aware that changing these values may affect the segmentation of the audio or video file.</i></b>
|
|
|
376 |
precision=2,
|
377 |
interactive=True
|
378 |
)
|
379 |
+
submit_btn = gr.Button("- PROCESS -")
|
380 |
with gr.Row():
|
381 |
with gr.Column():
|
382 |
transcript_output = gr.Textbox(label="Transcript", lines=8, interactive=False)
|