DeeeeeeM commited on
Commit
b4ef081
·
1 Parent(s): ab5cd4f

Removed .json checker, result_to_json, and added minor changes

Browse files
Files changed (1) hide show
  1. app.py +24 -34
app.py CHANGED
@@ -1,14 +1,11 @@
1
- import gradio as gr
2
- import mimetypes
3
  import os
4
  os.environ['KMP_DUPLICATE_LIB_OK']='True'
5
- import argparse
6
- import stable_whisper
7
- from stable_whisper.text_output import result_to_any, sec2srt
8
  import tempfile
9
- import re
10
- import textwrap
11
  import torch
 
 
12
 
13
  def process_media(
14
  model_size, source_lang, upload, model_type,
@@ -20,30 +17,21 @@ def process_media(
20
  return None, None, None, None
21
 
22
  temp_path = upload.name
23
- base_path = os.path.splitext(temp_path)[0]
24
- word_transcription_path = base_path + '.json'
25
 
26
- # ---- Load .json or transcribe ---- #
27
- if os.path.exists(word_transcription_path):
28
- print(f"Transcription data file found at {word_transcription_path}")
29
- result = stable_whisper.WhisperResult(word_transcription_path)
30
  else:
31
- print(f"Can't find transcription data file at {word_transcription_path}. Starting transcribing ...")
32
-
33
- #-- Check if CUDA is available or not --#
34
- if model_type == "faster whisper":
35
- device = "cuda" if torch.cuda.is_available() else "cpu"
36
- model = stable_whisper.load_faster_whisper(model_size, device=device)
37
- else:
38
- device = "cuda" if torch.cuda.is_available() else "cpu"
39
- model = stable_whisper.load_model(model_size, device=device)
40
-
41
- try:
42
- result = model.transcribe(temp_path, language=source_lang, vad=True, regroup=False, denoiser="demucs")
43
- except Exception as e:
44
- return None, None, None, None # Remove the 5th value
45
- result.save_as_json(word_transcription_path)
46
 
 
 
 
 
 
 
47
  # ADVANCED SETTINGS #
48
  if max_chars or max_words:
49
  result.split_by_length(
@@ -99,7 +87,7 @@ def process_media(
99
  audio_out = temp_path if mime and mime.startswith("audio") else None
100
  video_out = temp_path if mime and mime.startswith("video") else None
101
 
102
- return audio_out, video_out, transcript_txt, srt_file_path # Only 4 values
103
 
104
  def optimize_text(text, max_lines_per_segment, line_penalty, longest_line_char_penalty):
105
  text = text.strip()
@@ -257,7 +245,8 @@ with gr.Blocks() as interface:
257
  """
258
  <style>.html-container.svelte-phx28p.padding { padding: 0 !important; }</style>
259
  <div class='custom-container'>
260
- <h1 style='text-align: left;'>Speech Solutions</h1>
 
261
  """
262
  )
263
  gr.Markdown(
@@ -266,8 +255,8 @@ with gr.Blocks() as interface:
266
 
267
  - Speech-to-text (WhisperAI)
268
  - Language translation (GPT-4) (In progress)
269
- - Youtube video / playlist integration (In progress)
270
- - Batched processing (In progress)
271
 
272
  <b>NOTE: This app is currently in the process of applying other AI-solutions for other use cases.</b>
273
  """
@@ -292,7 +281,7 @@ with gr.Blocks() as interface:
292
  source_lang = gr.Dropdown(
293
  choices=WHISPER_LANGUAGES,
294
  label="Source Language",
295
- value="tl", # default to Tagalog
296
  interactive=True
297
  )
298
  model_type = gr.Dropdown(
@@ -320,6 +309,7 @@ with gr.Blocks() as interface:
320
  with gr.Accordion("Advanced Settings", open=False):
321
  gr.Markdown(
322
  """
 
323
  These settings allow you to customize the segmentation of the audio or video file. Adjust these parameters to control how the segments are created based on characters, words, and lines.
324
 
325
  <b><i>Note: The values currently set are the default values. You can adjust them to your needs, but be aware that changing these values may affect the segmentation of the audio or video file.</i></b>
@@ -386,7 +376,7 @@ with gr.Blocks() as interface:
386
  precision=2,
387
  interactive=True
388
  )
389
- submit_btn = gr.Button("PROCESS", elem_id="orange-process-btn")
390
  with gr.Row():
391
  with gr.Column():
392
  transcript_output = gr.Textbox(label="Transcript", lines=8, interactive=False)
 
 
 
1
  import os
2
  os.environ['KMP_DUPLICATE_LIB_OK']='True'
 
 
 
3
  import tempfile
4
+ import mimetypes
5
+ import gradio as gr
6
  import torch
7
+ import stable_whisper
8
+ from stable_whisper.text_output import result_to_any, sec2srt
9
 
10
  def process_media(
11
  model_size, source_lang, upload, model_type,
 
17
  return None, None, None, None
18
 
19
  temp_path = upload.name
 
 
20
 
21
+ #-- Check if CUDA is available or not --#
22
+ if model_type == "faster whisper":
23
+ device = "cuda" if torch.cuda.is_available() else "cpu"
24
+ model = stable_whisper.load_faster_whisper(model_size, device=device)
25
  else:
26
+ device = "cuda" if torch.cuda.is_available() else "cpu"
27
+ model = stable_whisper.load_model(model_size, device=device)
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
+ try:
30
+ result = model.transcribe(temp_path, language=source_lang, vad=True, regroup=False, denoiser="demucs")
31
+ #result.save_as_json(word_transcription_path)
32
+ except Exception as e:
33
+ return None, None, None, None
34
+
35
  # ADVANCED SETTINGS #
36
  if max_chars or max_words:
37
  result.split_by_length(
 
87
  audio_out = temp_path if mime and mime.startswith("audio") else None
88
  video_out = temp_path if mime and mime.startswith("video") else None
89
 
90
+ return audio_out, video_out, transcript_txt, srt_file_path
91
 
92
  def optimize_text(text, max_lines_per_segment, line_penalty, longest_line_char_penalty):
93
  text = text.strip()
 
245
  """
246
  <style>.html-container.svelte-phx28p.padding { padding: 0 !important; }</style>
247
  <div class='custom-container'>
248
+ <h1 style='text-align: left;'>Speech Solutions✨</h1>
249
+ <p style='text-align: left;'>Hosted on 🤗 <b>Hugging Face Spaces</b></p>
250
  """
251
  )
252
  gr.Markdown(
 
255
 
256
  - Speech-to-text (WhisperAI)
257
  - Language translation (GPT-4) (In progress)
258
+ - Improved transcription (GPT-4) (In progress)
259
+ - Text to Speech (In progress)
260
 
261
  <b>NOTE: This app is currently in the process of applying other AI-solutions for other use cases.</b>
262
  """
 
281
  source_lang = gr.Dropdown(
282
  choices=WHISPER_LANGUAGES,
283
  label="Source Language",
284
+ value="tl",
285
  interactive=True
286
  )
287
  model_type = gr.Dropdown(
 
309
  with gr.Accordion("Advanced Settings", open=False):
310
  gr.Markdown(
311
  """
312
+
313
  These settings allow you to customize the segmentation of the audio or video file. Adjust these parameters to control how the segments are created based on characters, words, and lines.
314
 
315
  <b><i>Note: The values currently set are the default values. You can adjust them to your needs, but be aware that changing these values may affect the segmentation of the audio or video file.</i></b>
 
376
  precision=2,
377
  interactive=True
378
  )
379
+ submit_btn = gr.Button("- PROCESS -")
380
  with gr.Row():
381
  with gr.Column():
382
  transcript_output = gr.Textbox(label="Transcript", lines=8, interactive=False)