flozi00 commited on
Commit
26832b7
·
1 Parent(s): 5f6cbd7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -29
app.py CHANGED
@@ -13,14 +13,6 @@ decoder = BeamSearchDecoderCTC.load_from_hf_hub(lmID)
13
  p = pipeline("automatic-speech-recognition", model="aware-ai/robust-wav2vec2-base-german-lowercase", decoder=decoder)
14
  ttp = pipeline("text2text-generation", model="aware-ai/marian-german-grammar")
15
 
16
- vadmodel, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
17
- model='silero_vad',
18
- force_reload=False)
19
-
20
- (get_speech_timestamps,
21
- _, read_audio,
22
- *_) = utils
23
-
24
  #model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_1.2B")
25
  #tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_1.2B")
26
 
@@ -36,28 +28,11 @@ def translate(src, tgt, text):
36
  return result
37
 
38
  def transcribe(audio):
39
- log = ""
40
- sampling_rate = 16000
41
- start_time = time.time()
42
- audio, sr = librosa.load(audio, sr=sampling_rate)
43
- log += "--- %s seconds audio loading ---" + str(time.time() - start_time)
44
- start_time = time.time()
45
- speech_timestamps = get_speech_timestamps(audio, vadmodel, sampling_rate=sampling_rate)
46
- log += "\n--- %s seconds audio timestamps---" + str(time.time() - start_time)
47
- start_time = time.time()
48
- chunks = [audio[i["start"]:i["end"]] for i in speech_timestamps]
49
- log += "\n--- %s seconds audio chunking---" + str(time.time() - start_time)
50
- start_time = time.time()
51
- transcribed = " ".join([text["text"] for text in p(chunks, chunk_length_s=20, stride_length_s=(0, 0))])
52
- log += "\n--- %s seconds audio transcription ---" + str(time.time() - start_time)
53
- start_time = time.time()
54
  punctuated = ttp(transcribed, max_length = 512)[0]["generated_text"]
55
- log += "\n--- %s seconds audio formatting ---" + str(time.time() - start_time)
56
- start_time = time.time()
57
- p(audio, chunk_length_s=20, stride_length_s=(0, 0))
58
- log += "\n--- %s seconds full asr ---" + str(time.time() - start_time)
59
 
60
- return transcribed, punctuated, log
61
 
62
  def get_asr_interface():
63
  return gr.Interface(
@@ -66,7 +41,6 @@ def get_asr_interface():
66
  gr.inputs.Audio(source="microphone", type="filepath")
67
  ],
68
  outputs=[
69
- "textbox",
70
  "textbox",
71
  "textbox"
72
  ])
 
13
  p = pipeline("automatic-speech-recognition", model="aware-ai/robust-wav2vec2-base-german-lowercase", decoder=decoder)
14
  ttp = pipeline("text2text-generation", model="aware-ai/marian-german-grammar")
15
 
 
 
 
 
 
 
 
 
16
  #model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_1.2B")
17
  #tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_1.2B")
18
 
 
28
  return result
29
 
30
  def transcribe(audio):
31
+ transcribed = p(audio, chunk_length_s=20, stride_length_s=(0, 0))["text"]
32
+
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  punctuated = ttp(transcribed, max_length = 512)[0]["generated_text"]
 
 
 
 
34
 
35
+ return transcribed, punctuated
36
 
37
  def get_asr_interface():
38
  return gr.Interface(
 
41
  gr.inputs.Audio(source="microphone", type="filepath")
42
  ],
43
  outputs=[
 
44
  "textbox",
45
  "textbox"
46
  ])