Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -13,14 +13,6 @@ decoder = BeamSearchDecoderCTC.load_from_hf_hub(lmID)
|
|
13 |
p = pipeline("automatic-speech-recognition", model="aware-ai/robust-wav2vec2-base-german-lowercase", decoder=decoder)
|
14 |
ttp = pipeline("text2text-generation", model="aware-ai/marian-german-grammar")
|
15 |
|
16 |
-
vadmodel, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
|
17 |
-
model='silero_vad',
|
18 |
-
force_reload=False)
|
19 |
-
|
20 |
-
(get_speech_timestamps,
|
21 |
-
_, read_audio,
|
22 |
-
*_) = utils
|
23 |
-
|
24 |
#model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_1.2B")
|
25 |
#tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_1.2B")
|
26 |
|
@@ -36,28 +28,11 @@ def translate(src, tgt, text):
|
|
36 |
return result
|
37 |
|
38 |
def transcribe(audio):
|
39 |
-
|
40 |
-
|
41 |
-
start_time = time.time()
|
42 |
-
audio, sr = librosa.load(audio, sr=sampling_rate)
|
43 |
-
log += "--- %s seconds audio loading ---" + str(time.time() - start_time)
|
44 |
-
start_time = time.time()
|
45 |
-
speech_timestamps = get_speech_timestamps(audio, vadmodel, sampling_rate=sampling_rate)
|
46 |
-
log += "\n--- %s seconds audio timestamps---" + str(time.time() - start_time)
|
47 |
-
start_time = time.time()
|
48 |
-
chunks = [audio[i["start"]:i["end"]] for i in speech_timestamps]
|
49 |
-
log += "\n--- %s seconds audio chunking---" + str(time.time() - start_time)
|
50 |
-
start_time = time.time()
|
51 |
-
transcribed = " ".join([text["text"] for text in p(chunks, chunk_length_s=20, stride_length_s=(0, 0))])
|
52 |
-
log += "\n--- %s seconds audio transcription ---" + str(time.time() - start_time)
|
53 |
-
start_time = time.time()
|
54 |
punctuated = ttp(transcribed, max_length = 512)[0]["generated_text"]
|
55 |
-
log += "\n--- %s seconds audio formatting ---" + str(time.time() - start_time)
|
56 |
-
start_time = time.time()
|
57 |
-
p(audio, chunk_length_s=20, stride_length_s=(0, 0))
|
58 |
-
log += "\n--- %s seconds full asr ---" + str(time.time() - start_time)
|
59 |
|
60 |
-
return transcribed, punctuated
|
61 |
|
62 |
def get_asr_interface():
|
63 |
return gr.Interface(
|
@@ -66,7 +41,6 @@ def get_asr_interface():
|
|
66 |
gr.inputs.Audio(source="microphone", type="filepath")
|
67 |
],
|
68 |
outputs=[
|
69 |
-
"textbox",
|
70 |
"textbox",
|
71 |
"textbox"
|
72 |
])
|
|
|
13 |
p = pipeline("automatic-speech-recognition", model="aware-ai/robust-wav2vec2-base-german-lowercase", decoder=decoder)
|
14 |
ttp = pipeline("text2text-generation", model="aware-ai/marian-german-grammar")
|
15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
#model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_1.2B")
|
17 |
#tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_1.2B")
|
18 |
|
|
|
28 |
return result
|
29 |
|
30 |
def transcribe(audio):
|
31 |
+
transcribed = p(audio, chunk_length_s=20, stride_length_s=(0, 0))["text"]
|
32 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
punctuated = ttp(transcribed, max_length = 512)[0]["generated_text"]
|
|
|
|
|
|
|
|
|
34 |
|
35 |
+
return transcribed, punctuated
|
36 |
|
37 |
def get_asr_interface():
|
38 |
return gr.Interface(
|
|
|
41 |
gr.inputs.Audio(source="microphone", type="filepath")
|
42 |
],
|
43 |
outputs=[
|
|
|
44 |
"textbox",
|
45 |
"textbox"
|
46 |
])
|