KevinGeng commited on
Commit
49062a1
·
1 Parent(s): 99503cf
app.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ TODO:
3
+ + [x] Load Configuration
4
+ + [ ] Multi ASR Engine
5
+ + [ ] Batch / Real Time support
6
+ """
7
+ import numpy as np
8
+ from pathlib import Path
9
+ import jiwer
10
+ import pdb
11
+ import torch.nn as nn
12
+ import torch
13
+ import torchaudio
14
+ import gradio as gr
15
+ from logging import PlaceHolder
16
+ from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
17
+ import yaml
18
+ from transformers import pipeline
19
+ import librosa
20
+ import librosa.display
21
+ import matplotlib.pyplot as plt
22
+
23
+ # local import
24
+ import sys
25
+
26
+ sys.path.append("src")
27
+
28
+ # Load automos
29
+ config_yaml = "config/samples.yaml"
30
+ with open(config_yaml, "r") as f:
31
+ # pdb.set_trace()
32
+ try:
33
+ config = yaml.safe_load(f)
34
+ except FileExistsError:
35
+ print("Config file Loading Error")
36
+ exit()
37
+
38
+ # Auto load examples
39
+ refs = np.loadtxt(config["ref_txt"], delimiter="\n", dtype="str")
40
+ refs_ids = [x.split()[0] for x in refs]
41
+ refs_txt = [" ".join(x.split()[1:]) for x in refs]
42
+ ref_wavs = [str(x) for x in sorted(Path(config["ref_wavs"]).glob("**/*.wav"))]
43
+
44
+ with open("src/description.html", "r", encoding="utf-8") as f:
45
+ description = f.read()
46
+ # description
47
+
48
+ reference_id = gr.Textbox(
49
+ value="ID", placeholder="Utter ID", label="Reference_ID"
50
+ )
51
+ reference_textbox = gr.Textbox(
52
+ value="Input reference here",
53
+ placeholder="Input reference here",
54
+ label="Reference",
55
+ )
56
+ reference_PPM = gr.Textbox(
57
+ placeholder="Pneumatic Voice's PPM", label="Ref PPM"
58
+ )
59
+
60
+ examples = [
61
+ [x, y] for x, y in zip(ref_wavs, refs_txt)
62
+ ]
63
+
64
+ # ASR part
65
+ p = pipeline("automatic-speech-recognition")
66
+
67
+ # WER part
68
+ transformation = jiwer.Compose(
69
+ [
70
+ jiwer.RemovePunctuation(),
71
+ jiwer.ToLowerCase(),
72
+ jiwer.RemoveWhiteSpace(replace_by_space=True),
73
+ jiwer.RemoveMultipleSpaces(),
74
+ jiwer.ReduceToListOfListOfWords(word_delimiter=" "),
75
+ ]
76
+ )
77
+
78
+ class ChangeSampleRate(nn.Module):
79
+ def __init__(self, input_rate: int, output_rate: int):
80
+ super().__init__()
81
+ self.output_rate = output_rate
82
+ self.input_rate = input_rate
83
+
84
+ def forward(self, wav: torch.tensor) -> torch.tensor:
85
+ # Only accepts 1-channel waveform input
86
+ wav = wav.view(wav.size(0), -1)
87
+ new_length = wav.size(-1) * self.output_rate // self.input_rate
88
+ indices = torch.arange(new_length) * (
89
+ self.input_rate / self.output_rate
90
+ )
91
+ round_down = wav[:, indices.long()]
92
+ round_up = wav[:, (indices.long() + 1).clamp(max=wav.size(-1) - 1)]
93
+ output = round_down * (1.0 - indices.fmod(1.0)).unsqueeze(0) + (
94
+ round_up * indices.fmod(1.0).unsqueeze(0)
95
+ )
96
+ return output
97
+
98
+ # Flagging setup
99
+
100
+ def calc_wer(audio_path, ref):
101
+ wav, sr = torchaudio.load(audio_path)
102
+ if wav.shape[0] != 1:
103
+ wav = wav[0, :].unsqueeze(0)
104
+ print(wav.shape)
105
+ osr = 16000
106
+ batch = wav.unsqueeze(0).repeat(10, 1, 1)
107
+ csr = ChangeSampleRate(sr, osr)
108
+ out_wavs = csr(wav)
109
+ # ASR
110
+ trans = jiwer.ToLowerCase()(p(audio_path)["text"])
111
+
112
+ # WER
113
+ wer = jiwer.wer(
114
+ ref,
115
+ trans,
116
+ truth_transform=transformation,
117
+ hypothesis_transform=transformation,
118
+ )
119
+
120
+ return [trans, wer]
121
+
122
+ iface = gr.Interface(
123
+ fn=calc_wer,
124
+ inputs=[
125
+ gr.Audio(
126
+ source="upload",
127
+ type="filepath",
128
+ label="Audio_to_evaluate",
129
+ ),
130
+ reference_textbox,
131
+ ],
132
+ outputs=[
133
+ gr.Textbox(placeholder="Hypothesis", label="Hypothesis"),
134
+ gr.Textbox(placeholder="Word Error Rate", label="WER"),
135
+ ],
136
+ title="Laronix Automatic Speech Recognition",
137
+ description=description,
138
+ examples=examples,
139
+ css=".body {background-color: green}",
140
+ )
141
+
142
+ print("Launch examples")
143
+
144
+ iface.launch(
145
+ share=False,
146
+ )
config/samples.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ exp_id: NULL
2
+ ref_txt: data/samples/ref.txt
3
+ ref_wavs: data/samples
config/template.yaml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ exp_id: NULL
2
+ ref_txt: data/p326_split.txt
3
+ ref_feature: data/p326_split_ref.csv
4
+ ref_wavs: data/p326_split
5
+ thre:
6
+ minppm: 0
7
+ maxppm: 2000
8
+ WER: 1.0
9
+ AUTOMOS: 3.0
10
+ auth:
11
+ username: NULL
12
+ password: NULL
data/samples/John_p326_020.wav ADDED
Binary file (339 kB). View file
 
data/samples/p326_020.wav ADDED
Binary file (449 kB). View file
 
data/samples/ref.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ p326_020 Many complicated ideas about the rainbow have been formed.
2
+ John_p326_020 Many complicated ideas about the rainbow have been formed.
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ festival
2
+ espeak
requirements.txt ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==1.0.0
2
+ aiohttp==3.8.1
3
+ aiosignal==1.2.0
4
+ analytics-python==1.4.0
5
+ antlr4-python3-runtime==4.8
6
+ anyio==3.5.0
7
+ asgiref==3.5.0
8
+ async-timeout==4.0.2
9
+ attrs==21.4.0
10
+ backoff==1.10.0
11
+ bcrypt==3.2.0
12
+ bitarray==2.4.0
13
+ cachetools==5.0.0
14
+ certifi==2021.10.8
15
+ cffi==1.15.0
16
+ charset-normalizer==2.0.12
17
+ click==8.0.4
18
+ colorama==0.4.4
19
+ cryptography==36.0.1
20
+ cycler==0.11.0
21
+ Cython==0.29.28
22
+ fairseq @ git+https://github.com/pytorch/fairseq.git@d03f4e771484a433f025f47744017c2eb6e9c6bc
23
+ fastapi==0.75.0
24
+ ffmpy==0.3.0
25
+ fonttools==4.30.0
26
+ frozenlist==1.3.0
27
+ fsspec==2022.2.0
28
+ future==0.18.2
29
+ google-auth==2.6.0
30
+ google-auth-oauthlib==0.4.6
31
+ gradio==3.2
32
+ grpcio==1.44.0
33
+ h11==0.12.0
34
+ hydra-core==1.0.7
35
+ idna==3.3
36
+ importlib-metadata==4.11.3
37
+ Jinja2==3.0.3
38
+ kiwisolver==1.3.2
39
+ linkify-it-py==1.0.3
40
+ Markdown==3.3.6
41
+ markdown-it-py==2.0.1
42
+ MarkupSafe==2.1.0
43
+ matplotlib==3.5.1
44
+ mdit-py-plugins==0.3.0
45
+ mdurl==0.1.0
46
+ monotonic==1.6
47
+ multidict==6.0.2
48
+ numpy==1.22.3
49
+ oauthlib==3.2.0
50
+ omegaconf==2.0.6
51
+ orjson==3.6.7
52
+ packaging==21.3
53
+ pandas==1.4.1
54
+ paramiko==2.10.1
55
+ Pillow==9.0.1
56
+ portalocker==2.4.0
57
+ protobuf==3.19.4
58
+ pyasn1==0.4.8
59
+ pyasn1-modules==0.2.8
60
+ pycparser==2.21
61
+ pycryptodome==3.14.1
62
+ pydantic==1.9.0
63
+ pyDeprecate==0.3.1
64
+ pydub==0.25.1
65
+ PyNaCl==1.5.0
66
+ pyparsing==3.0.7
67
+ python-dateutil==2.8.2
68
+ python-multipart==0.0.5
69
+ pytorch-lightning==1.5.10
70
+ pytz==2021.3
71
+ PyYAML==6.0
72
+ regex==2022.3.2
73
+ requests==2.27.1
74
+ requests-oauthlib==1.3.1
75
+ rsa==4.8
76
+ sacrebleu==2.0.0
77
+ six==1.16.0
78
+ sniffio==1.2.0
79
+ starlette==0.17.1
80
+ tabulate==0.8.9
81
+ tensorboard==2.8.0
82
+ tensorboard-data-server==0.6.1
83
+ tensorboard-plugin-wit==1.8.1
84
+ torch==1.11.0
85
+ torchaudio==0.11.0
86
+ torchmetrics==0.7.2
87
+ tqdm==4.63.0
88
+ typing-extensions==4.1.1
89
+ uc-micro-py==1.0.1
90
+ urllib3==1.26.8
91
+ uvicorn==0.17.6
92
+ Werkzeug==2.0.3
93
+ yarl==1.7.2
94
+ zipp==3.7.0
95
+
96
+ transformers
97
+ deepspeech
98
+ tensorboardX
99
+ jiwer
100
+ phonemizer
101
+ librosa
102
+
103
+ rich
src/__pycache__/lightning_module.cpython-39.pyc ADDED
Binary file (1.86 kB). View file
 
src/__pycache__/model.cpython-39.pyc ADDED
Binary file (6.46 kB). View file
 
src/description.html ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+
3
+ <html>
4
+
5
+ <head>
6
+ <meta charset="UTF-8">
7
+ <meta name="viewport" content="width=device-width">
8
+ <title style="text-align: center;"> Laronix Naturalness Test </title>
9
+ <!-- CSS only -->
10
+ <link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet"
11
+ integrity="sha384-F3w7mX95PdgyTmZZMECAngseQB83DfGTowi0iMjiWaeVhAn4FJkqJByhZMI3AhiU" crossorigin="anonymous">
12
+ </head>
13
+ <style>
14
+
15
+ @font-face {
16
+ font-family: Poppins;
17
+ src: url(font/Poppins-Regular.ttf);
18
+ }
19
+ @font-face {
20
+ font-family: Poppins-Bold;
21
+ src: url(font/Poppins-Bold.ttf);
22
+ }
23
+ @font-face {
24
+ font-family: Muli;
25
+ src: url(font/Muli.ttf);
26
+ }
27
+ @font-face {
28
+ font-family: Muli-Bold;
29
+ src: url(font/Muli-Bold.ttf);
30
+ }
31
+ </style>
32
+ <body>
33
+ <p style="font-family: Muli;">This is a prototype of Laronix Automatic Speech Recognition platform.</p>
34
+
35
+ <img
36
+ src="https://static.wixstatic.com/media/e7e144_93e98148d06147828031797eb4525b80~mv2.png/v1/crop/x_0,y_25,w_2606,h_882/fill/w_396,h_142,al_c,q_85,usm_0.66_1.00_0.01,enc_auto/newlogo.png"
37
+ align="right"
38
+ height="20%"
39
+ width="20%"
40
+ />
41
+ </body>
42
+ </html>
src/font/Muli-Bold.ttf ADDED
Binary file (52.8 kB). View file
 
src/font/Muli.ttf ADDED
Binary file (49 kB). View file
 
src/font/OFL.txt ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright 2020 The Poppins Project Authors (https://github.com/itfoundry/Poppins)
2
+
3
+ This Font Software is licensed under the SIL Open Font License, Version 1.1.
4
+ This license is copied below, and is also available with a FAQ at:
5
+ http://scripts.sil.org/OFL
6
+
7
+
8
+ -----------------------------------------------------------
9
+ SIL OPEN FONT LICENSE Version 1.1 - 26 February 2007
10
+ -----------------------------------------------------------
11
+
12
+ PREAMBLE
13
+ The goals of the Open Font License (OFL) are to stimulate worldwide
14
+ development of collaborative font projects, to support the font creation
15
+ efforts of academic and linguistic communities, and to provide a free and
16
+ open framework in which fonts may be shared and improved in partnership
17
+ with others.
18
+
19
+ The OFL allows the licensed fonts to be used, studied, modified and
20
+ redistributed freely as long as they are not sold by themselves. The
21
+ fonts, including any derivative works, can be bundled, embedded,
22
+ redistributed and/or sold with any software provided that any reserved
23
+ names are not used by derivative works. The fonts and derivatives,
24
+ however, cannot be released under any other type of license. The
25
+ requirement for fonts to remain under this license does not apply
26
+ to any document created using the fonts or their derivatives.
27
+
28
+ DEFINITIONS
29
+ "Font Software" refers to the set of files released by the Copyright
30
+ Holder(s) under this license and clearly marked as such. This may
31
+ include source files, build scripts and documentation.
32
+
33
+ "Reserved Font Name" refers to any names specified as such after the
34
+ copyright statement(s).
35
+
36
+ "Original Version" refers to the collection of Font Software components as
37
+ distributed by the Copyright Holder(s).
38
+
39
+ "Modified Version" refers to any derivative made by adding to, deleting,
40
+ or substituting -- in part or in whole -- any of the components of the
41
+ Original Version, by changing formats or by porting the Font Software to a
42
+ new environment.
43
+
44
+ "Author" refers to any designer, engineer, programmer, technical
45
+ writer or other person who contributed to the Font Software.
46
+
47
+ PERMISSION & CONDITIONS
48
+ Permission is hereby granted, free of charge, to any person obtaining
49
+ a copy of the Font Software, to use, study, copy, merge, embed, modify,
50
+ redistribute, and sell modified and unmodified copies of the Font
51
+ Software, subject to the following conditions:
52
+
53
+ 1) Neither the Font Software nor any of its individual components,
54
+ in Original or Modified Versions, may be sold by itself.
55
+
56
+ 2) Original or Modified Versions of the Font Software may be bundled,
57
+ redistributed and/or sold with any software, provided that each copy
58
+ contains the above copyright notice and this license. These can be
59
+ included either as stand-alone text files, human-readable headers or
60
+ in the appropriate machine-readable metadata fields within text or
61
+ binary files as long as those fields can be easily viewed by the user.
62
+
63
+ 3) No Modified Version of the Font Software may use the Reserved Font
64
+ Name(s) unless explicit written permission is granted by the corresponding
65
+ Copyright Holder. This restriction only applies to the primary font name as
66
+ presented to the users.
67
+
68
+ 4) The name(s) of the Copyright Holder(s) or the Author(s) of the Font
69
+ Software shall not be used to promote, endorse or advertise any
70
+ Modified Version, except to acknowledge the contribution(s) of the
71
+ Copyright Holder(s) and the Author(s) or with their explicit written
72
+ permission.
73
+
74
+ 5) The Font Software, modified or unmodified, in part or in whole,
75
+ must be distributed entirely under this license, and must not be
76
+ distributed under any other license. The requirement for fonts to
77
+ remain under this license does not apply to any document created
78
+ using the Font Software.
79
+
80
+ TERMINATION
81
+ This license becomes null and void if any of the above conditions are
82
+ not met.
83
+
84
+ DISCLAIMER
85
+ THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
86
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF
87
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
88
+ OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE
89
+ COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
90
+ INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL
91
+ DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
92
+ FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM
93
+ OTHER DEALINGS IN THE FONT SOFTWARE.
src/font/Poppins-Bold.ttf ADDED
Binary file (154 kB). View file
 
src/font/Poppins-Regular.ttf ADDED
Binary file (158 kB). View file
 
src/font/SIL Open Font License.txt ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright (c) 2011 by vernon adams ([email protected]),
2
+ with Reserved Font Name "Muli".
3
+
4
+ This Font Software is licensed under the SIL Open Font License, Version 1.1.
5
+ This license is copied below, and is also available with a FAQ at: http://scripts.sil.org/OFL
6
+
7
+ -----------------------------------------------------------
8
+ SIL OPEN FONT LICENSE Version 1.1 - 26 February 2007
9
+ -----------------------------------------------------------
10
+
11
+ PREAMBLE
12
+ The goals of the Open Font License (OFL) are to stimulate worldwide development of collaborative font projects, to support the font creation efforts of academic and linguistic communities, and to provide a free and open framework in which fonts may be shared and improved in partnership with others.
13
+
14
+ The OFL allows the licensed fonts to be used, studied, modified and redistributed freely as long as they are not sold by themselves. The fonts, including any derivative works, can be bundled, embedded, redistributed and/or sold with any software provided that any reserved names are not used by derivative works. The fonts and derivatives, however, cannot be released under any other type of license. The requirement for fonts to remain under this license does not apply to any document created using the fonts or their derivatives.
15
+
16
+ DEFINITIONS
17
+ "Font Software" refers to the set of files released by the Copyright Holder(s) under this license and clearly marked as such. This may include source files, build scripts and documentation.
18
+
19
+ "Reserved Font Name" refers to any names specified as such after the copyright statement(s).
20
+
21
+ "Original Version" refers to the collection of Font Software components as distributed by the Copyright Holder(s).
22
+
23
+ "Modified Version" refers to any derivative made by adding to, deleting, or substituting -- in part or in whole -- any of the components of the Original Version, by changing formats or by porting the Font Software to a new environment.
24
+
25
+ "Author" refers to any designer, engineer, programmer, technical writer or other person who contributed to the Font Software.
26
+
27
+ PERMISSION & CONDITIONS
28
+ Permission is hereby granted, free of charge, to any person obtaining a copy of the Font Software, to use, study, copy, merge, embed, modify, redistribute, and sell modified and unmodified copies of the Font Software, subject to the following conditions:
29
+
30
+ 1) Neither the Font Software nor any of its individual components, in Original or Modified Versions, may be sold by itself.
31
+
32
+ 2) Original or Modified Versions of the Font Software may be bundled, redistributed and/or sold with any software, provided that each copy contains the above copyright notice and this license. These can be included either as stand-alone text files, human-readable headers or in the appropriate machine-readable metadata fields within text or binary files as long as those fields can be easily viewed by the user.
33
+
34
+ 3) No Modified Version of the Font Software may use the Reserved Font Name(s) unless explicit written permission is granted by the corresponding Copyright Holder. This restriction only applies to the primary font name as presented to the users.
35
+
36
+ 4) The name(s) of the Copyright Holder(s) or the Author(s) of the Font Software shall not be used to promote, endorse or advertise any Modified Version, except to acknowledge the contribution(s) of the Copyright Holder(s) and the Author(s) or with their explicit written permission.
37
+
38
+ 5) The Font Software, modified or unmodified, in part or in whole, must be distributed entirely under this license, and must not be distributed under any other license. The requirement for fonts to remain under this license does not apply to any document created using the Font Software.
39
+
40
+ TERMINATION
41
+ This license becomes null and void if any of the above conditions are not met.
42
+
43
+ DISCLAIMER
44
+ THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM OTHER DEALINGS IN THE FONT SOFTWARE.