Spaces:
Running
Running
alessandro trinca tornidor
commited on
Commit
Β·
85b7206
1
Parent(s):
7810fbd
feat: port whisper and faster-whisper support from https://github.com/Thiagohgl/ai-pronunciation-trainer
Browse filesThis view is limited to 50 files because it contains too many changes. Β
See raw diff
- .coveragerc +3 -3
- .gitignore +153 -118
- .idea/inspectionProfiles/Project_Default.xml +15 -0
- .idea/vcs.xml +6 -0
- aip_trainer/models/AIModels.py β AIModels.py +31 -3
- Dockerfile +0 -37
- aip_trainer/models/ModelInterfaces.py β ModelInterfaces.py +1 -1
- README.md +2 -2
- aip_trainer/models/RuleBasedModels.py β RuleBasedModels.py +14 -6
- aip_trainer/WordMatching.py β WordMatching.py +85 -51
- aip_trainer/WordMetrics.py β WordMetrics.py +28 -4
- aip_trainer/__init__.py +0 -21
- aip_trainer/lambdas/__init__.py +0 -1
- aip_trainer/lambdas/data_de_en_with_categories.json +0 -0
- aip_trainer/lambdas/lambdaGetSample.py +0 -106
- aip_trainer/models/__init__.py +0 -0
- aip_trainer/utils/__init__.py +0 -0
- aip_trainer/utils/split_cosmic_ray_report.py +0 -33
- aip_trainer/utils/typing_hints.py +0 -19
- aip_trainer/utils/utilities.py +0 -57
- app.py +53 -24
- app_description.md +11 -0
- aip_trainer/lambdas/app_description.md β app_headline.md +3 -1
- constants.py +31 -0
- cosmic_ray_config.toml +0 -8
- tests/test_data_de_en_2.pickle β data_de_en_2.pickle +0 -0
- databases/data_de.csv +0 -0
- databases/data_en.csv +0 -0
- dockerfiles/apt_preferences +0 -9
- dockerfiles/debian.sources +0 -17
- dockerfiles/dockerfile-base +0 -72
- faster_whisper_wrapper.py +56 -0
- images/{MainScreen.png β MainScreen.jpg} +2 -2
- aip_trainer/lambdas/js.py β js.py +0 -14
- lambdaChangeModel.py +14 -0
- lambdaGetSample.py +145 -0
- aip_trainer/lambdas/lambdaSpeechToScore.py β lambdaSpeechToScore.py +176 -57
- aip_trainer/lambdas/lambdaTTS.py β lambdaTTS.py +29 -5
- aip_trainer/models/models.py β models.py +209 -88
- packages.txt +0 -1
- pre-requirements.txt +0 -1
- aip_trainer/pronunciationTrainer.py β pronunciationTrainer.py +44 -39
- requirements-dev.txt +0 -1
- requirements-flask.txt +0 -21
- requirements-gradio.txt +0 -1
- requirements.txt +13 -9
- aip_trainer/utils/session_logger.py β session_logger.py +3 -3
- static/.gitignore +0 -3
- static/.vscode/launch.json +0 -20
- static/css/{style.css β style-new.css} +142 -51
.coveragerc
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
[run]
|
2 |
-
source =
|
3 |
-
omit = ./venv
|
4 |
|
5 |
[report]
|
6 |
-
omit =
|
7 |
|
8 |
exclude_lines =
|
9 |
if __name__ == .__main__.:
|
|
|
1 |
[run]
|
2 |
+
source = ./*.py
|
3 |
+
omit = ./tests/*,./tests/**/*,./*venv*/*,__version__.py,*tests*,*app.py,js.py,*manage.py,*__init__.py,*migrations*,*asgi*,*wsgi*,*admin.py,*urls.py
|
4 |
|
5 |
[report]
|
6 |
+
omit = ./*venv*/*,*tests*,*app.py,*manage.py,*__init__.py,*migrations*,js.py,*asgi*,*wsgi*,*admin.py,*urls.py
|
7 |
|
8 |
exclude_lines =
|
9 |
if __name__ == .__main__.:
|
.gitignore
CHANGED
@@ -1,63 +1,3 @@
|
|
1 |
-
|
2 |
-
# Created by https://www.gitignore.io/api/osx,linux,python,windows,pycharm,visualstudiocode
|
3 |
-
|
4 |
-
### Linux ###
|
5 |
-
*~
|
6 |
-
|
7 |
-
# temporary files which can be created if a process still has a handle open of a deleted file
|
8 |
-
.fuse_hidden*
|
9 |
-
|
10 |
-
# KDE directory preferences
|
11 |
-
.directory
|
12 |
-
|
13 |
-
# Linux trash folder which might appear on any partition or disk
|
14 |
-
.Trash-*
|
15 |
-
|
16 |
-
# .nfs files are created when an open file is removed but is still being accessed
|
17 |
-
.nfs*
|
18 |
-
|
19 |
-
### OSX ###
|
20 |
-
*.DS_Store
|
21 |
-
*/*.DS_Store
|
22 |
-
*/**/*.DS_Store
|
23 |
-
.AppleDouble
|
24 |
-
.LSOverride
|
25 |
-
|
26 |
-
# Icon must end with two \r
|
27 |
-
Icon
|
28 |
-
|
29 |
-
# Thumbnails
|
30 |
-
._*
|
31 |
-
|
32 |
-
# Files that might appear in the root of a volume
|
33 |
-
.DocumentRevisions-V100
|
34 |
-
.fseventsd
|
35 |
-
.Spotlight-V100
|
36 |
-
.TemporaryItems
|
37 |
-
.Trashes
|
38 |
-
.VolumeIcon.icns
|
39 |
-
.com.apple.timemachine.donotpresent
|
40 |
-
|
41 |
-
# Directories potentially created on remote AFP share
|
42 |
-
.AppleDB
|
43 |
-
.AppleDesktop
|
44 |
-
Network Trash Folder
|
45 |
-
Temporary Items
|
46 |
-
.apdisk
|
47 |
-
|
48 |
-
# CMake
|
49 |
-
cmake-build-debug/
|
50 |
-
|
51 |
-
# Ruby plugin and RubyMine
|
52 |
-
/.rakeTasks
|
53 |
-
|
54 |
-
# Crashlytics plugin (for Android Studio and IntelliJ)
|
55 |
-
com_crashlytics_export_strings.xml
|
56 |
-
crashlytics.properties
|
57 |
-
crashlytics-build.properties
|
58 |
-
fabric.properties
|
59 |
-
|
60 |
-
### Python ###
|
61 |
# Byte-compiled / optimized / DLL files
|
62 |
__pycache__/
|
63 |
*.py[cod]
|
@@ -80,9 +20,14 @@ parts/
|
|
80 |
sdist/
|
81 |
var/
|
82 |
wheels/
|
|
|
|
|
83 |
*.egg-info/
|
84 |
.installed.cfg
|
85 |
*.egg
|
|
|
|
|
|
|
86 |
|
87 |
# PyInstaller
|
88 |
# Usually these files are written by a python script from a template
|
@@ -97,19 +42,31 @@ pip-delete-this-directory.txt
|
|
97 |
# Unit test / coverage reports
|
98 |
htmlcov/
|
99 |
.tox/
|
|
|
100 |
.coverage
|
101 |
.coverage.*
|
102 |
.cache
|
103 |
-
.pytest_cache/
|
104 |
nosetests.xml
|
105 |
coverage.xml
|
106 |
*.cover
|
|
|
107 |
.hypothesis/
|
|
|
|
|
|
|
|
|
|
|
108 |
|
109 |
# Translations
|
110 |
*.mo
|
111 |
*.pot
|
112 |
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
# Flask stuff:
|
114 |
instance/
|
115 |
.webassets-cache
|
@@ -118,8 +75,7 @@ instance/
|
|
118 |
.scrapy
|
119 |
|
120 |
# Sphinx documentation
|
121 |
-
docs/_build/
|
122 |
-
docs/_build/html/*
|
123 |
|
124 |
# PyBuilder
|
125 |
target/
|
@@ -127,25 +83,37 @@ target/
|
|
127 |
# Jupyter Notebook
|
128 |
.ipynb_checkpoints
|
129 |
|
|
|
|
|
|
|
|
|
130 |
# pyenv
|
131 |
.python-version
|
132 |
|
133 |
-
#
|
134 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
|
136 |
# SageMath parsed files
|
137 |
*.sage.py
|
138 |
|
139 |
# Environments
|
140 |
.env
|
141 |
-
.env*
|
142 |
-
.venv
|
143 |
.venv*
|
144 |
env/
|
145 |
-
venv
|
146 |
ENV/
|
147 |
env.bak/
|
148 |
-
venv.bak/
|
149 |
|
150 |
# Spyder project settings
|
151 |
.spyderproject
|
@@ -159,55 +127,24 @@ venv.bak/
|
|
159 |
|
160 |
# mypy
|
161 |
.mypy_cache/
|
|
|
|
|
162 |
|
163 |
-
|
164 |
-
.
|
165 |
-
!.vscode/settings.json
|
166 |
-
!.vscode/tasks.json
|
167 |
-
!.vscode/launch.json
|
168 |
-
!.vscode/extensions.json
|
169 |
-
.history
|
170 |
|
171 |
-
|
172 |
-
# Windows thumbnail cache files
|
173 |
-
Thumbs.db
|
174 |
-
ehthumbs.db
|
175 |
-
ehthumbs_vista.db
|
176 |
-
|
177 |
-
# Folder config file
|
178 |
-
Desktop.ini
|
179 |
-
|
180 |
-
# Recycle Bin used on file shares
|
181 |
-
$RECYCLE.BIN/
|
182 |
-
|
183 |
-
# Windows Installer files
|
184 |
-
*.cab
|
185 |
-
*.msi
|
186 |
-
*.msm
|
187 |
-
*.msp
|
188 |
-
|
189 |
-
# Windows shortcuts
|
190 |
-
*.lnk
|
191 |
-
|
192 |
-
# Build folder
|
193 |
-
|
194 |
-
*/build/*
|
195 |
|
196 |
# custom
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
/tests/events.tar
|
201 |
-
function_dump_*.json
|
202 |
-
*.yml
|
203 |
|
204 |
-
#
|
205 |
-
|
206 |
|
207 |
-
|
208 |
-
|
209 |
-
## .idea files
|
210 |
-
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
|
211 |
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
|
212 |
|
213 |
# User-specific stuff
|
@@ -217,6 +154,9 @@ function_dump_*.json
|
|
217 |
.idea/**/dictionaries
|
218 |
.idea/**/shelf
|
219 |
|
|
|
|
|
|
|
220 |
# Generated files
|
221 |
.idea/**/contentModel.xml
|
222 |
|
@@ -237,9 +177,14 @@ function_dump_*.json
|
|
237 |
# When using Gradle or Maven with auto-import, you should exclude module files,
|
238 |
# since they will be recreated, and may cause churn. Uncomment if using
|
239 |
# auto-import.
|
|
|
|
|
|
|
240 |
# .idea/modules.xml
|
241 |
-
.idea/*.iml
|
242 |
# .idea/modules
|
|
|
|
|
243 |
|
244 |
# CMake
|
245 |
cmake-build-*/
|
@@ -262,6 +207,9 @@ atlassian-ide-plugin.xml
|
|
262 |
# Cursive Clojure plugin
|
263 |
.idea/replstate.xml
|
264 |
|
|
|
|
|
|
|
265 |
# Crashlytics plugin (for Android Studio and IntelliJ)
|
266 |
com_crashlytics_export_strings.xml
|
267 |
crashlytics.properties
|
@@ -274,11 +222,98 @@ fabric.properties
|
|
274 |
# Android studio 3.1+ serialized cache file
|
275 |
.idea/caches/build_file_checksums.ser
|
276 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
277 |
# Sonarlint plugin
|
278 |
-
.
|
279 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
280 |
|
281 |
-
#
|
282 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
283 |
|
284 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# Byte-compiled / optimized / DLL files
|
2 |
__pycache__/
|
3 |
*.py[cod]
|
|
|
20 |
sdist/
|
21 |
var/
|
22 |
wheels/
|
23 |
+
pip-wheel-metadata/
|
24 |
+
share/python-wheels/
|
25 |
*.egg-info/
|
26 |
.installed.cfg
|
27 |
*.egg
|
28 |
+
MANIFEST
|
29 |
+
static/node_modules/*
|
30 |
+
static/dist/*
|
31 |
|
32 |
# PyInstaller
|
33 |
# Usually these files are written by a python script from a template
|
|
|
42 |
# Unit test / coverage reports
|
43 |
htmlcov/
|
44 |
.tox/
|
45 |
+
.nox/
|
46 |
.coverage
|
47 |
.coverage.*
|
48 |
.cache
|
|
|
49 |
nosetests.xml
|
50 |
coverage.xml
|
51 |
*.cover
|
52 |
+
*.py,cover
|
53 |
.hypothesis/
|
54 |
+
.pytest_cache/
|
55 |
+
static/test-results/*
|
56 |
+
cosmic-ray-results/*
|
57 |
+
cosmic_ray.sqlite
|
58 |
+
static/playwright-report/*
|
59 |
|
60 |
# Translations
|
61 |
*.mo
|
62 |
*.pot
|
63 |
|
64 |
+
# Django stuff:
|
65 |
+
*.log
|
66 |
+
local_settings.py
|
67 |
+
db.sqlite3
|
68 |
+
db.sqlite3-journal
|
69 |
+
|
70 |
# Flask stuff:
|
71 |
instance/
|
72 |
.webassets-cache
|
|
|
75 |
.scrapy
|
76 |
|
77 |
# Sphinx documentation
|
78 |
+
docs/_build/
|
|
|
79 |
|
80 |
# PyBuilder
|
81 |
target/
|
|
|
83 |
# Jupyter Notebook
|
84 |
.ipynb_checkpoints
|
85 |
|
86 |
+
# IPython
|
87 |
+
profile_default/
|
88 |
+
ipython_config.py
|
89 |
+
|
90 |
# pyenv
|
91 |
.python-version
|
92 |
|
93 |
+
# pipenv
|
94 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
95 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
96 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
97 |
+
# install all needed dependencies.
|
98 |
+
#Pipfile.lock
|
99 |
+
|
100 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
101 |
+
__pypackages__/
|
102 |
+
|
103 |
+
# Celery stuff
|
104 |
+
celerybeat-schedule
|
105 |
+
celerybeat.pid
|
106 |
|
107 |
# SageMath parsed files
|
108 |
*.sage.py
|
109 |
|
110 |
# Environments
|
111 |
.env
|
|
|
|
|
112 |
.venv*
|
113 |
env/
|
114 |
+
venv*
|
115 |
ENV/
|
116 |
env.bak/
|
|
|
117 |
|
118 |
# Spyder project settings
|
119 |
.spyderproject
|
|
|
127 |
|
128 |
# mypy
|
129 |
.mypy_cache/
|
130 |
+
.dmypy.json
|
131 |
+
dmypy.json
|
132 |
|
133 |
+
# Pyre type checker
|
134 |
+
.pyre/
|
|
|
|
|
|
|
|
|
|
|
135 |
|
136 |
+
tmp/*
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
|
138 |
# custom
|
139 |
+
translation_model_de.pickle
|
140 |
+
translation_tokenizer_de.pickle
|
141 |
+
test.ogg
|
|
|
|
|
|
|
142 |
|
143 |
+
# Created by https://www.toptal.com/developers/gitignore/api/jetbrains,windows,linux,visualstudiocode
|
144 |
+
# Edit at https://www.toptal.com/developers/gitignore?templates=jetbrains,windows,linux,visualstudiocode
|
145 |
|
146 |
+
### JetBrains ###
|
147 |
+
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
|
|
|
|
|
148 |
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
|
149 |
|
150 |
# User-specific stuff
|
|
|
154 |
.idea/**/dictionaries
|
155 |
.idea/**/shelf
|
156 |
|
157 |
+
# AWS User-specific
|
158 |
+
.idea/**/aws.xml
|
159 |
+
|
160 |
# Generated files
|
161 |
.idea/**/contentModel.xml
|
162 |
|
|
|
177 |
# When using Gradle or Maven with auto-import, you should exclude module files,
|
178 |
# since they will be recreated, and may cause churn. Uncomment if using
|
179 |
# auto-import.
|
180 |
+
# .idea/artifacts
|
181 |
+
# .idea/compiler.xml
|
182 |
+
# .idea/jarRepositories.xml
|
183 |
# .idea/modules.xml
|
184 |
+
# .idea/*.iml
|
185 |
# .idea/modules
|
186 |
+
# *.iml
|
187 |
+
# *.ipr
|
188 |
|
189 |
# CMake
|
190 |
cmake-build-*/
|
|
|
207 |
# Cursive Clojure plugin
|
208 |
.idea/replstate.xml
|
209 |
|
210 |
+
# SonarLint plugin
|
211 |
+
.idea/sonarlint/
|
212 |
+
|
213 |
# Crashlytics plugin (for Android Studio and IntelliJ)
|
214 |
com_crashlytics_export_strings.xml
|
215 |
crashlytics.properties
|
|
|
222 |
# Android studio 3.1+ serialized cache file
|
223 |
.idea/caches/build_file_checksums.ser
|
224 |
|
225 |
+
### JetBrains Patch ###
|
226 |
+
# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
|
227 |
+
|
228 |
+
# *.iml
|
229 |
+
# modules.xml
|
230 |
+
# .idea/misc.xml
|
231 |
+
# *.ipr
|
232 |
+
|
233 |
# Sonarlint plugin
|
234 |
+
# https://plugins.jetbrains.com/plugin/7973-sonarlint
|
235 |
+
.idea/**/sonarlint/
|
236 |
+
|
237 |
+
# SonarQube Plugin
|
238 |
+
# https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin
|
239 |
+
.idea/**/sonarIssues.xml
|
240 |
+
|
241 |
+
# Markdown Navigator plugin
|
242 |
+
# https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced
|
243 |
+
.idea/**/markdown-navigator.xml
|
244 |
+
.idea/**/markdown-navigator-enh.xml
|
245 |
+
.idea/**/markdown-navigator/
|
246 |
+
|
247 |
+
# Cache file creation bug
|
248 |
+
# See https://youtrack.jetbrains.com/issue/JBR-2257
|
249 |
+
.idea/$CACHE_FILE$
|
250 |
+
|
251 |
+
# CodeStream plugin
|
252 |
+
# https://plugins.jetbrains.com/plugin/12206-codestream
|
253 |
+
.idea/codestream.xml
|
254 |
+
|
255 |
+
# Azure Toolkit for IntelliJ plugin
|
256 |
+
# https://plugins.jetbrains.com/plugin/8053-azure-toolkit-for-intellij
|
257 |
+
.idea/**/azureSettings.xml
|
258 |
+
|
259 |
+
### Linux ###
|
260 |
+
*~
|
261 |
+
|
262 |
+
# temporary files which can be created if a process still has a handle open of a deleted file
|
263 |
+
.fuse_hidden*
|
264 |
+
|
265 |
+
# KDE directory preferences
|
266 |
+
.directory
|
267 |
+
|
268 |
+
# Linux trash folder which might appear on any partition or disk
|
269 |
+
.Trash-*
|
270 |
+
|
271 |
+
# .nfs files are created when an open file is removed but is still being accessed
|
272 |
+
.nfs*
|
273 |
+
|
274 |
+
### VisualStudioCode ###
|
275 |
+
.vscode/*
|
276 |
+
!.vscode/settings.json
|
277 |
+
!.vscode/tasks.json
|
278 |
+
!.vscode/launch.json
|
279 |
+
!.vscode/extensions.json
|
280 |
+
!.vscode/*.code-snippets
|
281 |
+
|
282 |
+
# Local History for Visual Studio Code
|
283 |
+
.history/
|
284 |
+
|
285 |
+
# Built Visual Studio Code Extensions
|
286 |
+
*.vsix
|
287 |
+
|
288 |
+
### VisualStudioCode Patch ###
|
289 |
+
# Ignore all local history of files
|
290 |
+
.history
|
291 |
+
.ionide
|
292 |
+
|
293 |
+
### Windows ###
|
294 |
+
# Windows thumbnail cache files
|
295 |
+
Thumbs.db
|
296 |
+
Thumbs.db:encryptable
|
297 |
+
ehthumbs.db
|
298 |
+
ehthumbs_vista.db
|
299 |
|
300 |
+
# Dump file
|
301 |
+
*.stackdump
|
302 |
+
|
303 |
+
# Folder config file
|
304 |
+
[Dd]esktop.ini
|
305 |
+
|
306 |
+
# Recycle Bin used on file shares
|
307 |
+
$RECYCLE.BIN/
|
308 |
+
|
309 |
+
# Windows Installer files
|
310 |
+
*.cab
|
311 |
+
*.msi
|
312 |
+
*.msix
|
313 |
+
*.msm
|
314 |
+
*.msp
|
315 |
+
|
316 |
+
# Windows shortcuts
|
317 |
+
*.lnk
|
318 |
|
319 |
+
# End of https://www.toptal.com/developers/gitignore/api/jetbrains,windows,linux,visualstudiocode
|
.idea/inspectionProfiles/Project_Default.xml
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<component name="InspectionProjectProfileManager">
|
2 |
+
<profile version="1.0">
|
3 |
+
<option name="myName" value="Project Default" />
|
4 |
+
<inspection_tool class="HtmlUnknownAttribute" enabled="true" level="WARNING" enabled_by_default="true">
|
5 |
+
<option name="myValues">
|
6 |
+
<value>
|
7 |
+
<list size="1">
|
8 |
+
<item index="0" class="java.lang.String" itemvalue="label" />
|
9 |
+
</list>
|
10 |
+
</value>
|
11 |
+
</option>
|
12 |
+
<option name="myCustomValuesEnabled" value="true" />
|
13 |
+
</inspection_tool>
|
14 |
+
</profile>
|
15 |
+
</component>
|
.idea/vcs.xml
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="VcsDirectoryMappings">
|
4 |
+
<mapping directory="" vcs="Git" />
|
5 |
+
</component>
|
6 |
+
</project>
|
aip_trainer/models/AIModels.py β AIModels.py
RENAMED
@@ -1,7 +1,6 @@
|
|
1 |
import numpy as np
|
2 |
import torch
|
3 |
-
|
4 |
-
from aip_trainer.models import ModelInterfaces
|
5 |
|
6 |
|
7 |
class NeuralASR(ModelInterfaces.IASRModel):
|
@@ -21,7 +20,6 @@ class NeuralASR(ModelInterfaces.IASRModel):
|
|
21 |
def getWordLocations(self) -> list:
|
22 |
"""Get the pair of words location from audio"""
|
23 |
assert self.word_locations_in_samples is not None, 'Can get word locations without having processed the audio'
|
24 |
-
|
25 |
return self.word_locations_in_samples
|
26 |
|
27 |
def processAudio(self, audio: torch.Tensor):
|
@@ -32,3 +30,33 @@ class NeuralASR(ModelInterfaces.IASRModel):
|
|
32 |
|
33 |
self.audio_transcript, self.word_locations_in_samples = self.decoder(
|
34 |
nn_output[0, :, :].detach(), audio_length_in_samples, word_align=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import numpy as np
|
2 |
import torch
|
3 |
+
import ModelInterfaces
|
|
|
4 |
|
5 |
|
6 |
class NeuralASR(ModelInterfaces.IASRModel):
|
|
|
20 |
def getWordLocations(self) -> list:
|
21 |
"""Get the pair of words location from audio"""
|
22 |
assert self.word_locations_in_samples is not None, 'Can get word locations without having processed the audio'
|
|
|
23 |
return self.word_locations_in_samples
|
24 |
|
25 |
def processAudio(self, audio: torch.Tensor):
|
|
|
30 |
|
31 |
self.audio_transcript, self.word_locations_in_samples = self.decoder(
|
32 |
nn_output[0, :, :].detach(), audio_length_in_samples, word_align=True)
|
33 |
+
|
34 |
+
|
35 |
+
class NeuralTTS(ModelInterfaces.ITextToSpeechModel):
|
36 |
+
def __init__(self, model: torch.nn.Module, sampling_rate: int) -> None:
|
37 |
+
super().__init__()
|
38 |
+
self.model = model
|
39 |
+
self.sampling_rate = sampling_rate
|
40 |
+
|
41 |
+
def getAudioFromSentence(self, sentence: str) -> np.array:
|
42 |
+
with torch.inference_mode():
|
43 |
+
audio_transcript = self.model.apply_tts(texts=[sentence],
|
44 |
+
sample_rate=self.sampling_rate)[0]
|
45 |
+
|
46 |
+
return audio_transcript
|
47 |
+
|
48 |
+
|
49 |
+
class NeuralTranslator(ModelInterfaces.ITranslationModel):
|
50 |
+
def __init__(self, model: torch.nn.Module, tokenizer) -> None:
|
51 |
+
super().__init__()
|
52 |
+
self.model = model
|
53 |
+
self.tokenizer = tokenizer
|
54 |
+
|
55 |
+
def translateSentence(self, sentence: str) -> str:
|
56 |
+
"""Get the transcripts of the process audio"""
|
57 |
+
tokenized_text = self.tokenizer(sentence, return_tensors='pt')
|
58 |
+
translation = self.model.generate(**tokenized_text)
|
59 |
+
translated_text = self.tokenizer.batch_decode(
|
60 |
+
translation, skip_special_tokens=True)[0]
|
61 |
+
|
62 |
+
return translated_text
|
Dockerfile
DELETED
@@ -1,37 +0,0 @@
|
|
1 |
-
FROM registry.gitlab.com/aletrn/ai-pronunciation-trainer:0.5.0
|
2 |
-
|
3 |
-
ARG ARCH
|
4 |
-
ARG WORKDIR_ROOT
|
5 |
-
ENV PYTHONPATH="${WORKDIR_ROOT}:${WORKDIR_ROOT}/.venv:${PYTHONPATH}:/usr/local/lib/python3/dist-packages"
|
6 |
-
ENV MPLCONFIGDIR=/tmp/matplotlib
|
7 |
-
ENV IS_DOCKER_CONTAINER="YES"
|
8 |
-
ENV LOG_JSON_FORMAT="TRUE"
|
9 |
-
ENV LOG_LEVEL="INFO"
|
10 |
-
|
11 |
-
ENV VIRTUAL_ENV=${WORKDIR_ROOT}/.venv PATH="${WORKDIR_ROOT}/.venv/bin:$PATH"
|
12 |
-
|
13 |
-
COPY --chown=python:python . ${WORKDIR_ROOT}/.
|
14 |
-
|
15 |
-
RUN python --version
|
16 |
-
RUN pip list
|
17 |
-
RUN echo "PATH: ${PATH}."
|
18 |
-
RUN echo "WORKDIR_ROOT: ${WORKDIR_ROOT}."
|
19 |
-
RUN ls -l ${WORKDIR_ROOT}
|
20 |
-
RUN ls -ld ${WORKDIR_ROOT}
|
21 |
-
RUN python -c "import sys; print(sys.path)"
|
22 |
-
RUN python -c "import epitran"
|
23 |
-
RUN python -c "import flask"
|
24 |
-
RUN python -c "import pandas"
|
25 |
-
RUN python -c "from torch import Tensor"
|
26 |
-
RUN python -c "import gunicorn"
|
27 |
-
RUN df -h
|
28 |
-
RUN ls -l ${WORKDIR_ROOT}/webApp.py
|
29 |
-
RUN ls -l ${WORKDIR_ROOT}/static/
|
30 |
-
|
31 |
-
USER 999
|
32 |
-
ENV PATH="${WORKDIR_ROOT}:${WORKDIR_ROOT}/.venv/bin:$PATH"
|
33 |
-
RUN echo "PATH: $PATH ..."
|
34 |
-
RUN echo "PYTHONPATH: $PYTHONPATH ..."
|
35 |
-
RUN echo "MPLCONFIGDIR: $MPLCONFIGDIR ..."
|
36 |
-
|
37 |
-
CMD ["gunicorn", "--bind", "0.0.0.0:3000", "webApp:app"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
aip_trainer/models/ModelInterfaces.py β ModelInterfaces.py
RENAMED
@@ -1,5 +1,5 @@
|
|
1 |
-
import abc
|
2 |
|
|
|
3 |
import numpy as np
|
4 |
|
5 |
|
|
|
|
|
1 |
|
2 |
+
import abc
|
3 |
import numpy as np
|
4 |
|
5 |
|
README.md
CHANGED
@@ -89,7 +89,7 @@ find aip_trainer -name "__pycache__" -exec rm -rf {} \;
|
|
89 |
Then execute the tests again:
|
90 |
|
91 |
```bash
|
92 |
-
pytest --cov=
|
93 |
```
|
94 |
|
95 |
### Backend tests execution on Windows
|
@@ -106,7 +106,7 @@ Normally I use Visual Studio Code to write and execute my playwright tests, howe
|
|
106 |
|
107 |
```bash
|
108 |
pnpm install
|
109 |
-
pnpm playwright test
|
110 |
```
|
111 |
|
112 |
### Unused classes and functions (now removed)
|
|
|
89 |
Then execute the tests again:
|
90 |
|
91 |
```bash
|
92 |
+
python -m pytest tests/models/test_models_faster_whisper.py; echo "# start pytest complete test suite #"; IS_TESTING=TRUE python -m pytest tests --cov="." --cov-report=term-missing && coverage html
|
93 |
```
|
94 |
|
95 |
### Backend tests execution on Windows
|
|
|
106 |
|
107 |
```bash
|
108 |
pnpm install
|
109 |
+
pnpm playwright test --workers 1 --retries 4 --project=chromium
|
110 |
```
|
111 |
|
112 |
### Unused classes and functions (now removed)
|
aip_trainer/models/RuleBasedModels.py β RuleBasedModels.py
RENAMED
@@ -1,8 +1,20 @@
|
|
|
|
|
|
|
|
|
|
1 |
import eng_to_ipa
|
2 |
|
3 |
-
from aip_trainer.models import ModelInterfaces
|
4 |
-
from aip_trainer import app_logger
|
5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
class EpitranPhonemConverter(ModelInterfaces.ITextToPhonemModel):
|
8 |
word_locations_in_samples = None
|
@@ -13,9 +25,7 @@ class EpitranPhonemConverter(ModelInterfaces.ITextToPhonemModel):
|
|
13 |
self.epitran_model = epitran_model
|
14 |
|
15 |
def convertToPhonem(self, sentence: str) -> str:
|
16 |
-
app_logger.debug(f'starting EpitranPhonemConverter.convertToPhonem for sentence/token "{sentence}"...')
|
17 |
phonem_representation = self.epitran_model.transliterate(sentence)
|
18 |
-
app_logger.debug(f'EpitranPhonemConverter: got phonem_representation for sentence/token "{sentence}"!')
|
19 |
return phonem_representation
|
20 |
|
21 |
|
@@ -25,8 +35,6 @@ class EngPhonemConverter(ModelInterfaces.ITextToPhonemModel):
|
|
25 |
super().__init__()
|
26 |
|
27 |
def convertToPhonem(self, sentence: str) -> str:
|
28 |
-
app_logger.debug(f'starting EngPhonemConverter.convertToPhonem for sentence/token "{sentence}"...')
|
29 |
phonem_representation = eng_to_ipa.convert(sentence)
|
30 |
phonem_representation = phonem_representation.replace('*','')
|
31 |
-
app_logger.debug(f'EngPhonemConverter: got phonem_representation for sentence/token "{sentence}"!')
|
32 |
return phonem_representation
|
|
|
1 |
+
import ModelInterfaces
|
2 |
+
import torch
|
3 |
+
import numpy as np
|
4 |
+
import epitran
|
5 |
import eng_to_ipa
|
6 |
|
|
|
|
|
7 |
|
8 |
+
def get_phonem_converter(language: str):
|
9 |
+
if language == 'de':
|
10 |
+
phonem_converter = EpitranPhonemConverter(
|
11 |
+
epitran.Epitran('deu-Latn'))
|
12 |
+
elif language == 'en':
|
13 |
+
phonem_converter = EngPhonemConverter()
|
14 |
+
else:
|
15 |
+
raise ValueError('Language not implemented')
|
16 |
+
|
17 |
+
return phonem_converter
|
18 |
|
19 |
class EpitranPhonemConverter(ModelInterfaces.ITextToPhonemModel):
|
20 |
word_locations_in_samples = None
|
|
|
25 |
self.epitran_model = epitran_model
|
26 |
|
27 |
def convertToPhonem(self, sentence: str) -> str:
|
|
|
28 |
phonem_representation = self.epitran_model.transliterate(sentence)
|
|
|
29 |
return phonem_representation
|
30 |
|
31 |
|
|
|
35 |
super().__init__()
|
36 |
|
37 |
def convertToPhonem(self, sentence: str) -> str:
|
|
|
38 |
phonem_representation = eng_to_ipa.convert(sentence)
|
39 |
phonem_representation = phonem_representation.replace('*','')
|
|
|
40 |
return phonem_representation
|
aip_trainer/WordMatching.py β WordMatching.py
RENAMED
@@ -1,22 +1,24 @@
|
|
1 |
import time
|
2 |
from string import punctuation
|
|
|
3 |
|
4 |
import numpy as np
|
5 |
from dtwalign import dtw_from_distance_matrix
|
6 |
from ortools.sat.python import cp_model
|
7 |
|
8 |
-
|
|
|
9 |
|
10 |
offset_blank = 1
|
11 |
TIME_THRESHOLD_MAPPING = 5.0
|
12 |
|
13 |
|
14 |
-
def get_word_distance_matrix(words_estimated: list, words_real: list) -> np.
|
15 |
number_of_real_words = len(words_real)
|
16 |
number_of_estimated_words = len(words_estimated)
|
17 |
|
18 |
word_distance_matrix = np.zeros(
|
19 |
-
(number_of_estimated_words+offset_blank, number_of_real_words))
|
20 |
for idx_estimated in range(number_of_estimated_words):
|
21 |
for idx_real in range(number_of_real_words):
|
22 |
word_distance_matrix[idx_estimated, idx_real] = WordMetrics.edit_distance_python(
|
@@ -25,7 +27,7 @@ def get_word_distance_matrix(words_estimated: list, words_real: list) -> np.arra
|
|
25 |
if offset_blank == 1:
|
26 |
for idx_real in range(number_of_real_words):
|
27 |
word_distance_matrix[number_of_estimated_words,
|
28 |
-
|
29 |
return word_distance_matrix
|
30 |
|
31 |
|
@@ -33,37 +35,37 @@ def get_best_path_from_distance_matrix(word_distance_matrix):
|
|
33 |
modelCpp = cp_model.CpModel()
|
34 |
|
35 |
number_of_real_words = word_distance_matrix.shape[1]
|
36 |
-
number_of_estimated_words = word_distance_matrix.shape[0]-1
|
37 |
|
38 |
number_words = np.maximum(number_of_real_words, number_of_estimated_words)
|
39 |
|
40 |
estimated_words_order = [modelCpp.NewIntVar(0, int(
|
41 |
-
number_words - 1 + offset_blank), 'w%i' % i) for i in range(number_words+offset_blank)]
|
42 |
|
43 |
# They are in ascending order
|
44 |
-
for word_idx in range(number_words-1):
|
45 |
modelCpp.Add(
|
46 |
-
estimated_words_order[word_idx+1] >= estimated_words_order[word_idx])
|
47 |
|
48 |
total_phoneme_distance = 0
|
49 |
real_word_at_time = {}
|
50 |
for idx_estimated in range(number_of_estimated_words):
|
51 |
for idx_real in range(number_of_real_words):
|
52 |
real_word_at_time[idx_estimated, idx_real] = modelCpp.NewBoolVar(
|
53 |
-
'real_word_at_time'+str(idx_real)+'-'+str(idx_estimated))
|
54 |
modelCpp.Add(estimated_words_order[idx_estimated] == idx_real).OnlyEnforceIf(
|
55 |
real_word_at_time[idx_estimated, idx_real])
|
56 |
total_phoneme_distance += word_distance_matrix[idx_estimated,
|
57 |
-
|
58 |
|
59 |
# If no word in time, difference is calculated from empty string
|
60 |
for idx_real in range(number_of_real_words):
|
61 |
word_has_a_match = modelCpp.NewBoolVar(
|
62 |
-
'word_has_a_match'+str(idx_real))
|
63 |
modelCpp.Add(sum([real_word_at_time[idx_estimated, idx_real] for idx_estimated in range(
|
64 |
number_of_estimated_words)]) == 1).OnlyEnforceIf(word_has_a_match)
|
65 |
total_phoneme_distance += word_distance_matrix[number_of_estimated_words,
|
66 |
-
|
67 |
|
68 |
# Loss should be minimized
|
69 |
modelCpp.Minimize(total_phoneme_distance)
|
@@ -79,18 +81,16 @@ def get_best_path_from_distance_matrix(word_distance_matrix):
|
|
79 |
(solver.Value(estimated_words_order[word_idx])))
|
80 |
|
81 |
return np.array(mapped_indices, dtype=int)
|
82 |
-
except
|
83 |
-
app_logger.error(f"ex:{ex}.")
|
84 |
return []
|
85 |
|
86 |
|
87 |
-
def get_resulting_string(mapped_indices: np.
|
88 |
mapped_words = []
|
89 |
mapped_words_indices = []
|
90 |
WORD_NOT_FOUND_TOKEN = '-'
|
91 |
number_of_real_words = len(words_real)
|
92 |
for word_idx in range(number_of_real_words):
|
93 |
-
app_logger.debug(f"{word_idx} => {mapped_indices} == {word_idx}, {mapped_indices == word_idx} #")
|
94 |
position_of_real_word_indices = np.where(
|
95 |
mapped_indices == word_idx)[0].astype(int)
|
96 |
|
@@ -109,59 +109,93 @@ def get_resulting_string(mapped_indices: np.array, words_estimated: list, words_
|
|
109 |
error = 99999
|
110 |
best_possible_combination = ''
|
111 |
best_possible_idx = -1
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
|
117 |
mapped_words.append(best_possible_combination)
|
118 |
mapped_words_indices.append(best_possible_idx)
|
119 |
-
# continue
|
120 |
-
|
121 |
-
return mapped_words, mapped_words_indices
|
122 |
-
|
123 |
-
|
124 |
-
def inner_get_resulting_string(
|
125 |
-
best_possible_combination, best_possible_idx, error, position_of_real_word_indices, word_idx, words_estimated, words_real
|
126 |
-
):
|
127 |
-
for single_word_idx in position_of_real_word_indices:
|
128 |
-
idx_above_word = single_word_idx >= len(words_estimated)
|
129 |
-
if idx_above_word:
|
130 |
continue
|
131 |
-
error_word = WordMetrics.edit_distance_python(
|
132 |
-
words_estimated[single_word_idx], words_real[word_idx])
|
133 |
-
if error_word < error:
|
134 |
-
error = error_word * 1
|
135 |
-
best_possible_combination = words_estimated[single_word_idx]
|
136 |
-
best_possible_idx = single_word_idx
|
137 |
-
return best_possible_combination, best_possible_idx
|
138 |
|
|
|
139 |
|
140 |
-
def get_best_mapped_words(words_estimated: list, words_real: list) -> tuple[list, list]:
|
141 |
|
|
|
|
|
142 |
word_distance_matrix = get_word_distance_matrix(
|
143 |
words_estimated, words_real)
|
144 |
-
|
145 |
start = time.time()
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
mapped_indices
|
152 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
153 |
|
154 |
-
mapped_words, mapped_words_indices = get_resulting_string(
|
155 |
-
mapped_indices, words_estimated, words_real)
|
156 |
|
157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
|
159 |
|
160 |
def getWhichLettersWereTranscribedCorrectly(real_word, transcribed_word):
|
161 |
-
is_leter_correct = [None]*len(real_word)
|
162 |
for idx, letter in enumerate(real_word):
|
|
|
|
|
163 |
if letter == transcribed_word[idx] or letter in punctuation:
|
164 |
is_leter_correct[idx] = 1
|
165 |
else:
|
166 |
is_leter_correct[idx] = 0
|
167 |
return is_leter_correct
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import time
|
2 |
from string import punctuation
|
3 |
+
from typing import List, Tuple
|
4 |
|
5 |
import numpy as np
|
6 |
from dtwalign import dtw_from_distance_matrix
|
7 |
from ortools.sat.python import cp_model
|
8 |
|
9 |
+
import WordMetrics
|
10 |
+
from constants import app_logger
|
11 |
|
12 |
offset_blank = 1
|
13 |
TIME_THRESHOLD_MAPPING = 5.0
|
14 |
|
15 |
|
16 |
+
def get_word_distance_matrix(words_estimated: list, words_real: list) -> np.ndarray:
|
17 |
number_of_real_words = len(words_real)
|
18 |
number_of_estimated_words = len(words_estimated)
|
19 |
|
20 |
word_distance_matrix = np.zeros(
|
21 |
+
(number_of_estimated_words + offset_blank, number_of_real_words))
|
22 |
for idx_estimated in range(number_of_estimated_words):
|
23 |
for idx_real in range(number_of_real_words):
|
24 |
word_distance_matrix[idx_estimated, idx_real] = WordMetrics.edit_distance_python(
|
|
|
27 |
if offset_blank == 1:
|
28 |
for idx_real in range(number_of_real_words):
|
29 |
word_distance_matrix[number_of_estimated_words,
|
30 |
+
idx_real] = len(words_real[idx_real])
|
31 |
return word_distance_matrix
|
32 |
|
33 |
|
|
|
35 |
modelCpp = cp_model.CpModel()
|
36 |
|
37 |
number_of_real_words = word_distance_matrix.shape[1]
|
38 |
+
number_of_estimated_words = word_distance_matrix.shape[0] - 1
|
39 |
|
40 |
number_words = np.maximum(number_of_real_words, number_of_estimated_words)
|
41 |
|
42 |
estimated_words_order = [modelCpp.NewIntVar(0, int(
|
43 |
+
number_words - 1 + offset_blank), 'w%i' % i) for i in range(number_words + offset_blank)]
|
44 |
|
45 |
# They are in ascending order
|
46 |
+
for word_idx in range(number_words - 1):
|
47 |
modelCpp.Add(
|
48 |
+
estimated_words_order[word_idx + 1] >= estimated_words_order[word_idx])
|
49 |
|
50 |
total_phoneme_distance = 0
|
51 |
real_word_at_time = {}
|
52 |
for idx_estimated in range(number_of_estimated_words):
|
53 |
for idx_real in range(number_of_real_words):
|
54 |
real_word_at_time[idx_estimated, idx_real] = modelCpp.NewBoolVar(
|
55 |
+
'real_word_at_time' + str(idx_real) + '-' + str(idx_estimated))
|
56 |
modelCpp.Add(estimated_words_order[idx_estimated] == idx_real).OnlyEnforceIf(
|
57 |
real_word_at_time[idx_estimated, idx_real])
|
58 |
total_phoneme_distance += word_distance_matrix[idx_estimated,
|
59 |
+
idx_real] * real_word_at_time[idx_estimated, idx_real]
|
60 |
|
61 |
# If no word in time, difference is calculated from empty string
|
62 |
for idx_real in range(number_of_real_words):
|
63 |
word_has_a_match = modelCpp.NewBoolVar(
|
64 |
+
'word_has_a_match' + str(idx_real))
|
65 |
modelCpp.Add(sum([real_word_at_time[idx_estimated, idx_real] for idx_estimated in range(
|
66 |
number_of_estimated_words)]) == 1).OnlyEnforceIf(word_has_a_match)
|
67 |
total_phoneme_distance += word_distance_matrix[number_of_estimated_words,
|
68 |
+
idx_real] * word_has_a_match.Not()
|
69 |
|
70 |
# Loss should be minimized
|
71 |
modelCpp.Minimize(total_phoneme_distance)
|
|
|
81 |
(solver.Value(estimated_words_order[word_idx])))
|
82 |
|
83 |
return np.array(mapped_indices, dtype=int)
|
84 |
+
except:
|
|
|
85 |
return []
|
86 |
|
87 |
|
88 |
+
def get_resulting_string(mapped_indices: np.ndarray, words_estimated: list, words_real: list) -> Tuple[List, List]:
|
89 |
mapped_words = []
|
90 |
mapped_words_indices = []
|
91 |
WORD_NOT_FOUND_TOKEN = '-'
|
92 |
number_of_real_words = len(words_real)
|
93 |
for word_idx in range(number_of_real_words):
|
|
|
94 |
position_of_real_word_indices = np.where(
|
95 |
mapped_indices == word_idx)[0].astype(int)
|
96 |
|
|
|
109 |
error = 99999
|
110 |
best_possible_combination = ''
|
111 |
best_possible_idx = -1
|
112 |
+
for single_word_idx in position_of_real_word_indices:
|
113 |
+
idx_above_word = single_word_idx >= len(words_estimated)
|
114 |
+
if idx_above_word:
|
115 |
+
continue
|
116 |
+
error_word = WordMetrics.edit_distance_python(
|
117 |
+
words_estimated[single_word_idx], words_real[word_idx])
|
118 |
+
if error_word < error:
|
119 |
+
error = error_word * 1
|
120 |
+
best_possible_combination = words_estimated[single_word_idx]
|
121 |
+
best_possible_idx = single_word_idx
|
122 |
|
123 |
mapped_words.append(best_possible_combination)
|
124 |
mapped_words_indices.append(best_possible_idx)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
|
127 |
+
return mapped_words, mapped_words_indices
|
128 |
|
|
|
129 |
|
130 |
+
def get_best_mapped_words(words_estimated: list | str, words_real: list | str, use_dtw:bool = False) -> tuple[list, list]:
|
131 |
+
app_logger.info(f"words_estimated: '{words_estimated}', words_real: '{words_real}', use_dtw:{use_dtw}.")
|
132 |
word_distance_matrix = get_word_distance_matrix(
|
133 |
words_estimated, words_real)
|
134 |
+
app_logger.debug(f"word_distance_matrix: '{word_distance_matrix}'.")
|
135 |
start = time.time()
|
136 |
+
app_logger.info(f"use_dtw: '{use_dtw}'.")
|
137 |
+
if use_dtw:
|
138 |
+
alignment = (dtw_from_distance_matrix(word_distance_matrix.T))
|
139 |
+
app_logger.debug(f"alignment: '{alignment}'.")
|
140 |
+
mapped_indices = alignment.get_warping_path()[:len(words_estimated)]
|
141 |
+
app_logger.debug(f"mapped_indices: '{mapped_indices}'.")
|
142 |
+
duration_of_mapping = time.time()-start
|
143 |
+
else:
|
144 |
+
mapped_indices = get_best_path_from_distance_matrix(word_distance_matrix)
|
145 |
+
app_logger.debug(f"mapped_indices: '{mapped_indices}'.")
|
146 |
+
duration_of_mapping = time.time()-start
|
147 |
+
# In case or-tools doesn't converge, go to a faster, low-quality solution
|
148 |
+
check_mapped_indices_or_duration = len(mapped_indices) == 0 or duration_of_mapping > TIME_THRESHOLD_MAPPING+0.5
|
149 |
+
app_logger.info(f"check_mapped_indices_or_duration: '{check_mapped_indices_or_duration}'.")
|
150 |
+
if check_mapped_indices_or_duration:
|
151 |
+
#mapped_indices = (dtw_from_distance_matrix(
|
152 |
+
# word_distance_matrix)).path[:len(words_estimated), 1]
|
153 |
+
word_distance_matrix_transposed = word_distance_matrix.T
|
154 |
+
app_logger.debug(f"word_distance_matrix_transposed: '{word_distance_matrix_transposed}'.")
|
155 |
+
alignment = dtw_from_distance_matrix(word_distance_matrix_transposed)
|
156 |
+
app_logger.debug(f"check_mapped_indices_or_duration, alignment: '{alignment}'.")
|
157 |
+
mapped_indices = alignment.get_warping_path()
|
158 |
+
app_logger.debug(f"check_mapped_indices_or_duration, mapped_indices: '{mapped_indices}'.")
|
159 |
+
|
160 |
+
mapped_words, mapped_words_indices = get_resulting_string(mapped_indices, words_estimated, words_real)
|
161 |
+
app_logger.debug(f"mapped_words: '{mapped_words}', mapped_words_indices: '{mapped_words_indices}', duration_of_mapping:{duration_of_mapping}.")
|
162 |
+
return mapped_words, mapped_words_indices
|
163 |
|
|
|
|
|
164 |
|
165 |
+
## Faster, but not optimal
|
166 |
+
# def get_best_mapped_words_dtw(words_estimated: list, words_real: list) -> list:
|
167 |
+
# from dtwalign import dtw_from_distance_matrix
|
168 |
+
# word_distance_matrix = get_word_distance_matrix(
|
169 |
+
# words_estimated, words_real)
|
170 |
+
# mapped_indices = dtw_from_distance_matrix(
|
171 |
+
# word_distance_matrix).path[:-1, 0]
|
172 |
+
#
|
173 |
+
# mapped_words, mapped_words_indices = get_resulting_string(
|
174 |
+
# mapped_indices, words_estimated, words_real)
|
175 |
+
# return mapped_words, mapped_words_indices
|
176 |
|
177 |
|
178 |
def getWhichLettersWereTranscribedCorrectly(real_word, transcribed_word):
|
179 |
+
is_leter_correct = [None] * len(real_word)
|
180 |
for idx, letter in enumerate(real_word):
|
181 |
+
letter = letter.lower()
|
182 |
+
transcribed_word[idx] = transcribed_word[idx].lower()
|
183 |
if letter == transcribed_word[idx] or letter in punctuation:
|
184 |
is_leter_correct[idx] = 1
|
185 |
else:
|
186 |
is_leter_correct[idx] = 0
|
187 |
return is_leter_correct
|
188 |
+
|
189 |
+
|
190 |
+
# def parseLetterErrorsToHTML(word_real, is_leter_correct):
|
191 |
+
# word_colored = ''
|
192 |
+
# correct_color_start = '*'
|
193 |
+
# correct_color_end = '*'
|
194 |
+
# wrong_color_start = '-'
|
195 |
+
# wrong_color_end = '-'
|
196 |
+
# for idx, letter in enumerate(word_real):
|
197 |
+
# if is_leter_correct[idx] == 1:
|
198 |
+
# word_colored += correct_color_start + letter + correct_color_end
|
199 |
+
# else:
|
200 |
+
# word_colored += wrong_color_start + letter + wrong_color_end
|
201 |
+
# return word_colored
|
aip_trainer/WordMetrics.py β WordMetrics.py
RENAMED
@@ -1,9 +1,33 @@
|
|
1 |
import numpy as np
|
2 |
|
3 |
-
|
|
|
4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
-
#
|
7 |
def edit_distance_python(seq1, seq2):
|
8 |
size_x = len(seq1) + 1
|
9 |
size_y = len(seq2) + 1
|
@@ -27,5 +51,5 @@ def edit_distance_python(seq1, seq2):
|
|
27 |
matrix[x-1,y-1] + 1,
|
28 |
matrix[x,y-1] + 1
|
29 |
)
|
30 |
-
|
31 |
-
return matrix[size_x - 1, size_y - 1]
|
|
|
1 |
import numpy as np
|
2 |
|
3 |
+
# ref from https://gitlab.com/-/snippets/1948157
|
4 |
+
# For some variants, look here https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#Python
|
5 |
|
6 |
+
# Pure python
|
7 |
+
def edit_distance_python2(a, b):
|
8 |
+
# This version is commutative, so as an optimization we force |a|>=|b|
|
9 |
+
if len(a) < len(b):
|
10 |
+
return edit_distance_python(b, a)
|
11 |
+
if len(b) == 0: # Can deal with empty sequences faster
|
12 |
+
return len(a)
|
13 |
+
# Only two rows are really needed: the one currently filled in, and the previous
|
14 |
+
distances = []
|
15 |
+
distances.append([i for i in range(len(b)+1)])
|
16 |
+
distances.append([0 for _ in range(len(b)+1)])
|
17 |
+
# We can prefill the first row:
|
18 |
+
costs = [0 for _ in range(3)]
|
19 |
+
for i, a_token in enumerate(a, start=1):
|
20 |
+
distances[1][0] += 1 # Deals with the first column.
|
21 |
+
for j, b_token in enumerate(b, start=1):
|
22 |
+
costs[0] = distances[1][j-1] + 1
|
23 |
+
costs[1] = distances[0][j] + 1
|
24 |
+
costs[2] = distances[0][j-1] + (0 if a_token == b_token else 1)
|
25 |
+
distances[1][j] = min(costs)
|
26 |
+
# Move to the next row:
|
27 |
+
distances[0][:] = distances[1][:]
|
28 |
+
return distances[1][len(b)]
|
29 |
|
30 |
+
#https://stackabuse.com/levenshtein-distance-and-text-similarity-in-python/
|
31 |
def edit_distance_python(seq1, seq2):
|
32 |
size_x = len(seq1) + 1
|
33 |
size_y = len(seq2) + 1
|
|
|
51 |
matrix[x-1,y-1] + 1,
|
52 |
matrix[x,y-1] + 1
|
53 |
)
|
54 |
+
#print (matrix)
|
55 |
+
return matrix[size_x - 1, size_y - 1]
|
aip_trainer/__init__.py
DELETED
@@ -1,21 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
from pathlib import Path
|
3 |
-
|
4 |
-
import structlog
|
5 |
-
from dotenv import load_dotenv
|
6 |
-
|
7 |
-
from aip_trainer.utils import session_logger
|
8 |
-
|
9 |
-
|
10 |
-
load_dotenv()
|
11 |
-
PROJECT_ROOT_FOLDER = Path(globals().get("__file__", "./_")).absolute().parent.parent
|
12 |
-
LOG_JSON_FORMAT = bool(os.getenv("LOG_JSON_FORMAT"))
|
13 |
-
log_level = os.getenv("LOG_LEVEL", "INFO")
|
14 |
-
sample_rate_start = int(os.getenv('SAMPLE_RATE', 48000))
|
15 |
-
accepted_sample_rates = [48000, 24000, 16000, 8000]
|
16 |
-
try:
|
17 |
-
assert sample_rate_start in accepted_sample_rates
|
18 |
-
except AssertionError:
|
19 |
-
raise ValueError(f"cannot use a sample rate of value '{sample_rate_start}', should be one of {accepted_sample_rates} ...")
|
20 |
-
session_logger.setup_logging(json_logs=LOG_JSON_FORMAT, log_level=log_level)
|
21 |
-
app_logger = structlog.stdlib.get_logger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
aip_trainer/lambdas/__init__.py
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
|
|
|
|
aip_trainer/lambdas/data_de_en_with_categories.json
DELETED
The diff for this file is too large to render.
See raw diff
|
|
aip_trainer/lambdas/lambdaGetSample.py
DELETED
@@ -1,106 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
import pickle
|
3 |
-
from pathlib import Path
|
4 |
-
|
5 |
-
import epitran
|
6 |
-
import pandas as pd
|
7 |
-
|
8 |
-
from aip_trainer import PROJECT_ROOT_FOLDER, app_logger
|
9 |
-
from aip_trainer.models import RuleBasedModels
|
10 |
-
from aip_trainer.utils.typing_hints import BodyGetSampleRequest
|
11 |
-
|
12 |
-
|
13 |
-
class TextDataset:
|
14 |
-
def __init__(self, table, language='-'):
|
15 |
-
self.table_dataframe = table
|
16 |
-
self.number_of_samples = len(table)
|
17 |
-
self.language = language
|
18 |
-
|
19 |
-
def __getitem__(self, idx):
|
20 |
-
language_sentence = f"{self.language}_sentence" if self.language != '-' else 'sentence'
|
21 |
-
language_series = self.table_dataframe[language_sentence]
|
22 |
-
return [language_series.iloc[idx]]
|
23 |
-
|
24 |
-
def __len__(self):
|
25 |
-
return self.number_of_samples
|
26 |
-
|
27 |
-
def get_category_from_df_by_language(self, language: str, category_value:int):
|
28 |
-
selector = self.table_dataframe[f"{language}_category"] == category_value
|
29 |
-
df_by_category = self.table_dataframe[selector]
|
30 |
-
return df_by_category
|
31 |
-
|
32 |
-
def get_random_sample_from_df(self, language: str, category_value:int):
|
33 |
-
app_logger.info(f"language={language}, category_value={category_value}.")
|
34 |
-
choice = self.table_dataframe.sample(n=1)
|
35 |
-
if category_value !=0:
|
36 |
-
df_language_filtered_by_category_and_language = self.get_category_from_df_by_language(language, category_value)
|
37 |
-
choice = df_language_filtered_by_category_and_language.sample(n=1)
|
38 |
-
return [choice[f"{language}_sentence"].iloc[0]]
|
39 |
-
|
40 |
-
|
41 |
-
sample_folder = Path(PROJECT_ROOT_FOLDER / "aip_trainer" / "lambdas")
|
42 |
-
lambda_database = {}
|
43 |
-
lambda_ipa_converter = {}
|
44 |
-
|
45 |
-
with open(sample_folder / 'data_de_en_with_categories.json', 'r') as src:
|
46 |
-
df = pd.read_json(src)
|
47 |
-
|
48 |
-
lambda_database['de'] = TextDataset(df, 'de')
|
49 |
-
lambda_database['en'] = TextDataset(df, 'en')
|
50 |
-
lambda_translate_new_sample = False
|
51 |
-
lambda_ipa_converter['de'] = RuleBasedModels.EpitranPhonemConverter(
|
52 |
-
epitran.Epitran('deu-Latn'))
|
53 |
-
lambda_ipa_converter['en'] = RuleBasedModels.EngPhonemConverter()
|
54 |
-
|
55 |
-
|
56 |
-
def lambda_handler(event, context):
|
57 |
-
event_body = event["body"]
|
58 |
-
body = BodyGetSampleRequest.model_validate_json(event_body)
|
59 |
-
current_transcript = get_random_selection(body.language, body.category, is_gradio_output=False, transcript=body.transcript)
|
60 |
-
current_transcript = current_transcript[0] if isinstance(current_transcript, list) else current_transcript
|
61 |
-
current_ipa = lambda_ipa_converter[body.language].convertToPhonem(current_transcript)
|
62 |
-
|
63 |
-
app_logger.info(f"real_transcript='{current_transcript}', ipa_transcript='{current_ipa}'.")
|
64 |
-
result = {
|
65 |
-
'real_transcript': current_transcript,
|
66 |
-
'ipa_transcript': current_ipa,
|
67 |
-
'transcript_translation': ""
|
68 |
-
}
|
69 |
-
|
70 |
-
return json.dumps(result)
|
71 |
-
|
72 |
-
|
73 |
-
def get_random_selection(language: str, category: int, is_gradio_output=True, transcript=None):
|
74 |
-
if transcript is not None and isinstance(transcript, str) and len(transcript) > 0:
|
75 |
-
return transcript
|
76 |
-
lambda_df_lang = lambda_database[language]
|
77 |
-
current_transcript = lambda_df_lang.get_random_sample_from_df(language, category)
|
78 |
-
app_logger.info(f"category={category}, language={language}, current_transcript={current_transcript}.")
|
79 |
-
return current_transcript[0] if is_gradio_output else current_transcript
|
80 |
-
|
81 |
-
|
82 |
-
def getSentenceCategory(sentence) -> int:
|
83 |
-
number_of_words = len(sentence.split())
|
84 |
-
categories_word_limits = [0, 8, 20, 100000]
|
85 |
-
for category in range(len(categories_word_limits) - 1):
|
86 |
-
if categories_word_limits[category] < number_of_words <= categories_word_limits[category + 1]:
|
87 |
-
return category + 1
|
88 |
-
|
89 |
-
|
90 |
-
def get_pickle2json_dataframe(
|
91 |
-
custom_pickle_filename_no_ext: Path | str = 'data_de_en_2',
|
92 |
-
custom_folder: Path = sample_folder
|
93 |
-
):
|
94 |
-
custom_folder = Path(custom_folder)
|
95 |
-
with open(custom_folder / f'{custom_pickle_filename_no_ext}.pickle', 'rb') as handle:
|
96 |
-
df2 = pickle.load(handle)
|
97 |
-
pass
|
98 |
-
df2["de_category"] = df2["de_sentence"].apply(getSentenceCategory)
|
99 |
-
print("de_category added")
|
100 |
-
df2["en_category"] = df2["en_sentence"].apply(getSentenceCategory)
|
101 |
-
print("en_category added")
|
102 |
-
df_json = df2.to_json()
|
103 |
-
with open(custom_folder / f'{custom_pickle_filename_no_ext}.json', 'w') as dst:
|
104 |
-
dst.write(df_json)
|
105 |
-
print("data_de_en_with_categories.json written")
|
106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
aip_trainer/models/__init__.py
DELETED
File without changes
|
aip_trainer/utils/__init__.py
DELETED
File without changes
|
aip_trainer/utils/split_cosmic_ray_report.py
DELETED
@@ -1,33 +0,0 @@
|
|
1 |
-
from pathlib import Path
|
2 |
-
|
3 |
-
|
4 |
-
def get_cosmic_ray_report_filtered(input_filename, suffix="filtered", separator="============", filter_string_list: list = None):
|
5 |
-
if filter_string_list is None:
|
6 |
-
filter_string_list = ["test outcome: TestOutcome.KILLED"]
|
7 |
-
filename, ext = Path(input_filename).stem, Path(input_filename).suffix
|
8 |
-
working_dir = input_filename.parent
|
9 |
-
# Read the input file
|
10 |
-
with open(input_filename, 'r') as file:
|
11 |
-
content = file.read()
|
12 |
-
|
13 |
-
# Split the content into sections
|
14 |
-
sections = content.split(separator)
|
15 |
-
filtered_sections = [section for section in sections]
|
16 |
-
|
17 |
-
# Filter out sections containing "test outcome: TestOutcome.KILLED"
|
18 |
-
for filter_string in filter_string_list:
|
19 |
-
filtered_sections = [section for section in filtered_sections if filter_string not in section]
|
20 |
-
|
21 |
-
# Join the filtered sections back into a single string
|
22 |
-
filtered_content = separator.join(filtered_sections)
|
23 |
-
|
24 |
-
# Write the filtered content to a new file
|
25 |
-
with open(working_dir / f'{filename}_{suffix}{ext}', 'w') as file:
|
26 |
-
file.write(filtered_content)
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
if __name__ == "__main__":
|
31 |
-
from aip_trainer import PROJECT_ROOT_FOLDER
|
32 |
-
_input_filename = "cosmic-ray-models2.txt"
|
33 |
-
get_cosmic_ray_report_filtered(PROJECT_ROOT_FOLDER / "tmp" / _input_filename)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
aip_trainer/utils/typing_hints.py
DELETED
@@ -1,19 +0,0 @@
|
|
1 |
-
from typing import Annotated, Optional, TypeAlias
|
2 |
-
from pydantic import BaseModel
|
3 |
-
|
4 |
-
import annotated_types
|
5 |
-
|
6 |
-
|
7 |
-
Category: TypeAlias = Annotated[int, annotated_types.Ge(0), annotated_types.Le(4)]
|
8 |
-
|
9 |
-
|
10 |
-
class BodyGetSampleRequest(BaseModel):
|
11 |
-
category: Optional[Category] = 0
|
12 |
-
language: str
|
13 |
-
transcript: Optional[str] = ""
|
14 |
-
|
15 |
-
|
16 |
-
class BodySpeechToScoreRequest(BaseModel):
|
17 |
-
base64Audio: str
|
18 |
-
language: str
|
19 |
-
title: str
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
aip_trainer/utils/utilities.py
DELETED
@@ -1,57 +0,0 @@
|
|
1 |
-
"""Various utilities (logger, time benchmark, args dump, numerical and stats info)"""
|
2 |
-
|
3 |
-
from copy import deepcopy
|
4 |
-
from aip_trainer import app_logger
|
5 |
-
from aip_trainer.utils.serialize import serialize
|
6 |
-
|
7 |
-
|
8 |
-
def hash_calculate(arr_or_path, is_file: bool, read_mode: str = "rb") -> str | bytes:
|
9 |
-
"""
|
10 |
-
Return computed hash from input variable (typically a numpy array).
|
11 |
-
|
12 |
-
Args:
|
13 |
-
arr: input variable
|
14 |
-
|
15 |
-
Returns:
|
16 |
-
computed hash from input variable
|
17 |
-
"""
|
18 |
-
from hashlib import sha256
|
19 |
-
from base64 import b64encode
|
20 |
-
from numpy import ndarray as np_ndarray
|
21 |
-
|
22 |
-
if is_file:
|
23 |
-
with open(arr_or_path, read_mode) as file_to_check:
|
24 |
-
# read contents of the file
|
25 |
-
arr_or_path = file_to_check.read()
|
26 |
-
# # pipe contents of the file through
|
27 |
-
# try:
|
28 |
-
# return hashlib.sha256(data).hexdigest()
|
29 |
-
# except TypeError:
|
30 |
-
# app_logger.warning(
|
31 |
-
# f"TypeError, re-try encoding arg:{arr_or_path},type:{type(arr_or_path)}."
|
32 |
-
# )
|
33 |
-
# return hashlib.sha256(data.encode("utf-8")).hexdigest()
|
34 |
-
|
35 |
-
if isinstance(arr_or_path, np_ndarray):
|
36 |
-
hash_fn = sha256(arr_or_path.data)
|
37 |
-
elif isinstance(arr_or_path, dict):
|
38 |
-
import json
|
39 |
-
|
40 |
-
serialized = serialize(arr_or_path)
|
41 |
-
variable_to_hash = json.dumps(serialized, sort_keys=True).encode("utf-8")
|
42 |
-
hash_fn = sha256(variable_to_hash)
|
43 |
-
elif isinstance(arr_or_path, str):
|
44 |
-
try:
|
45 |
-
hash_fn = sha256(arr_or_path)
|
46 |
-
except TypeError:
|
47 |
-
app_logger.warning(
|
48 |
-
f"TypeError, re-try encoding arg:{arr_or_path},type:{type(arr_or_path)}."
|
49 |
-
)
|
50 |
-
hash_fn = sha256(arr_or_path.encode("utf-8"))
|
51 |
-
elif isinstance(arr_or_path, bytes):
|
52 |
-
hash_fn = sha256(arr_or_path)
|
53 |
-
else:
|
54 |
-
raise ValueError(
|
55 |
-
f"variable 'arr':{arr_or_path} of type '{type(arr_or_path)}' not yet handled."
|
56 |
-
)
|
57 |
-
return b64encode(hash_fn.digest())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
CHANGED
@@ -1,8 +1,12 @@
|
|
1 |
from pathlib import Path
|
2 |
import gradio as gr
|
3 |
|
4 |
-
|
5 |
-
from
|
|
|
|
|
|
|
|
|
6 |
|
7 |
|
8 |
css = """
|
@@ -38,9 +42,34 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
|
|
38 |
app_logger.info("start gradio app building...")
|
39 |
|
40 |
project_root_folder = Path(PROJECT_ROOT_FOLDER)
|
41 |
-
with open(project_root_folder / "
|
|
|
|
|
|
|
42 |
md_app_description = app_description_src.read()
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
with gr.Row():
|
45 |
with gr.Column(scale=4, min_width=300):
|
46 |
with gr.Row(elem_id="id-choose-random-phrase-by-language-and-difficulty"):
|
@@ -108,10 +137,10 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
|
|
108 |
visible=False,
|
109 |
)
|
110 |
text_recording_ipa = gr.Textbox(
|
111 |
-
placeholder=
|
112 |
)
|
113 |
text_ideal_ipa = gr.Textbox(
|
114 |
-
placeholder=
|
115 |
)
|
116 |
text_raw_json_output_hidden = gr.Textbox(placeholder=None, label="text_raw_json_output_hidden", visible=False)
|
117 |
with gr.Group(elem_classes="speech-output-group background-white"):
|
@@ -127,11 +156,11 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
|
|
127 |
elem_classes="speech-output-html background-white",
|
128 |
)
|
129 |
with gr.Row():
|
130 |
-
with gr.Column(min_width=100, elem_classes="speech-accuracy-score-container row2 col1"):
|
131 |
-
num_pronunciation_accuracy = gr.Number(label="Current score %", elem_id="number-pronunciation-accuracy-id-element")
|
132 |
-
with gr.Column(min_width=100, elem_classes="speech-accuracy-score-container row2 col2"):
|
133 |
num_score_de = gr.Number(label="Global score DE %", value=0, interactive=False, elem_id="number-score-de-id-element")
|
134 |
-
with gr.Column(min_width=100, elem_classes="speech-accuracy-score-container row2 col3"):
|
135 |
num_score_en = gr.Number(label="Global score EN %", value=0, interactive=False, elem_id="number-score-en-id-element")
|
136 |
btn_recognize_speech_accuracy = gr.Button(value="Get speech accuracy score (%)", elem_id="btn-recognize-speech-accuracy-id-element")
|
137 |
with gr.Row(elem_id="id-replay-splitted-audio-by-words"):
|
@@ -139,17 +168,17 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
|
|
139 |
with gr.Column(scale=1, min_width=50):
|
140 |
num_selected_recognized_word = gr.Number(label=word_idx_text, visible=True, minimum=0, value=0, interactive=False)
|
141 |
with gr.Column(scale=4, min_width=100):
|
142 |
-
|
143 |
-
label="
|
144 |
type="filepath",
|
145 |
show_download_button=True,
|
146 |
-
elem_id="audio-
|
147 |
)
|
148 |
text_selected_recognized_word_hidden = gr.Textbox(label="text_selected_recognized_word", value="placeholder", interactive=False, visible=False)
|
149 |
|
150 |
def get_updated_score_by_language(text: str, audio_rec: str | Path, lang: str, score_de: float, score_en: float):
|
151 |
import json
|
152 |
-
_transcribed_text, _letter_correctness, _pronunciation_accuracy, _recording_ipa, _ideal_ipa, _num_tot_recognized_word, first_audio_file, _res = lambdaSpeechToScore.get_speech_to_score_tuple(text, audio_rec, lang, remove_random_file=False)
|
153 |
new_num_selected_recognized_word = gr.Number(label=word_idx_text, visible=True, value=0)
|
154 |
words_list = _transcribed_text.split()
|
155 |
first_word = words_list[0]
|
@@ -165,7 +194,7 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
|
|
165 |
text_raw_json_output_hidden: _res,
|
166 |
num_tot_recognized_words: _num_tot_recognized_word,
|
167 |
num_selected_recognized_word: new_num_selected_recognized_word,
|
168 |
-
|
169 |
text_selected_recognized_word_hidden: first_word,
|
170 |
num_audio_duration_hidden: first_audio_duration
|
171 |
}
|
@@ -199,7 +228,7 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
|
|
199 |
num_score_en,
|
200 |
num_tot_recognized_words,
|
201 |
num_selected_recognized_word,
|
202 |
-
|
203 |
text_selected_recognized_word_hidden,
|
204 |
num_audio_duration_hidden
|
205 |
],
|
@@ -229,7 +258,7 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
|
|
229 |
clear3,
|
230 |
inputs=[],
|
231 |
outputs=[
|
232 |
-
audio_student_recording_stt, audio_tts,
|
233 |
num_pronunciation_accuracy, num_selected_recognized_word, num_pronunciation_accuracy
|
234 |
],
|
235 |
)
|
@@ -280,18 +309,18 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
|
|
280 |
num_selected_recognized_word.input(
|
281 |
fn=lambdaSpeechToScore.get_selected_word,
|
282 |
inputs=[num_selected_recognized_word, text_raw_json_output_hidden],
|
283 |
-
outputs=[
|
284 |
)
|
285 |
-
|
286 |
fn=None,
|
287 |
inputs=[text_selected_recognized_word_hidden, radio_language, num_audio_duration_hidden],
|
288 |
-
outputs=
|
289 |
js=js.js_play_audio
|
290 |
)
|
291 |
|
292 |
@gradio_app.load(inputs=[local_storage], outputs=[num_score_de, num_score_en])
|
293 |
def load_from_local_storage(saved_values):
|
294 |
-
|
295 |
return saved_values[0], saved_values[1]
|
296 |
|
297 |
@gr.on([num_score_de.change, num_score_en.change], inputs=[num_score_de, num_score_en], outputs=[local_storage])
|
@@ -302,6 +331,6 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
|
|
302 |
if __name__ == "__main__":
|
303 |
try:
|
304 |
gradio_app.launch()
|
305 |
-
except Exception as
|
306 |
-
app_logger.error(f"Error: {
|
307 |
-
raise
|
|
|
1 |
from pathlib import Path
|
2 |
import gradio as gr
|
3 |
|
4 |
+
import js
|
5 |
+
from constants import (PROJECT_ROOT_FOLDER, app_logger, sample_rate_start, MODEL_NAME_DEFAULT, model_urls,
|
6 |
+
sample_rate_resample, samplerate_tts, silero_versions_dict)
|
7 |
+
import lambdaGetSample
|
8 |
+
import lambdaSpeechToScore
|
9 |
+
import lambdaTTS
|
10 |
|
11 |
|
12 |
css = """
|
|
|
42 |
app_logger.info("start gradio app building...")
|
43 |
|
44 |
project_root_folder = Path(PROJECT_ROOT_FOLDER)
|
45 |
+
with open(project_root_folder / "app_headline.md", "r", encoding="utf-8") as app_headline_src:
|
46 |
+
md_app_headline = app_headline_src.read()
|
47 |
+
gr.Markdown(md_app_headline)
|
48 |
+
with open(project_root_folder / "app_description.md", "r", encoding="utf-8") as app_description_src:
|
49 |
md_app_description = app_description_src.read()
|
50 |
+
model_url = model_urls[MODEL_NAME_DEFAULT]
|
51 |
+
app_logger.info(f"model_urls:{model_urls} ...")
|
52 |
+
models_names_urls_list = ""
|
53 |
+
other_supported_models = {k: v for k, v in model_urls.items() if k != MODEL_NAME_DEFAULT}
|
54 |
+
for model_name, model_url in other_supported_models.items():
|
55 |
+
app_logger.info(f"model_name: {model_name}, model_url: {model_url} ...")
|
56 |
+
models_names_urls_list += """\n - [{model_name}]({model_url})""".format(model_name=model_name, model_url=model_url)
|
57 |
+
if model_name == "silero":
|
58 |
+
models_names_urls_list += " (German version: {}, English version: {})".format(silero_versions_dict["de"], silero_versions_dict["en"])
|
59 |
+
app_logger.info(f"models_names_urls_list: '{models_names_urls_list}' ...")
|
60 |
+
with gr.Accordion(
|
61 |
+
"Click here for expand and show current env variables samplerate values, the selected model and the supported ones",
|
62 |
+
open=False,
|
63 |
+
elem_id="accordion-models-env-variables-id-element"
|
64 |
+
):
|
65 |
+
gr.Markdown(md_app_description.format(
|
66 |
+
sample_rate_start=sample_rate_start,
|
67 |
+
model_name=MODEL_NAME_DEFAULT,
|
68 |
+
model_url=model_url,
|
69 |
+
models_names_urls_list=models_names_urls_list,
|
70 |
+
sample_rate_resample=sample_rate_resample,
|
71 |
+
samplerate_tts=samplerate_tts
|
72 |
+
))
|
73 |
with gr.Row():
|
74 |
with gr.Column(scale=4, min_width=300):
|
75 |
with gr.Row(elem_id="id-choose-random-phrase-by-language-and-difficulty"):
|
|
|
137 |
visible=False,
|
138 |
)
|
139 |
text_recording_ipa = gr.Textbox(
|
140 |
+
placeholder="-", label="Student phonetic transcription", elem_id="text-student-recording-ipa-id-element", interactive=False
|
141 |
)
|
142 |
text_ideal_ipa = gr.Textbox(
|
143 |
+
placeholder="-", label="Ideal phonetic transcription", elem_id="text-ideal-ipa-id-element", interactive=False
|
144 |
)
|
145 |
text_raw_json_output_hidden = gr.Textbox(placeholder=None, label="text_raw_json_output_hidden", visible=False)
|
146 |
with gr.Group(elem_classes="speech-output-group background-white"):
|
|
|
156 |
elem_classes="speech-output-html background-white",
|
157 |
)
|
158 |
with gr.Row():
|
159 |
+
with gr.Column(min_width=100, elem_classes="speech-accuracy-score-container row2 col1", elem_id="id-current-speech-accuracy-score-container"):
|
160 |
+
num_pronunciation_accuracy = gr.Number(label="Current score %", elem_id="number-pronunciation-accuracy-id-element", interactive=False, value=0)
|
161 |
+
with gr.Column(min_width=100, elem_classes="speech-accuracy-score-container row2 col2", elem_id="id-global-speech-accuracy-score-de-container"):
|
162 |
num_score_de = gr.Number(label="Global score DE %", value=0, interactive=False, elem_id="number-score-de-id-element")
|
163 |
+
with gr.Column(min_width=100, elem_classes="speech-accuracy-score-container row2 col3", elem_id="id-global-speech-accuracy-score-en-container"):
|
164 |
num_score_en = gr.Number(label="Global score EN %", value=0, interactive=False, elem_id="number-score-en-id-element")
|
165 |
btn_recognize_speech_accuracy = gr.Button(value="Get speech accuracy score (%)", elem_id="btn-recognize-speech-accuracy-id-element")
|
166 |
with gr.Row(elem_id="id-replay-splitted-audio-by-words"):
|
|
|
168 |
with gr.Column(scale=1, min_width=50):
|
169 |
num_selected_recognized_word = gr.Number(label=word_idx_text, visible=True, minimum=0, value=0, interactive=False)
|
170 |
with gr.Column(scale=4, min_width=100):
|
171 |
+
audio_sliced_student_recording_stt = gr.Audio(
|
172 |
+
label="Sliced student speech output",
|
173 |
type="filepath",
|
174 |
show_download_button=True,
|
175 |
+
elem_id="audio-sliced-student-recording-stt-id-element",
|
176 |
)
|
177 |
text_selected_recognized_word_hidden = gr.Textbox(label="text_selected_recognized_word", value="placeholder", interactive=False, visible=False)
|
178 |
|
179 |
def get_updated_score_by_language(text: str, audio_rec: str | Path, lang: str, score_de: float, score_en: float):
|
180 |
import json
|
181 |
+
_transcribed_text, _letter_correctness, _pronunciation_accuracy, _recording_ipa, _ideal_ipa, _num_tot_recognized_word, first_audio_file, _res, _ = lambdaSpeechToScore.get_speech_to_score_tuple(text, audio_rec, lang, remove_random_file=False)
|
182 |
new_num_selected_recognized_word = gr.Number(label=word_idx_text, visible=True, value=0)
|
183 |
words_list = _transcribed_text.split()
|
184 |
first_word = words_list[0]
|
|
|
194 |
text_raw_json_output_hidden: _res,
|
195 |
num_tot_recognized_words: _num_tot_recognized_word,
|
196 |
num_selected_recognized_word: new_num_selected_recognized_word,
|
197 |
+
audio_sliced_student_recording_stt: first_audio_file,
|
198 |
text_selected_recognized_word_hidden: first_word,
|
199 |
num_audio_duration_hidden: first_audio_duration
|
200 |
}
|
|
|
228 |
num_score_en,
|
229 |
num_tot_recognized_words,
|
230 |
num_selected_recognized_word,
|
231 |
+
audio_sliced_student_recording_stt,
|
232 |
text_selected_recognized_word_hidden,
|
233 |
num_audio_duration_hidden
|
234 |
],
|
|
|
258 |
clear3,
|
259 |
inputs=[],
|
260 |
outputs=[
|
261 |
+
audio_student_recording_stt, audio_tts, audio_sliced_student_recording_stt, text_recording_ipa, text_ideal_ipa, text_transcribed_hidden,
|
262 |
num_pronunciation_accuracy, num_selected_recognized_word, num_pronunciation_accuracy
|
263 |
],
|
264 |
)
|
|
|
309 |
num_selected_recognized_word.input(
|
310 |
fn=lambdaSpeechToScore.get_selected_word,
|
311 |
inputs=[num_selected_recognized_word, text_raw_json_output_hidden],
|
312 |
+
outputs=[audio_sliced_student_recording_stt, text_selected_recognized_word_hidden, num_audio_duration_hidden],
|
313 |
)
|
314 |
+
audio_sliced_student_recording_stt.play(
|
315 |
fn=None,
|
316 |
inputs=[text_selected_recognized_word_hidden, radio_language, num_audio_duration_hidden],
|
317 |
+
outputs=audio_sliced_student_recording_stt,
|
318 |
js=js.js_play_audio
|
319 |
)
|
320 |
|
321 |
@gradio_app.load(inputs=[local_storage], outputs=[num_score_de, num_score_en])
|
322 |
def load_from_local_storage(saved_values):
|
323 |
+
app_logger.info(f"loading from local storage: {saved_values} ...")
|
324 |
return saved_values[0], saved_values[1]
|
325 |
|
326 |
@gr.on([num_score_de.change, num_score_en.change], inputs=[num_score_de, num_score_en], outputs=[local_storage])
|
|
|
331 |
if __name__ == "__main__":
|
332 |
try:
|
333 |
gradio_app.launch()
|
334 |
+
except Exception as ex:
|
335 |
+
app_logger.error(f"Error: {ex}")
|
336 |
+
raise ex
|
app_description.md
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Models and variables
|
2 |
+
|
3 |
+
Right now this tool uses:
|
4 |
+
|
5 |
+
- [{model_name}]({model_url}) as STT (speech-to-text) model; other supported models are:
|
6 |
+
{models_names_urls_list}
|
7 |
+
- <u>{sample_rate_start}</u> as input samplerate value (from empirical tests the best sample rate value is 48000)
|
8 |
+
- <u>{sample_rate_resample}</u> as resampled samplerate value
|
9 |
+
- <u>{samplerate_tts}</u> as TTS (text-to-speech) samplerate value
|
10 |
+
|
11 |
+
|
aip_trainer/lambdas/app_description.md β app_headline.md
RENAMED
@@ -1,4 +1,6 @@
|
|
1 |
# AI Pronunciation Trainer
|
2 |
|
3 |
See [my fork](https://github.com/trincadev/ai-pronunciation-trainer) of [AI Pronunciation Trainer](https://github.com/Thiagohgl/ai-pronunciation-trainer) repository
|
4 |
-
for more details.
|
|
|
|
|
|
1 |
# AI Pronunciation Trainer
|
2 |
|
3 |
See [my fork](https://github.com/trincadev/ai-pronunciation-trainer) of [AI Pronunciation Trainer](https://github.com/Thiagohgl/ai-pronunciation-trainer) repository
|
4 |
+
for more details.
|
5 |
+
|
6 |
+
|
constants.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from pathlib import Path
|
3 |
+
import structlog
|
4 |
+
import session_logger
|
5 |
+
|
6 |
+
PROJECT_ROOT_FOLDER = Path(__file__).parent
|
7 |
+
ALLOWED_ORIGIN = os.getenv('ALLOWED_ORIGIN', 'http://localhost:3000')
|
8 |
+
LOG_JSON_FORMAT = bool(os.getenv("LOG_JSON_FORMAT"))
|
9 |
+
IS_TESTING = bool(os.getenv('IS_TESTING', ""))
|
10 |
+
STSCOREAPIKEY = os.getenv('STSCOREAPIKEY', "stscore_apikey_placeholder")
|
11 |
+
log_level = os.getenv("LOG_LEVEL", "INFO")
|
12 |
+
USE_DTW = bool(os.getenv("USE_DTW"))
|
13 |
+
MODEL_NAME_TESTING = "whisper"
|
14 |
+
_MODEL_NAME_DEFAULT = os.getenv("MODEL_NAME_DEFAULT", MODEL_NAME_TESTING)
|
15 |
+
MODEL_NAME_DEFAULT = MODEL_NAME_TESTING if IS_TESTING else _MODEL_NAME_DEFAULT
|
16 |
+
DEVICE = os.getenv("DEVICE", "cpu")
|
17 |
+
tmp_audio_extension = os.getenv('TMP_AUDIO_EXTENSION', '.wav')
|
18 |
+
session_logger.setup_logging(json_logs=LOG_JSON_FORMAT, log_level=log_level)
|
19 |
+
app_logger = structlog.stdlib.get_logger(__name__)
|
20 |
+
sample_rate_start = int(os.getenv('SAMPLE_RATE', 48000))
|
21 |
+
sample_rate_resample = 16000
|
22 |
+
samplerate_tts = 16000
|
23 |
+
language_not_implemented = "Language '{}' not implemented. Supported languages: 'de', 'en'."
|
24 |
+
SILERO_VERSION_DE = "v4"
|
25 |
+
SILERO_VERSION_EN = "latest"
|
26 |
+
silero_versions_dict = {"de": SILERO_VERSION_DE, "en": SILERO_VERSION_EN}
|
27 |
+
model_urls = {
|
28 |
+
"faster_whisper": "https://pypi.org/project/faster-whisper/",
|
29 |
+
"silero": "https://pypi.org/project/silero/",
|
30 |
+
"whisper": "https://pypi.org/project/openai-whisper/",
|
31 |
+
}
|
cosmic_ray_config.toml
DELETED
@@ -1,8 +0,0 @@
|
|
1 |
-
[cosmic-ray]
|
2 |
-
module-path = "aip_trainer/models/models.py"
|
3 |
-
timeout = 30.0
|
4 |
-
excluded-modules = []
|
5 |
-
test-command = "python -m pytest tests/models/test_models.py"
|
6 |
-
|
7 |
-
[cosmic-ray.distributor]
|
8 |
-
name = "local"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/test_data_de_en_2.pickle β data_de_en_2.pickle
RENAMED
File without changes
|
databases/data_de.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
databases/data_en.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
dockerfiles/apt_preferences
DELETED
@@ -1,9 +0,0 @@
|
|
1 |
-
Explanation: Uninstall or do not install any Debian-originated
|
2 |
-
Explanation: package versions other than those in the stable distro
|
3 |
-
Package: *
|
4 |
-
Pin: release a=stable
|
5 |
-
Pin-Priority: 900
|
6 |
-
|
7 |
-
Package: zlib1g
|
8 |
-
Pin: release a=trixie
|
9 |
-
Pin-Priority: -10
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
dockerfiles/debian.sources
DELETED
@@ -1,17 +0,0 @@
|
|
1 |
-
Types: deb deb-src
|
2 |
-
URIs: http://deb.debian.org/debian
|
3 |
-
Suites: bookworm bookworm-updates
|
4 |
-
Components: main
|
5 |
-
Signed-By: /usr/share/keyrings/debian-archive-keyring.gpg
|
6 |
-
|
7 |
-
Types: deb deb-src
|
8 |
-
URIs: http://deb.debian.org/debian-security
|
9 |
-
Suites: bookworm-security
|
10 |
-
Components: main
|
11 |
-
Signed-By: /usr/share/keyrings/debian-archive-keyring.gpg
|
12 |
-
|
13 |
-
Types: deb
|
14 |
-
URIs: http://deb.debian.org/debian
|
15 |
-
Suites: trixie
|
16 |
-
Components: main
|
17 |
-
Signed-By: /usr/share/keyrings/debian-archive-keyring.gpg
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
dockerfiles/dockerfile-base
DELETED
@@ -1,72 +0,0 @@
|
|
1 |
-
# Include global ARGs at the dockerfile top
|
2 |
-
ARG ARCH="x86_64"
|
3 |
-
ARG WORKDIR_ROOT="/var/task"
|
4 |
-
|
5 |
-
|
6 |
-
FROM python:3.12-bookworm AS builder_global
|
7 |
-
|
8 |
-
ARG ARCH
|
9 |
-
ARG WORKDIR_ROOT
|
10 |
-
ARG POETRY_NO_INTERACTION
|
11 |
-
ARG POETRY_VIRTUALENVS_IN_PROJECT
|
12 |
-
ARG POETRY_VIRTUALENVS_CREATE
|
13 |
-
ARG POETRY_CACHE_DIR
|
14 |
-
ARG ZLIB1G="http://ftp.it.debian.org/debian/pool/main/z/zlib/zlib1g_1.3.dfsg-3+b1_amd64.deb"
|
15 |
-
ENV PYTHONPATH="${WORKDIR_ROOT}:${PYTHONPATH}:/usr/local/lib/python3/dist-packages"
|
16 |
-
ENV MPLCONFIGDIR=/tmp/matplotlib
|
17 |
-
ARG USER="999"
|
18 |
-
|
19 |
-
|
20 |
-
RUN echo "ARCH: $ARCH, ARG POETRY_CACHE_DIR: ${POETRY_CACHE_DIR}, ENV PYTHONPATH: $PYTHONPATH, USER: $USER ..."
|
21 |
-
# RUN groupadd -g 999 python && useradd -r -u 999 -g python python
|
22 |
-
|
23 |
-
# Set working directory to function root directory
|
24 |
-
WORKDIR ${WORKDIR_ROOT}
|
25 |
-
COPY --chown=python:python requirements.txt ${WORKDIR_ROOT}/
|
26 |
-
|
27 |
-
# avoid segment-geospatial exception caused by missing libGL.so.1 library
|
28 |
-
RUN echo "BUILDER: check libz.s* before start" && ls -l /usr/lib/${ARCH}-linux-gnu/libz.so*
|
29 |
-
RUN apt update && apt install -y curl ffmpeg libgl1 python3-pip && apt clean
|
30 |
-
COPY --chown=python:python ./dockerfiles/apt_preferences /etc/apt/preferences
|
31 |
-
COPY --chown=python:python ./dockerfiles/debian.sources /etc/apt/sources.list.d/debian.sources
|
32 |
-
RUN apt update && apt install -t trixie zlib1g -y && apt clean
|
33 |
-
RUN echo "BUILDER: check libz.s* after install from trixie" && ls -l /usr/lib/${ARCH}-linux-gnu/libz.so*
|
34 |
-
|
35 |
-
RUN ls -l /etc/apt/sources* /etc/apt/preferences*
|
36 |
-
|
37 |
-
# poetry installation path is NOT within ${WORKDIR_ROOT}: not needed for runtime docker image
|
38 |
-
RUN python3 -m venv ${WORKDIR_ROOT}/.venv
|
39 |
-
ENV PATH="${WORKDIR_ROOT}/.venv/bin:$PATH"
|
40 |
-
RUN . ${WORKDIR_ROOT}/.venv/bin/activate && python -m pip install -r ${WORKDIR_ROOT}/requirements.txt
|
41 |
-
|
42 |
-
# USER 999
|
43 |
-
|
44 |
-
|
45 |
-
FROM python:3.12-slim-bookworm AS runtime
|
46 |
-
|
47 |
-
RUN groupadd -g 999 python && useradd -r -u 999 -g python python
|
48 |
-
|
49 |
-
ARG ARCH
|
50 |
-
ARG WORKDIR_ROOT
|
51 |
-
ENV PYTHONPATH="${WORKDIR_ROOT}:${WORKDIR_ROOT}/.venv:${PYTHONPATH}:/usr/local/lib/python3/dist-packages"
|
52 |
-
ENV MPLCONFIGDIR=/tmp/matplotlib
|
53 |
-
|
54 |
-
ENV VIRTUAL_ENV=${WORKDIR_ROOT}/.venv PATH="${WORKDIR_ROOT}/.venv/bin:$PATH"
|
55 |
-
|
56 |
-
RUN apt update && apt install -y ffmpeg && apt clean
|
57 |
-
RUN echo "COPY --chown=python:python --from=builder_global /usr/lib/${ARCH}-linux-gnu/libGL.so* /usr/lib/${ARCH}-linux-gnu/"
|
58 |
-
RUN echo "RUNTIME: check libz.s* before upgrade" && ls -l /usr/lib/${ARCH}-linux-gnu/libz.so*
|
59 |
-
RUN echo "RUNTIME: remove libz.s* to force upgrade" && rm /usr/lib/${ARCH}-linux-gnu/libz.so*
|
60 |
-
COPY --chown=python:python --from=builder_global /usr/lib/${ARCH}-linux-gnu/libz.so* /usr/lib/${ARCH}-linux-gnu/
|
61 |
-
COPY --chown=python:python --from=builder_global /lib/${ARCH}-linux-gnu/libexpat.so* /lib/${ARCH}-linux-gnu/
|
62 |
-
RUN echo "RUNTIME: check libz.s* after copy" && ls -l /usr/lib/${ARCH}-linux-gnu/libz.so*
|
63 |
-
COPY --chown=python:python --from=builder_global ${WORKDIR_ROOT}/.venv ${WORKDIR_ROOT}/.venv
|
64 |
-
RUN echo "check ffmpeg files..."
|
65 |
-
RUN ls -ld /usr/share/ffmpeg || echo "ffpeg folder not found!"
|
66 |
-
RUN ls -l /usr/bin/ff* || echo "ffpeg bin not found!"
|
67 |
-
RUN ls -l /usr/share/ffmpeg || echo "ffpeg folder share not found!"
|
68 |
-
RUN . ${WORKDIR_ROOT}/.venv && which python && pip list
|
69 |
-
|
70 |
-
RUN echo "new WORKDIR_ROOT after hidden venv COPY --chown=python:python => ${WORKDIR_ROOT}"
|
71 |
-
RUN ls -ld ${WORKDIR_ROOT}/
|
72 |
-
RUN ls -lA ${WORKDIR_ROOT}/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
faster_whisper_wrapper.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Union
|
2 |
+
|
3 |
+
import numpy as np
|
4 |
+
import onnxruntime
|
5 |
+
import torch
|
6 |
+
from faster_whisper import WhisperModel
|
7 |
+
|
8 |
+
from ModelInterfaces import IASRModel
|
9 |
+
from constants import sample_rate_resample, app_logger, IS_TESTING, DEVICE
|
10 |
+
|
11 |
+
device = onnxruntime.get_device()
|
12 |
+
device = "cpu" if IS_TESTING or device.lower() == DEVICE.lower() else device
|
13 |
+
app_logger.info(f"device: {device} #")
|
14 |
+
device_compute = "int8_float16" if device == "cuda" else "int8"
|
15 |
+
app_logger.info(f"device: {device}, device_compute: {device_compute} #")
|
16 |
+
|
17 |
+
|
18 |
+
def parse_word_info(word_info, sample_rate):
|
19 |
+
start_ts = float(word_info.start) * sample_rate
|
20 |
+
end_ts = float(word_info.end) * sample_rate
|
21 |
+
word = word_info.word
|
22 |
+
return {"word": word, "start_ts": start_ts, "end_ts": end_ts}
|
23 |
+
|
24 |
+
|
25 |
+
class FasterWhisperASRModel(IASRModel):
|
26 |
+
def __init__(self, model_name="base", language=None):
|
27 |
+
self.asr = WhisperModel(model_name, device=device, compute_type=device_compute)
|
28 |
+
self._transcript = ""
|
29 |
+
self._word_locations = []
|
30 |
+
self.sample_rate = sample_rate_resample
|
31 |
+
self.language = language
|
32 |
+
|
33 |
+
def processAudio(self, audio:Union[np.ndarray, torch.Tensor]):
|
34 |
+
# 'audio' can be a path to a file or a numpy array of audio samples.
|
35 |
+
if isinstance(audio, torch.Tensor):
|
36 |
+
audio = audio.detach().cpu().numpy()
|
37 |
+
segments, info = self.asr.transcribe(audio=audio[0], language=self.language, word_timestamps=True, beam_size=5, temperature=0, vad_filter=True) #, "verbose": True})
|
38 |
+
app_logger.debug(f"segments: type={type(segments)}, segments complete: {segments} #")
|
39 |
+
app_logger.info(f"info: type={type(info)}, info complete: {info} #")
|
40 |
+
transcript = []
|
41 |
+
count = 0
|
42 |
+
for segment in segments:
|
43 |
+
app_logger.debug(f"single segment: {type(segment)}, segment: {segment} #")
|
44 |
+
transcript.append(segment.text)
|
45 |
+
segment_word_locations = [parse_word_info(word_info, sample_rate=self.sample_rate) for word_info in segment.words]
|
46 |
+
self._word_locations.extend(segment_word_locations)
|
47 |
+
app_logger.info(f"elaborated segment {count}: type={type(segment)}, len(words):{len(segment.words)}, text:{segment.text} #")
|
48 |
+
count += 1
|
49 |
+
app_logger.info(f"transcript: {transcript} #")
|
50 |
+
self._transcript = " ".join(transcript)
|
51 |
+
|
52 |
+
def getTranscript(self) -> str:
|
53 |
+
return self._transcript
|
54 |
+
|
55 |
+
def getWordLocations(self) -> list:
|
56 |
+
return self._word_locations
|
images/{MainScreen.png β MainScreen.jpg}
RENAMED
File without changes
|
aip_trainer/lambdas/js.py β js.py
RENAMED
@@ -1,11 +1,4 @@
|
|
1 |
js_update_ipa_output = """
|
2 |
-
/**
|
3 |
-
* Updates the CSS text of the given text based on the correctness of each letter.
|
4 |
-
*
|
5 |
-
* @param text - The text to be displayed.
|
6 |
-
* @param letters - A string representing the correctness of each letter in the text.
|
7 |
-
* @param idxSelectedWord - The index of the selected word to be underlined.
|
8 |
-
*/
|
9 |
function updateCssText(text, letters, idxSelectedWord) {
|
10 |
let wordsArr = text.split(" ")
|
11 |
let lettersWordsArr = letters.split(" ")
|
@@ -31,13 +24,6 @@ function updateCssText(text, letters, idxSelectedWord) {
|
|
31 |
"""
|
32 |
|
33 |
js_play_audio = """
|
34 |
-
/**
|
35 |
-
* Plays the given text as audio using the Web Speech API.
|
36 |
-
*
|
37 |
-
* @param text - The text to be spoken.
|
38 |
-
* @param language - The language code for the speech synthesis (e.g., 'en' for English, 'de' for German).
|
39 |
-
* @param sleepTime - Optional. The time in seconds to wait before starting the speech synthesis. Default is 0.
|
40 |
-
*/
|
41 |
function playAudio(text, language, sleepTime = 0) {
|
42 |
let voice_idx = 0;
|
43 |
let voice_synth = null;
|
|
|
1 |
js_update_ipa_output = """
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
function updateCssText(text, letters, idxSelectedWord) {
|
3 |
let wordsArr = text.split(" ")
|
4 |
let lettersWordsArr = letters.split(" ")
|
|
|
24 |
"""
|
25 |
|
26 |
js_play_audio = """
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
function playAudio(text, language, sleepTime = 0) {
|
28 |
let voice_idx = 0;
|
29 |
let voice_synth = null;
|
lambdaChangeModel.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
|
3 |
+
import pronunciationTrainer
|
4 |
+
|
5 |
+
|
6 |
+
trainer_SST_lambda = {'de': pronunciationTrainer.getTrainer("de"), 'en': pronunciationTrainer.getTrainer("en")}
|
7 |
+
|
8 |
+
|
9 |
+
def lambda_handler(event, context):
|
10 |
+
data = json.loads(event['body'])
|
11 |
+
model_name = data['modelName']
|
12 |
+
trainer_SST_lambda["de"] = pronunciationTrainer.getTrainer("de", model_name=model_name)
|
13 |
+
trainer_SST_lambda["en"] = pronunciationTrainer.getTrainer("en", model_name=model_name)
|
14 |
+
return f'Model changed to {model_name}!'
|
lambdaGetSample.py
ADDED
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
from pathlib import Path
|
3 |
+
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
import RuleBasedModels
|
7 |
+
from constants import app_logger
|
8 |
+
|
9 |
+
|
10 |
+
class TextDataset:
|
11 |
+
def __init__(self, table, language):
|
12 |
+
self.table_dataframe = table
|
13 |
+
self.language = language
|
14 |
+
|
15 |
+
def __getitem__(self, idx):
|
16 |
+
line = [self.table_dataframe['sentence'].iloc[idx]]
|
17 |
+
return line
|
18 |
+
|
19 |
+
def __len__(self):
|
20 |
+
return len(self.table_dataframe)
|
21 |
+
|
22 |
+
def get_category_from_df(self, category_value:int):
|
23 |
+
selector = self.table_dataframe["category"] == category_value
|
24 |
+
df_by_category = self.table_dataframe[selector]
|
25 |
+
return df_by_category
|
26 |
+
|
27 |
+
def get_random_sample_from_df(self, category_value:int):
|
28 |
+
app_logger.info(f"language={self.language}, category_value={category_value}.")
|
29 |
+
choice = self.table_dataframe.sample(n=1)
|
30 |
+
if category_value !=0:
|
31 |
+
df_language_filtered_by_category = self.get_category_from_df(category_value)
|
32 |
+
choice = df_language_filtered_by_category.sample(n=1)
|
33 |
+
sentence = choice["sentence"].iloc[0]
|
34 |
+
app_logger.info(f"sentence={sentence} ...")
|
35 |
+
return [sentence]
|
36 |
+
|
37 |
+
|
38 |
+
sample_folder = Path(__file__).parent / "databases"
|
39 |
+
lambda_database = {}
|
40 |
+
lambda_ipa_converter = {}
|
41 |
+
available_languages = ['de', 'en']
|
42 |
+
|
43 |
+
for lang in available_languages:
|
44 |
+
# avoid using ";" or "," as separator because these are present within the dataframe sentences
|
45 |
+
df = pd.read_csv(sample_folder / f'data_{lang}.csv', delimiter='|')
|
46 |
+
lambda_database[lang] = TextDataset(df, lang)
|
47 |
+
lambda_ipa_converter[lang] = RuleBasedModels.get_phonem_converter(lang)
|
48 |
+
|
49 |
+
lambda_translate_new_sample = False
|
50 |
+
|
51 |
+
|
52 |
+
def lambda_handler(event, context):
|
53 |
+
"""
|
54 |
+
lambda handler to return a random text sample from the dataset.
|
55 |
+
|
56 |
+
Parameters:
|
57 |
+
event (dict): The event data passed to the Lambda function.
|
58 |
+
context (dict): The context in which the Lambda function is called.
|
59 |
+
|
60 |
+
Returns:
|
61 |
+
str: The JSON-encoded result.
|
62 |
+
"""
|
63 |
+
try:
|
64 |
+
body = json.loads(event['body'])
|
65 |
+
|
66 |
+
try:
|
67 |
+
category = int(body['category'])
|
68 |
+
except KeyError:
|
69 |
+
category = 0
|
70 |
+
language = body['language']
|
71 |
+
try:
|
72 |
+
current_transcript = str(body["transcript"])
|
73 |
+
except KeyError:
|
74 |
+
current_transcript = get_random_selection(language, category)
|
75 |
+
current_ipa = lambda_ipa_converter[language].convertToPhonem(current_transcript)
|
76 |
+
|
77 |
+
app_logger.info(f"real_transcript='{current_transcript}', ipa_transcript='{current_ipa}'.")
|
78 |
+
result = {
|
79 |
+
'real_transcript': [current_transcript],
|
80 |
+
'ipa_transcript': current_ipa,
|
81 |
+
'transcript_translation': ""
|
82 |
+
}
|
83 |
+
|
84 |
+
return json.dumps(result)
|
85 |
+
except Exception as ex:
|
86 |
+
app_logger.error(f"ex: {ex} ...")
|
87 |
+
raise ex
|
88 |
+
|
89 |
+
|
90 |
+
def get_random_selection(language: str, category: int) -> str:
|
91 |
+
"""
|
92 |
+
Get a random text sample from the dataset.
|
93 |
+
|
94 |
+
Parameters:
|
95 |
+
language (str): The language code.
|
96 |
+
category (int): The category value to filter the dataset.
|
97 |
+
|
98 |
+
Returns:
|
99 |
+
str: The selected text sample.
|
100 |
+
"""
|
101 |
+
lambda_df_lang = lambda_database[language]
|
102 |
+
current_transcript = lambda_df_lang.get_random_sample_from_df(category)
|
103 |
+
app_logger.info(f"category={category}, language={language}, current_transcript={current_transcript}.")
|
104 |
+
return current_transcript[0]
|
105 |
+
|
106 |
+
|
107 |
+
def getSentenceCategory(sentence) -> int | None:
|
108 |
+
number_of_words = len(sentence.split())
|
109 |
+
categories_word_limits = [0, 8, 20, 100000]
|
110 |
+
for category in range(len(categories_word_limits)-1):
|
111 |
+
if categories_word_limits[category] < number_of_words <= categories_word_limits[category + 1]:
|
112 |
+
return category+1
|
113 |
+
raise ValueError(f"category not assigned for sentence '{sentence}' ...")
|
114 |
+
|
115 |
+
|
116 |
+
def get_enriched_dataframe_csv(
|
117 |
+
language: str,
|
118 |
+
custom_dataframe_csv_filename_no_ext: str = "data",
|
119 |
+
custom_folder: Path = sample_folder
|
120 |
+
) -> None:
|
121 |
+
"""
|
122 |
+
Read a csv dataframe adding a 'category' column.
|
123 |
+
|
124 |
+
Parameters:
|
125 |
+
language (str): The language code (e.g. "de" for German).
|
126 |
+
custom_dataframe_csv_filename_no_ext (str): The csv dataframe without extension.
|
127 |
+
custom_folder (Path): The folder containing the csv dataframe.
|
128 |
+
|
129 |
+
Returns:
|
130 |
+
None
|
131 |
+
"""
|
132 |
+
custom_folder = Path(custom_folder).absolute()
|
133 |
+
df_filename = custom_folder / f'{custom_dataframe_csv_filename_no_ext}_{language}.csv'
|
134 |
+
with open(df_filename, 'r') as handle:
|
135 |
+
df2 = pd.read_csv(handle, sep="|")
|
136 |
+
df2["category"] = df2["sentence"].apply(getSentenceCategory)
|
137 |
+
app_logger.info("de_category added")
|
138 |
+
output_path = custom_folder / f'{custom_dataframe_csv_filename_no_ext}_{language}.csv'
|
139 |
+
df2.to_csv(output_path, index=False, sep="|")
|
140 |
+
app_logger.info(f"written {output_path} ...")
|
141 |
+
|
142 |
+
|
143 |
+
if __name__ == '__main__':
|
144 |
+
get_enriched_dataframe_csv("de")
|
145 |
+
get_enriched_dataframe_csv("en")
|
aip_trainer/lambdas/lambdaSpeechToScore.py β lambdaSpeechToScore.py
RENAMED
@@ -4,52 +4,75 @@ import os
|
|
4 |
from pathlib import Path
|
5 |
import tempfile
|
6 |
import time
|
|
|
7 |
|
8 |
import audioread
|
9 |
import numpy as np
|
10 |
import torch
|
11 |
from torchaudio.transforms import Resample
|
12 |
|
13 |
-
|
14 |
-
|
15 |
-
|
|
|
16 |
|
17 |
|
18 |
-
trainer_SST_lambda = {
|
19 |
-
|
20 |
-
'en': pronunciationTrainer.getTrainer("en")
|
21 |
-
}
|
22 |
-
transform = Resample(orig_freq=sample_rate_start, new_freq=16000)
|
23 |
|
24 |
|
25 |
-
def lambda_handler(event, context):
|
26 |
-
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
-
real_text = data
|
30 |
-
base64_audio = data
|
31 |
app_logger.debug(f"base64Audio:{base64_audio} ...")
|
32 |
file_bytes_or_audiotmpfile = base64.b64decode(base64_audio[22:].encode('utf-8'))
|
33 |
-
language = data
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
if len(real_text) == 0:
|
36 |
-
return {
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
'Access-Control-Allow-Origin': 'http://127.0.0.1:3000/',
|
42 |
-
'Access-Control-Allow-Methods': 'OPTIONS,POST,GET'
|
43 |
-
},
|
44 |
-
'body': ''
|
45 |
-
}
|
46 |
-
output = get_speech_to_score_dict(real_text=real_text, file_bytes_or_audiotmpfile=file_bytes_or_audiotmpfile, language=language, remove_random_file=False)
|
47 |
output = json.dumps(output)
|
48 |
app_logger.debug(f"output: {output} ...")
|
49 |
return output
|
50 |
|
51 |
|
52 |
-
def get_speech_to_score_dict(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
from soundfile import LibsndfileError
|
54 |
app_logger.info(f"real_text:{real_text} ...")
|
55 |
app_logger.debug(f"file_bytes:{file_bytes_or_audiotmpfile} ...")
|
@@ -72,23 +95,19 @@ def get_speech_to_score_dict(real_text: str, file_bytes_or_audiotmpfile: str | d
|
|
72 |
app_logger.debug("writing streaming data to file on disk...")
|
73 |
with tempfile.NamedTemporaryFile(prefix="temp_sound_speech_score_", suffix=extension, delete=False) as f1:
|
74 |
f1.write(file_bytes_or_audiotmpfile)
|
|
|
75 |
duration = time.time() - start0
|
76 |
app_logger.info(f'Saved binary data in file in {duration}s.')
|
77 |
-
random_file_name = f1.name
|
78 |
|
79 |
start = time.time()
|
80 |
-
app_logger.info(f
|
81 |
try:
|
82 |
signal, samplerate = soundfile_load(random_file_name)
|
83 |
except LibsndfileError as sfe:
|
84 |
# https://github.com/beetbox/audioread/issues/144
|
85 |
# deprecation warnings => pip install standard-aifc standard-sunau
|
86 |
app_logger.error(f"Error reading file {random_file_name}: {sfe}, re-try with audioread...")
|
87 |
-
|
88 |
-
signal, samplerate = audioread_load(random_file_name)
|
89 |
-
except ModuleNotFoundError as mnfe:
|
90 |
-
app_logger.error(f"Error reading file {random_file_name}: {mnfe}, try read https://github.com/beetbox/audioread/issues/144")
|
91 |
-
raise mnfe
|
92 |
|
93 |
duration = time.time() - start
|
94 |
app_logger.info(f'Read {extension} file {random_file_name} in {duration}s.')
|
@@ -103,11 +122,11 @@ def get_speech_to_score_dict(real_text: str, file_bytes_or_audiotmpfile: str | d
|
|
103 |
result = language_trainer_sst_lambda.processAudioForGivenText(signal_transformed, real_text)
|
104 |
app_logger.info(f'language_trainer_sst_lambda: result: {result}...')
|
105 |
|
106 |
-
start = time.time()
|
107 |
-
if remove_random_file:
|
108 |
-
|
109 |
-
duration = time.time() - start
|
110 |
-
app_logger.info(f'Deleted file {random_file_name} in {duration}s.')
|
111 |
|
112 |
start = time.time()
|
113 |
real_transcripts_ipa = ' '.join(
|
@@ -125,9 +144,9 @@ def get_speech_to_score_dict(real_text: str, file_bytes_or_audiotmpfile: str | d
|
|
125 |
|
126 |
is_letter_correct_all_words = ''
|
127 |
for idx, word_real in enumerate(words_real):
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
|
132 |
is_letter_correct = wm.getWhichLettersWereTranscribedCorrectly(
|
133 |
word_real, mapped_letters) # , mapped_letters_indices)
|
@@ -146,21 +165,40 @@ def get_speech_to_score_dict(real_text: str, file_bytes_or_audiotmpfile: str | d
|
|
146 |
return {
|
147 |
'real_transcript': result['recording_transcript'],
|
148 |
'ipa_transcript': ipa_transcript,
|
149 |
-
'pronunciation_accuracy':
|
150 |
'real_transcripts': real_transcripts, 'matched_transcripts': matched_transcripts,
|
151 |
'real_transcripts_ipa': real_transcripts_ipa, 'matched_transcripts_ipa': matched_transcripts_ipa,
|
152 |
'pair_accuracy_category': pair_accuracy_category,
|
153 |
'start_time': result['start_time'],
|
154 |
'end_time': result['end_time'],
|
155 |
-
'is_letter_correct_all_words': is_letter_correct_all_words
|
|
|
156 |
}
|
157 |
|
158 |
|
159 |
-
def get_speech_to_score_tuple(real_text: str, file_bytes_or_audiotmpfile: str | dict, language: str = "en", remove_random_file: bool = True):
|
160 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
real_transcripts = output['real_transcripts']
|
162 |
is_letter_correct_all_words = output['is_letter_correct_all_words']
|
163 |
-
pronunciation_accuracy = output[
|
|
|
164 |
ipa_transcript = output['ipa_transcript']
|
165 |
real_transcripts_ipa = output['real_transcripts_ipa']
|
166 |
end_time = [float(x) for x in output['end_time'].split(" ")]
|
@@ -169,17 +207,45 @@ def get_speech_to_score_tuple(real_text: str, file_bytes_or_audiotmpfile: str |
|
|
169 |
app_logger.debug(f"start splitting recorded audio into {num_words} words...")
|
170 |
|
171 |
audio_files, audio_durations = get_splitted_audio_file(audiotmpfile=file_bytes_or_audiotmpfile, start_time=start_time, end_time=end_time)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
172 |
output = {'audio_files': audio_files, "audio_durations": audio_durations, **output}
|
173 |
first_audio_file = audio_files[0]
|
174 |
-
return real_transcripts, is_letter_correct_all_words, pronunciation_accuracy, ipa_transcript, real_transcripts_ipa, num_words, first_audio_file, json.dumps(output)
|
|
|
|
|
|
|
|
|
|
|
175 |
|
|
|
|
|
|
|
|
|
176 |
|
177 |
-
|
|
|
|
|
178 |
import soundfile as sf
|
179 |
sf.write(audiofile, data, samplerate)
|
180 |
|
181 |
|
182 |
-
def get_selected_word(idx_recorded_word: int, raw_json_output: str) -> tuple[str]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
183 |
recognition_output = json.loads(raw_json_output)
|
184 |
list_audio_files = recognition_output["audio_files"]
|
185 |
real_transcripts = recognition_output["real_transcripts"]
|
@@ -194,10 +260,23 @@ def get_selected_word(idx_recorded_word: int, raw_json_output: str) -> tuple[str
|
|
194 |
|
195 |
|
196 |
def get_splitted_audio_file(audiotmpfile: str | Path, start_time: list[float], end_time: list[float]) -> tuple[list[str], list[float]]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
197 |
import soundfile as sf
|
198 |
audio_files = []
|
199 |
audio_durations = []
|
|
|
200 |
for n, (start_nth, end_nth) in enumerate(zip(start_time, end_time)):
|
|
|
201 |
signal_nth, samplerate = soundfile_load(audiotmpfile, offset=start_nth, duration=end_nth - start_nth)
|
202 |
audiofile = get_file_with_custom_suffix(audiotmpfile, f"_part{n}_start{start_nth}_end{end_nth}")
|
203 |
soundfile_write(audiofile=audiofile, data=signal_nth, samplerate=samplerate)
|
@@ -210,20 +289,52 @@ def get_splitted_audio_file(audiotmpfile: str | Path, start_time: list[float], e
|
|
210 |
|
211 |
|
212 |
def get_file_with_custom_suffix(basefile: str | Path, custom_suffix: str) -> Path:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
213 |
pathname = Path(basefile)
|
214 |
dirname, filename_no_ext, filename_ext = pathname.parent, pathname.stem, pathname.suffix
|
215 |
-
output_file =
|
216 |
return output_file
|
217 |
|
218 |
|
219 |
# From Librosa
|
220 |
|
221 |
-
def calc_start_end(sr_native, time_position, n_channels):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
222 |
return int(np.round(sr_native * time_position)) * n_channels
|
223 |
|
224 |
|
225 |
-
def soundfile_load(path: str | Path, offset: float = 0.0, duration: float = None, dtype=np.float32):
|
226 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
227 |
import soundfile as sf
|
228 |
|
229 |
if isinstance(path, sf.SoundFile):
|
@@ -250,10 +361,18 @@ def soundfile_load(path: str | Path, offset: float = 0.0, duration: float = None
|
|
250 |
return y, sr_native
|
251 |
|
252 |
|
253 |
-
def audioread_load(path, offset=0.0, duration=None, dtype=np.float32):
|
254 |
-
"""
|
255 |
-
|
256 |
This loads one block at a time, and then concatenates the results.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
257 |
"""
|
258 |
y = []
|
259 |
app_logger.debug(f"reading audio file at path:{path} ...")
|
@@ -309,7 +428,7 @@ def audioread_load(path, offset=0.0, duration=None, dtype=np.float32):
|
|
309 |
# From Librosa
|
310 |
|
311 |
|
312 |
-
def buf_to_float(x, n_bytes=2, dtype=np.float32):
|
313 |
"""Convert an integer buffer to floating point values.
|
314 |
This is primarily useful when loading integer-valued wav data
|
315 |
into numpy arrays.
|
|
|
4 |
from pathlib import Path
|
5 |
import tempfile
|
6 |
import time
|
7 |
+
from typing import Dict, Any, LiteralString
|
8 |
|
9 |
import audioread
|
10 |
import numpy as np
|
11 |
import torch
|
12 |
from torchaudio.transforms import Resample
|
13 |
|
14 |
+
import WordMatching as wm
|
15 |
+
import pronunciationTrainer
|
16 |
+
import utilsFileIO
|
17 |
+
from constants import app_logger, sample_rate_resample, sample_rate_start, USE_DTW, IS_TESTING, tmp_audio_extension
|
18 |
|
19 |
|
20 |
+
trainer_SST_lambda = {'de': pronunciationTrainer.getTrainer("de"), 'en': pronunciationTrainer.getTrainer("en")}
|
21 |
+
transform = Resample(orig_freq=sample_rate_start, new_freq=sample_rate_resample)
|
|
|
|
|
|
|
22 |
|
23 |
|
24 |
+
def lambda_handler(event: Dict[str, Any], context: Any) -> Dict[str, Any]:
|
25 |
+
"""
|
26 |
+
Lambda handler for speech-to-score.
|
27 |
+
|
28 |
+
Parameters:
|
29 |
+
event (Dict[str, Any]): The event data containing the request body.
|
30 |
+
context (Any): The context in which the lambda function is executed.
|
31 |
+
|
32 |
+
Returns:
|
33 |
+
Dict[str, Any]: The response containing the speech-to-score results.
|
34 |
+
"""
|
35 |
+
body = event['body']
|
36 |
+
data = json.loads(body)
|
37 |
|
38 |
+
real_text = data['title']
|
39 |
+
base64_audio = data["base64Audio"]
|
40 |
app_logger.debug(f"base64Audio:{base64_audio} ...")
|
41 |
file_bytes_or_audiotmpfile = base64.b64decode(base64_audio[22:].encode('utf-8'))
|
42 |
+
language = data['language']
|
43 |
+
try:
|
44 |
+
use_dtw = data["useDTW"]
|
45 |
+
app_logger.info(f'use_dtw: "{type(use_dtw)}", "{use_dtw}".')
|
46 |
+
except KeyError:
|
47 |
+
use_dtw = USE_DTW
|
48 |
|
49 |
if len(real_text) == 0:
|
50 |
+
return utilsFileIO.return_response_ok('{}')
|
51 |
+
output = get_speech_to_score_dict(
|
52 |
+
real_text=real_text, file_bytes_or_audiotmpfile=file_bytes_or_audiotmpfile, language=language, use_dtw=use_dtw
|
53 |
+
)
|
54 |
+
output["pronunciation_accuracy"] = f"{int(output["pronunciation_accuracy"])}"
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
output = json.dumps(output)
|
56 |
app_logger.debug(f"output: {output} ...")
|
57 |
return output
|
58 |
|
59 |
|
60 |
+
def get_speech_to_score_dict(
|
61 |
+
real_text: str, file_bytes_or_audiotmpfile: str | bytes | dict, language: str = "en", extension: str = tmp_audio_extension, use_dtw: bool = False
|
62 |
+
) -> Dict[str | Any, float | LiteralString | str | Any]:
|
63 |
+
"""
|
64 |
+
Process the audio file and return a dictionary with speech-to-score results.
|
65 |
+
|
66 |
+
Parameters:
|
67 |
+
use_dtw:
|
68 |
+
real_text (str): The text to be matched with the audio.
|
69 |
+
file_bytes_or_audiotmpfile (str | bytes | dict): The audio file in bytes or a temporary file.
|
70 |
+
language (str): The language of the audio.
|
71 |
+
extension (str): The file extension of the audio file.
|
72 |
+
|
73 |
+
Returns:
|
74 |
+
Dict[str | Any, float | LiteralString | str | Any]: The speech-to-score results.
|
75 |
+
"""
|
76 |
from soundfile import LibsndfileError
|
77 |
app_logger.info(f"real_text:{real_text} ...")
|
78 |
app_logger.debug(f"file_bytes:{file_bytes_or_audiotmpfile} ...")
|
|
|
95 |
app_logger.debug("writing streaming data to file on disk...")
|
96 |
with tempfile.NamedTemporaryFile(prefix="temp_sound_speech_score_", suffix=extension, delete=False) as f1:
|
97 |
f1.write(file_bytes_or_audiotmpfile)
|
98 |
+
random_file_name = f1.name
|
99 |
duration = time.time() - start0
|
100 |
app_logger.info(f'Saved binary data in file in {duration}s.')
|
|
|
101 |
|
102 |
start = time.time()
|
103 |
+
app_logger.info(f"Loading temp '{random_file_name}' file...")
|
104 |
try:
|
105 |
signal, samplerate = soundfile_load(random_file_name)
|
106 |
except LibsndfileError as sfe:
|
107 |
# https://github.com/beetbox/audioread/issues/144
|
108 |
# deprecation warnings => pip install standard-aifc standard-sunau
|
109 |
app_logger.error(f"Error reading file {random_file_name}: {sfe}, re-try with audioread...")
|
110 |
+
signal, samplerate = audioread_load(random_file_name)
|
|
|
|
|
|
|
|
|
111 |
|
112 |
duration = time.time() - start
|
113 |
app_logger.info(f'Read {extension} file {random_file_name} in {duration}s.')
|
|
|
122 |
result = language_trainer_sst_lambda.processAudioForGivenText(signal_transformed, real_text)
|
123 |
app_logger.info(f'language_trainer_sst_lambda: result: {result}...')
|
124 |
|
125 |
+
# start = time.time()
|
126 |
+
# if remove_random_file:
|
127 |
+
# os.remove(random_file_name)
|
128 |
+
# duration = time.time() - start
|
129 |
+
# app_logger.info(f'Deleted file {random_file_name} in {duration}s.')
|
130 |
|
131 |
start = time.time()
|
132 |
real_transcripts_ipa = ' '.join(
|
|
|
144 |
|
145 |
is_letter_correct_all_words = ''
|
146 |
for idx, word_real in enumerate(words_real):
|
147 |
+
|
148 |
+
mapped_letters, mapped_letters_indices = wm.get_best_mapped_words(
|
149 |
+
mapped_words[idx], word_real, use_dtw=use_dtw)
|
150 |
|
151 |
is_letter_correct = wm.getWhichLettersWereTranscribedCorrectly(
|
152 |
word_real, mapped_letters) # , mapped_letters_indices)
|
|
|
165 |
return {
|
166 |
'real_transcript': result['recording_transcript'],
|
167 |
'ipa_transcript': ipa_transcript,
|
168 |
+
'pronunciation_accuracy': pronunciation_accuracy,
|
169 |
'real_transcripts': real_transcripts, 'matched_transcripts': matched_transcripts,
|
170 |
'real_transcripts_ipa': real_transcripts_ipa, 'matched_transcripts_ipa': matched_transcripts_ipa,
|
171 |
'pair_accuracy_category': pair_accuracy_category,
|
172 |
'start_time': result['start_time'],
|
173 |
'end_time': result['end_time'],
|
174 |
+
'is_letter_correct_all_words': is_letter_correct_all_words,
|
175 |
+
"random_file_name": random_file_name
|
176 |
}
|
177 |
|
178 |
|
179 |
+
def get_speech_to_score_tuple(real_text: str, file_bytes_or_audiotmpfile: str | dict, language: str = "en", remove_random_file: bool = True) -> tuple:
|
180 |
+
"""
|
181 |
+
Process the audio file and return a tuple with speech-to-score results.
|
182 |
+
|
183 |
+
Parameters:
|
184 |
+
real_text (str): The text to be matched with the audio.
|
185 |
+
file_bytes_or_audiotmpfile (str | dict): The audio file in bytes or a temporary file.
|
186 |
+
language (str): The language of the audio.
|
187 |
+
remove_random_file (bool): Whether to remove the temporary file after processing.
|
188 |
+
|
189 |
+
Returns:
|
190 |
+
tuple: A tuple containing real transcripts, letter correctness, pronunciation accuracy, IPA transcript, real transcripts in IPA, number of words, first audio file, and JSON output.
|
191 |
+
"""
|
192 |
+
output = get_speech_to_score_dict(
|
193 |
+
real_text=real_text, file_bytes_or_audiotmpfile=file_bytes_or_audiotmpfile,
|
194 |
+
language=language
|
195 |
+
)
|
196 |
+
random_file_name = output["random_file_name"]
|
197 |
+
del output["random_file_name"]
|
198 |
real_transcripts = output['real_transcripts']
|
199 |
is_letter_correct_all_words = output['is_letter_correct_all_words']
|
200 |
+
pronunciation_accuracy = f"{output["pronunciation_accuracy"]:.2f}"
|
201 |
+
output["pronunciation_accuracy"] = pronunciation_accuracy
|
202 |
ipa_transcript = output['ipa_transcript']
|
203 |
real_transcripts_ipa = output['real_transcripts_ipa']
|
204 |
end_time = [float(x) for x in output['end_time'].split(" ")]
|
|
|
207 |
app_logger.debug(f"start splitting recorded audio into {num_words} words...")
|
208 |
|
209 |
audio_files, audio_durations = get_splitted_audio_file(audiotmpfile=file_bytes_or_audiotmpfile, start_time=start_time, end_time=end_time)
|
210 |
+
|
211 |
+
remove_random_file = not IS_TESTING and remove_random_file
|
212 |
+
if remove_random_file:
|
213 |
+
app_logger.info(f"{IS_TESTING} => remove_random_file:{remove_random_file}, removing:{random_file_name} ...")
|
214 |
+
Path(random_file_name).unlink(missing_ok=True)
|
215 |
+
app_logger.info(f"removed:{random_file_name} ...")
|
216 |
+
|
217 |
output = {'audio_files': audio_files, "audio_durations": audio_durations, **output}
|
218 |
first_audio_file = audio_files[0]
|
219 |
+
return real_transcripts, is_letter_correct_all_words, pronunciation_accuracy, ipa_transcript, real_transcripts_ipa, num_words, first_audio_file, json.dumps(output), random_file_name
|
220 |
+
|
221 |
+
|
222 |
+
def soundfile_write(audiofile: str | Path, data: np.ndarray, samplerate: int) -> None:
|
223 |
+
"""
|
224 |
+
Write audio data to a file using soundfile.
|
225 |
|
226 |
+
Parameters:
|
227 |
+
audiofile (str | Path): The path to the audio file.
|
228 |
+
data (np.ndarray): The audio data to write.
|
229 |
+
samplerate (int): The sample rate of the audio data.
|
230 |
|
231 |
+
Returns:
|
232 |
+
None
|
233 |
+
"""
|
234 |
import soundfile as sf
|
235 |
sf.write(audiofile, data, samplerate)
|
236 |
|
237 |
|
238 |
+
def get_selected_word(idx_recorded_word: int, raw_json_output: str) -> tuple[str, str, float]:
|
239 |
+
"""
|
240 |
+
Get the selected word, its audio file, and duration from the recognition output.
|
241 |
+
|
242 |
+
Parameters:
|
243 |
+
idx_recorded_word (int): The index of the recorded word.
|
244 |
+
raw_json_output (str): The JSON output from the recognition process.
|
245 |
+
|
246 |
+
Returns:
|
247 |
+
tuple: A tuple containing the audio file, the current word, and its duration.
|
248 |
+
"""
|
249 |
recognition_output = json.loads(raw_json_output)
|
250 |
list_audio_files = recognition_output["audio_files"]
|
251 |
real_transcripts = recognition_output["real_transcripts"]
|
|
|
260 |
|
261 |
|
262 |
def get_splitted_audio_file(audiotmpfile: str | Path, start_time: list[float], end_time: list[float]) -> tuple[list[str], list[float]]:
|
263 |
+
"""
|
264 |
+
Split the audio file into segments based on start and end times.
|
265 |
+
|
266 |
+
Parameters:
|
267 |
+
audiotmpfile (str | Path): The path to the audio file.
|
268 |
+
start_time (list[float]): The start times of the segments.
|
269 |
+
end_time (list[float]): The end times of the segments.
|
270 |
+
|
271 |
+
Returns:
|
272 |
+
tuple: A tuple containing a list of audio files and their durations.
|
273 |
+
"""
|
274 |
import soundfile as sf
|
275 |
audio_files = []
|
276 |
audio_durations = []
|
277 |
+
app_logger.info(f"start_time:{start_time}, end_time:{end_time} ...")
|
278 |
for n, (start_nth, end_nth) in enumerate(zip(start_time, end_time)):
|
279 |
+
# assert start_nth < end_nth, f"start_nth:{start_nth} (index {n}) should be less than end_nth:{end_nth} (start_time:{start_time}, end_time:{end_time})..."
|
280 |
signal_nth, samplerate = soundfile_load(audiotmpfile, offset=start_nth, duration=end_nth - start_nth)
|
281 |
audiofile = get_file_with_custom_suffix(audiotmpfile, f"_part{n}_start{start_nth}_end{end_nth}")
|
282 |
soundfile_write(audiofile=audiofile, data=signal_nth, samplerate=samplerate)
|
|
|
289 |
|
290 |
|
291 |
def get_file_with_custom_suffix(basefile: str | Path, custom_suffix: str) -> Path:
|
292 |
+
"""
|
293 |
+
Generate a file path with a custom suffix.
|
294 |
+
|
295 |
+
Parameters:
|
296 |
+
basefile (str | Path): The base file path.
|
297 |
+
custom_suffix (str): The custom suffix to add to the file name.
|
298 |
+
|
299 |
+
Returns:
|
300 |
+
Path: The new file path with the custom suffix.
|
301 |
+
"""
|
302 |
pathname = Path(basefile)
|
303 |
dirname, filename_no_ext, filename_ext = pathname.parent, pathname.stem, pathname.suffix
|
304 |
+
output_file = dirname / f"{filename_no_ext}_{custom_suffix}.{filename_ext}"
|
305 |
return output_file
|
306 |
|
307 |
|
308 |
# From Librosa
|
309 |
|
310 |
+
def calc_start_end(sr_native: int, time_position: float, n_channels: int) -> int:
|
311 |
+
"""
|
312 |
+
Calculate the start or end position in samples.
|
313 |
+
|
314 |
+
Parameters:
|
315 |
+
sr_native (int): The native sample rate.
|
316 |
+
time_position (float): The time position in seconds.
|
317 |
+
n_channels (int): The number of audio channels.
|
318 |
+
|
319 |
+
Returns:
|
320 |
+
int: The start or end position in samples.
|
321 |
+
"""
|
322 |
return int(np.round(sr_native * time_position)) * n_channels
|
323 |
|
324 |
|
325 |
+
def soundfile_load(path: str | Path, offset: float = 0.0, duration: float = None, dtype=np.float32) -> tuple[np.ndarray, int]:
|
326 |
+
"""
|
327 |
+
Load an audio buffer using soundfile.
|
328 |
+
|
329 |
+
Parameters:
|
330 |
+
path (str | Path): The path to the audio file.
|
331 |
+
offset (float): The offset in seconds to start reading the file.
|
332 |
+
duration (float): The duration in seconds to read from the file.
|
333 |
+
dtype (np.float32): The data type of the audio buffer.
|
334 |
+
|
335 |
+
Returns:
|
336 |
+
tuple: A tuple containing the audio buffer and the sample rate.
|
337 |
+
"""
|
338 |
import soundfile as sf
|
339 |
|
340 |
if isinstance(path, sf.SoundFile):
|
|
|
361 |
return y, sr_native
|
362 |
|
363 |
|
364 |
+
def audioread_load(path: str | Path, offset: float = 0.0, duration: float = None, dtype=np.float32) -> tuple[np.ndarray, int]:
|
365 |
+
"""
|
|
|
366 |
This loads one block at a time, and then concatenates the results.
|
367 |
+
|
368 |
+
Parameters:
|
369 |
+
path (str | Path): The path to the audio file.
|
370 |
+
offset (float): The offset in seconds to start reading the file.
|
371 |
+
duration (float): The duration in seconds to read from the file.
|
372 |
+
dtype (np.float32): The data type of the audio buffer.
|
373 |
+
|
374 |
+
Returns:
|
375 |
+
tuple: A tuple containing the audio buffer and the sample rate.
|
376 |
"""
|
377 |
y = []
|
378 |
app_logger.debug(f"reading audio file at path:{path} ...")
|
|
|
428 |
# From Librosa
|
429 |
|
430 |
|
431 |
+
def buf_to_float(x: np.ndarray, n_bytes: int = 2, dtype: np.float32 = np.float32) -> np.ndarray:
|
432 |
"""Convert an integer buffer to floating point values.
|
433 |
This is primarily useful when loading integer-valued wav data
|
434 |
into numpy arrays.
|
aip_trainer/lambdas/lambdaTTS.py β lambdaTTS.py
RENAMED
@@ -1,12 +1,36 @@
|
|
1 |
-
import
|
|
|
|
|
2 |
import tempfile
|
3 |
from pathlib import Path
|
4 |
|
5 |
-
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
|
8 |
def get_tts(text: str, language: str, tmp_prefix="audio_", tmp_suffix=".wav") -> str:
|
9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
if text is None or len(text) == 0:
|
12 |
raise ValueError(f"cannot read an empty/None text: '{text}'...")
|
@@ -15,11 +39,11 @@ def get_tts(text: str, language: str, tmp_prefix="audio_", tmp_suffix=".wav") ->
|
|
15 |
|
16 |
tmp_dir = Path(tempfile.gettempdir())
|
17 |
try:
|
18 |
-
model, _, speaker, sample_rate = models.
|
19 |
language, output_folder=tmp_dir
|
20 |
)
|
21 |
except ValueError:
|
22 |
-
model, _, sample_rate, _, _, speaker = models.
|
23 |
language, output_folder=tmp_dir
|
24 |
)
|
25 |
app_logger.info(f"model speaker #0: {speaker} ...")
|
|
|
1 |
+
import base64
|
2 |
+
import json
|
3 |
+
import os
|
4 |
import tempfile
|
5 |
from pathlib import Path
|
6 |
|
7 |
+
import soundfile as sf
|
8 |
+
|
9 |
+
import AIModels
|
10 |
+
import models
|
11 |
+
import utilsFileIO
|
12 |
+
from constants import app_logger, sample_rate_resample
|
13 |
|
14 |
|
15 |
def get_tts(text: str, language: str, tmp_prefix="audio_", tmp_suffix=".wav") -> str:
|
16 |
+
"""
|
17 |
+
Generate text-to-speech (TTS) audio for the given text and language.
|
18 |
+
|
19 |
+
Args:
|
20 |
+
text (str): The text to be converted to speech.
|
21 |
+
language (str): The language of the text. Supported languages are "en" (English) and "de" (German).
|
22 |
+
tmp_prefix (str, optional): The temporary directory to use for temporary files.
|
23 |
+
tmp_suffix (str, optional): The temporary directory to use for temporary files.
|
24 |
+
|
25 |
+
Returns:
|
26 |
+
str: The path to the generated audio file.
|
27 |
+
|
28 |
+
Raises:
|
29 |
+
NotImplementedError: If the provided language is not supported.
|
30 |
+
|
31 |
+
Notes:
|
32 |
+
This function uses the Silero TTS model to generate the audio. The model and speaker are selected based on the provided language.
|
33 |
+
"""
|
34 |
|
35 |
if text is None or len(text) == 0:
|
36 |
raise ValueError(f"cannot read an empty/None text: '{text}'...")
|
|
|
39 |
|
40 |
tmp_dir = Path(tempfile.gettempdir())
|
41 |
try:
|
42 |
+
model, _, speaker, sample_rate = models.__silero_tts(
|
43 |
language, output_folder=tmp_dir
|
44 |
)
|
45 |
except ValueError:
|
46 |
+
model, _, sample_rate, _, _, speaker = models.__silero_tts(
|
47 |
language, output_folder=tmp_dir
|
48 |
)
|
49 |
app_logger.info(f"model speaker #0: {speaker} ...")
|
aip_trainer/models/models.py β models.py
RENAMED
@@ -1,11 +1,15 @@
|
|
1 |
import os
|
2 |
from pathlib import Path
|
3 |
-
import
|
|
|
4 |
import torch
|
5 |
import torch.nn as nn
|
|
|
6 |
from silero.utils import Decoder
|
7 |
|
8 |
-
from
|
|
|
|
|
9 |
|
10 |
|
11 |
default_speaker_dict = {
|
@@ -14,11 +18,92 @@ default_speaker_dict = {
|
|
14 |
}
|
15 |
|
16 |
|
17 |
-
def
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
"""
|
23 |
output_folder = Path(output_folder)
|
24 |
current_model_lang = default_speaker_dict[language]
|
@@ -26,10 +111,10 @@ def silero_tts(language="en", version="latest", output_folder: Path | str = None
|
|
26 |
if language in default_speaker_dict:
|
27 |
model_id = current_model_lang["model_id"]
|
28 |
|
29 |
-
models =
|
30 |
available_languages = list(models.tts_models.keys())
|
31 |
assert (
|
32 |
-
|
33 |
), f"Language not in the supported list {available_languages}"
|
34 |
|
35 |
tts_models_lang = models.tts_models[language]
|
@@ -67,46 +152,95 @@ def silero_tts(language="en", version="latest", output_folder: Path | str = None
|
|
67 |
return model, symbols, sample_rate, example_text, apply_tts, model_id
|
68 |
|
69 |
|
70 |
-
def
|
71 |
-
language="en",
|
72 |
-
version="latest",
|
73 |
-
jit_model="jit",
|
74 |
-
output_folder: Path | str = None,
|
75 |
-
**kwargs,
|
76 |
-
):
|
77 |
-
"""Modified Silero Speech-To-Text Model(s) function
|
78 |
-
language (str): language of the model, now available are ['en', 'de', 'es']
|
79 |
-
version:
|
80 |
-
jit_model:
|
81 |
-
output_folder: needed in case of docker build
|
82 |
-
Returns a model, decoder object and a set of utils
|
83 |
-
Please see https://github.com/snakers4/silero-models for usage examples
|
84 |
"""
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
**kwargs,
|
99 |
)
|
100 |
-
|
101 |
-
|
102 |
-
return model, decoder, utils
|
103 |
|
104 |
|
105 |
def init_jit_model(
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
torch.set_grad_enabled(False)
|
111 |
|
112 |
app_logger.info(
|
@@ -126,62 +260,49 @@ def init_jit_model(
|
|
126 |
|
127 |
if not os.path.isfile(model_path):
|
128 |
app_logger.info(f"downloading model_path: '{model_path}' ...")
|
129 |
-
torch.hub.download_url_to_file(model_url, model_path, progress=True)
|
130 |
app_logger.info(f"model_path {model_path} downloaded!")
|
131 |
model = torch.jit.load(model_path, map_location=device)
|
132 |
model.eval()
|
133 |
return model, Decoder(model.labels)
|
134 |
|
135 |
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
raise NotImplementedError(
|
147 |
-
"currenty works only for 'de' and 'en' languages, not for '{}'.".format(
|
148 |
-
language
|
149 |
-
)
|
150 |
-
)
|
151 |
-
|
152 |
-
return model, decoder
|
153 |
-
|
154 |
|
155 |
-
|
156 |
-
|
|
|
|
|
|
|
|
|
157 |
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
|
|
|
|
|
|
|
|
162 |
)
|
163 |
-
models_list_file = output_folder / f"latest_silero_model_{language}.yml"
|
164 |
-
if not os.path.exists(models_list_file):
|
165 |
-
app_logger.info(
|
166 |
-
f"model {model_type} yml for '{language}' language, '{version}' version not found, download it in folder {output_folder}..."
|
167 |
-
)
|
168 |
-
torch.hub.download_url_to_file(
|
169 |
-
"https://raw.githubusercontent.com/snakers4/silero-models/master/models.yml",
|
170 |
-
models_list_file,
|
171 |
-
progress=False,
|
172 |
-
)
|
173 |
-
assert os.path.exists(models_list_file)
|
174 |
-
return OmegaConf.load(models_list_file)
|
175 |
-
|
176 |
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
model_url=models[model_type].get(language).get(version).get(jit_model),
|
184 |
-
output_folder=output_folder,
|
185 |
**kwargs,
|
186 |
)
|
187 |
-
|
|
|
|
|
|
1 |
import os
|
2 |
from pathlib import Path
|
3 |
+
from typing import Union, Callable
|
4 |
+
|
5 |
import torch
|
6 |
import torch.nn as nn
|
7 |
+
from omegaconf import DictConfig, ListConfig
|
8 |
from silero.utils import Decoder
|
9 |
|
10 |
+
from AIModels import NeuralASR
|
11 |
+
from ModelInterfaces import IASRModel
|
12 |
+
from constants import MODEL_NAME_DEFAULT, language_not_implemented, app_logger, sample_rate_start, silero_versions_dict
|
13 |
|
14 |
|
15 |
default_speaker_dict = {
|
|
|
18 |
}
|
19 |
|
20 |
|
21 |
+
def getASRModel(language: str, model_name: str = MODEL_NAME_DEFAULT) -> IASRModel:
|
22 |
+
models_dict = {
|
23 |
+
"whisper": __get_model_whisper,
|
24 |
+
"faster_whisper": __get_model_faster_whisper,
|
25 |
+
"silero": __get_model_silero
|
26 |
+
}
|
27 |
+
if model_name in models_dict:
|
28 |
+
fn = models_dict[model_name]
|
29 |
+
return fn(language)
|
30 |
+
models_supported = ", ".join(models_dict.keys())
|
31 |
+
raise ValueError(f"Model '{model_name}' not implemented. Supported models: {models_supported}.")
|
32 |
+
|
33 |
+
|
34 |
+
def __get_model_whisper(language: str) -> IASRModel:
|
35 |
+
from whisper_wrapper import WhisperASRModel
|
36 |
+
return WhisperASRModel(language=language)
|
37 |
+
|
38 |
+
|
39 |
+
def __get_model_faster_whisper(language: str) -> IASRModel:
|
40 |
+
from faster_whisper_wrapper import FasterWhisperASRModel
|
41 |
+
return FasterWhisperASRModel(language=language)
|
42 |
+
|
43 |
+
|
44 |
+
def __get_model_silero(language: str) -> IASRModel:
|
45 |
+
import tempfile
|
46 |
+
tmp_dir = tempfile.gettempdir()
|
47 |
+
if language == "de":
|
48 |
+
model, decoder, _ = __silero_stt(
|
49 |
+
language="de", version="v4", jit_model="jit_large", output_folder=tmp_dir
|
50 |
+
)
|
51 |
+
return __eval_apply_neural_asr(model, decoder, language)
|
52 |
+
elif language == "en":
|
53 |
+
model, decoder, _ = __silero_stt(language="en", output_folder=tmp_dir)
|
54 |
+
return __eval_apply_neural_asr(model, decoder, language)
|
55 |
+
raise ValueError(language_not_implemented.format(language))
|
56 |
+
|
57 |
+
|
58 |
+
def __eval_apply_neural_asr(model: nn.Module, decoder: Decoder, language: str):
|
59 |
+
app_logger.info(f"LOADED silero model language: {language}, version: '{silero_versions_dict[language]}'")
|
60 |
+
model.eval()
|
61 |
+
app_logger.info(f"EVALUATED silero model language: {language}, version: '{silero_versions_dict[language]}'")
|
62 |
+
return NeuralASR(model, decoder)
|
63 |
+
|
64 |
+
|
65 |
+
def getTranslationModel(language: str) -> nn.Module:
|
66 |
+
from transformers import AutoTokenizer
|
67 |
+
from transformers import AutoModelForSeq2SeqLM
|
68 |
+
if language == 'de':
|
69 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(
|
70 |
+
"Helsinki-NLP/opus-mt-de-en")
|
71 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
72 |
+
"Helsinki-NLP/opus-mt-de-en")
|
73 |
+
# Cache models to avoid Hugging face processing (not needed now)
|
74 |
+
# with open('translation_model_de.pickle', 'wb') as handle:
|
75 |
+
# pickle.dump(model, handle)
|
76 |
+
# with open('translation_tokenizer_de.pickle', 'wb') as handle:
|
77 |
+
# pickle.dump(tokenizer, handle)
|
78 |
+
else:
|
79 |
+
raise ValueError(language_not_implemented.format(language))
|
80 |
+
|
81 |
+
return model, tokenizer
|
82 |
+
|
83 |
+
|
84 |
+
def __silero_tts(language: str = "en", version: str = "latest", output_folder: Path | str = None, **kwargs) -> tuple[nn.Module, str, int, str, dict, Callable, str]:
|
85 |
+
"""
|
86 |
+
Modified function to create instances of Silero Text-To-Speech Models.
|
87 |
+
Please see https://github.com/snakers4/silero-models?tab=readme-ov-file#text-to-speech for usage examples.
|
88 |
+
language="en", version="latest", output_folder: Path | str = None, **kwargs
|
89 |
+
|
90 |
+
Args:
|
91 |
+
language (str): Language of the model. Available options are ['ru', 'en', 'de', 'es', 'fr']. Default is 'en'.
|
92 |
+
version (str): Version of the model to use. Default is 'latest'.
|
93 |
+
output_folder (Path | str): Path to the folder where the model will be saved. Default is None.
|
94 |
+
**kwargs: Additional keyword arguments.
|
95 |
+
Returns:
|
96 |
+
tuple: Depending on the model version and the input arguments, returns a tuple containing:
|
97 |
+
- model: The loaded TTS model.
|
98 |
+
- symbols (str): The set of symbols used by the model (only for older model versions).
|
99 |
+
- sample_rate (int): The sample rate of the model.
|
100 |
+
- example_text (str): Example text for the model.
|
101 |
+
- speaker (dict):
|
102 |
+
- apply_tts (function): Function to apply TTS (only for older model versions).
|
103 |
+
- model_id (str): The model ID (only for older model versions).
|
104 |
+
|
105 |
+
Raises:
|
106 |
+
AssertionError: If the specified language is not in the supported list.
|
107 |
"""
|
108 |
output_folder = Path(output_folder)
|
109 |
current_model_lang = default_speaker_dict[language]
|
|
|
111 |
if language in default_speaker_dict:
|
112 |
model_id = current_model_lang["model_id"]
|
113 |
|
114 |
+
models = __get_models(language, output_folder, version, model_type="tts_models")
|
115 |
available_languages = list(models.tts_models.keys())
|
116 |
assert (
|
117 |
+
language in available_languages
|
118 |
), f"Language not in the supported list {available_languages}"
|
119 |
|
120 |
tts_models_lang = models.tts_models[language]
|
|
|
152 |
return model, symbols, sample_rate, example_text, apply_tts, model_id
|
153 |
|
154 |
|
155 |
+
def __get_models(language: str, output_folder: str | Path, version: str, model_type: str) -> Union[DictConfig, ListConfig]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
"""
|
157 |
+
Retrieve and load the model configuration for a specified language and model type.
|
158 |
+
|
159 |
+
Args:
|
160 |
+
language (str): The language for which the model is required.
|
161 |
+
output_folder (str or Path): The folder where the model configuration file should be saved
|
162 |
+
version (str): The version of the model.
|
163 |
+
model_type (str): The type of the model.
|
164 |
+
|
165 |
+
Returns:
|
166 |
+
OmegaConf: The loaded model configuration.
|
167 |
+
|
168 |
+
Raises:
|
169 |
+
AssertionError: If the model configuration file does not exist after attempting to download it.
|
170 |
+
|
171 |
+
Notes:
|
172 |
+
If the model configuration file does not exist in the specified output folder, it will be downloaded
|
173 |
+
from a predefined URL and saved in the output folder.
|
174 |
+
"""
|
175 |
+
from omegaconf import OmegaConf
|
176 |
+
|
177 |
+
output_folder = (
|
178 |
+
Path(output_folder)
|
179 |
+
if output_folder is not None
|
180 |
+
else Path(os.path.dirname(__file__)).parent.parent
|
181 |
)
|
182 |
+
models_list_file = output_folder / f"latest_silero_model_{language}.yml"
|
183 |
+
app_logger.info(f"models_list_file:{models_list_file}.")
|
184 |
+
if not os.path.exists(models_list_file):
|
185 |
+
app_logger.info(
|
186 |
+
f"model {model_type} yml for '{language}' language, '{version}' version not found, download it in folder {output_folder}..."
|
187 |
+
)
|
188 |
+
torch.hub.download_url_to_file(
|
189 |
+
"https://raw.githubusercontent.com/snakers4/silero-models/master/models.yml",
|
190 |
+
str(models_list_file),
|
191 |
+
progress=False,
|
192 |
+
)
|
193 |
+
assert os.path.exists(models_list_file)
|
194 |
+
return OmegaConf.load(models_list_file)
|
195 |
|
196 |
+
|
197 |
+
def __get_latest_stt_model(language: str, output_folder: str | Path, version: str, model_type: str, jit_model: str, **kwargs) -> tuple[nn.Module, Decoder]:
|
198 |
+
"""
|
199 |
+
Retrieve the latest Speech-to-Text (STT) model for a given language and model type.
|
200 |
+
|
201 |
+
Args:
|
202 |
+
language (str): The language for which the STT model is required.
|
203 |
+
output_folder (str): The directory where the model will be saved.
|
204 |
+
version (str): The version of the model to retrieve.
|
205 |
+
model_type (str): The type of the model (e.g., 'large', 'small').
|
206 |
+
jit_model (str): The specific JIT model to use.
|
207 |
+
**kwargs: Additional keyword arguments to pass to the model initialization function.
|
208 |
+
|
209 |
+
Returns:
|
210 |
+
tuple: A tuple containing the model and the decoder.
|
211 |
+
|
212 |
+
Raises:
|
213 |
+
AssertionError: If the specified language is not available in the model type.
|
214 |
+
"""
|
215 |
+
models = __get_models(language, output_folder, version, model_type)
|
216 |
+
available_languages = list(models[model_type].keys())
|
217 |
+
assert language in available_languages
|
218 |
+
|
219 |
+
model, decoder = init_jit_model(
|
220 |
+
model_url=models[model_type].get(language).get(version).get(jit_model),
|
221 |
+
output_folder=output_folder,
|
222 |
**kwargs,
|
223 |
)
|
224 |
+
return model, decoder
|
|
|
|
|
225 |
|
226 |
|
227 |
def init_jit_model(
|
228 |
+
model_url: str,
|
229 |
+
device: torch.device = torch.device("cpu"),
|
230 |
+
output_folder: Path | str = None,
|
231 |
+
) -> tuple[torch.nn.Module, Decoder]:
|
232 |
+
"""
|
233 |
+
Initialize a JIT model from a given URL.
|
234 |
+
|
235 |
+
Args:
|
236 |
+
model_url (str): The URL to download the model from.
|
237 |
+
device (torch.device, optional): The device to load the model on. Defaults to CPU.
|
238 |
+
output_folder (Path | str, optional): The folder to save the downloaded model.
|
239 |
+
If None, defaults to a 'model' directory in the current file's directory.
|
240 |
+
|
241 |
+
Returns:
|
242 |
+
Tuple[torch.jit.ScriptModule, Decoder]: The loaded JIT model and its corresponding decoder.
|
243 |
+
"""
|
244 |
torch.set_grad_enabled(False)
|
245 |
|
246 |
app_logger.info(
|
|
|
260 |
|
261 |
if not os.path.isfile(model_path):
|
262 |
app_logger.info(f"downloading model_path: '{model_path}' ...")
|
263 |
+
torch.hub.download_url_to_file(model_url, str(model_path), progress=True)
|
264 |
app_logger.info(f"model_path {model_path} downloaded!")
|
265 |
model = torch.jit.load(model_path, map_location=device)
|
266 |
model.eval()
|
267 |
return model, Decoder(model.labels)
|
268 |
|
269 |
|
270 |
+
def __silero_stt(
|
271 |
+
language: str = "en",
|
272 |
+
version: str = "latest",
|
273 |
+
jit_model: str = "jit",
|
274 |
+
output_folder: Path | str = None,
|
275 |
+
**kwargs,
|
276 |
+
) -> tuple[nn.Module, Decoder, set[Callable, Callable, Callable, Callable]]:
|
277 |
+
"""
|
278 |
+
Modified function to create instances of Silero Speech-To-Text Model(s).
|
279 |
+
Please see https://github.com/snakers4/silero-models?tab=readme-ov-file#speech-to-text for usage examples.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
280 |
|
281 |
+
Args:
|
282 |
+
language (str): Language of the model. Available options are ['en', 'de', 'es'].
|
283 |
+
version (str): Version of the model to use. Default is "latest".
|
284 |
+
jit_model (str): Type of JIT model to use. Default is "jit".
|
285 |
+
output_folder (Path | str, optional): Output folder needed in case of docker build. Default is None.
|
286 |
+
**kwargs: Additional keyword arguments.
|
287 |
|
288 |
+
Returns:
|
289 |
+
tuple: A tuple containing the model, decoder object, and a set of utility functions.
|
290 |
+
"""
|
291 |
+
from silero.utils import (
|
292 |
+
read_audio,
|
293 |
+
read_batch,
|
294 |
+
split_into_batches,
|
295 |
+
prepare_model_input,
|
296 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
297 |
|
298 |
+
model, decoder = __get_latest_stt_model(
|
299 |
+
language,
|
300 |
+
output_folder,
|
301 |
+
version,
|
302 |
+
model_type="stt_models",
|
303 |
+
jit_model=jit_model,
|
|
|
|
|
304 |
**kwargs,
|
305 |
)
|
306 |
+
utils = (read_batch, split_into_batches, read_audio, prepare_model_input)
|
307 |
+
|
308 |
+
return model, decoder, utils
|
packages.txt
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
ffmpeg
|
|
|
|
pre-requirements.txt
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
pip
|
|
|
|
aip_trainer/pronunciationTrainer.py β pronunciationTrainer.py
RENAMED
@@ -5,29 +5,22 @@ import epitran
|
|
5 |
import numpy as np
|
6 |
import torch
|
7 |
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
|
|
|
|
12 |
|
13 |
|
14 |
-
def getTrainer(language: str):
|
15 |
-
|
16 |
-
device = torch.device('cpu')
|
17 |
-
|
18 |
-
model, decoder = mo.getASRModel(language)
|
19 |
-
model = model.to(device)
|
20 |
-
model.eval()
|
21 |
-
asr_model = AIModels.NeuralASR(model, decoder)
|
22 |
-
|
23 |
if language == 'de':
|
24 |
-
|
25 |
-
phonem_converter = RuleBasedModels.EpitranPhonemConverter(epitran_deu_latn)
|
26 |
elif language == 'en':
|
27 |
phonem_converter = RuleBasedModels.EngPhonemConverter()
|
28 |
else:
|
29 |
-
raise ValueError(
|
30 |
-
|
31 |
trainer = PronunciationTrainer(asr_model, phonem_converter)
|
32 |
|
33 |
return trainer
|
@@ -50,7 +43,7 @@ class PronunciationTrainer:
|
|
50 |
current_words_pronunciation_accuracy = []
|
51 |
categories_thresholds = np.array([80, 60, 59])
|
52 |
|
53 |
-
sampling_rate =
|
54 |
|
55 |
def __init__(self, asr_model: mi.IASRModel, word_to_ipa_coverter: mi.ITextToPhonemModel) -> None:
|
56 |
self.asr_model = asr_model
|
@@ -67,22 +60,36 @@ class PronunciationTrainer:
|
|
67 |
|
68 |
return audio_transcript, word_locations_in_samples
|
69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
##################### ASR Functions ###########################
|
71 |
|
72 |
def processAudioForGivenText(self, recordedAudio: torch.Tensor = None, real_text=None):
|
73 |
|
74 |
start = time.time()
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
app_logger.info(f'Time for NN to transcript audio: {duration}.')
|
80 |
|
81 |
start = time.time()
|
82 |
real_and_transcribed_words, real_and_transcribed_words_ipa, mapped_words_indices = self.matchSampleAndRecordedWords(
|
83 |
real_text, recording_transcript)
|
84 |
-
|
85 |
-
app_logger.info(f'Time for matching transcripts: {
|
86 |
|
87 |
start_time, end_time = self.getWordLocationsFromRecordInSeconds(
|
88 |
word_locations, mapped_words_indices)
|
@@ -104,22 +111,20 @@ class PronunciationTrainer:
|
|
104 |
def getAudioTranscript(self, recordedAudio: torch.Tensor = None):
|
105 |
current_recorded_audio = recordedAudio
|
106 |
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
app_logger.info('starting processAudio...')
|
111 |
self.asr_model.processAudio(current_recorded_audio)
|
112 |
|
113 |
-
app_logger.info('starting getTranscriptAndWordsLocations...')
|
114 |
current_recorded_transcript, current_recorded_word_locations = self.getTranscriptAndWordsLocations(
|
115 |
current_recorded_audio.shape[1])
|
116 |
-
|
117 |
-
|
118 |
|
119 |
-
|
120 |
return current_recorded_transcript, current_recorded_ipa, current_recorded_word_locations
|
121 |
|
122 |
-
def getWordLocationsFromRecordInSeconds(self, word_locations, mapped_words_indices) ->
|
|
|
123 |
start_time = []
|
124 |
end_time = []
|
125 |
for word_idx in range(len(mapped_words_indices)):
|
@@ -135,10 +140,10 @@ class PronunciationTrainer:
|
|
135 |
def matchSampleAndRecordedWords(self, real_text, recorded_transcript):
|
136 |
words_estimated = recorded_transcript.split()
|
137 |
|
138 |
-
|
139 |
-
words_real = self.current_transcript[0].split()
|
140 |
-
else:
|
141 |
words_real = real_text.split()
|
|
|
|
|
142 |
|
143 |
mapped_words, mapped_words_indices = wm.get_best_mapped_words(
|
144 |
words_estimated, words_real)
|
@@ -154,7 +159,7 @@ class PronunciationTrainer:
|
|
154 |
self.ipa_converter.convertToPhonem(mapped_words[word_idx])))
|
155 |
return real_and_transcribed_words, real_and_transcribed_words_ipa, mapped_words_indices
|
156 |
|
157 |
-
def getPronunciationAccuracy(self, real_and_transcribed_words_ipa) ->
|
158 |
total_mismatches = 0.
|
159 |
number_of_phonemes = 0.
|
160 |
current_words_pronunciation_accuracy = []
|
@@ -191,4 +196,4 @@ class PronunciationTrainer:
|
|
191 |
return np.argmin(abs(self.categories_thresholds-accuracy))
|
192 |
|
193 |
def preprocessAudio(self, audio: torch.tensor) -> torch.tensor:
|
194 |
-
return preprocessAudioStandalone(audio
|
|
|
5 |
import numpy as np
|
6 |
import torch
|
7 |
|
8 |
+
import ModelInterfaces as mi
|
9 |
+
import RuleBasedModels
|
10 |
+
import WordMatching as wm
|
11 |
+
import WordMetrics
|
12 |
+
import models as mo
|
13 |
+
from constants import app_logger, MODEL_NAME_DEFAULT, sample_rate_resample
|
14 |
|
15 |
|
16 |
+
def getTrainer(language: str, model_name: str = MODEL_NAME_DEFAULT):
|
17 |
+
asr_model = mo.getASRModel(language, model_name=model_name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
if language == 'de':
|
19 |
+
phonem_converter = RuleBasedModels.EpitranPhonemConverter(epitran.Epitran('deu-Latn'))
|
|
|
20 |
elif language == 'en':
|
21 |
phonem_converter = RuleBasedModels.EngPhonemConverter()
|
22 |
else:
|
23 |
+
raise ValueError(f"Language '{language}' not implemented")
|
|
|
24 |
trainer = PronunciationTrainer(asr_model, phonem_converter)
|
25 |
|
26 |
return trainer
|
|
|
43 |
current_words_pronunciation_accuracy = []
|
44 |
categories_thresholds = np.array([80, 60, 59])
|
45 |
|
46 |
+
sampling_rate = sample_rate_resample
|
47 |
|
48 |
def __init__(self, asr_model: mi.IASRModel, word_to_ipa_coverter: mi.ITextToPhonemModel) -> None:
|
49 |
self.asr_model = asr_model
|
|
|
60 |
|
61 |
return audio_transcript, word_locations_in_samples
|
62 |
|
63 |
+
# def getWordsRelativeIntonation(self, Audio: torch.tensor, word_locations: list):
|
64 |
+
# intonations = torch.zeros((len(word_locations), 1))
|
65 |
+
# intonation_fade_samples = 0.3*self.sampling_rate
|
66 |
+
# app_logger.info(f"intonations.shape: {intonations.shape}.")
|
67 |
+
# for word in range(len(word_locations)):
|
68 |
+
# intonation_start = int(np.maximum(
|
69 |
+
# 0, word_locations[word][0]-intonation_fade_samples))
|
70 |
+
# intonation_end = int(np.minimum(
|
71 |
+
# Audio.shape[1]-1, word_locations[word][1]+intonation_fade_samples))
|
72 |
+
# intonations[word] = torch.sqrt(torch.mean(
|
73 |
+
# Audio[0][intonation_start:intonation_end]**2))
|
74 |
+
#
|
75 |
+
# intonations = intonations/torch.mean(intonations)
|
76 |
+
# return intonations
|
77 |
+
|
78 |
##################### ASR Functions ###########################
|
79 |
|
80 |
def processAudioForGivenText(self, recordedAudio: torch.Tensor = None, real_text=None):
|
81 |
|
82 |
start = time.time()
|
83 |
+
recording_transcript, recording_ipa, word_locations = self.getAudioTranscript(
|
84 |
+
recordedAudio)
|
85 |
+
time_transcript_audio = time.time() - start
|
86 |
+
app_logger.info(f'Time for NN to transcript audio: {time_transcript_audio:.2f}.')
|
|
|
87 |
|
88 |
start = time.time()
|
89 |
real_and_transcribed_words, real_and_transcribed_words_ipa, mapped_words_indices = self.matchSampleAndRecordedWords(
|
90 |
real_text, recording_transcript)
|
91 |
+
time_matching_transcripts = time.time() - start
|
92 |
+
app_logger.info(f'Time for matching transcripts: {time_matching_transcripts:.3f}.')
|
93 |
|
94 |
start_time, end_time = self.getWordLocationsFromRecordInSeconds(
|
95 |
word_locations, mapped_words_indices)
|
|
|
111 |
def getAudioTranscript(self, recordedAudio: torch.Tensor = None):
|
112 |
current_recorded_audio = recordedAudio
|
113 |
|
114 |
+
current_recorded_audio = self.preprocessAudio(
|
115 |
+
current_recorded_audio)
|
|
|
|
|
116 |
self.asr_model.processAudio(current_recorded_audio)
|
117 |
|
|
|
118 |
current_recorded_transcript, current_recorded_word_locations = self.getTranscriptAndWordsLocations(
|
119 |
current_recorded_audio.shape[1])
|
120 |
+
current_recorded_ipa = self.ipa_converter.convertToPhonem(
|
121 |
+
current_recorded_transcript)
|
122 |
|
123 |
+
# time.sleep(10000)
|
124 |
return current_recorded_transcript, current_recorded_ipa, current_recorded_word_locations
|
125 |
|
126 |
+
def getWordLocationsFromRecordInSeconds(self, word_locations, mapped_words_indices) -> list:
|
127 |
+
app_logger.info(f"len_list: word_locations:{len(word_locations)}, mapped_words_indices:{len(mapped_words_indices)}, {len(word_locations) == len(mapped_words_indices)}...")
|
128 |
start_time = []
|
129 |
end_time = []
|
130 |
for word_idx in range(len(mapped_words_indices)):
|
|
|
140 |
def matchSampleAndRecordedWords(self, real_text, recorded_transcript):
|
141 |
words_estimated = recorded_transcript.split()
|
142 |
|
143 |
+
try:
|
|
|
|
|
144 |
words_real = real_text.split()
|
145 |
+
except AttributeError:
|
146 |
+
raise ValueError("Real text is None, but should be a string.")
|
147 |
|
148 |
mapped_words, mapped_words_indices = wm.get_best_mapped_words(
|
149 |
words_estimated, words_real)
|
|
|
159 |
self.ipa_converter.convertToPhonem(mapped_words[word_idx])))
|
160 |
return real_and_transcribed_words, real_and_transcribed_words_ipa, mapped_words_indices
|
161 |
|
162 |
+
def getPronunciationAccuracy(self, real_and_transcribed_words_ipa) -> float:
|
163 |
total_mismatches = 0.
|
164 |
number_of_phonemes = 0.
|
165 |
current_words_pronunciation_accuracy = []
|
|
|
196 |
return np.argmin(abs(self.categories_thresholds-accuracy))
|
197 |
|
198 |
def preprocessAudio(self, audio: torch.tensor) -> torch.tensor:
|
199 |
+
return preprocessAudioStandalone(audio)
|
requirements-dev.txt
CHANGED
@@ -1,3 +1,2 @@
|
|
1 |
-
bson
|
2 |
pytest
|
3 |
pytest-cov
|
|
|
|
|
1 |
pytest
|
2 |
pytest-cov
|
requirements-flask.txt
DELETED
@@ -1,21 +0,0 @@
|
|
1 |
-
audioread
|
2 |
-
dtwalign
|
3 |
-
eng_to_ipa
|
4 |
-
epitran==1.25.1
|
5 |
-
flask
|
6 |
-
flask_cors
|
7 |
-
gunicorn
|
8 |
-
omegaconf
|
9 |
-
ortools==9.11.4210
|
10 |
-
pandas
|
11 |
-
pickle-mixin
|
12 |
-
python-dotenv
|
13 |
-
requests
|
14 |
-
sentencepiece
|
15 |
-
silero==0.4.1
|
16 |
-
soundfile==0.12.1
|
17 |
-
sqlalchemy
|
18 |
-
structlog
|
19 |
-
torch
|
20 |
-
torchaudio
|
21 |
-
transformers
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements-gradio.txt
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
gradio==5.11.0
|
|
|
|
requirements.txt
CHANGED
@@ -1,19 +1,23 @@
|
|
1 |
-
asgi-correlation-id
|
2 |
audioread
|
3 |
dtwalign
|
4 |
eng_to_ipa
|
5 |
-
epitran
|
6 |
-
|
|
|
|
|
7 |
omegaconf
|
8 |
-
|
|
|
9 |
pandas
|
10 |
pickle-mixin
|
11 |
-
python-dotenv
|
12 |
requests
|
|
|
13 |
sentencepiece
|
14 |
-
silero
|
15 |
-
soundfile
|
|
|
16 |
structlog
|
17 |
-
|
18 |
-
|
|
|
19 |
transformers
|
|
|
|
|
1 |
audioread
|
2 |
dtwalign
|
3 |
eng_to_ipa
|
4 |
+
epitran
|
5 |
+
faster-whisper
|
6 |
+
flask
|
7 |
+
flask_cors
|
8 |
omegaconf
|
9 |
+
openai-whisper
|
10 |
+
ortools
|
11 |
pandas
|
12 |
pickle-mixin
|
|
|
13 |
requests
|
14 |
+
sacremoses # suggested by marian translation model
|
15 |
sentencepiece
|
16 |
+
silero
|
17 |
+
soundfile
|
18 |
+
sqlalchemy
|
19 |
structlog
|
20 |
+
-f https://download.pytorch.org/whl/torch_stable.html
|
21 |
+
torch
|
22 |
+
torchaudio
|
23 |
transformers
|
aip_trainer/utils/session_logger.py β session_logger.py
RENAMED
@@ -28,9 +28,9 @@ def drop_color_message_key(_, __, event_dict: EventDict) -> EventDict:
|
|
28 |
|
29 |
def setup_logging(json_logs: bool = False, log_level: str = "INFO"):
|
30 |
"""Enhance the configuration of structlog.
|
31 |
-
Needed for correlation id injection with fastapi middleware
|
32 |
-
After the use of logging_middleware()
|
33 |
-
'asgi_correlation_id' package.
|
34 |
To change an input parameter like the log level, re-run the function changing the parameter
|
35 |
(no need to re-instantiate the logger instance: it's a hot change)
|
36 |
|
|
|
28 |
|
29 |
def setup_logging(json_logs: bool = False, log_level: str = "INFO"):
|
30 |
"""Enhance the configuration of structlog.
|
31 |
+
Needed for correlation id injection with fastapi middleware within the app.
|
32 |
+
After the use of logging_middleware() within the middlewares module (if present), add also the CorrelationIdMiddleware from
|
33 |
+
'asgi_correlation_id' package.
|
34 |
To change an input parameter like the log level, re-run the function changing the parameter
|
35 |
(no need to re-instantiate the logger instance: it's a hot change)
|
36 |
|
static/.gitignore
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
playwright-report/*
|
2 |
-
node_modules
|
3 |
-
test-results/*
|
|
|
|
|
|
|
|
static/.vscode/launch.json
DELETED
@@ -1,20 +0,0 @@
|
|
1 |
-
{
|
2 |
-
// Use IntelliSense to learn about possible attributes.
|
3 |
-
// Hover to view descriptions of existing attributes.
|
4 |
-
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
5 |
-
"version": "0.2.0",
|
6 |
-
"configurations": [
|
7 |
-
{
|
8 |
-
"type": "node",
|
9 |
-
"request": "launch",
|
10 |
-
"name": "Launch Program",
|
11 |
-
"skipFiles": [
|
12 |
-
"<node_internals>/**"
|
13 |
-
],
|
14 |
-
"program": "${workspaceFolder}/tests/test-1.spec.ts",
|
15 |
-
"outFiles": [
|
16 |
-
"${workspaceFolder}/**/*.js"
|
17 |
-
]
|
18 |
-
}
|
19 |
-
]
|
20 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static/css/{style.css β style-new.css}
RENAMED
@@ -2,6 +2,21 @@ body {
|
|
2 |
background: #f2f2f2;
|
3 |
}
|
4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
.expanded {
|
7 |
margin: auto;
|
@@ -18,7 +33,13 @@ h1 {
|
|
18 |
|
19 |
a.disabled {
|
20 |
pointer-events: none;
|
21 |
-
color:
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
background-color: #ccc;
|
23 |
}
|
24 |
|
@@ -29,6 +50,31 @@ a.disabled {
|
|
29 |
display: flex;
|
30 |
}
|
31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
/* ############## Next button ##### */
|
33 |
.button-next {
|
34 |
border-radius: 4px;
|
@@ -40,11 +86,10 @@ a.disabled {
|
|
40 |
box-sizing: border-box;
|
41 |
position: absolute;
|
42 |
top: 0;
|
43 |
-
left:
|
44 |
-
right: 2%;
|
45 |
-
bottom: 2%;
|
46 |
background-color: #58636d;
|
47 |
-
width:
|
|
|
48 |
|
49 |
transition: all 0.5s;
|
50 |
cursor: pointer;
|
@@ -127,41 +172,24 @@ a.disabled {
|
|
127 |
display: block;
|
128 |
position: absolute;
|
129 |
left: 2%;
|
130 |
-
top:
|
131 |
-
transform: translate(-0%, -0%);
|
132 |
-
height: 45%;
|
133 |
-
width: 96%;
|
134 |
-
max-width: 96%;
|
135 |
-
background: #ffff;
|
136 |
-
overflow: hidden;
|
137 |
-
border-radius: 20px;
|
138 |
-
box-shadow: 0 0 20px 8px #d0d0d0;
|
139 |
-
}
|
140 |
-
|
141 |
-
.container2 {
|
142 |
-
display: block;
|
143 |
-
position: absolute;
|
144 |
-
left: 2%;
|
145 |
-
top: 63%;
|
146 |
transform: translate(-0%, -0%);
|
147 |
-
height:
|
148 |
width: 96%;
|
149 |
max-width: 96%;
|
150 |
background: #ffff;
|
151 |
overflow: hidden;
|
152 |
border-radius: 20px;
|
153 |
box-shadow: 0 0 20px 8px #d0d0d0;
|
154 |
-
overflow: scroll;
|
155 |
-
max-height: 15%;
|
156 |
}
|
157 |
|
158 |
.container-small {
|
159 |
position: fixed;
|
160 |
-
left:
|
161 |
-
top:
|
162 |
transform: translate(-0%, -0%);
|
163 |
-
height:
|
164 |
-
width:
|
165 |
background: #ffff;
|
166 |
overflow: hidden;
|
167 |
border-radius: 20px;
|
@@ -238,6 +266,17 @@ a.disabled {
|
|
238 |
font-size: 3.5em !important;
|
239 |
}
|
240 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
241 |
.mic-button-div {
|
242 |
position: fixed;
|
243 |
left: 50%;
|
@@ -349,6 +388,75 @@ a.disabled {
|
|
349 |
width: 100%;
|
350 |
}
|
351 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
352 |
/* ############ Links and credits ####*/
|
353 |
|
354 |
.link-icon-div {
|
@@ -362,7 +470,7 @@ a.disabled {
|
|
362 |
.credits-icon-div {
|
363 |
position: fixed;
|
364 |
left: 90.5%;
|
365 |
-
top:
|
366 |
font-size: x-small;
|
367 |
}
|
368 |
|
@@ -401,9 +509,9 @@ a.disabled {
|
|
401 |
display: block;
|
402 |
position: absolute;
|
403 |
left: 2%;
|
404 |
-
top:
|
405 |
transform: translate(-0%, -0%);
|
406 |
-
height:
|
407 |
width: 96%;
|
408 |
max-width: 96%;
|
409 |
background: #ffff;
|
@@ -412,23 +520,6 @@ a.disabled {
|
|
412 |
box-shadow: 0 0 20px 8px #d0d0d0;
|
413 |
}
|
414 |
|
415 |
-
.container2 {
|
416 |
-
display: block;
|
417 |
-
position: absolute;
|
418 |
-
left: 2%;
|
419 |
-
top: 63%;
|
420 |
-
transform: translate(-0%, -0%);
|
421 |
-
height: 10%;
|
422 |
-
width: 96%;
|
423 |
-
max-width: 96%;
|
424 |
-
background: #ffff;
|
425 |
-
overflow: hidden;
|
426 |
-
border-radius: 20px;
|
427 |
-
box-shadow: 0 0 20px 8px #d0d0d0;
|
428 |
-
overflow: scroll;
|
429 |
-
max-height: 15%;
|
430 |
-
}
|
431 |
-
|
432 |
.icon-text {
|
433 |
font-size: 0.8em !important;
|
434 |
text-align: center;
|
@@ -445,7 +536,7 @@ a.disabled {
|
|
445 |
/* 80px */
|
446 |
height: 3.5em;
|
447 |
padding-top: 0.4em;
|
448 |
-
left:
|
449 |
line-height: 0px;
|
450 |
border: 6px solid #fff;
|
451 |
border-radius: 50%;
|
@@ -460,7 +551,7 @@ a.disabled {
|
|
460 |
|
461 |
.mic-button-div {
|
462 |
position: fixed;
|
463 |
-
left:
|
464 |
top: 80%
|
465 |
}
|
466 |
|
@@ -502,4 +593,4 @@ a.disabled {
|
|
502 |
font-size: 0.8em;
|
503 |
}
|
504 |
|
505 |
-
}
|
|
|
2 |
background: #f2f2f2;
|
3 |
}
|
4 |
|
5 |
+
.flex {
|
6 |
+
display: flex;
|
7 |
+
}
|
8 |
+
|
9 |
+
.text-align-center {
|
10 |
+
text-align: center;
|
11 |
+
}
|
12 |
+
|
13 |
+
.display-block {
|
14 |
+
display: block;
|
15 |
+
}
|
16 |
+
|
17 |
+
.display-inline-block {
|
18 |
+
display: inline-block;
|
19 |
+
}
|
20 |
|
21 |
.expanded {
|
22 |
margin: auto;
|
|
|
33 |
|
34 |
a.disabled {
|
35 |
pointer-events: none;
|
36 |
+
color: black;
|
37 |
+
background-color: #ccc;
|
38 |
+
}
|
39 |
+
|
40 |
+
.color-disabled {
|
41 |
+
pointer-events: none;
|
42 |
+
color: black;
|
43 |
background-color: #ccc;
|
44 |
}
|
45 |
|
|
|
50 |
display: flex;
|
51 |
}
|
52 |
|
53 |
+
.darkgreen {
|
54 |
+
color: white;
|
55 |
+
background-color: #467387;
|
56 |
+
}
|
57 |
+
|
58 |
+
/* text button */
|
59 |
+
.text-button {
|
60 |
+
border: none;
|
61 |
+
text-align: center;
|
62 |
+
text-decoration: none;
|
63 |
+
display: inline-block;
|
64 |
+
font-size: 16px;
|
65 |
+
margin: 4px 2px;
|
66 |
+
height: fit-content;
|
67 |
+
width: 4em;
|
68 |
+
}
|
69 |
+
.text-button-div {
|
70 |
+
position: absolute;
|
71 |
+
top: 38%;
|
72 |
+
}
|
73 |
+
#input-uploader-audio-file {
|
74 |
+
width: 100px;
|
75 |
+
white-space: normal;
|
76 |
+
}
|
77 |
+
|
78 |
/* ############## Next button ##### */
|
79 |
.button-next {
|
80 |
border-radius: 4px;
|
|
|
86 |
box-sizing: border-box;
|
87 |
position: absolute;
|
88 |
top: 0;
|
89 |
+
left: 94%;
|
|
|
|
|
90 |
background-color: #58636d;
|
91 |
+
width: 6%;
|
92 |
+
height: 100%;
|
93 |
|
94 |
transition: all 0.5s;
|
95 |
cursor: pointer;
|
|
|
172 |
display: block;
|
173 |
position: absolute;
|
174 |
left: 2%;
|
175 |
+
top: 18%;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
176 |
transform: translate(-0%, -0%);
|
177 |
+
height: 59%;
|
178 |
width: 96%;
|
179 |
max-width: 96%;
|
180 |
background: #ffff;
|
181 |
overflow: hidden;
|
182 |
border-radius: 20px;
|
183 |
box-shadow: 0 0 20px 8px #d0d0d0;
|
|
|
|
|
184 |
}
|
185 |
|
186 |
.container-small {
|
187 |
position: fixed;
|
188 |
+
left: 68%;
|
189 |
+
top: 79%;
|
190 |
transform: translate(-0%, -0%);
|
191 |
+
height: 7%;
|
192 |
+
width: 30%;
|
193 |
background: #ffff;
|
194 |
overflow: hidden;
|
195 |
border-radius: 20px;
|
|
|
266 |
font-size: 3.5em !important;
|
267 |
}
|
268 |
|
269 |
+
.form-audio-file {
|
270 |
+
position: fixed;
|
271 |
+
left: 25%;
|
272 |
+
top: 82%;
|
273 |
+
}
|
274 |
+
.form-audio-file-label {
|
275 |
+
position: fixed;
|
276 |
+
left: 25%;
|
277 |
+
top: 86%;
|
278 |
+
}
|
279 |
+
|
280 |
.mic-button-div {
|
281 |
position: fixed;
|
282 |
left: 50%;
|
|
|
388 |
width: 100%;
|
389 |
}
|
390 |
|
391 |
+
/* ############ checkbox for using DTW */
|
392 |
+
.container-dtw-div {
|
393 |
+
position: absolute;
|
394 |
+
top: 60%;
|
395 |
+
}
|
396 |
+
.container-label-dtw {
|
397 |
+
padding-left: 35px;
|
398 |
+
cursor: pointer;
|
399 |
+
font-size: 2em;
|
400 |
+
-webkit-user-select: none;
|
401 |
+
-moz-user-select: none;
|
402 |
+
-ms-user-select: none;
|
403 |
+
user-select: none;
|
404 |
+
}
|
405 |
+
|
406 |
+
/* Hide the browser's default checkbox */
|
407 |
+
.container-label-dtw input {
|
408 |
+
position: absolute;
|
409 |
+
opacity: 0;
|
410 |
+
cursor: pointer;
|
411 |
+
height: 0;
|
412 |
+
width: 0;
|
413 |
+
}
|
414 |
+
|
415 |
+
/* Create a custom checkbox */
|
416 |
+
.checkmark {
|
417 |
+
position: absolute;
|
418 |
+
margin-top: 0.4em;
|
419 |
+
left: 0;
|
420 |
+
height: 25px;
|
421 |
+
width: 25px;
|
422 |
+
background-color: #eee;
|
423 |
+
}
|
424 |
+
|
425 |
+
/* On mouse-over, add a grey background color */
|
426 |
+
.container:hover input ~ .checkmark {
|
427 |
+
background-color: #ccc;
|
428 |
+
}
|
429 |
+
|
430 |
+
/* When the checkbox is checked, add a blue background */
|
431 |
+
.container input:checked ~ .checkmark {
|
432 |
+
background-color: #467387;
|
433 |
+
}
|
434 |
+
|
435 |
+
/* Create the checkmark/indicator (hidden when not checked) */
|
436 |
+
.checkmark:after {
|
437 |
+
content: "";
|
438 |
+
position: absolute;
|
439 |
+
display: none;
|
440 |
+
}
|
441 |
+
|
442 |
+
/* Show the checkmark when checked */
|
443 |
+
.container input:checked ~ .checkmark:after {
|
444 |
+
display: block;
|
445 |
+
}
|
446 |
+
|
447 |
+
/* Style the checkmark/indicator */
|
448 |
+
.container .checkmark:after {
|
449 |
+
left: 9px;
|
450 |
+
top: 5px;
|
451 |
+
width: 5px;
|
452 |
+
height: 10px;
|
453 |
+
border: solid white;
|
454 |
+
border-width: 0 3px 3px 0;
|
455 |
+
-webkit-transform: rotate(45deg);
|
456 |
+
-ms-transform: rotate(45deg);
|
457 |
+
transform: rotate(45deg);
|
458 |
+
}
|
459 |
+
|
460 |
/* ############ Links and credits ####*/
|
461 |
|
462 |
.link-icon-div {
|
|
|
470 |
.credits-icon-div {
|
471 |
position: fixed;
|
472 |
left: 90.5%;
|
473 |
+
top: 95%;
|
474 |
font-size: x-small;
|
475 |
}
|
476 |
|
|
|
509 |
display: block;
|
510 |
position: absolute;
|
511 |
left: 2%;
|
512 |
+
top: 22%;
|
513 |
transform: translate(-0%, -0%);
|
514 |
+
height: 55%;
|
515 |
width: 96%;
|
516 |
max-width: 96%;
|
517 |
background: #ffff;
|
|
|
520 |
box-shadow: 0 0 20px 8px #d0d0d0;
|
521 |
}
|
522 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
523 |
.icon-text {
|
524 |
font-size: 0.8em !important;
|
525 |
text-align: center;
|
|
|
536 |
/* 80px */
|
537 |
height: 3.5em;
|
538 |
padding-top: 0.4em;
|
539 |
+
left: 40%;
|
540 |
line-height: 0px;
|
541 |
border: 6px solid #fff;
|
542 |
border-radius: 50%;
|
|
|
551 |
|
552 |
.mic-button-div {
|
553 |
position: fixed;
|
554 |
+
left: 40%;
|
555 |
top: 80%
|
556 |
}
|
557 |
|
|
|
593 |
font-size: 0.8em;
|
594 |
}
|
595 |
|
596 |
+
}
|