alessandro trinca tornidor commited on
Commit
85b7206
Β·
1 Parent(s): 7810fbd

feat: port whisper and faster-whisper support from https://github.com/Thiagohgl/ai-pronunciation-trainer

Browse files
This view is limited to 50 files because it contains too many changes. Β  See raw diff
Files changed (50) hide show
  1. .coveragerc +3 -3
  2. .gitignore +153 -118
  3. .idea/inspectionProfiles/Project_Default.xml +15 -0
  4. .idea/vcs.xml +6 -0
  5. aip_trainer/models/AIModels.py β†’ AIModels.py +31 -3
  6. Dockerfile +0 -37
  7. aip_trainer/models/ModelInterfaces.py β†’ ModelInterfaces.py +1 -1
  8. README.md +2 -2
  9. aip_trainer/models/RuleBasedModels.py β†’ RuleBasedModels.py +14 -6
  10. aip_trainer/WordMatching.py β†’ WordMatching.py +85 -51
  11. aip_trainer/WordMetrics.py β†’ WordMetrics.py +28 -4
  12. aip_trainer/__init__.py +0 -21
  13. aip_trainer/lambdas/__init__.py +0 -1
  14. aip_trainer/lambdas/data_de_en_with_categories.json +0 -0
  15. aip_trainer/lambdas/lambdaGetSample.py +0 -106
  16. aip_trainer/models/__init__.py +0 -0
  17. aip_trainer/utils/__init__.py +0 -0
  18. aip_trainer/utils/split_cosmic_ray_report.py +0 -33
  19. aip_trainer/utils/typing_hints.py +0 -19
  20. aip_trainer/utils/utilities.py +0 -57
  21. app.py +53 -24
  22. app_description.md +11 -0
  23. aip_trainer/lambdas/app_description.md β†’ app_headline.md +3 -1
  24. constants.py +31 -0
  25. cosmic_ray_config.toml +0 -8
  26. tests/test_data_de_en_2.pickle β†’ data_de_en_2.pickle +0 -0
  27. databases/data_de.csv +0 -0
  28. databases/data_en.csv +0 -0
  29. dockerfiles/apt_preferences +0 -9
  30. dockerfiles/debian.sources +0 -17
  31. dockerfiles/dockerfile-base +0 -72
  32. faster_whisper_wrapper.py +56 -0
  33. images/{MainScreen.png β†’ MainScreen.jpg} +2 -2
  34. aip_trainer/lambdas/js.py β†’ js.py +0 -14
  35. lambdaChangeModel.py +14 -0
  36. lambdaGetSample.py +145 -0
  37. aip_trainer/lambdas/lambdaSpeechToScore.py β†’ lambdaSpeechToScore.py +176 -57
  38. aip_trainer/lambdas/lambdaTTS.py β†’ lambdaTTS.py +29 -5
  39. aip_trainer/models/models.py β†’ models.py +209 -88
  40. packages.txt +0 -1
  41. pre-requirements.txt +0 -1
  42. aip_trainer/pronunciationTrainer.py β†’ pronunciationTrainer.py +44 -39
  43. requirements-dev.txt +0 -1
  44. requirements-flask.txt +0 -21
  45. requirements-gradio.txt +0 -1
  46. requirements.txt +13 -9
  47. aip_trainer/utils/session_logger.py β†’ session_logger.py +3 -3
  48. static/.gitignore +0 -3
  49. static/.vscode/launch.json +0 -20
  50. static/css/{style.css β†’ style-new.css} +142 -51
.coveragerc CHANGED
@@ -1,9 +1,9 @@
1
  [run]
2
- source = samgis
3
- omit = ./venv/*,__version__.py,*tests*,*apps.py,*manage.py,*__init__.py,*migrations*,*asgi*,*wsgi*,*admin.py,*urls.py,./tests/*,aip_trainer/lambdas/js.py
4
 
5
  [report]
6
- omit = ./venv/*,*tests*,*apps.py,*manage.py,*__init__.py,*migrations*,*asgi*,*wsgi*,*admin.py,*urls.py,./tests/*,aip_trainer/lambdas/js.py
7
 
8
  exclude_lines =
9
  if __name__ == .__main__.:
 
1
  [run]
2
+ source = ./*.py
3
+ omit = ./tests/*,./tests/**/*,./*venv*/*,__version__.py,*tests*,*app.py,js.py,*manage.py,*__init__.py,*migrations*,*asgi*,*wsgi*,*admin.py,*urls.py
4
 
5
  [report]
6
+ omit = ./*venv*/*,*tests*,*app.py,*manage.py,*__init__.py,*migrations*,js.py,*asgi*,*wsgi*,*admin.py,*urls.py
7
 
8
  exclude_lines =
9
  if __name__ == .__main__.:
.gitignore CHANGED
@@ -1,63 +1,3 @@
1
-
2
- # Created by https://www.gitignore.io/api/osx,linux,python,windows,pycharm,visualstudiocode
3
-
4
- ### Linux ###
5
- *~
6
-
7
- # temporary files which can be created if a process still has a handle open of a deleted file
8
- .fuse_hidden*
9
-
10
- # KDE directory preferences
11
- .directory
12
-
13
- # Linux trash folder which might appear on any partition or disk
14
- .Trash-*
15
-
16
- # .nfs files are created when an open file is removed but is still being accessed
17
- .nfs*
18
-
19
- ### OSX ###
20
- *.DS_Store
21
- */*.DS_Store
22
- */**/*.DS_Store
23
- .AppleDouble
24
- .LSOverride
25
-
26
- # Icon must end with two \r
27
- Icon
28
-
29
- # Thumbnails
30
- ._*
31
-
32
- # Files that might appear in the root of a volume
33
- .DocumentRevisions-V100
34
- .fseventsd
35
- .Spotlight-V100
36
- .TemporaryItems
37
- .Trashes
38
- .VolumeIcon.icns
39
- .com.apple.timemachine.donotpresent
40
-
41
- # Directories potentially created on remote AFP share
42
- .AppleDB
43
- .AppleDesktop
44
- Network Trash Folder
45
- Temporary Items
46
- .apdisk
47
-
48
- # CMake
49
- cmake-build-debug/
50
-
51
- # Ruby plugin and RubyMine
52
- /.rakeTasks
53
-
54
- # Crashlytics plugin (for Android Studio and IntelliJ)
55
- com_crashlytics_export_strings.xml
56
- crashlytics.properties
57
- crashlytics-build.properties
58
- fabric.properties
59
-
60
- ### Python ###
61
  # Byte-compiled / optimized / DLL files
62
  __pycache__/
63
  *.py[cod]
@@ -80,9 +20,14 @@ parts/
80
  sdist/
81
  var/
82
  wheels/
 
 
83
  *.egg-info/
84
  .installed.cfg
85
  *.egg
 
 
 
86
 
87
  # PyInstaller
88
  # Usually these files are written by a python script from a template
@@ -97,19 +42,31 @@ pip-delete-this-directory.txt
97
  # Unit test / coverage reports
98
  htmlcov/
99
  .tox/
 
100
  .coverage
101
  .coverage.*
102
  .cache
103
- .pytest_cache/
104
  nosetests.xml
105
  coverage.xml
106
  *.cover
 
107
  .hypothesis/
 
 
 
 
 
108
 
109
  # Translations
110
  *.mo
111
  *.pot
112
 
 
 
 
 
 
 
113
  # Flask stuff:
114
  instance/
115
  .webassets-cache
@@ -118,8 +75,7 @@ instance/
118
  .scrapy
119
 
120
  # Sphinx documentation
121
- docs/_build/doctrees/*
122
- docs/_build/html/*
123
 
124
  # PyBuilder
125
  target/
@@ -127,25 +83,37 @@ target/
127
  # Jupyter Notebook
128
  .ipynb_checkpoints
129
 
 
 
 
 
130
  # pyenv
131
  .python-version
132
 
133
- # celery beat schedule file
134
- celerybeat-schedule.*
 
 
 
 
 
 
 
 
 
 
 
135
 
136
  # SageMath parsed files
137
  *.sage.py
138
 
139
  # Environments
140
  .env
141
- .env*
142
- .venv
143
  .venv*
144
  env/
145
- venv/
146
  ENV/
147
  env.bak/
148
- venv.bak/
149
 
150
  # Spyder project settings
151
  .spyderproject
@@ -159,55 +127,24 @@ venv.bak/
159
 
160
  # mypy
161
  .mypy_cache/
 
 
162
 
163
- ### VisualStudioCode ###
164
- .vscode/*
165
- !.vscode/settings.json
166
- !.vscode/tasks.json
167
- !.vscode/launch.json
168
- !.vscode/extensions.json
169
- .history
170
 
171
- ### Windows ###
172
- # Windows thumbnail cache files
173
- Thumbs.db
174
- ehthumbs.db
175
- ehthumbs_vista.db
176
-
177
- # Folder config file
178
- Desktop.ini
179
-
180
- # Recycle Bin used on file shares
181
- $RECYCLE.BIN/
182
-
183
- # Windows Installer files
184
- *.cab
185
- *.msi
186
- *.msm
187
- *.msp
188
-
189
- # Windows shortcuts
190
- *.lnk
191
-
192
- # Build folder
193
-
194
- */build/*
195
 
196
  # custom
197
- *.ori
198
- tmp
199
- nohup.out
200
- /tests/events.tar
201
- function_dump_*.json
202
- *.yml
203
 
204
- # onnx models
205
- *.onnx
206
 
207
- # End of https://www.gitignore.io/api/osx,linux,python,windows,pycharm,visualstudiocode
208
-
209
- ## .idea files
210
- # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
211
  # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
212
 
213
  # User-specific stuff
@@ -217,6 +154,9 @@ function_dump_*.json
217
  .idea/**/dictionaries
218
  .idea/**/shelf
219
 
 
 
 
220
  # Generated files
221
  .idea/**/contentModel.xml
222
 
@@ -237,9 +177,14 @@ function_dump_*.json
237
  # When using Gradle or Maven with auto-import, you should exclude module files,
238
  # since they will be recreated, and may cause churn. Uncomment if using
239
  # auto-import.
 
 
 
240
  # .idea/modules.xml
241
- .idea/*.iml
242
  # .idea/modules
 
 
243
 
244
  # CMake
245
  cmake-build-*/
@@ -262,6 +207,9 @@ atlassian-ide-plugin.xml
262
  # Cursive Clojure plugin
263
  .idea/replstate.xml
264
 
 
 
 
265
  # Crashlytics plugin (for Android Studio and IntelliJ)
266
  com_crashlytics_export_strings.xml
267
  crashlytics.properties
@@ -274,11 +222,98 @@ fabric.properties
274
  # Android studio 3.1+ serialized cache file
275
  .idea/caches/build_file_checksums.ser
276
 
 
 
 
 
 
 
 
 
277
  # Sonarlint plugin
278
- .idea/sonarlint
279
- /.idea/modules.xml
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
280
 
281
- # node_modules
282
- node_modules
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
 
284
- *.jit
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # Byte-compiled / optimized / DLL files
2
  __pycache__/
3
  *.py[cod]
 
20
  sdist/
21
  var/
22
  wheels/
23
+ pip-wheel-metadata/
24
+ share/python-wheels/
25
  *.egg-info/
26
  .installed.cfg
27
  *.egg
28
+ MANIFEST
29
+ static/node_modules/*
30
+ static/dist/*
31
 
32
  # PyInstaller
33
  # Usually these files are written by a python script from a template
 
42
  # Unit test / coverage reports
43
  htmlcov/
44
  .tox/
45
+ .nox/
46
  .coverage
47
  .coverage.*
48
  .cache
 
49
  nosetests.xml
50
  coverage.xml
51
  *.cover
52
+ *.py,cover
53
  .hypothesis/
54
+ .pytest_cache/
55
+ static/test-results/*
56
+ cosmic-ray-results/*
57
+ cosmic_ray.sqlite
58
+ static/playwright-report/*
59
 
60
  # Translations
61
  *.mo
62
  *.pot
63
 
64
+ # Django stuff:
65
+ *.log
66
+ local_settings.py
67
+ db.sqlite3
68
+ db.sqlite3-journal
69
+
70
  # Flask stuff:
71
  instance/
72
  .webassets-cache
 
75
  .scrapy
76
 
77
  # Sphinx documentation
78
+ docs/_build/
 
79
 
80
  # PyBuilder
81
  target/
 
83
  # Jupyter Notebook
84
  .ipynb_checkpoints
85
 
86
+ # IPython
87
+ profile_default/
88
+ ipython_config.py
89
+
90
  # pyenv
91
  .python-version
92
 
93
+ # pipenv
94
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
95
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
96
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
97
+ # install all needed dependencies.
98
+ #Pipfile.lock
99
+
100
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
101
+ __pypackages__/
102
+
103
+ # Celery stuff
104
+ celerybeat-schedule
105
+ celerybeat.pid
106
 
107
  # SageMath parsed files
108
  *.sage.py
109
 
110
  # Environments
111
  .env
 
 
112
  .venv*
113
  env/
114
+ venv*
115
  ENV/
116
  env.bak/
 
117
 
118
  # Spyder project settings
119
  .spyderproject
 
127
 
128
  # mypy
129
  .mypy_cache/
130
+ .dmypy.json
131
+ dmypy.json
132
 
133
+ # Pyre type checker
134
+ .pyre/
 
 
 
 
 
135
 
136
+ tmp/*
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
 
138
  # custom
139
+ translation_model_de.pickle
140
+ translation_tokenizer_de.pickle
141
+ test.ogg
 
 
 
142
 
143
+ # Created by https://www.toptal.com/developers/gitignore/api/jetbrains,windows,linux,visualstudiocode
144
+ # Edit at https://www.toptal.com/developers/gitignore?templates=jetbrains,windows,linux,visualstudiocode
145
 
146
+ ### JetBrains ###
147
+ # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
 
 
148
  # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
149
 
150
  # User-specific stuff
 
154
  .idea/**/dictionaries
155
  .idea/**/shelf
156
 
157
+ # AWS User-specific
158
+ .idea/**/aws.xml
159
+
160
  # Generated files
161
  .idea/**/contentModel.xml
162
 
 
177
  # When using Gradle or Maven with auto-import, you should exclude module files,
178
  # since they will be recreated, and may cause churn. Uncomment if using
179
  # auto-import.
180
+ # .idea/artifacts
181
+ # .idea/compiler.xml
182
+ # .idea/jarRepositories.xml
183
  # .idea/modules.xml
184
+ # .idea/*.iml
185
  # .idea/modules
186
+ # *.iml
187
+ # *.ipr
188
 
189
  # CMake
190
  cmake-build-*/
 
207
  # Cursive Clojure plugin
208
  .idea/replstate.xml
209
 
210
+ # SonarLint plugin
211
+ .idea/sonarlint/
212
+
213
  # Crashlytics plugin (for Android Studio and IntelliJ)
214
  com_crashlytics_export_strings.xml
215
  crashlytics.properties
 
222
  # Android studio 3.1+ serialized cache file
223
  .idea/caches/build_file_checksums.ser
224
 
225
+ ### JetBrains Patch ###
226
+ # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
227
+
228
+ # *.iml
229
+ # modules.xml
230
+ # .idea/misc.xml
231
+ # *.ipr
232
+
233
  # Sonarlint plugin
234
+ # https://plugins.jetbrains.com/plugin/7973-sonarlint
235
+ .idea/**/sonarlint/
236
+
237
+ # SonarQube Plugin
238
+ # https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin
239
+ .idea/**/sonarIssues.xml
240
+
241
+ # Markdown Navigator plugin
242
+ # https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced
243
+ .idea/**/markdown-navigator.xml
244
+ .idea/**/markdown-navigator-enh.xml
245
+ .idea/**/markdown-navigator/
246
+
247
+ # Cache file creation bug
248
+ # See https://youtrack.jetbrains.com/issue/JBR-2257
249
+ .idea/$CACHE_FILE$
250
+
251
+ # CodeStream plugin
252
+ # https://plugins.jetbrains.com/plugin/12206-codestream
253
+ .idea/codestream.xml
254
+
255
+ # Azure Toolkit for IntelliJ plugin
256
+ # https://plugins.jetbrains.com/plugin/8053-azure-toolkit-for-intellij
257
+ .idea/**/azureSettings.xml
258
+
259
+ ### Linux ###
260
+ *~
261
+
262
+ # temporary files which can be created if a process still has a handle open of a deleted file
263
+ .fuse_hidden*
264
+
265
+ # KDE directory preferences
266
+ .directory
267
+
268
+ # Linux trash folder which might appear on any partition or disk
269
+ .Trash-*
270
+
271
+ # .nfs files are created when an open file is removed but is still being accessed
272
+ .nfs*
273
+
274
+ ### VisualStudioCode ###
275
+ .vscode/*
276
+ !.vscode/settings.json
277
+ !.vscode/tasks.json
278
+ !.vscode/launch.json
279
+ !.vscode/extensions.json
280
+ !.vscode/*.code-snippets
281
+
282
+ # Local History for Visual Studio Code
283
+ .history/
284
+
285
+ # Built Visual Studio Code Extensions
286
+ *.vsix
287
+
288
+ ### VisualStudioCode Patch ###
289
+ # Ignore all local history of files
290
+ .history
291
+ .ionide
292
+
293
+ ### Windows ###
294
+ # Windows thumbnail cache files
295
+ Thumbs.db
296
+ Thumbs.db:encryptable
297
+ ehthumbs.db
298
+ ehthumbs_vista.db
299
 
300
+ # Dump file
301
+ *.stackdump
302
+
303
+ # Folder config file
304
+ [Dd]esktop.ini
305
+
306
+ # Recycle Bin used on file shares
307
+ $RECYCLE.BIN/
308
+
309
+ # Windows Installer files
310
+ *.cab
311
+ *.msi
312
+ *.msix
313
+ *.msm
314
+ *.msp
315
+
316
+ # Windows shortcuts
317
+ *.lnk
318
 
319
+ # End of https://www.toptal.com/developers/gitignore/api/jetbrains,windows,linux,visualstudiocode
.idea/inspectionProfiles/Project_Default.xml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <component name="InspectionProjectProfileManager">
2
+ <profile version="1.0">
3
+ <option name="myName" value="Project Default" />
4
+ <inspection_tool class="HtmlUnknownAttribute" enabled="true" level="WARNING" enabled_by_default="true">
5
+ <option name="myValues">
6
+ <value>
7
+ <list size="1">
8
+ <item index="0" class="java.lang.String" itemvalue="label" />
9
+ </list>
10
+ </value>
11
+ </option>
12
+ <option name="myCustomValuesEnabled" value="true" />
13
+ </inspection_tool>
14
+ </profile>
15
+ </component>
.idea/vcs.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="VcsDirectoryMappings">
4
+ <mapping directory="" vcs="Git" />
5
+ </component>
6
+ </project>
aip_trainer/models/AIModels.py β†’ AIModels.py RENAMED
@@ -1,7 +1,6 @@
1
  import numpy as np
2
  import torch
3
-
4
- from aip_trainer.models import ModelInterfaces
5
 
6
 
7
  class NeuralASR(ModelInterfaces.IASRModel):
@@ -21,7 +20,6 @@ class NeuralASR(ModelInterfaces.IASRModel):
21
  def getWordLocations(self) -> list:
22
  """Get the pair of words location from audio"""
23
  assert self.word_locations_in_samples is not None, 'Can get word locations without having processed the audio'
24
-
25
  return self.word_locations_in_samples
26
 
27
  def processAudio(self, audio: torch.Tensor):
@@ -32,3 +30,33 @@ class NeuralASR(ModelInterfaces.IASRModel):
32
 
33
  self.audio_transcript, self.word_locations_in_samples = self.decoder(
34
  nn_output[0, :, :].detach(), audio_length_in_samples, word_align=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import numpy as np
2
  import torch
3
+ import ModelInterfaces
 
4
 
5
 
6
  class NeuralASR(ModelInterfaces.IASRModel):
 
20
  def getWordLocations(self) -> list:
21
  """Get the pair of words location from audio"""
22
  assert self.word_locations_in_samples is not None, 'Can get word locations without having processed the audio'
 
23
  return self.word_locations_in_samples
24
 
25
  def processAudio(self, audio: torch.Tensor):
 
30
 
31
  self.audio_transcript, self.word_locations_in_samples = self.decoder(
32
  nn_output[0, :, :].detach(), audio_length_in_samples, word_align=True)
33
+
34
+
35
+ class NeuralTTS(ModelInterfaces.ITextToSpeechModel):
36
+ def __init__(self, model: torch.nn.Module, sampling_rate: int) -> None:
37
+ super().__init__()
38
+ self.model = model
39
+ self.sampling_rate = sampling_rate
40
+
41
+ def getAudioFromSentence(self, sentence: str) -> np.array:
42
+ with torch.inference_mode():
43
+ audio_transcript = self.model.apply_tts(texts=[sentence],
44
+ sample_rate=self.sampling_rate)[0]
45
+
46
+ return audio_transcript
47
+
48
+
49
+ class NeuralTranslator(ModelInterfaces.ITranslationModel):
50
+ def __init__(self, model: torch.nn.Module, tokenizer) -> None:
51
+ super().__init__()
52
+ self.model = model
53
+ self.tokenizer = tokenizer
54
+
55
+ def translateSentence(self, sentence: str) -> str:
56
+ """Get the transcripts of the process audio"""
57
+ tokenized_text = self.tokenizer(sentence, return_tensors='pt')
58
+ translation = self.model.generate(**tokenized_text)
59
+ translated_text = self.tokenizer.batch_decode(
60
+ translation, skip_special_tokens=True)[0]
61
+
62
+ return translated_text
Dockerfile DELETED
@@ -1,37 +0,0 @@
1
- FROM registry.gitlab.com/aletrn/ai-pronunciation-trainer:0.5.0
2
-
3
- ARG ARCH
4
- ARG WORKDIR_ROOT
5
- ENV PYTHONPATH="${WORKDIR_ROOT}:${WORKDIR_ROOT}/.venv:${PYTHONPATH}:/usr/local/lib/python3/dist-packages"
6
- ENV MPLCONFIGDIR=/tmp/matplotlib
7
- ENV IS_DOCKER_CONTAINER="YES"
8
- ENV LOG_JSON_FORMAT="TRUE"
9
- ENV LOG_LEVEL="INFO"
10
-
11
- ENV VIRTUAL_ENV=${WORKDIR_ROOT}/.venv PATH="${WORKDIR_ROOT}/.venv/bin:$PATH"
12
-
13
- COPY --chown=python:python . ${WORKDIR_ROOT}/.
14
-
15
- RUN python --version
16
- RUN pip list
17
- RUN echo "PATH: ${PATH}."
18
- RUN echo "WORKDIR_ROOT: ${WORKDIR_ROOT}."
19
- RUN ls -l ${WORKDIR_ROOT}
20
- RUN ls -ld ${WORKDIR_ROOT}
21
- RUN python -c "import sys; print(sys.path)"
22
- RUN python -c "import epitran"
23
- RUN python -c "import flask"
24
- RUN python -c "import pandas"
25
- RUN python -c "from torch import Tensor"
26
- RUN python -c "import gunicorn"
27
- RUN df -h
28
- RUN ls -l ${WORKDIR_ROOT}/webApp.py
29
- RUN ls -l ${WORKDIR_ROOT}/static/
30
-
31
- USER 999
32
- ENV PATH="${WORKDIR_ROOT}:${WORKDIR_ROOT}/.venv/bin:$PATH"
33
- RUN echo "PATH: $PATH ..."
34
- RUN echo "PYTHONPATH: $PYTHONPATH ..."
35
- RUN echo "MPLCONFIGDIR: $MPLCONFIGDIR ..."
36
-
37
- CMD ["gunicorn", "--bind", "0.0.0.0:3000", "webApp:app"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aip_trainer/models/ModelInterfaces.py β†’ ModelInterfaces.py RENAMED
@@ -1,5 +1,5 @@
1
- import abc
2
 
 
3
  import numpy as np
4
 
5
 
 
 
1
 
2
+ import abc
3
  import numpy as np
4
 
5
 
README.md CHANGED
@@ -89,7 +89,7 @@ find aip_trainer -name "__pycache__" -exec rm -rf {} \;
89
  Then execute the tests again:
90
 
91
  ```bash
92
- pytest --cov=aip_trainer --cov-report=term-missing && coverage html
93
  ```
94
 
95
  ### Backend tests execution on Windows
@@ -106,7 +106,7 @@ Normally I use Visual Studio Code to write and execute my playwright tests, howe
106
 
107
  ```bash
108
  pnpm install
109
- pnpm playwright test
110
  ```
111
 
112
  ### Unused classes and functions (now removed)
 
89
  Then execute the tests again:
90
 
91
  ```bash
92
+ python -m pytest tests/models/test_models_faster_whisper.py; echo "# start pytest complete test suite #"; IS_TESTING=TRUE python -m pytest tests --cov="." --cov-report=term-missing && coverage html
93
  ```
94
 
95
  ### Backend tests execution on Windows
 
106
 
107
  ```bash
108
  pnpm install
109
+ pnpm playwright test --workers 1 --retries 4 --project=chromium
110
  ```
111
 
112
  ### Unused classes and functions (now removed)
aip_trainer/models/RuleBasedModels.py β†’ RuleBasedModels.py RENAMED
@@ -1,8 +1,20 @@
 
 
 
 
1
  import eng_to_ipa
2
 
3
- from aip_trainer.models import ModelInterfaces
4
- from aip_trainer import app_logger
5
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  class EpitranPhonemConverter(ModelInterfaces.ITextToPhonemModel):
8
  word_locations_in_samples = None
@@ -13,9 +25,7 @@ class EpitranPhonemConverter(ModelInterfaces.ITextToPhonemModel):
13
  self.epitran_model = epitran_model
14
 
15
  def convertToPhonem(self, sentence: str) -> str:
16
- app_logger.debug(f'starting EpitranPhonemConverter.convertToPhonem for sentence/token "{sentence}"...')
17
  phonem_representation = self.epitran_model.transliterate(sentence)
18
- app_logger.debug(f'EpitranPhonemConverter: got phonem_representation for sentence/token "{sentence}"!')
19
  return phonem_representation
20
 
21
 
@@ -25,8 +35,6 @@ class EngPhonemConverter(ModelInterfaces.ITextToPhonemModel):
25
  super().__init__()
26
 
27
  def convertToPhonem(self, sentence: str) -> str:
28
- app_logger.debug(f'starting EngPhonemConverter.convertToPhonem for sentence/token "{sentence}"...')
29
  phonem_representation = eng_to_ipa.convert(sentence)
30
  phonem_representation = phonem_representation.replace('*','')
31
- app_logger.debug(f'EngPhonemConverter: got phonem_representation for sentence/token "{sentence}"!')
32
  return phonem_representation
 
1
+ import ModelInterfaces
2
+ import torch
3
+ import numpy as np
4
+ import epitran
5
  import eng_to_ipa
6
 
 
 
7
 
8
+ def get_phonem_converter(language: str):
9
+ if language == 'de':
10
+ phonem_converter = EpitranPhonemConverter(
11
+ epitran.Epitran('deu-Latn'))
12
+ elif language == 'en':
13
+ phonem_converter = EngPhonemConverter()
14
+ else:
15
+ raise ValueError('Language not implemented')
16
+
17
+ return phonem_converter
18
 
19
  class EpitranPhonemConverter(ModelInterfaces.ITextToPhonemModel):
20
  word_locations_in_samples = None
 
25
  self.epitran_model = epitran_model
26
 
27
  def convertToPhonem(self, sentence: str) -> str:
 
28
  phonem_representation = self.epitran_model.transliterate(sentence)
 
29
  return phonem_representation
30
 
31
 
 
35
  super().__init__()
36
 
37
  def convertToPhonem(self, sentence: str) -> str:
 
38
  phonem_representation = eng_to_ipa.convert(sentence)
39
  phonem_representation = phonem_representation.replace('*','')
 
40
  return phonem_representation
aip_trainer/WordMatching.py β†’ WordMatching.py RENAMED
@@ -1,22 +1,24 @@
1
  import time
2
  from string import punctuation
 
3
 
4
  import numpy as np
5
  from dtwalign import dtw_from_distance_matrix
6
  from ortools.sat.python import cp_model
7
 
8
- from . import WordMetrics, app_logger
 
9
 
10
  offset_blank = 1
11
  TIME_THRESHOLD_MAPPING = 5.0
12
 
13
 
14
- def get_word_distance_matrix(words_estimated: list, words_real: list) -> np.array:
15
  number_of_real_words = len(words_real)
16
  number_of_estimated_words = len(words_estimated)
17
 
18
  word_distance_matrix = np.zeros(
19
- (number_of_estimated_words+offset_blank, number_of_real_words))
20
  for idx_estimated in range(number_of_estimated_words):
21
  for idx_real in range(number_of_real_words):
22
  word_distance_matrix[idx_estimated, idx_real] = WordMetrics.edit_distance_python(
@@ -25,7 +27,7 @@ def get_word_distance_matrix(words_estimated: list, words_real: list) -> np.arra
25
  if offset_blank == 1:
26
  for idx_real in range(number_of_real_words):
27
  word_distance_matrix[number_of_estimated_words,
28
- idx_real] = len(words_real[idx_real])
29
  return word_distance_matrix
30
 
31
 
@@ -33,37 +35,37 @@ def get_best_path_from_distance_matrix(word_distance_matrix):
33
  modelCpp = cp_model.CpModel()
34
 
35
  number_of_real_words = word_distance_matrix.shape[1]
36
- number_of_estimated_words = word_distance_matrix.shape[0]-1
37
 
38
  number_words = np.maximum(number_of_real_words, number_of_estimated_words)
39
 
40
  estimated_words_order = [modelCpp.NewIntVar(0, int(
41
- number_words - 1 + offset_blank), 'w%i' % i) for i in range(number_words+offset_blank)]
42
 
43
  # They are in ascending order
44
- for word_idx in range(number_words-1):
45
  modelCpp.Add(
46
- estimated_words_order[word_idx+1] >= estimated_words_order[word_idx])
47
 
48
  total_phoneme_distance = 0
49
  real_word_at_time = {}
50
  for idx_estimated in range(number_of_estimated_words):
51
  for idx_real in range(number_of_real_words):
52
  real_word_at_time[idx_estimated, idx_real] = modelCpp.NewBoolVar(
53
- 'real_word_at_time'+str(idx_real)+'-'+str(idx_estimated))
54
  modelCpp.Add(estimated_words_order[idx_estimated] == idx_real).OnlyEnforceIf(
55
  real_word_at_time[idx_estimated, idx_real])
56
  total_phoneme_distance += word_distance_matrix[idx_estimated,
57
- idx_real]*real_word_at_time[idx_estimated, idx_real]
58
 
59
  # If no word in time, difference is calculated from empty string
60
  for idx_real in range(number_of_real_words):
61
  word_has_a_match = modelCpp.NewBoolVar(
62
- 'word_has_a_match'+str(idx_real))
63
  modelCpp.Add(sum([real_word_at_time[idx_estimated, idx_real] for idx_estimated in range(
64
  number_of_estimated_words)]) == 1).OnlyEnforceIf(word_has_a_match)
65
  total_phoneme_distance += word_distance_matrix[number_of_estimated_words,
66
- idx_real]*word_has_a_match.Not()
67
 
68
  # Loss should be minimized
69
  modelCpp.Minimize(total_phoneme_distance)
@@ -79,18 +81,16 @@ def get_best_path_from_distance_matrix(word_distance_matrix):
79
  (solver.Value(estimated_words_order[word_idx])))
80
 
81
  return np.array(mapped_indices, dtype=int)
82
- except Exception as ex:
83
- app_logger.error(f"ex:{ex}.")
84
  return []
85
 
86
 
87
- def get_resulting_string(mapped_indices: np.array, words_estimated: list, words_real: list) -> tuple[list, list]:
88
  mapped_words = []
89
  mapped_words_indices = []
90
  WORD_NOT_FOUND_TOKEN = '-'
91
  number_of_real_words = len(words_real)
92
  for word_idx in range(number_of_real_words):
93
- app_logger.debug(f"{word_idx} => {mapped_indices} == {word_idx}, {mapped_indices == word_idx} #")
94
  position_of_real_word_indices = np.where(
95
  mapped_indices == word_idx)[0].astype(int)
96
 
@@ -109,59 +109,93 @@ def get_resulting_string(mapped_indices: np.array, words_estimated: list, words_
109
  error = 99999
110
  best_possible_combination = ''
111
  best_possible_idx = -1
112
- best_possible_combination, best_possible_idx = inner_get_resulting_string(
113
- best_possible_combination, best_possible_idx, error, position_of_real_word_indices,
114
- word_idx, words_estimated, words_real
115
- )
 
 
 
 
 
 
116
 
117
  mapped_words.append(best_possible_combination)
118
  mapped_words_indices.append(best_possible_idx)
119
- # continue
120
-
121
- return mapped_words, mapped_words_indices
122
-
123
-
124
- def inner_get_resulting_string(
125
- best_possible_combination, best_possible_idx, error, position_of_real_word_indices, word_idx, words_estimated, words_real
126
- ):
127
- for single_word_idx in position_of_real_word_indices:
128
- idx_above_word = single_word_idx >= len(words_estimated)
129
- if idx_above_word:
130
  continue
131
- error_word = WordMetrics.edit_distance_python(
132
- words_estimated[single_word_idx], words_real[word_idx])
133
- if error_word < error:
134
- error = error_word * 1
135
- best_possible_combination = words_estimated[single_word_idx]
136
- best_possible_idx = single_word_idx
137
- return best_possible_combination, best_possible_idx
138
 
 
139
 
140
- def get_best_mapped_words(words_estimated: list, words_real: list) -> tuple[list, list]:
141
 
 
 
142
  word_distance_matrix = get_word_distance_matrix(
143
  words_estimated, words_real)
144
-
145
  start = time.time()
146
- mapped_indices = get_best_path_from_distance_matrix(word_distance_matrix)
147
-
148
- duration_of_mapping = time.time()-start
149
- # In case or-tools doesn't converge, go to a faster, low-quality solution
150
- if len(mapped_indices) == 0 or duration_of_mapping > TIME_THRESHOLD_MAPPING+0.5:
151
- mapped_indices = (dtw_from_distance_matrix(
152
- word_distance_matrix)).path[:len(words_estimated), 1]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
 
154
- mapped_words, mapped_words_indices = get_resulting_string(
155
- mapped_indices, words_estimated, words_real)
156
 
157
- return mapped_words, mapped_words_indices
 
 
 
 
 
 
 
 
 
 
158
 
159
 
160
  def getWhichLettersWereTranscribedCorrectly(real_word, transcribed_word):
161
- is_leter_correct = [None]*len(real_word)
162
  for idx, letter in enumerate(real_word):
 
 
163
  if letter == transcribed_word[idx] or letter in punctuation:
164
  is_leter_correct[idx] = 1
165
  else:
166
  is_leter_correct[idx] = 0
167
  return is_leter_correct
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import time
2
  from string import punctuation
3
+ from typing import List, Tuple
4
 
5
  import numpy as np
6
  from dtwalign import dtw_from_distance_matrix
7
  from ortools.sat.python import cp_model
8
 
9
+ import WordMetrics
10
+ from constants import app_logger
11
 
12
  offset_blank = 1
13
  TIME_THRESHOLD_MAPPING = 5.0
14
 
15
 
16
+ def get_word_distance_matrix(words_estimated: list, words_real: list) -> np.ndarray:
17
  number_of_real_words = len(words_real)
18
  number_of_estimated_words = len(words_estimated)
19
 
20
  word_distance_matrix = np.zeros(
21
+ (number_of_estimated_words + offset_blank, number_of_real_words))
22
  for idx_estimated in range(number_of_estimated_words):
23
  for idx_real in range(number_of_real_words):
24
  word_distance_matrix[idx_estimated, idx_real] = WordMetrics.edit_distance_python(
 
27
  if offset_blank == 1:
28
  for idx_real in range(number_of_real_words):
29
  word_distance_matrix[number_of_estimated_words,
30
+ idx_real] = len(words_real[idx_real])
31
  return word_distance_matrix
32
 
33
 
 
35
  modelCpp = cp_model.CpModel()
36
 
37
  number_of_real_words = word_distance_matrix.shape[1]
38
+ number_of_estimated_words = word_distance_matrix.shape[0] - 1
39
 
40
  number_words = np.maximum(number_of_real_words, number_of_estimated_words)
41
 
42
  estimated_words_order = [modelCpp.NewIntVar(0, int(
43
+ number_words - 1 + offset_blank), 'w%i' % i) for i in range(number_words + offset_blank)]
44
 
45
  # They are in ascending order
46
+ for word_idx in range(number_words - 1):
47
  modelCpp.Add(
48
+ estimated_words_order[word_idx + 1] >= estimated_words_order[word_idx])
49
 
50
  total_phoneme_distance = 0
51
  real_word_at_time = {}
52
  for idx_estimated in range(number_of_estimated_words):
53
  for idx_real in range(number_of_real_words):
54
  real_word_at_time[idx_estimated, idx_real] = modelCpp.NewBoolVar(
55
+ 'real_word_at_time' + str(idx_real) + '-' + str(idx_estimated))
56
  modelCpp.Add(estimated_words_order[idx_estimated] == idx_real).OnlyEnforceIf(
57
  real_word_at_time[idx_estimated, idx_real])
58
  total_phoneme_distance += word_distance_matrix[idx_estimated,
59
+ idx_real] * real_word_at_time[idx_estimated, idx_real]
60
 
61
  # If no word in time, difference is calculated from empty string
62
  for idx_real in range(number_of_real_words):
63
  word_has_a_match = modelCpp.NewBoolVar(
64
+ 'word_has_a_match' + str(idx_real))
65
  modelCpp.Add(sum([real_word_at_time[idx_estimated, idx_real] for idx_estimated in range(
66
  number_of_estimated_words)]) == 1).OnlyEnforceIf(word_has_a_match)
67
  total_phoneme_distance += word_distance_matrix[number_of_estimated_words,
68
+ idx_real] * word_has_a_match.Not()
69
 
70
  # Loss should be minimized
71
  modelCpp.Minimize(total_phoneme_distance)
 
81
  (solver.Value(estimated_words_order[word_idx])))
82
 
83
  return np.array(mapped_indices, dtype=int)
84
+ except:
 
85
  return []
86
 
87
 
88
+ def get_resulting_string(mapped_indices: np.ndarray, words_estimated: list, words_real: list) -> Tuple[List, List]:
89
  mapped_words = []
90
  mapped_words_indices = []
91
  WORD_NOT_FOUND_TOKEN = '-'
92
  number_of_real_words = len(words_real)
93
  for word_idx in range(number_of_real_words):
 
94
  position_of_real_word_indices = np.where(
95
  mapped_indices == word_idx)[0].astype(int)
96
 
 
109
  error = 99999
110
  best_possible_combination = ''
111
  best_possible_idx = -1
112
+ for single_word_idx in position_of_real_word_indices:
113
+ idx_above_word = single_word_idx >= len(words_estimated)
114
+ if idx_above_word:
115
+ continue
116
+ error_word = WordMetrics.edit_distance_python(
117
+ words_estimated[single_word_idx], words_real[word_idx])
118
+ if error_word < error:
119
+ error = error_word * 1
120
+ best_possible_combination = words_estimated[single_word_idx]
121
+ best_possible_idx = single_word_idx
122
 
123
  mapped_words.append(best_possible_combination)
124
  mapped_words_indices.append(best_possible_idx)
 
 
 
 
 
 
 
 
 
 
 
125
  continue
 
 
 
 
 
 
 
126
 
127
+ return mapped_words, mapped_words_indices
128
 
 
129
 
130
+ def get_best_mapped_words(words_estimated: list | str, words_real: list | str, use_dtw:bool = False) -> tuple[list, list]:
131
+ app_logger.info(f"words_estimated: '{words_estimated}', words_real: '{words_real}', use_dtw:{use_dtw}.")
132
  word_distance_matrix = get_word_distance_matrix(
133
  words_estimated, words_real)
134
+ app_logger.debug(f"word_distance_matrix: '{word_distance_matrix}'.")
135
  start = time.time()
136
+ app_logger.info(f"use_dtw: '{use_dtw}'.")
137
+ if use_dtw:
138
+ alignment = (dtw_from_distance_matrix(word_distance_matrix.T))
139
+ app_logger.debug(f"alignment: '{alignment}'.")
140
+ mapped_indices = alignment.get_warping_path()[:len(words_estimated)]
141
+ app_logger.debug(f"mapped_indices: '{mapped_indices}'.")
142
+ duration_of_mapping = time.time()-start
143
+ else:
144
+ mapped_indices = get_best_path_from_distance_matrix(word_distance_matrix)
145
+ app_logger.debug(f"mapped_indices: '{mapped_indices}'.")
146
+ duration_of_mapping = time.time()-start
147
+ # In case or-tools doesn't converge, go to a faster, low-quality solution
148
+ check_mapped_indices_or_duration = len(mapped_indices) == 0 or duration_of_mapping > TIME_THRESHOLD_MAPPING+0.5
149
+ app_logger.info(f"check_mapped_indices_or_duration: '{check_mapped_indices_or_duration}'.")
150
+ if check_mapped_indices_or_duration:
151
+ #mapped_indices = (dtw_from_distance_matrix(
152
+ # word_distance_matrix)).path[:len(words_estimated), 1]
153
+ word_distance_matrix_transposed = word_distance_matrix.T
154
+ app_logger.debug(f"word_distance_matrix_transposed: '{word_distance_matrix_transposed}'.")
155
+ alignment = dtw_from_distance_matrix(word_distance_matrix_transposed)
156
+ app_logger.debug(f"check_mapped_indices_or_duration, alignment: '{alignment}'.")
157
+ mapped_indices = alignment.get_warping_path()
158
+ app_logger.debug(f"check_mapped_indices_or_duration, mapped_indices: '{mapped_indices}'.")
159
+
160
+ mapped_words, mapped_words_indices = get_resulting_string(mapped_indices, words_estimated, words_real)
161
+ app_logger.debug(f"mapped_words: '{mapped_words}', mapped_words_indices: '{mapped_words_indices}', duration_of_mapping:{duration_of_mapping}.")
162
+ return mapped_words, mapped_words_indices
163
 
 
 
164
 
165
+ ## Faster, but not optimal
166
+ # def get_best_mapped_words_dtw(words_estimated: list, words_real: list) -> list:
167
+ # from dtwalign import dtw_from_distance_matrix
168
+ # word_distance_matrix = get_word_distance_matrix(
169
+ # words_estimated, words_real)
170
+ # mapped_indices = dtw_from_distance_matrix(
171
+ # word_distance_matrix).path[:-1, 0]
172
+ #
173
+ # mapped_words, mapped_words_indices = get_resulting_string(
174
+ # mapped_indices, words_estimated, words_real)
175
+ # return mapped_words, mapped_words_indices
176
 
177
 
178
  def getWhichLettersWereTranscribedCorrectly(real_word, transcribed_word):
179
+ is_leter_correct = [None] * len(real_word)
180
  for idx, letter in enumerate(real_word):
181
+ letter = letter.lower()
182
+ transcribed_word[idx] = transcribed_word[idx].lower()
183
  if letter == transcribed_word[idx] or letter in punctuation:
184
  is_leter_correct[idx] = 1
185
  else:
186
  is_leter_correct[idx] = 0
187
  return is_leter_correct
188
+
189
+
190
+ # def parseLetterErrorsToHTML(word_real, is_leter_correct):
191
+ # word_colored = ''
192
+ # correct_color_start = '*'
193
+ # correct_color_end = '*'
194
+ # wrong_color_start = '-'
195
+ # wrong_color_end = '-'
196
+ # for idx, letter in enumerate(word_real):
197
+ # if is_leter_correct[idx] == 1:
198
+ # word_colored += correct_color_start + letter + correct_color_end
199
+ # else:
200
+ # word_colored += wrong_color_start + letter + wrong_color_end
201
+ # return word_colored
aip_trainer/WordMetrics.py β†’ WordMetrics.py RENAMED
@@ -1,9 +1,33 @@
1
  import numpy as np
2
 
3
- from aip_trainer import app_logger
 
4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- # https://stackabuse.com/levenshtein-distance-and-text-similarity-in-python/
7
  def edit_distance_python(seq1, seq2):
8
  size_x = len(seq1) + 1
9
  size_y = len(seq2) + 1
@@ -27,5 +51,5 @@ def edit_distance_python(seq1, seq2):
27
  matrix[x-1,y-1] + 1,
28
  matrix[x,y-1] + 1
29
  )
30
- app_logger.debug("matrix:{}\n".format(matrix))
31
- return matrix[size_x - 1, size_y - 1]
 
1
  import numpy as np
2
 
3
+ # ref from https://gitlab.com/-/snippets/1948157
4
+ # For some variants, look here https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#Python
5
 
6
+ # Pure python
7
+ def edit_distance_python2(a, b):
8
+ # This version is commutative, so as an optimization we force |a|>=|b|
9
+ if len(a) < len(b):
10
+ return edit_distance_python(b, a)
11
+ if len(b) == 0: # Can deal with empty sequences faster
12
+ return len(a)
13
+ # Only two rows are really needed: the one currently filled in, and the previous
14
+ distances = []
15
+ distances.append([i for i in range(len(b)+1)])
16
+ distances.append([0 for _ in range(len(b)+1)])
17
+ # We can prefill the first row:
18
+ costs = [0 for _ in range(3)]
19
+ for i, a_token in enumerate(a, start=1):
20
+ distances[1][0] += 1 # Deals with the first column.
21
+ for j, b_token in enumerate(b, start=1):
22
+ costs[0] = distances[1][j-1] + 1
23
+ costs[1] = distances[0][j] + 1
24
+ costs[2] = distances[0][j-1] + (0 if a_token == b_token else 1)
25
+ distances[1][j] = min(costs)
26
+ # Move to the next row:
27
+ distances[0][:] = distances[1][:]
28
+ return distances[1][len(b)]
29
 
30
+ #https://stackabuse.com/levenshtein-distance-and-text-similarity-in-python/
31
  def edit_distance_python(seq1, seq2):
32
  size_x = len(seq1) + 1
33
  size_y = len(seq2) + 1
 
51
  matrix[x-1,y-1] + 1,
52
  matrix[x,y-1] + 1
53
  )
54
+ #print (matrix)
55
+ return matrix[size_x - 1, size_y - 1]
aip_trainer/__init__.py DELETED
@@ -1,21 +0,0 @@
1
- import os
2
- from pathlib import Path
3
-
4
- import structlog
5
- from dotenv import load_dotenv
6
-
7
- from aip_trainer.utils import session_logger
8
-
9
-
10
- load_dotenv()
11
- PROJECT_ROOT_FOLDER = Path(globals().get("__file__", "./_")).absolute().parent.parent
12
- LOG_JSON_FORMAT = bool(os.getenv("LOG_JSON_FORMAT"))
13
- log_level = os.getenv("LOG_LEVEL", "INFO")
14
- sample_rate_start = int(os.getenv('SAMPLE_RATE', 48000))
15
- accepted_sample_rates = [48000, 24000, 16000, 8000]
16
- try:
17
- assert sample_rate_start in accepted_sample_rates
18
- except AssertionError:
19
- raise ValueError(f"cannot use a sample rate of value '{sample_rate_start}', should be one of {accepted_sample_rates} ...")
20
- session_logger.setup_logging(json_logs=LOG_JSON_FORMAT, log_level=log_level)
21
- app_logger = structlog.stdlib.get_logger(__name__)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aip_trainer/lambdas/__init__.py DELETED
@@ -1 +0,0 @@
1
-
 
 
aip_trainer/lambdas/data_de_en_with_categories.json DELETED
The diff for this file is too large to render. See raw diff
 
aip_trainer/lambdas/lambdaGetSample.py DELETED
@@ -1,106 +0,0 @@
1
- import json
2
- import pickle
3
- from pathlib import Path
4
-
5
- import epitran
6
- import pandas as pd
7
-
8
- from aip_trainer import PROJECT_ROOT_FOLDER, app_logger
9
- from aip_trainer.models import RuleBasedModels
10
- from aip_trainer.utils.typing_hints import BodyGetSampleRequest
11
-
12
-
13
- class TextDataset:
14
- def __init__(self, table, language='-'):
15
- self.table_dataframe = table
16
- self.number_of_samples = len(table)
17
- self.language = language
18
-
19
- def __getitem__(self, idx):
20
- language_sentence = f"{self.language}_sentence" if self.language != '-' else 'sentence'
21
- language_series = self.table_dataframe[language_sentence]
22
- return [language_series.iloc[idx]]
23
-
24
- def __len__(self):
25
- return self.number_of_samples
26
-
27
- def get_category_from_df_by_language(self, language: str, category_value:int):
28
- selector = self.table_dataframe[f"{language}_category"] == category_value
29
- df_by_category = self.table_dataframe[selector]
30
- return df_by_category
31
-
32
- def get_random_sample_from_df(self, language: str, category_value:int):
33
- app_logger.info(f"language={language}, category_value={category_value}.")
34
- choice = self.table_dataframe.sample(n=1)
35
- if category_value !=0:
36
- df_language_filtered_by_category_and_language = self.get_category_from_df_by_language(language, category_value)
37
- choice = df_language_filtered_by_category_and_language.sample(n=1)
38
- return [choice[f"{language}_sentence"].iloc[0]]
39
-
40
-
41
- sample_folder = Path(PROJECT_ROOT_FOLDER / "aip_trainer" / "lambdas")
42
- lambda_database = {}
43
- lambda_ipa_converter = {}
44
-
45
- with open(sample_folder / 'data_de_en_with_categories.json', 'r') as src:
46
- df = pd.read_json(src)
47
-
48
- lambda_database['de'] = TextDataset(df, 'de')
49
- lambda_database['en'] = TextDataset(df, 'en')
50
- lambda_translate_new_sample = False
51
- lambda_ipa_converter['de'] = RuleBasedModels.EpitranPhonemConverter(
52
- epitran.Epitran('deu-Latn'))
53
- lambda_ipa_converter['en'] = RuleBasedModels.EngPhonemConverter()
54
-
55
-
56
- def lambda_handler(event, context):
57
- event_body = event["body"]
58
- body = BodyGetSampleRequest.model_validate_json(event_body)
59
- current_transcript = get_random_selection(body.language, body.category, is_gradio_output=False, transcript=body.transcript)
60
- current_transcript = current_transcript[0] if isinstance(current_transcript, list) else current_transcript
61
- current_ipa = lambda_ipa_converter[body.language].convertToPhonem(current_transcript)
62
-
63
- app_logger.info(f"real_transcript='{current_transcript}', ipa_transcript='{current_ipa}'.")
64
- result = {
65
- 'real_transcript': current_transcript,
66
- 'ipa_transcript': current_ipa,
67
- 'transcript_translation': ""
68
- }
69
-
70
- return json.dumps(result)
71
-
72
-
73
- def get_random_selection(language: str, category: int, is_gradio_output=True, transcript=None):
74
- if transcript is not None and isinstance(transcript, str) and len(transcript) > 0:
75
- return transcript
76
- lambda_df_lang = lambda_database[language]
77
- current_transcript = lambda_df_lang.get_random_sample_from_df(language, category)
78
- app_logger.info(f"category={category}, language={language}, current_transcript={current_transcript}.")
79
- return current_transcript[0] if is_gradio_output else current_transcript
80
-
81
-
82
- def getSentenceCategory(sentence) -> int:
83
- number_of_words = len(sentence.split())
84
- categories_word_limits = [0, 8, 20, 100000]
85
- for category in range(len(categories_word_limits) - 1):
86
- if categories_word_limits[category] < number_of_words <= categories_word_limits[category + 1]:
87
- return category + 1
88
-
89
-
90
- def get_pickle2json_dataframe(
91
- custom_pickle_filename_no_ext: Path | str = 'data_de_en_2',
92
- custom_folder: Path = sample_folder
93
- ):
94
- custom_folder = Path(custom_folder)
95
- with open(custom_folder / f'{custom_pickle_filename_no_ext}.pickle', 'rb') as handle:
96
- df2 = pickle.load(handle)
97
- pass
98
- df2["de_category"] = df2["de_sentence"].apply(getSentenceCategory)
99
- print("de_category added")
100
- df2["en_category"] = df2["en_sentence"].apply(getSentenceCategory)
101
- print("en_category added")
102
- df_json = df2.to_json()
103
- with open(custom_folder / f'{custom_pickle_filename_no_ext}.json', 'w') as dst:
104
- dst.write(df_json)
105
- print("data_de_en_with_categories.json written")
106
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aip_trainer/models/__init__.py DELETED
File without changes
aip_trainer/utils/__init__.py DELETED
File without changes
aip_trainer/utils/split_cosmic_ray_report.py DELETED
@@ -1,33 +0,0 @@
1
- from pathlib import Path
2
-
3
-
4
- def get_cosmic_ray_report_filtered(input_filename, suffix="filtered", separator="============", filter_string_list: list = None):
5
- if filter_string_list is None:
6
- filter_string_list = ["test outcome: TestOutcome.KILLED"]
7
- filename, ext = Path(input_filename).stem, Path(input_filename).suffix
8
- working_dir = input_filename.parent
9
- # Read the input file
10
- with open(input_filename, 'r') as file:
11
- content = file.read()
12
-
13
- # Split the content into sections
14
- sections = content.split(separator)
15
- filtered_sections = [section for section in sections]
16
-
17
- # Filter out sections containing "test outcome: TestOutcome.KILLED"
18
- for filter_string in filter_string_list:
19
- filtered_sections = [section for section in filtered_sections if filter_string not in section]
20
-
21
- # Join the filtered sections back into a single string
22
- filtered_content = separator.join(filtered_sections)
23
-
24
- # Write the filtered content to a new file
25
- with open(working_dir / f'{filename}_{suffix}{ext}', 'w') as file:
26
- file.write(filtered_content)
27
-
28
-
29
-
30
- if __name__ == "__main__":
31
- from aip_trainer import PROJECT_ROOT_FOLDER
32
- _input_filename = "cosmic-ray-models2.txt"
33
- get_cosmic_ray_report_filtered(PROJECT_ROOT_FOLDER / "tmp" / _input_filename)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aip_trainer/utils/typing_hints.py DELETED
@@ -1,19 +0,0 @@
1
- from typing import Annotated, Optional, TypeAlias
2
- from pydantic import BaseModel
3
-
4
- import annotated_types
5
-
6
-
7
- Category: TypeAlias = Annotated[int, annotated_types.Ge(0), annotated_types.Le(4)]
8
-
9
-
10
- class BodyGetSampleRequest(BaseModel):
11
- category: Optional[Category] = 0
12
- language: str
13
- transcript: Optional[str] = ""
14
-
15
-
16
- class BodySpeechToScoreRequest(BaseModel):
17
- base64Audio: str
18
- language: str
19
- title: str
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aip_trainer/utils/utilities.py DELETED
@@ -1,57 +0,0 @@
1
- """Various utilities (logger, time benchmark, args dump, numerical and stats info)"""
2
-
3
- from copy import deepcopy
4
- from aip_trainer import app_logger
5
- from aip_trainer.utils.serialize import serialize
6
-
7
-
8
- def hash_calculate(arr_or_path, is_file: bool, read_mode: str = "rb") -> str | bytes:
9
- """
10
- Return computed hash from input variable (typically a numpy array).
11
-
12
- Args:
13
- arr: input variable
14
-
15
- Returns:
16
- computed hash from input variable
17
- """
18
- from hashlib import sha256
19
- from base64 import b64encode
20
- from numpy import ndarray as np_ndarray
21
-
22
- if is_file:
23
- with open(arr_or_path, read_mode) as file_to_check:
24
- # read contents of the file
25
- arr_or_path = file_to_check.read()
26
- # # pipe contents of the file through
27
- # try:
28
- # return hashlib.sha256(data).hexdigest()
29
- # except TypeError:
30
- # app_logger.warning(
31
- # f"TypeError, re-try encoding arg:{arr_or_path},type:{type(arr_or_path)}."
32
- # )
33
- # return hashlib.sha256(data.encode("utf-8")).hexdigest()
34
-
35
- if isinstance(arr_or_path, np_ndarray):
36
- hash_fn = sha256(arr_or_path.data)
37
- elif isinstance(arr_or_path, dict):
38
- import json
39
-
40
- serialized = serialize(arr_or_path)
41
- variable_to_hash = json.dumps(serialized, sort_keys=True).encode("utf-8")
42
- hash_fn = sha256(variable_to_hash)
43
- elif isinstance(arr_or_path, str):
44
- try:
45
- hash_fn = sha256(arr_or_path)
46
- except TypeError:
47
- app_logger.warning(
48
- f"TypeError, re-try encoding arg:{arr_or_path},type:{type(arr_or_path)}."
49
- )
50
- hash_fn = sha256(arr_or_path.encode("utf-8"))
51
- elif isinstance(arr_or_path, bytes):
52
- hash_fn = sha256(arr_or_path)
53
- else:
54
- raise ValueError(
55
- f"variable 'arr':{arr_or_path} of type '{type(arr_or_path)}' not yet handled."
56
- )
57
- return b64encode(hash_fn.digest())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -1,8 +1,12 @@
1
  from pathlib import Path
2
  import gradio as gr
3
 
4
- from aip_trainer import PROJECT_ROOT_FOLDER, app_logger, sample_rate_start
5
- from aip_trainer.lambdas import js, lambdaGetSample, lambdaSpeechToScore, lambdaTTS
 
 
 
 
6
 
7
 
8
  css = """
@@ -38,9 +42,34 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
38
  app_logger.info("start gradio app building...")
39
 
40
  project_root_folder = Path(PROJECT_ROOT_FOLDER)
41
- with open(project_root_folder / "aip_trainer" / "lambdas" / "app_description.md", "r", encoding="utf-8") as app_description_src:
 
 
 
42
  md_app_description = app_description_src.read()
43
- gr.Markdown(md_app_description.format(sample_rate_start=sample_rate_start))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  with gr.Row():
45
  with gr.Column(scale=4, min_width=300):
46
  with gr.Row(elem_id="id-choose-random-phrase-by-language-and-difficulty"):
@@ -108,10 +137,10 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
108
  visible=False,
109
  )
110
  text_recording_ipa = gr.Textbox(
111
- placeholder=None, label="Student phonetic transcription", elem_id="text-student-recording-ipa-id-element"
112
  )
113
  text_ideal_ipa = gr.Textbox(
114
- placeholder=None, label="Ideal phonetic transcription", elem_id="text-ideal-ipa-id-element"
115
  )
116
  text_raw_json_output_hidden = gr.Textbox(placeholder=None, label="text_raw_json_output_hidden", visible=False)
117
  with gr.Group(elem_classes="speech-output-group background-white"):
@@ -127,11 +156,11 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
127
  elem_classes="speech-output-html background-white",
128
  )
129
  with gr.Row():
130
- with gr.Column(min_width=100, elem_classes="speech-accuracy-score-container row2 col1"):
131
- num_pronunciation_accuracy = gr.Number(label="Current score %", elem_id="number-pronunciation-accuracy-id-element")
132
- with gr.Column(min_width=100, elem_classes="speech-accuracy-score-container row2 col2"):
133
  num_score_de = gr.Number(label="Global score DE %", value=0, interactive=False, elem_id="number-score-de-id-element")
134
- with gr.Column(min_width=100, elem_classes="speech-accuracy-score-container row2 col3"):
135
  num_score_en = gr.Number(label="Global score EN %", value=0, interactive=False, elem_id="number-score-en-id-element")
136
  btn_recognize_speech_accuracy = gr.Button(value="Get speech accuracy score (%)", elem_id="btn-recognize-speech-accuracy-id-element")
137
  with gr.Row(elem_id="id-replay-splitted-audio-by-words"):
@@ -139,17 +168,17 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
139
  with gr.Column(scale=1, min_width=50):
140
  num_selected_recognized_word = gr.Number(label=word_idx_text, visible=True, minimum=0, value=0, interactive=False)
141
  with gr.Column(scale=4, min_width=100):
142
- audio_splitted_student_recording_stt = gr.Audio(
143
- label="Splitted student speech output",
144
  type="filepath",
145
  show_download_button=True,
146
- elem_id="audio-splitted-student-recording-stt-id-element",
147
  )
148
  text_selected_recognized_word_hidden = gr.Textbox(label="text_selected_recognized_word", value="placeholder", interactive=False, visible=False)
149
 
150
  def get_updated_score_by_language(text: str, audio_rec: str | Path, lang: str, score_de: float, score_en: float):
151
  import json
152
- _transcribed_text, _letter_correctness, _pronunciation_accuracy, _recording_ipa, _ideal_ipa, _num_tot_recognized_word, first_audio_file, _res = lambdaSpeechToScore.get_speech_to_score_tuple(text, audio_rec, lang, remove_random_file=False)
153
  new_num_selected_recognized_word = gr.Number(label=word_idx_text, visible=True, value=0)
154
  words_list = _transcribed_text.split()
155
  first_word = words_list[0]
@@ -165,7 +194,7 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
165
  text_raw_json_output_hidden: _res,
166
  num_tot_recognized_words: _num_tot_recognized_word,
167
  num_selected_recognized_word: new_num_selected_recognized_word,
168
- audio_splitted_student_recording_stt: first_audio_file,
169
  text_selected_recognized_word_hidden: first_word,
170
  num_audio_duration_hidden: first_audio_duration
171
  }
@@ -199,7 +228,7 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
199
  num_score_en,
200
  num_tot_recognized_words,
201
  num_selected_recognized_word,
202
- audio_splitted_student_recording_stt,
203
  text_selected_recognized_word_hidden,
204
  num_audio_duration_hidden
205
  ],
@@ -229,7 +258,7 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
229
  clear3,
230
  inputs=[],
231
  outputs=[
232
- audio_student_recording_stt, audio_tts, audio_splitted_student_recording_stt, text_recording_ipa, text_ideal_ipa, text_transcribed_hidden,
233
  num_pronunciation_accuracy, num_selected_recognized_word, num_pronunciation_accuracy
234
  ],
235
  )
@@ -280,18 +309,18 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
280
  num_selected_recognized_word.input(
281
  fn=lambdaSpeechToScore.get_selected_word,
282
  inputs=[num_selected_recognized_word, text_raw_json_output_hidden],
283
- outputs=[audio_splitted_student_recording_stt, text_selected_recognized_word_hidden, num_audio_duration_hidden],
284
  )
285
- audio_splitted_student_recording_stt.play(
286
  fn=None,
287
  inputs=[text_selected_recognized_word_hidden, radio_language, num_audio_duration_hidden],
288
- outputs=audio_splitted_student_recording_stt,
289
  js=js.js_play_audio
290
  )
291
 
292
  @gradio_app.load(inputs=[local_storage], outputs=[num_score_de, num_score_en])
293
  def load_from_local_storage(saved_values):
294
- print("loading from local storage", saved_values)
295
  return saved_values[0], saved_values[1]
296
 
297
  @gr.on([num_score_de.change, num_score_en.change], inputs=[num_score_de, num_score_en], outputs=[local_storage])
@@ -302,6 +331,6 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
302
  if __name__ == "__main__":
303
  try:
304
  gradio_app.launch()
305
- except Exception as e:
306
- app_logger.error(f"Error: {e}")
307
- raise e
 
1
  from pathlib import Path
2
  import gradio as gr
3
 
4
+ import js
5
+ from constants import (PROJECT_ROOT_FOLDER, app_logger, sample_rate_start, MODEL_NAME_DEFAULT, model_urls,
6
+ sample_rate_resample, samplerate_tts, silero_versions_dict)
7
+ import lambdaGetSample
8
+ import lambdaSpeechToScore
9
+ import lambdaTTS
10
 
11
 
12
  css = """
 
42
  app_logger.info("start gradio app building...")
43
 
44
  project_root_folder = Path(PROJECT_ROOT_FOLDER)
45
+ with open(project_root_folder / "app_headline.md", "r", encoding="utf-8") as app_headline_src:
46
+ md_app_headline = app_headline_src.read()
47
+ gr.Markdown(md_app_headline)
48
+ with open(project_root_folder / "app_description.md", "r", encoding="utf-8") as app_description_src:
49
  md_app_description = app_description_src.read()
50
+ model_url = model_urls[MODEL_NAME_DEFAULT]
51
+ app_logger.info(f"model_urls:{model_urls} ...")
52
+ models_names_urls_list = ""
53
+ other_supported_models = {k: v for k, v in model_urls.items() if k != MODEL_NAME_DEFAULT}
54
+ for model_name, model_url in other_supported_models.items():
55
+ app_logger.info(f"model_name: {model_name}, model_url: {model_url} ...")
56
+ models_names_urls_list += """\n - [{model_name}]({model_url})""".format(model_name=model_name, model_url=model_url)
57
+ if model_name == "silero":
58
+ models_names_urls_list += " (German version: {}, English version: {})".format(silero_versions_dict["de"], silero_versions_dict["en"])
59
+ app_logger.info(f"models_names_urls_list: '{models_names_urls_list}' ...")
60
+ with gr.Accordion(
61
+ "Click here for expand and show current env variables samplerate values, the selected model and the supported ones",
62
+ open=False,
63
+ elem_id="accordion-models-env-variables-id-element"
64
+ ):
65
+ gr.Markdown(md_app_description.format(
66
+ sample_rate_start=sample_rate_start,
67
+ model_name=MODEL_NAME_DEFAULT,
68
+ model_url=model_url,
69
+ models_names_urls_list=models_names_urls_list,
70
+ sample_rate_resample=sample_rate_resample,
71
+ samplerate_tts=samplerate_tts
72
+ ))
73
  with gr.Row():
74
  with gr.Column(scale=4, min_width=300):
75
  with gr.Row(elem_id="id-choose-random-phrase-by-language-and-difficulty"):
 
137
  visible=False,
138
  )
139
  text_recording_ipa = gr.Textbox(
140
+ placeholder="-", label="Student phonetic transcription", elem_id="text-student-recording-ipa-id-element", interactive=False
141
  )
142
  text_ideal_ipa = gr.Textbox(
143
+ placeholder="-", label="Ideal phonetic transcription", elem_id="text-ideal-ipa-id-element", interactive=False
144
  )
145
  text_raw_json_output_hidden = gr.Textbox(placeholder=None, label="text_raw_json_output_hidden", visible=False)
146
  with gr.Group(elem_classes="speech-output-group background-white"):
 
156
  elem_classes="speech-output-html background-white",
157
  )
158
  with gr.Row():
159
+ with gr.Column(min_width=100, elem_classes="speech-accuracy-score-container row2 col1", elem_id="id-current-speech-accuracy-score-container"):
160
+ num_pronunciation_accuracy = gr.Number(label="Current score %", elem_id="number-pronunciation-accuracy-id-element", interactive=False, value=0)
161
+ with gr.Column(min_width=100, elem_classes="speech-accuracy-score-container row2 col2", elem_id="id-global-speech-accuracy-score-de-container"):
162
  num_score_de = gr.Number(label="Global score DE %", value=0, interactive=False, elem_id="number-score-de-id-element")
163
+ with gr.Column(min_width=100, elem_classes="speech-accuracy-score-container row2 col3", elem_id="id-global-speech-accuracy-score-en-container"):
164
  num_score_en = gr.Number(label="Global score EN %", value=0, interactive=False, elem_id="number-score-en-id-element")
165
  btn_recognize_speech_accuracy = gr.Button(value="Get speech accuracy score (%)", elem_id="btn-recognize-speech-accuracy-id-element")
166
  with gr.Row(elem_id="id-replay-splitted-audio-by-words"):
 
168
  with gr.Column(scale=1, min_width=50):
169
  num_selected_recognized_word = gr.Number(label=word_idx_text, visible=True, minimum=0, value=0, interactive=False)
170
  with gr.Column(scale=4, min_width=100):
171
+ audio_sliced_student_recording_stt = gr.Audio(
172
+ label="Sliced student speech output",
173
  type="filepath",
174
  show_download_button=True,
175
+ elem_id="audio-sliced-student-recording-stt-id-element",
176
  )
177
  text_selected_recognized_word_hidden = gr.Textbox(label="text_selected_recognized_word", value="placeholder", interactive=False, visible=False)
178
 
179
  def get_updated_score_by_language(text: str, audio_rec: str | Path, lang: str, score_de: float, score_en: float):
180
  import json
181
+ _transcribed_text, _letter_correctness, _pronunciation_accuracy, _recording_ipa, _ideal_ipa, _num_tot_recognized_word, first_audio_file, _res, _ = lambdaSpeechToScore.get_speech_to_score_tuple(text, audio_rec, lang, remove_random_file=False)
182
  new_num_selected_recognized_word = gr.Number(label=word_idx_text, visible=True, value=0)
183
  words_list = _transcribed_text.split()
184
  first_word = words_list[0]
 
194
  text_raw_json_output_hidden: _res,
195
  num_tot_recognized_words: _num_tot_recognized_word,
196
  num_selected_recognized_word: new_num_selected_recognized_word,
197
+ audio_sliced_student_recording_stt: first_audio_file,
198
  text_selected_recognized_word_hidden: first_word,
199
  num_audio_duration_hidden: first_audio_duration
200
  }
 
228
  num_score_en,
229
  num_tot_recognized_words,
230
  num_selected_recognized_word,
231
+ audio_sliced_student_recording_stt,
232
  text_selected_recognized_word_hidden,
233
  num_audio_duration_hidden
234
  ],
 
258
  clear3,
259
  inputs=[],
260
  outputs=[
261
+ audio_student_recording_stt, audio_tts, audio_sliced_student_recording_stt, text_recording_ipa, text_ideal_ipa, text_transcribed_hidden,
262
  num_pronunciation_accuracy, num_selected_recognized_word, num_pronunciation_accuracy
263
  ],
264
  )
 
309
  num_selected_recognized_word.input(
310
  fn=lambdaSpeechToScore.get_selected_word,
311
  inputs=[num_selected_recognized_word, text_raw_json_output_hidden],
312
+ outputs=[audio_sliced_student_recording_stt, text_selected_recognized_word_hidden, num_audio_duration_hidden],
313
  )
314
+ audio_sliced_student_recording_stt.play(
315
  fn=None,
316
  inputs=[text_selected_recognized_word_hidden, radio_language, num_audio_duration_hidden],
317
+ outputs=audio_sliced_student_recording_stt,
318
  js=js.js_play_audio
319
  )
320
 
321
  @gradio_app.load(inputs=[local_storage], outputs=[num_score_de, num_score_en])
322
  def load_from_local_storage(saved_values):
323
+ app_logger.info(f"loading from local storage: {saved_values} ...")
324
  return saved_values[0], saved_values[1]
325
 
326
  @gr.on([num_score_de.change, num_score_en.change], inputs=[num_score_de, num_score_en], outputs=[local_storage])
 
331
  if __name__ == "__main__":
332
  try:
333
  gradio_app.launch()
334
+ except Exception as ex:
335
+ app_logger.error(f"Error: {ex}")
336
+ raise ex
app_description.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Models and variables
2
+
3
+ Right now this tool uses:
4
+
5
+ - [{model_name}]({model_url}) as STT (speech-to-text) model; other supported models are:
6
+ {models_names_urls_list}
7
+ - <u>{sample_rate_start}</u> as input samplerate value (from empirical tests the best sample rate value is 48000)
8
+ - <u>{sample_rate_resample}</u> as resampled samplerate value
9
+ - <u>{samplerate_tts}</u> as TTS (text-to-speech) samplerate value
10
+
11
+
aip_trainer/lambdas/app_description.md β†’ app_headline.md RENAMED
@@ -1,4 +1,6 @@
1
  # AI Pronunciation Trainer
2
 
3
  See [my fork](https://github.com/trincadev/ai-pronunciation-trainer) of [AI Pronunciation Trainer](https://github.com/Thiagohgl/ai-pronunciation-trainer) repository
4
- for more details. Right now this tool uses {sample_rate_start} as sample rate value. From empirical tests the best sample rate value is 48000.
 
 
 
1
  # AI Pronunciation Trainer
2
 
3
  See [my fork](https://github.com/trincadev/ai-pronunciation-trainer) of [AI Pronunciation Trainer](https://github.com/Thiagohgl/ai-pronunciation-trainer) repository
4
+ for more details.
5
+
6
+
constants.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+ import structlog
4
+ import session_logger
5
+
6
+ PROJECT_ROOT_FOLDER = Path(__file__).parent
7
+ ALLOWED_ORIGIN = os.getenv('ALLOWED_ORIGIN', 'http://localhost:3000')
8
+ LOG_JSON_FORMAT = bool(os.getenv("LOG_JSON_FORMAT"))
9
+ IS_TESTING = bool(os.getenv('IS_TESTING', ""))
10
+ STSCOREAPIKEY = os.getenv('STSCOREAPIKEY', "stscore_apikey_placeholder")
11
+ log_level = os.getenv("LOG_LEVEL", "INFO")
12
+ USE_DTW = bool(os.getenv("USE_DTW"))
13
+ MODEL_NAME_TESTING = "whisper"
14
+ _MODEL_NAME_DEFAULT = os.getenv("MODEL_NAME_DEFAULT", MODEL_NAME_TESTING)
15
+ MODEL_NAME_DEFAULT = MODEL_NAME_TESTING if IS_TESTING else _MODEL_NAME_DEFAULT
16
+ DEVICE = os.getenv("DEVICE", "cpu")
17
+ tmp_audio_extension = os.getenv('TMP_AUDIO_EXTENSION', '.wav')
18
+ session_logger.setup_logging(json_logs=LOG_JSON_FORMAT, log_level=log_level)
19
+ app_logger = structlog.stdlib.get_logger(__name__)
20
+ sample_rate_start = int(os.getenv('SAMPLE_RATE', 48000))
21
+ sample_rate_resample = 16000
22
+ samplerate_tts = 16000
23
+ language_not_implemented = "Language '{}' not implemented. Supported languages: 'de', 'en'."
24
+ SILERO_VERSION_DE = "v4"
25
+ SILERO_VERSION_EN = "latest"
26
+ silero_versions_dict = {"de": SILERO_VERSION_DE, "en": SILERO_VERSION_EN}
27
+ model_urls = {
28
+ "faster_whisper": "https://pypi.org/project/faster-whisper/",
29
+ "silero": "https://pypi.org/project/silero/",
30
+ "whisper": "https://pypi.org/project/openai-whisper/",
31
+ }
cosmic_ray_config.toml DELETED
@@ -1,8 +0,0 @@
1
- [cosmic-ray]
2
- module-path = "aip_trainer/models/models.py"
3
- timeout = 30.0
4
- excluded-modules = []
5
- test-command = "python -m pytest tests/models/test_models.py"
6
-
7
- [cosmic-ray.distributor]
8
- name = "local"
 
 
 
 
 
 
 
 
 
tests/test_data_de_en_2.pickle β†’ data_de_en_2.pickle RENAMED
File without changes
databases/data_de.csv ADDED
The diff for this file is too large to render. See raw diff
 
databases/data_en.csv ADDED
The diff for this file is too large to render. See raw diff
 
dockerfiles/apt_preferences DELETED
@@ -1,9 +0,0 @@
1
- Explanation: Uninstall or do not install any Debian-originated
2
- Explanation: package versions other than those in the stable distro
3
- Package: *
4
- Pin: release a=stable
5
- Pin-Priority: 900
6
-
7
- Package: zlib1g
8
- Pin: release a=trixie
9
- Pin-Priority: -10
 
 
 
 
 
 
 
 
 
 
dockerfiles/debian.sources DELETED
@@ -1,17 +0,0 @@
1
- Types: deb deb-src
2
- URIs: http://deb.debian.org/debian
3
- Suites: bookworm bookworm-updates
4
- Components: main
5
- Signed-By: /usr/share/keyrings/debian-archive-keyring.gpg
6
-
7
- Types: deb deb-src
8
- URIs: http://deb.debian.org/debian-security
9
- Suites: bookworm-security
10
- Components: main
11
- Signed-By: /usr/share/keyrings/debian-archive-keyring.gpg
12
-
13
- Types: deb
14
- URIs: http://deb.debian.org/debian
15
- Suites: trixie
16
- Components: main
17
- Signed-By: /usr/share/keyrings/debian-archive-keyring.gpg
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dockerfiles/dockerfile-base DELETED
@@ -1,72 +0,0 @@
1
- # Include global ARGs at the dockerfile top
2
- ARG ARCH="x86_64"
3
- ARG WORKDIR_ROOT="/var/task"
4
-
5
-
6
- FROM python:3.12-bookworm AS builder_global
7
-
8
- ARG ARCH
9
- ARG WORKDIR_ROOT
10
- ARG POETRY_NO_INTERACTION
11
- ARG POETRY_VIRTUALENVS_IN_PROJECT
12
- ARG POETRY_VIRTUALENVS_CREATE
13
- ARG POETRY_CACHE_DIR
14
- ARG ZLIB1G="http://ftp.it.debian.org/debian/pool/main/z/zlib/zlib1g_1.3.dfsg-3+b1_amd64.deb"
15
- ENV PYTHONPATH="${WORKDIR_ROOT}:${PYTHONPATH}:/usr/local/lib/python3/dist-packages"
16
- ENV MPLCONFIGDIR=/tmp/matplotlib
17
- ARG USER="999"
18
-
19
-
20
- RUN echo "ARCH: $ARCH, ARG POETRY_CACHE_DIR: ${POETRY_CACHE_DIR}, ENV PYTHONPATH: $PYTHONPATH, USER: $USER ..."
21
- # RUN groupadd -g 999 python && useradd -r -u 999 -g python python
22
-
23
- # Set working directory to function root directory
24
- WORKDIR ${WORKDIR_ROOT}
25
- COPY --chown=python:python requirements.txt ${WORKDIR_ROOT}/
26
-
27
- # avoid segment-geospatial exception caused by missing libGL.so.1 library
28
- RUN echo "BUILDER: check libz.s* before start" && ls -l /usr/lib/${ARCH}-linux-gnu/libz.so*
29
- RUN apt update && apt install -y curl ffmpeg libgl1 python3-pip && apt clean
30
- COPY --chown=python:python ./dockerfiles/apt_preferences /etc/apt/preferences
31
- COPY --chown=python:python ./dockerfiles/debian.sources /etc/apt/sources.list.d/debian.sources
32
- RUN apt update && apt install -t trixie zlib1g -y && apt clean
33
- RUN echo "BUILDER: check libz.s* after install from trixie" && ls -l /usr/lib/${ARCH}-linux-gnu/libz.so*
34
-
35
- RUN ls -l /etc/apt/sources* /etc/apt/preferences*
36
-
37
- # poetry installation path is NOT within ${WORKDIR_ROOT}: not needed for runtime docker image
38
- RUN python3 -m venv ${WORKDIR_ROOT}/.venv
39
- ENV PATH="${WORKDIR_ROOT}/.venv/bin:$PATH"
40
- RUN . ${WORKDIR_ROOT}/.venv/bin/activate && python -m pip install -r ${WORKDIR_ROOT}/requirements.txt
41
-
42
- # USER 999
43
-
44
-
45
- FROM python:3.12-slim-bookworm AS runtime
46
-
47
- RUN groupadd -g 999 python && useradd -r -u 999 -g python python
48
-
49
- ARG ARCH
50
- ARG WORKDIR_ROOT
51
- ENV PYTHONPATH="${WORKDIR_ROOT}:${WORKDIR_ROOT}/.venv:${PYTHONPATH}:/usr/local/lib/python3/dist-packages"
52
- ENV MPLCONFIGDIR=/tmp/matplotlib
53
-
54
- ENV VIRTUAL_ENV=${WORKDIR_ROOT}/.venv PATH="${WORKDIR_ROOT}/.venv/bin:$PATH"
55
-
56
- RUN apt update && apt install -y ffmpeg && apt clean
57
- RUN echo "COPY --chown=python:python --from=builder_global /usr/lib/${ARCH}-linux-gnu/libGL.so* /usr/lib/${ARCH}-linux-gnu/"
58
- RUN echo "RUNTIME: check libz.s* before upgrade" && ls -l /usr/lib/${ARCH}-linux-gnu/libz.so*
59
- RUN echo "RUNTIME: remove libz.s* to force upgrade" && rm /usr/lib/${ARCH}-linux-gnu/libz.so*
60
- COPY --chown=python:python --from=builder_global /usr/lib/${ARCH}-linux-gnu/libz.so* /usr/lib/${ARCH}-linux-gnu/
61
- COPY --chown=python:python --from=builder_global /lib/${ARCH}-linux-gnu/libexpat.so* /lib/${ARCH}-linux-gnu/
62
- RUN echo "RUNTIME: check libz.s* after copy" && ls -l /usr/lib/${ARCH}-linux-gnu/libz.so*
63
- COPY --chown=python:python --from=builder_global ${WORKDIR_ROOT}/.venv ${WORKDIR_ROOT}/.venv
64
- RUN echo "check ffmpeg files..."
65
- RUN ls -ld /usr/share/ffmpeg || echo "ffpeg folder not found!"
66
- RUN ls -l /usr/bin/ff* || echo "ffpeg bin not found!"
67
- RUN ls -l /usr/share/ffmpeg || echo "ffpeg folder share not found!"
68
- RUN . ${WORKDIR_ROOT}/.venv && which python && pip list
69
-
70
- RUN echo "new WORKDIR_ROOT after hidden venv COPY --chown=python:python => ${WORKDIR_ROOT}"
71
- RUN ls -ld ${WORKDIR_ROOT}/
72
- RUN ls -lA ${WORKDIR_ROOT}/
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
faster_whisper_wrapper.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Union
2
+
3
+ import numpy as np
4
+ import onnxruntime
5
+ import torch
6
+ from faster_whisper import WhisperModel
7
+
8
+ from ModelInterfaces import IASRModel
9
+ from constants import sample_rate_resample, app_logger, IS_TESTING, DEVICE
10
+
11
+ device = onnxruntime.get_device()
12
+ device = "cpu" if IS_TESTING or device.lower() == DEVICE.lower() else device
13
+ app_logger.info(f"device: {device} #")
14
+ device_compute = "int8_float16" if device == "cuda" else "int8"
15
+ app_logger.info(f"device: {device}, device_compute: {device_compute} #")
16
+
17
+
18
+ def parse_word_info(word_info, sample_rate):
19
+ start_ts = float(word_info.start) * sample_rate
20
+ end_ts = float(word_info.end) * sample_rate
21
+ word = word_info.word
22
+ return {"word": word, "start_ts": start_ts, "end_ts": end_ts}
23
+
24
+
25
+ class FasterWhisperASRModel(IASRModel):
26
+ def __init__(self, model_name="base", language=None):
27
+ self.asr = WhisperModel(model_name, device=device, compute_type=device_compute)
28
+ self._transcript = ""
29
+ self._word_locations = []
30
+ self.sample_rate = sample_rate_resample
31
+ self.language = language
32
+
33
+ def processAudio(self, audio:Union[np.ndarray, torch.Tensor]):
34
+ # 'audio' can be a path to a file or a numpy array of audio samples.
35
+ if isinstance(audio, torch.Tensor):
36
+ audio = audio.detach().cpu().numpy()
37
+ segments, info = self.asr.transcribe(audio=audio[0], language=self.language, word_timestamps=True, beam_size=5, temperature=0, vad_filter=True) #, "verbose": True})
38
+ app_logger.debug(f"segments: type={type(segments)}, segments complete: {segments} #")
39
+ app_logger.info(f"info: type={type(info)}, info complete: {info} #")
40
+ transcript = []
41
+ count = 0
42
+ for segment in segments:
43
+ app_logger.debug(f"single segment: {type(segment)}, segment: {segment} #")
44
+ transcript.append(segment.text)
45
+ segment_word_locations = [parse_word_info(word_info, sample_rate=self.sample_rate) for word_info in segment.words]
46
+ self._word_locations.extend(segment_word_locations)
47
+ app_logger.info(f"elaborated segment {count}: type={type(segment)}, len(words):{len(segment.words)}, text:{segment.text} #")
48
+ count += 1
49
+ app_logger.info(f"transcript: {transcript} #")
50
+ self._transcript = " ".join(transcript)
51
+
52
+ def getTranscript(self) -> str:
53
+ return self._transcript
54
+
55
+ def getWordLocations(self) -> list:
56
+ return self._word_locations
images/{MainScreen.png β†’ MainScreen.jpg} RENAMED
File without changes
aip_trainer/lambdas/js.py β†’ js.py RENAMED
@@ -1,11 +1,4 @@
1
  js_update_ipa_output = """
2
- /**
3
- * Updates the CSS text of the given text based on the correctness of each letter.
4
- *
5
- * @param text - The text to be displayed.
6
- * @param letters - A string representing the correctness of each letter in the text.
7
- * @param idxSelectedWord - The index of the selected word to be underlined.
8
- */
9
  function updateCssText(text, letters, idxSelectedWord) {
10
  let wordsArr = text.split(" ")
11
  let lettersWordsArr = letters.split(" ")
@@ -31,13 +24,6 @@ function updateCssText(text, letters, idxSelectedWord) {
31
  """
32
 
33
  js_play_audio = """
34
- /**
35
- * Plays the given text as audio using the Web Speech API.
36
- *
37
- * @param text - The text to be spoken.
38
- * @param language - The language code for the speech synthesis (e.g., 'en' for English, 'de' for German).
39
- * @param sleepTime - Optional. The time in seconds to wait before starting the speech synthesis. Default is 0.
40
- */
41
  function playAudio(text, language, sleepTime = 0) {
42
  let voice_idx = 0;
43
  let voice_synth = null;
 
1
  js_update_ipa_output = """
 
 
 
 
 
 
 
2
  function updateCssText(text, letters, idxSelectedWord) {
3
  let wordsArr = text.split(" ")
4
  let lettersWordsArr = letters.split(" ")
 
24
  """
25
 
26
  js_play_audio = """
 
 
 
 
 
 
 
27
  function playAudio(text, language, sleepTime = 0) {
28
  let voice_idx = 0;
29
  let voice_synth = null;
lambdaChangeModel.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ import pronunciationTrainer
4
+
5
+
6
+ trainer_SST_lambda = {'de': pronunciationTrainer.getTrainer("de"), 'en': pronunciationTrainer.getTrainer("en")}
7
+
8
+
9
+ def lambda_handler(event, context):
10
+ data = json.loads(event['body'])
11
+ model_name = data['modelName']
12
+ trainer_SST_lambda["de"] = pronunciationTrainer.getTrainer("de", model_name=model_name)
13
+ trainer_SST_lambda["en"] = pronunciationTrainer.getTrainer("en", model_name=model_name)
14
+ return f'Model changed to {model_name}!'
lambdaGetSample.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from pathlib import Path
3
+
4
+ import pandas as pd
5
+
6
+ import RuleBasedModels
7
+ from constants import app_logger
8
+
9
+
10
+ class TextDataset:
11
+ def __init__(self, table, language):
12
+ self.table_dataframe = table
13
+ self.language = language
14
+
15
+ def __getitem__(self, idx):
16
+ line = [self.table_dataframe['sentence'].iloc[idx]]
17
+ return line
18
+
19
+ def __len__(self):
20
+ return len(self.table_dataframe)
21
+
22
+ def get_category_from_df(self, category_value:int):
23
+ selector = self.table_dataframe["category"] == category_value
24
+ df_by_category = self.table_dataframe[selector]
25
+ return df_by_category
26
+
27
+ def get_random_sample_from_df(self, category_value:int):
28
+ app_logger.info(f"language={self.language}, category_value={category_value}.")
29
+ choice = self.table_dataframe.sample(n=1)
30
+ if category_value !=0:
31
+ df_language_filtered_by_category = self.get_category_from_df(category_value)
32
+ choice = df_language_filtered_by_category.sample(n=1)
33
+ sentence = choice["sentence"].iloc[0]
34
+ app_logger.info(f"sentence={sentence} ...")
35
+ return [sentence]
36
+
37
+
38
+ sample_folder = Path(__file__).parent / "databases"
39
+ lambda_database = {}
40
+ lambda_ipa_converter = {}
41
+ available_languages = ['de', 'en']
42
+
43
+ for lang in available_languages:
44
+ # avoid using ";" or "," as separator because these are present within the dataframe sentences
45
+ df = pd.read_csv(sample_folder / f'data_{lang}.csv', delimiter='|')
46
+ lambda_database[lang] = TextDataset(df, lang)
47
+ lambda_ipa_converter[lang] = RuleBasedModels.get_phonem_converter(lang)
48
+
49
+ lambda_translate_new_sample = False
50
+
51
+
52
+ def lambda_handler(event, context):
53
+ """
54
+ lambda handler to return a random text sample from the dataset.
55
+
56
+ Parameters:
57
+ event (dict): The event data passed to the Lambda function.
58
+ context (dict): The context in which the Lambda function is called.
59
+
60
+ Returns:
61
+ str: The JSON-encoded result.
62
+ """
63
+ try:
64
+ body = json.loads(event['body'])
65
+
66
+ try:
67
+ category = int(body['category'])
68
+ except KeyError:
69
+ category = 0
70
+ language = body['language']
71
+ try:
72
+ current_transcript = str(body["transcript"])
73
+ except KeyError:
74
+ current_transcript = get_random_selection(language, category)
75
+ current_ipa = lambda_ipa_converter[language].convertToPhonem(current_transcript)
76
+
77
+ app_logger.info(f"real_transcript='{current_transcript}', ipa_transcript='{current_ipa}'.")
78
+ result = {
79
+ 'real_transcript': [current_transcript],
80
+ 'ipa_transcript': current_ipa,
81
+ 'transcript_translation': ""
82
+ }
83
+
84
+ return json.dumps(result)
85
+ except Exception as ex:
86
+ app_logger.error(f"ex: {ex} ...")
87
+ raise ex
88
+
89
+
90
+ def get_random_selection(language: str, category: int) -> str:
91
+ """
92
+ Get a random text sample from the dataset.
93
+
94
+ Parameters:
95
+ language (str): The language code.
96
+ category (int): The category value to filter the dataset.
97
+
98
+ Returns:
99
+ str: The selected text sample.
100
+ """
101
+ lambda_df_lang = lambda_database[language]
102
+ current_transcript = lambda_df_lang.get_random_sample_from_df(category)
103
+ app_logger.info(f"category={category}, language={language}, current_transcript={current_transcript}.")
104
+ return current_transcript[0]
105
+
106
+
107
+ def getSentenceCategory(sentence) -> int | None:
108
+ number_of_words = len(sentence.split())
109
+ categories_word_limits = [0, 8, 20, 100000]
110
+ for category in range(len(categories_word_limits)-1):
111
+ if categories_word_limits[category] < number_of_words <= categories_word_limits[category + 1]:
112
+ return category+1
113
+ raise ValueError(f"category not assigned for sentence '{sentence}' ...")
114
+
115
+
116
+ def get_enriched_dataframe_csv(
117
+ language: str,
118
+ custom_dataframe_csv_filename_no_ext: str = "data",
119
+ custom_folder: Path = sample_folder
120
+ ) -> None:
121
+ """
122
+ Read a csv dataframe adding a 'category' column.
123
+
124
+ Parameters:
125
+ language (str): The language code (e.g. "de" for German).
126
+ custom_dataframe_csv_filename_no_ext (str): The csv dataframe without extension.
127
+ custom_folder (Path): The folder containing the csv dataframe.
128
+
129
+ Returns:
130
+ None
131
+ """
132
+ custom_folder = Path(custom_folder).absolute()
133
+ df_filename = custom_folder / f'{custom_dataframe_csv_filename_no_ext}_{language}.csv'
134
+ with open(df_filename, 'r') as handle:
135
+ df2 = pd.read_csv(handle, sep="|")
136
+ df2["category"] = df2["sentence"].apply(getSentenceCategory)
137
+ app_logger.info("de_category added")
138
+ output_path = custom_folder / f'{custom_dataframe_csv_filename_no_ext}_{language}.csv'
139
+ df2.to_csv(output_path, index=False, sep="|")
140
+ app_logger.info(f"written {output_path} ...")
141
+
142
+
143
+ if __name__ == '__main__':
144
+ get_enriched_dataframe_csv("de")
145
+ get_enriched_dataframe_csv("en")
aip_trainer/lambdas/lambdaSpeechToScore.py β†’ lambdaSpeechToScore.py RENAMED
@@ -4,52 +4,75 @@ import os
4
  from pathlib import Path
5
  import tempfile
6
  import time
 
7
 
8
  import audioread
9
  import numpy as np
10
  import torch
11
  from torchaudio.transforms import Resample
12
 
13
- from aip_trainer import WordMatching as wm, app_logger
14
- from aip_trainer import pronunciationTrainer, sample_rate_start
15
- from aip_trainer.utils.typing_hints import BodySpeechToScoreRequest
 
16
 
17
 
18
- trainer_SST_lambda = {
19
- 'de': pronunciationTrainer.getTrainer("de"),
20
- 'en': pronunciationTrainer.getTrainer("en")
21
- }
22
- transform = Resample(orig_freq=sample_rate_start, new_freq=16000)
23
 
24
 
25
- def lambda_handler(event, context):
26
- event_body = event['body']
27
- data = BodySpeechToScoreRequest.model_validate_json(event_body)
 
 
 
 
 
 
 
 
 
 
28
 
29
- real_text = data.title
30
- base64_audio = data.base64Audio
31
  app_logger.debug(f"base64Audio:{base64_audio} ...")
32
  file_bytes_or_audiotmpfile = base64.b64decode(base64_audio[22:].encode('utf-8'))
33
- language = data.language
 
 
 
 
 
34
 
35
  if len(real_text) == 0:
36
- return {
37
- 'statusCode': 200,
38
- 'headers': {
39
- 'Access-Control-Allow-Headers': '*',
40
- 'Access-Control-Allow-Credentials': "true",
41
- 'Access-Control-Allow-Origin': 'http://127.0.0.1:3000/',
42
- 'Access-Control-Allow-Methods': 'OPTIONS,POST,GET'
43
- },
44
- 'body': ''
45
- }
46
- output = get_speech_to_score_dict(real_text=real_text, file_bytes_or_audiotmpfile=file_bytes_or_audiotmpfile, language=language, remove_random_file=False)
47
  output = json.dumps(output)
48
  app_logger.debug(f"output: {output} ...")
49
  return output
50
 
51
 
52
- def get_speech_to_score_dict(real_text: str, file_bytes_or_audiotmpfile: str | dict, language: str = "en", remove_random_file: bool = True, extension: str = ".ogg"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  from soundfile import LibsndfileError
54
  app_logger.info(f"real_text:{real_text} ...")
55
  app_logger.debug(f"file_bytes:{file_bytes_or_audiotmpfile} ...")
@@ -72,23 +95,19 @@ def get_speech_to_score_dict(real_text: str, file_bytes_or_audiotmpfile: str | d
72
  app_logger.debug("writing streaming data to file on disk...")
73
  with tempfile.NamedTemporaryFile(prefix="temp_sound_speech_score_", suffix=extension, delete=False) as f1:
74
  f1.write(file_bytes_or_audiotmpfile)
 
75
  duration = time.time() - start0
76
  app_logger.info(f'Saved binary data in file in {duration}s.')
77
- random_file_name = f1.name
78
 
79
  start = time.time()
80
- app_logger.info(f'Loading {extension} file file {random_file_name} ...')
81
  try:
82
  signal, samplerate = soundfile_load(random_file_name)
83
  except LibsndfileError as sfe:
84
  # https://github.com/beetbox/audioread/issues/144
85
  # deprecation warnings => pip install standard-aifc standard-sunau
86
  app_logger.error(f"Error reading file {random_file_name}: {sfe}, re-try with audioread...")
87
- try:
88
- signal, samplerate = audioread_load(random_file_name)
89
- except ModuleNotFoundError as mnfe:
90
- app_logger.error(f"Error reading file {random_file_name}: {mnfe}, try read https://github.com/beetbox/audioread/issues/144")
91
- raise mnfe
92
 
93
  duration = time.time() - start
94
  app_logger.info(f'Read {extension} file {random_file_name} in {duration}s.')
@@ -103,11 +122,11 @@ def get_speech_to_score_dict(real_text: str, file_bytes_or_audiotmpfile: str | d
103
  result = language_trainer_sst_lambda.processAudioForGivenText(signal_transformed, real_text)
104
  app_logger.info(f'language_trainer_sst_lambda: result: {result}...')
105
 
106
- start = time.time()
107
- if remove_random_file:
108
- os.remove(random_file_name)
109
- duration = time.time() - start
110
- app_logger.info(f'Deleted file {random_file_name} in {duration}s.')
111
 
112
  start = time.time()
113
  real_transcripts_ipa = ' '.join(
@@ -125,9 +144,9 @@ def get_speech_to_score_dict(real_text: str, file_bytes_or_audiotmpfile: str | d
125
 
126
  is_letter_correct_all_words = ''
127
  for idx, word_real in enumerate(words_real):
128
- mapped_letters, _ = wm.get_best_mapped_words(
129
- mapped_words[idx], word_real
130
- )
131
 
132
  is_letter_correct = wm.getWhichLettersWereTranscribedCorrectly(
133
  word_real, mapped_letters) # , mapped_letters_indices)
@@ -146,21 +165,40 @@ def get_speech_to_score_dict(real_text: str, file_bytes_or_audiotmpfile: str | d
146
  return {
147
  'real_transcript': result['recording_transcript'],
148
  'ipa_transcript': ipa_transcript,
149
- 'pronunciation_accuracy': float(f"{pronunciation_accuracy:.2f}"),
150
  'real_transcripts': real_transcripts, 'matched_transcripts': matched_transcripts,
151
  'real_transcripts_ipa': real_transcripts_ipa, 'matched_transcripts_ipa': matched_transcripts_ipa,
152
  'pair_accuracy_category': pair_accuracy_category,
153
  'start_time': result['start_time'],
154
  'end_time': result['end_time'],
155
- 'is_letter_correct_all_words': is_letter_correct_all_words
 
156
  }
157
 
158
 
159
- def get_speech_to_score_tuple(real_text: str, file_bytes_or_audiotmpfile: str | dict, language: str = "en", remove_random_file: bool = True):
160
- output = get_speech_to_score_dict(real_text=real_text, file_bytes_or_audiotmpfile=file_bytes_or_audiotmpfile, language=language, remove_random_file=remove_random_file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  real_transcripts = output['real_transcripts']
162
  is_letter_correct_all_words = output['is_letter_correct_all_words']
163
- pronunciation_accuracy = output['pronunciation_accuracy']
 
164
  ipa_transcript = output['ipa_transcript']
165
  real_transcripts_ipa = output['real_transcripts_ipa']
166
  end_time = [float(x) for x in output['end_time'].split(" ")]
@@ -169,17 +207,45 @@ def get_speech_to_score_tuple(real_text: str, file_bytes_or_audiotmpfile: str |
169
  app_logger.debug(f"start splitting recorded audio into {num_words} words...")
170
 
171
  audio_files, audio_durations = get_splitted_audio_file(audiotmpfile=file_bytes_or_audiotmpfile, start_time=start_time, end_time=end_time)
 
 
 
 
 
 
 
172
  output = {'audio_files': audio_files, "audio_durations": audio_durations, **output}
173
  first_audio_file = audio_files[0]
174
- return real_transcripts, is_letter_correct_all_words, pronunciation_accuracy, ipa_transcript, real_transcripts_ipa, num_words, first_audio_file, json.dumps(output)
 
 
 
 
 
175
 
 
 
 
 
176
 
177
- def soundfile_write(audiofile: str | Path, data: np.ndarray, samplerate: int):
 
 
178
  import soundfile as sf
179
  sf.write(audiofile, data, samplerate)
180
 
181
 
182
- def get_selected_word(idx_recorded_word: int, raw_json_output: str) -> tuple[str]:
 
 
 
 
 
 
 
 
 
 
183
  recognition_output = json.loads(raw_json_output)
184
  list_audio_files = recognition_output["audio_files"]
185
  real_transcripts = recognition_output["real_transcripts"]
@@ -194,10 +260,23 @@ def get_selected_word(idx_recorded_word: int, raw_json_output: str) -> tuple[str
194
 
195
 
196
  def get_splitted_audio_file(audiotmpfile: str | Path, start_time: list[float], end_time: list[float]) -> tuple[list[str], list[float]]:
 
 
 
 
 
 
 
 
 
 
 
197
  import soundfile as sf
198
  audio_files = []
199
  audio_durations = []
 
200
  for n, (start_nth, end_nth) in enumerate(zip(start_time, end_time)):
 
201
  signal_nth, samplerate = soundfile_load(audiotmpfile, offset=start_nth, duration=end_nth - start_nth)
202
  audiofile = get_file_with_custom_suffix(audiotmpfile, f"_part{n}_start{start_nth}_end{end_nth}")
203
  soundfile_write(audiofile=audiofile, data=signal_nth, samplerate=samplerate)
@@ -210,20 +289,52 @@ def get_splitted_audio_file(audiotmpfile: str | Path, start_time: list[float], e
210
 
211
 
212
  def get_file_with_custom_suffix(basefile: str | Path, custom_suffix: str) -> Path:
 
 
 
 
 
 
 
 
 
 
213
  pathname = Path(basefile)
214
  dirname, filename_no_ext, filename_ext = pathname.parent, pathname.stem, pathname.suffix
215
- output_file = Path(dirname) / f"{filename_no_ext}_{custom_suffix}.{filename_ext}"
216
  return output_file
217
 
218
 
219
  # From Librosa
220
 
221
- def calc_start_end(sr_native, time_position, n_channels):
 
 
 
 
 
 
 
 
 
 
 
222
  return int(np.round(sr_native * time_position)) * n_channels
223
 
224
 
225
- def soundfile_load(path: str | Path, offset: float = 0.0, duration: float = None, dtype=np.float32):
226
- """Load an audio buffer using soundfile. Taken from librosa """
 
 
 
 
 
 
 
 
 
 
 
227
  import soundfile as sf
228
 
229
  if isinstance(path, sf.SoundFile):
@@ -250,10 +361,18 @@ def soundfile_load(path: str | Path, offset: float = 0.0, duration: float = None
250
  return y, sr_native
251
 
252
 
253
- def audioread_load(path, offset=0.0, duration=None, dtype=np.float32):
254
- """Load an audio buffer using audioread.
255
-
256
  This loads one block at a time, and then concatenates the results.
 
 
 
 
 
 
 
 
 
257
  """
258
  y = []
259
  app_logger.debug(f"reading audio file at path:{path} ...")
@@ -309,7 +428,7 @@ def audioread_load(path, offset=0.0, duration=None, dtype=np.float32):
309
  # From Librosa
310
 
311
 
312
- def buf_to_float(x, n_bytes=2, dtype=np.float32):
313
  """Convert an integer buffer to floating point values.
314
  This is primarily useful when loading integer-valued wav data
315
  into numpy arrays.
 
4
  from pathlib import Path
5
  import tempfile
6
  import time
7
+ from typing import Dict, Any, LiteralString
8
 
9
  import audioread
10
  import numpy as np
11
  import torch
12
  from torchaudio.transforms import Resample
13
 
14
+ import WordMatching as wm
15
+ import pronunciationTrainer
16
+ import utilsFileIO
17
+ from constants import app_logger, sample_rate_resample, sample_rate_start, USE_DTW, IS_TESTING, tmp_audio_extension
18
 
19
 
20
+ trainer_SST_lambda = {'de': pronunciationTrainer.getTrainer("de"), 'en': pronunciationTrainer.getTrainer("en")}
21
+ transform = Resample(orig_freq=sample_rate_start, new_freq=sample_rate_resample)
 
 
 
22
 
23
 
24
+ def lambda_handler(event: Dict[str, Any], context: Any) -> Dict[str, Any]:
25
+ """
26
+ Lambda handler for speech-to-score.
27
+
28
+ Parameters:
29
+ event (Dict[str, Any]): The event data containing the request body.
30
+ context (Any): The context in which the lambda function is executed.
31
+
32
+ Returns:
33
+ Dict[str, Any]: The response containing the speech-to-score results.
34
+ """
35
+ body = event['body']
36
+ data = json.loads(body)
37
 
38
+ real_text = data['title']
39
+ base64_audio = data["base64Audio"]
40
  app_logger.debug(f"base64Audio:{base64_audio} ...")
41
  file_bytes_or_audiotmpfile = base64.b64decode(base64_audio[22:].encode('utf-8'))
42
+ language = data['language']
43
+ try:
44
+ use_dtw = data["useDTW"]
45
+ app_logger.info(f'use_dtw: "{type(use_dtw)}", "{use_dtw}".')
46
+ except KeyError:
47
+ use_dtw = USE_DTW
48
 
49
  if len(real_text) == 0:
50
+ return utilsFileIO.return_response_ok('{}')
51
+ output = get_speech_to_score_dict(
52
+ real_text=real_text, file_bytes_or_audiotmpfile=file_bytes_or_audiotmpfile, language=language, use_dtw=use_dtw
53
+ )
54
+ output["pronunciation_accuracy"] = f"{int(output["pronunciation_accuracy"])}"
 
 
 
 
 
 
55
  output = json.dumps(output)
56
  app_logger.debug(f"output: {output} ...")
57
  return output
58
 
59
 
60
+ def get_speech_to_score_dict(
61
+ real_text: str, file_bytes_or_audiotmpfile: str | bytes | dict, language: str = "en", extension: str = tmp_audio_extension, use_dtw: bool = False
62
+ ) -> Dict[str | Any, float | LiteralString | str | Any]:
63
+ """
64
+ Process the audio file and return a dictionary with speech-to-score results.
65
+
66
+ Parameters:
67
+ use_dtw:
68
+ real_text (str): The text to be matched with the audio.
69
+ file_bytes_or_audiotmpfile (str | bytes | dict): The audio file in bytes or a temporary file.
70
+ language (str): The language of the audio.
71
+ extension (str): The file extension of the audio file.
72
+
73
+ Returns:
74
+ Dict[str | Any, float | LiteralString | str | Any]: The speech-to-score results.
75
+ """
76
  from soundfile import LibsndfileError
77
  app_logger.info(f"real_text:{real_text} ...")
78
  app_logger.debug(f"file_bytes:{file_bytes_or_audiotmpfile} ...")
 
95
  app_logger.debug("writing streaming data to file on disk...")
96
  with tempfile.NamedTemporaryFile(prefix="temp_sound_speech_score_", suffix=extension, delete=False) as f1:
97
  f1.write(file_bytes_or_audiotmpfile)
98
+ random_file_name = f1.name
99
  duration = time.time() - start0
100
  app_logger.info(f'Saved binary data in file in {duration}s.')
 
101
 
102
  start = time.time()
103
+ app_logger.info(f"Loading temp '{random_file_name}' file...")
104
  try:
105
  signal, samplerate = soundfile_load(random_file_name)
106
  except LibsndfileError as sfe:
107
  # https://github.com/beetbox/audioread/issues/144
108
  # deprecation warnings => pip install standard-aifc standard-sunau
109
  app_logger.error(f"Error reading file {random_file_name}: {sfe}, re-try with audioread...")
110
+ signal, samplerate = audioread_load(random_file_name)
 
 
 
 
111
 
112
  duration = time.time() - start
113
  app_logger.info(f'Read {extension} file {random_file_name} in {duration}s.')
 
122
  result = language_trainer_sst_lambda.processAudioForGivenText(signal_transformed, real_text)
123
  app_logger.info(f'language_trainer_sst_lambda: result: {result}...')
124
 
125
+ # start = time.time()
126
+ # if remove_random_file:
127
+ # os.remove(random_file_name)
128
+ # duration = time.time() - start
129
+ # app_logger.info(f'Deleted file {random_file_name} in {duration}s.')
130
 
131
  start = time.time()
132
  real_transcripts_ipa = ' '.join(
 
144
 
145
  is_letter_correct_all_words = ''
146
  for idx, word_real in enumerate(words_real):
147
+
148
+ mapped_letters, mapped_letters_indices = wm.get_best_mapped_words(
149
+ mapped_words[idx], word_real, use_dtw=use_dtw)
150
 
151
  is_letter_correct = wm.getWhichLettersWereTranscribedCorrectly(
152
  word_real, mapped_letters) # , mapped_letters_indices)
 
165
  return {
166
  'real_transcript': result['recording_transcript'],
167
  'ipa_transcript': ipa_transcript,
168
+ 'pronunciation_accuracy': pronunciation_accuracy,
169
  'real_transcripts': real_transcripts, 'matched_transcripts': matched_transcripts,
170
  'real_transcripts_ipa': real_transcripts_ipa, 'matched_transcripts_ipa': matched_transcripts_ipa,
171
  'pair_accuracy_category': pair_accuracy_category,
172
  'start_time': result['start_time'],
173
  'end_time': result['end_time'],
174
+ 'is_letter_correct_all_words': is_letter_correct_all_words,
175
+ "random_file_name": random_file_name
176
  }
177
 
178
 
179
+ def get_speech_to_score_tuple(real_text: str, file_bytes_or_audiotmpfile: str | dict, language: str = "en", remove_random_file: bool = True) -> tuple:
180
+ """
181
+ Process the audio file and return a tuple with speech-to-score results.
182
+
183
+ Parameters:
184
+ real_text (str): The text to be matched with the audio.
185
+ file_bytes_or_audiotmpfile (str | dict): The audio file in bytes or a temporary file.
186
+ language (str): The language of the audio.
187
+ remove_random_file (bool): Whether to remove the temporary file after processing.
188
+
189
+ Returns:
190
+ tuple: A tuple containing real transcripts, letter correctness, pronunciation accuracy, IPA transcript, real transcripts in IPA, number of words, first audio file, and JSON output.
191
+ """
192
+ output = get_speech_to_score_dict(
193
+ real_text=real_text, file_bytes_or_audiotmpfile=file_bytes_or_audiotmpfile,
194
+ language=language
195
+ )
196
+ random_file_name = output["random_file_name"]
197
+ del output["random_file_name"]
198
  real_transcripts = output['real_transcripts']
199
  is_letter_correct_all_words = output['is_letter_correct_all_words']
200
+ pronunciation_accuracy = f"{output["pronunciation_accuracy"]:.2f}"
201
+ output["pronunciation_accuracy"] = pronunciation_accuracy
202
  ipa_transcript = output['ipa_transcript']
203
  real_transcripts_ipa = output['real_transcripts_ipa']
204
  end_time = [float(x) for x in output['end_time'].split(" ")]
 
207
  app_logger.debug(f"start splitting recorded audio into {num_words} words...")
208
 
209
  audio_files, audio_durations = get_splitted_audio_file(audiotmpfile=file_bytes_or_audiotmpfile, start_time=start_time, end_time=end_time)
210
+
211
+ remove_random_file = not IS_TESTING and remove_random_file
212
+ if remove_random_file:
213
+ app_logger.info(f"{IS_TESTING} => remove_random_file:{remove_random_file}, removing:{random_file_name} ...")
214
+ Path(random_file_name).unlink(missing_ok=True)
215
+ app_logger.info(f"removed:{random_file_name} ...")
216
+
217
  output = {'audio_files': audio_files, "audio_durations": audio_durations, **output}
218
  first_audio_file = audio_files[0]
219
+ return real_transcripts, is_letter_correct_all_words, pronunciation_accuracy, ipa_transcript, real_transcripts_ipa, num_words, first_audio_file, json.dumps(output), random_file_name
220
+
221
+
222
+ def soundfile_write(audiofile: str | Path, data: np.ndarray, samplerate: int) -> None:
223
+ """
224
+ Write audio data to a file using soundfile.
225
 
226
+ Parameters:
227
+ audiofile (str | Path): The path to the audio file.
228
+ data (np.ndarray): The audio data to write.
229
+ samplerate (int): The sample rate of the audio data.
230
 
231
+ Returns:
232
+ None
233
+ """
234
  import soundfile as sf
235
  sf.write(audiofile, data, samplerate)
236
 
237
 
238
+ def get_selected_word(idx_recorded_word: int, raw_json_output: str) -> tuple[str, str, float]:
239
+ """
240
+ Get the selected word, its audio file, and duration from the recognition output.
241
+
242
+ Parameters:
243
+ idx_recorded_word (int): The index of the recorded word.
244
+ raw_json_output (str): The JSON output from the recognition process.
245
+
246
+ Returns:
247
+ tuple: A tuple containing the audio file, the current word, and its duration.
248
+ """
249
  recognition_output = json.loads(raw_json_output)
250
  list_audio_files = recognition_output["audio_files"]
251
  real_transcripts = recognition_output["real_transcripts"]
 
260
 
261
 
262
  def get_splitted_audio_file(audiotmpfile: str | Path, start_time: list[float], end_time: list[float]) -> tuple[list[str], list[float]]:
263
+ """
264
+ Split the audio file into segments based on start and end times.
265
+
266
+ Parameters:
267
+ audiotmpfile (str | Path): The path to the audio file.
268
+ start_time (list[float]): The start times of the segments.
269
+ end_time (list[float]): The end times of the segments.
270
+
271
+ Returns:
272
+ tuple: A tuple containing a list of audio files and their durations.
273
+ """
274
  import soundfile as sf
275
  audio_files = []
276
  audio_durations = []
277
+ app_logger.info(f"start_time:{start_time}, end_time:{end_time} ...")
278
  for n, (start_nth, end_nth) in enumerate(zip(start_time, end_time)):
279
+ # assert start_nth < end_nth, f"start_nth:{start_nth} (index {n}) should be less than end_nth:{end_nth} (start_time:{start_time}, end_time:{end_time})..."
280
  signal_nth, samplerate = soundfile_load(audiotmpfile, offset=start_nth, duration=end_nth - start_nth)
281
  audiofile = get_file_with_custom_suffix(audiotmpfile, f"_part{n}_start{start_nth}_end{end_nth}")
282
  soundfile_write(audiofile=audiofile, data=signal_nth, samplerate=samplerate)
 
289
 
290
 
291
  def get_file_with_custom_suffix(basefile: str | Path, custom_suffix: str) -> Path:
292
+ """
293
+ Generate a file path with a custom suffix.
294
+
295
+ Parameters:
296
+ basefile (str | Path): The base file path.
297
+ custom_suffix (str): The custom suffix to add to the file name.
298
+
299
+ Returns:
300
+ Path: The new file path with the custom suffix.
301
+ """
302
  pathname = Path(basefile)
303
  dirname, filename_no_ext, filename_ext = pathname.parent, pathname.stem, pathname.suffix
304
+ output_file = dirname / f"{filename_no_ext}_{custom_suffix}.{filename_ext}"
305
  return output_file
306
 
307
 
308
  # From Librosa
309
 
310
+ def calc_start_end(sr_native: int, time_position: float, n_channels: int) -> int:
311
+ """
312
+ Calculate the start or end position in samples.
313
+
314
+ Parameters:
315
+ sr_native (int): The native sample rate.
316
+ time_position (float): The time position in seconds.
317
+ n_channels (int): The number of audio channels.
318
+
319
+ Returns:
320
+ int: The start or end position in samples.
321
+ """
322
  return int(np.round(sr_native * time_position)) * n_channels
323
 
324
 
325
+ def soundfile_load(path: str | Path, offset: float = 0.0, duration: float = None, dtype=np.float32) -> tuple[np.ndarray, int]:
326
+ """
327
+ Load an audio buffer using soundfile.
328
+
329
+ Parameters:
330
+ path (str | Path): The path to the audio file.
331
+ offset (float): The offset in seconds to start reading the file.
332
+ duration (float): The duration in seconds to read from the file.
333
+ dtype (np.float32): The data type of the audio buffer.
334
+
335
+ Returns:
336
+ tuple: A tuple containing the audio buffer and the sample rate.
337
+ """
338
  import soundfile as sf
339
 
340
  if isinstance(path, sf.SoundFile):
 
361
  return y, sr_native
362
 
363
 
364
+ def audioread_load(path: str | Path, offset: float = 0.0, duration: float = None, dtype=np.float32) -> tuple[np.ndarray, int]:
365
+ """
 
366
  This loads one block at a time, and then concatenates the results.
367
+
368
+ Parameters:
369
+ path (str | Path): The path to the audio file.
370
+ offset (float): The offset in seconds to start reading the file.
371
+ duration (float): The duration in seconds to read from the file.
372
+ dtype (np.float32): The data type of the audio buffer.
373
+
374
+ Returns:
375
+ tuple: A tuple containing the audio buffer and the sample rate.
376
  """
377
  y = []
378
  app_logger.debug(f"reading audio file at path:{path} ...")
 
428
  # From Librosa
429
 
430
 
431
+ def buf_to_float(x: np.ndarray, n_bytes: int = 2, dtype: np.float32 = np.float32) -> np.ndarray:
432
  """Convert an integer buffer to floating point values.
433
  This is primarily useful when loading integer-valued wav data
434
  into numpy arrays.
aip_trainer/lambdas/lambdaTTS.py β†’ lambdaTTS.py RENAMED
@@ -1,12 +1,36 @@
1
- import random
 
 
2
  import tempfile
3
  from pathlib import Path
4
 
5
- from aip_trainer import app_logger
 
 
 
 
 
6
 
7
 
8
  def get_tts(text: str, language: str, tmp_prefix="audio_", tmp_suffix=".wav") -> str:
9
- from aip_trainer.models import models
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  if text is None or len(text) == 0:
12
  raise ValueError(f"cannot read an empty/None text: '{text}'...")
@@ -15,11 +39,11 @@ def get_tts(text: str, language: str, tmp_prefix="audio_", tmp_suffix=".wav") ->
15
 
16
  tmp_dir = Path(tempfile.gettempdir())
17
  try:
18
- model, _, speaker, sample_rate = models.silero_tts(
19
  language, output_folder=tmp_dir
20
  )
21
  except ValueError:
22
- model, _, sample_rate, _, _, speaker = models.silero_tts(
23
  language, output_folder=tmp_dir
24
  )
25
  app_logger.info(f"model speaker #0: {speaker} ...")
 
1
+ import base64
2
+ import json
3
+ import os
4
  import tempfile
5
  from pathlib import Path
6
 
7
+ import soundfile as sf
8
+
9
+ import AIModels
10
+ import models
11
+ import utilsFileIO
12
+ from constants import app_logger, sample_rate_resample
13
 
14
 
15
  def get_tts(text: str, language: str, tmp_prefix="audio_", tmp_suffix=".wav") -> str:
16
+ """
17
+ Generate text-to-speech (TTS) audio for the given text and language.
18
+
19
+ Args:
20
+ text (str): The text to be converted to speech.
21
+ language (str): The language of the text. Supported languages are "en" (English) and "de" (German).
22
+ tmp_prefix (str, optional): The temporary directory to use for temporary files.
23
+ tmp_suffix (str, optional): The temporary directory to use for temporary files.
24
+
25
+ Returns:
26
+ str: The path to the generated audio file.
27
+
28
+ Raises:
29
+ NotImplementedError: If the provided language is not supported.
30
+
31
+ Notes:
32
+ This function uses the Silero TTS model to generate the audio. The model and speaker are selected based on the provided language.
33
+ """
34
 
35
  if text is None or len(text) == 0:
36
  raise ValueError(f"cannot read an empty/None text: '{text}'...")
 
39
 
40
  tmp_dir = Path(tempfile.gettempdir())
41
  try:
42
+ model, _, speaker, sample_rate = models.__silero_tts(
43
  language, output_folder=tmp_dir
44
  )
45
  except ValueError:
46
+ model, _, sample_rate, _, _, speaker = models.__silero_tts(
47
  language, output_folder=tmp_dir
48
  )
49
  app_logger.info(f"model speaker #0: {speaker} ...")
aip_trainer/models/models.py β†’ models.py RENAMED
@@ -1,11 +1,15 @@
1
  import os
2
  from pathlib import Path
3
- import tempfile
 
4
  import torch
5
  import torch.nn as nn
 
6
  from silero.utils import Decoder
7
 
8
- from aip_trainer import app_logger, sample_rate_start
 
 
9
 
10
 
11
  default_speaker_dict = {
@@ -14,11 +18,92 @@ default_speaker_dict = {
14
  }
15
 
16
 
17
- def silero_tts(language="en", version="latest", output_folder: Path | str = None, **kwargs):
18
- """Silero Text-To-Speech Models
19
- language (str): language of the model, now available are ['ru', 'en', 'de', 'es', 'fr']
20
- Returns a model and a set of utils
21
- Please see https://github.com/snakers4/silero-models for usage examples
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  """
23
  output_folder = Path(output_folder)
24
  current_model_lang = default_speaker_dict[language]
@@ -26,10 +111,10 @@ def silero_tts(language="en", version="latest", output_folder: Path | str = None
26
  if language in default_speaker_dict:
27
  model_id = current_model_lang["model_id"]
28
 
29
- models = get_models(language, output_folder, version, model_type="tts_models")
30
  available_languages = list(models.tts_models.keys())
31
  assert (
32
- language in available_languages
33
  ), f"Language not in the supported list {available_languages}"
34
 
35
  tts_models_lang = models.tts_models[language]
@@ -67,46 +152,95 @@ def silero_tts(language="en", version="latest", output_folder: Path | str = None
67
  return model, symbols, sample_rate, example_text, apply_tts, model_id
68
 
69
 
70
- def silero_stt(
71
- language="en",
72
- version="latest",
73
- jit_model="jit",
74
- output_folder: Path | str = None,
75
- **kwargs,
76
- ):
77
- """Modified Silero Speech-To-Text Model(s) function
78
- language (str): language of the model, now available are ['en', 'de', 'es']
79
- version:
80
- jit_model:
81
- output_folder: needed in case of docker build
82
- Returns a model, decoder object and a set of utils
83
- Please see https://github.com/snakers4/silero-models for usage examples
84
  """
85
- from silero.utils import (
86
- read_audio,
87
- read_batch,
88
- split_into_batches,
89
- prepare_model_input,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
- model, decoder = get_latest_model(
93
- language,
94
- output_folder,
95
- version,
96
- model_type="stt_models",
97
- jit_model=jit_model,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  **kwargs,
99
  )
100
- utils = (read_batch, split_into_batches, read_audio, prepare_model_input)
101
-
102
- return model, decoder, utils
103
 
104
 
105
  def init_jit_model(
106
- model_url: str,
107
- device: torch.device = torch.device("cpu"),
108
- output_folder: Path | str = None,
109
- ):
 
 
 
 
 
 
 
 
 
 
 
 
110
  torch.set_grad_enabled(False)
111
 
112
  app_logger.info(
@@ -126,62 +260,49 @@ def init_jit_model(
126
 
127
  if not os.path.isfile(model_path):
128
  app_logger.info(f"downloading model_path: '{model_path}' ...")
129
- torch.hub.download_url_to_file(model_url, model_path, progress=True)
130
  app_logger.info(f"model_path {model_path} downloaded!")
131
  model = torch.jit.load(model_path, map_location=device)
132
  model.eval()
133
  return model, Decoder(model.labels)
134
 
135
 
136
- # second returned type here is the custom class src.silero.utils.Decoder from snakers4/silero-models
137
- def getASRModel(language: str) -> tuple[nn.Module, Decoder]:
138
- tmp_dir = tempfile.gettempdir()
139
- if language == "de":
140
- model, decoder, _ = silero_stt(
141
- language="de", version="v4", jit_model="jit_large", output_folder=tmp_dir
142
- )
143
- elif language == "en":
144
- model, decoder, _ = silero_stt(language="en", output_folder=tmp_dir)
145
- else:
146
- raise NotImplementedError(
147
- "currenty works only for 'de' and 'en' languages, not for '{}'.".format(
148
- language
149
- )
150
- )
151
-
152
- return model, decoder
153
-
154
 
155
- def get_models(language, output_folder, version, model_type):
156
- from omegaconf import OmegaConf
 
 
 
 
157
 
158
- output_folder = (
159
- Path(output_folder)
160
- if output_folder is not None
161
- else Path(os.path.dirname(__file__)).parent.parent
 
 
 
 
162
  )
163
- models_list_file = output_folder / f"latest_silero_model_{language}.yml"
164
- if not os.path.exists(models_list_file):
165
- app_logger.info(
166
- f"model {model_type} yml for '{language}' language, '{version}' version not found, download it in folder {output_folder}..."
167
- )
168
- torch.hub.download_url_to_file(
169
- "https://raw.githubusercontent.com/snakers4/silero-models/master/models.yml",
170
- models_list_file,
171
- progress=False,
172
- )
173
- assert os.path.exists(models_list_file)
174
- return OmegaConf.load(models_list_file)
175
-
176
 
177
- def get_latest_model(language, output_folder, version, model_type, jit_model, **kwargs):
178
- models = get_models(language, output_folder, version, model_type)
179
- available_languages = list(models[model_type].keys())
180
- assert language in available_languages
181
-
182
- model, decoder = init_jit_model(
183
- model_url=models[model_type].get(language).get(version).get(jit_model),
184
- output_folder=output_folder,
185
  **kwargs,
186
  )
187
- return model, decoder
 
 
 
1
  import os
2
  from pathlib import Path
3
+ from typing import Union, Callable
4
+
5
  import torch
6
  import torch.nn as nn
7
+ from omegaconf import DictConfig, ListConfig
8
  from silero.utils import Decoder
9
 
10
+ from AIModels import NeuralASR
11
+ from ModelInterfaces import IASRModel
12
+ from constants import MODEL_NAME_DEFAULT, language_not_implemented, app_logger, sample_rate_start, silero_versions_dict
13
 
14
 
15
  default_speaker_dict = {
 
18
  }
19
 
20
 
21
+ def getASRModel(language: str, model_name: str = MODEL_NAME_DEFAULT) -> IASRModel:
22
+ models_dict = {
23
+ "whisper": __get_model_whisper,
24
+ "faster_whisper": __get_model_faster_whisper,
25
+ "silero": __get_model_silero
26
+ }
27
+ if model_name in models_dict:
28
+ fn = models_dict[model_name]
29
+ return fn(language)
30
+ models_supported = ", ".join(models_dict.keys())
31
+ raise ValueError(f"Model '{model_name}' not implemented. Supported models: {models_supported}.")
32
+
33
+
34
+ def __get_model_whisper(language: str) -> IASRModel:
35
+ from whisper_wrapper import WhisperASRModel
36
+ return WhisperASRModel(language=language)
37
+
38
+
39
+ def __get_model_faster_whisper(language: str) -> IASRModel:
40
+ from faster_whisper_wrapper import FasterWhisperASRModel
41
+ return FasterWhisperASRModel(language=language)
42
+
43
+
44
+ def __get_model_silero(language: str) -> IASRModel:
45
+ import tempfile
46
+ tmp_dir = tempfile.gettempdir()
47
+ if language == "de":
48
+ model, decoder, _ = __silero_stt(
49
+ language="de", version="v4", jit_model="jit_large", output_folder=tmp_dir
50
+ )
51
+ return __eval_apply_neural_asr(model, decoder, language)
52
+ elif language == "en":
53
+ model, decoder, _ = __silero_stt(language="en", output_folder=tmp_dir)
54
+ return __eval_apply_neural_asr(model, decoder, language)
55
+ raise ValueError(language_not_implemented.format(language))
56
+
57
+
58
+ def __eval_apply_neural_asr(model: nn.Module, decoder: Decoder, language: str):
59
+ app_logger.info(f"LOADED silero model language: {language}, version: '{silero_versions_dict[language]}'")
60
+ model.eval()
61
+ app_logger.info(f"EVALUATED silero model language: {language}, version: '{silero_versions_dict[language]}'")
62
+ return NeuralASR(model, decoder)
63
+
64
+
65
+ def getTranslationModel(language: str) -> nn.Module:
66
+ from transformers import AutoTokenizer
67
+ from transformers import AutoModelForSeq2SeqLM
68
+ if language == 'de':
69
+ model = AutoModelForSeq2SeqLM.from_pretrained(
70
+ "Helsinki-NLP/opus-mt-de-en")
71
+ tokenizer = AutoTokenizer.from_pretrained(
72
+ "Helsinki-NLP/opus-mt-de-en")
73
+ # Cache models to avoid Hugging face processing (not needed now)
74
+ # with open('translation_model_de.pickle', 'wb') as handle:
75
+ # pickle.dump(model, handle)
76
+ # with open('translation_tokenizer_de.pickle', 'wb') as handle:
77
+ # pickle.dump(tokenizer, handle)
78
+ else:
79
+ raise ValueError(language_not_implemented.format(language))
80
+
81
+ return model, tokenizer
82
+
83
+
84
+ def __silero_tts(language: str = "en", version: str = "latest", output_folder: Path | str = None, **kwargs) -> tuple[nn.Module, str, int, str, dict, Callable, str]:
85
+ """
86
+ Modified function to create instances of Silero Text-To-Speech Models.
87
+ Please see https://github.com/snakers4/silero-models?tab=readme-ov-file#text-to-speech for usage examples.
88
+ language="en", version="latest", output_folder: Path | str = None, **kwargs
89
+
90
+ Args:
91
+ language (str): Language of the model. Available options are ['ru', 'en', 'de', 'es', 'fr']. Default is 'en'.
92
+ version (str): Version of the model to use. Default is 'latest'.
93
+ output_folder (Path | str): Path to the folder where the model will be saved. Default is None.
94
+ **kwargs: Additional keyword arguments.
95
+ Returns:
96
+ tuple: Depending on the model version and the input arguments, returns a tuple containing:
97
+ - model: The loaded TTS model.
98
+ - symbols (str): The set of symbols used by the model (only for older model versions).
99
+ - sample_rate (int): The sample rate of the model.
100
+ - example_text (str): Example text for the model.
101
+ - speaker (dict):
102
+ - apply_tts (function): Function to apply TTS (only for older model versions).
103
+ - model_id (str): The model ID (only for older model versions).
104
+
105
+ Raises:
106
+ AssertionError: If the specified language is not in the supported list.
107
  """
108
  output_folder = Path(output_folder)
109
  current_model_lang = default_speaker_dict[language]
 
111
  if language in default_speaker_dict:
112
  model_id = current_model_lang["model_id"]
113
 
114
+ models = __get_models(language, output_folder, version, model_type="tts_models")
115
  available_languages = list(models.tts_models.keys())
116
  assert (
117
+ language in available_languages
118
  ), f"Language not in the supported list {available_languages}"
119
 
120
  tts_models_lang = models.tts_models[language]
 
152
  return model, symbols, sample_rate, example_text, apply_tts, model_id
153
 
154
 
155
+ def __get_models(language: str, output_folder: str | Path, version: str, model_type: str) -> Union[DictConfig, ListConfig]:
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  """
157
+ Retrieve and load the model configuration for a specified language and model type.
158
+
159
+ Args:
160
+ language (str): The language for which the model is required.
161
+ output_folder (str or Path): The folder where the model configuration file should be saved
162
+ version (str): The version of the model.
163
+ model_type (str): The type of the model.
164
+
165
+ Returns:
166
+ OmegaConf: The loaded model configuration.
167
+
168
+ Raises:
169
+ AssertionError: If the model configuration file does not exist after attempting to download it.
170
+
171
+ Notes:
172
+ If the model configuration file does not exist in the specified output folder, it will be downloaded
173
+ from a predefined URL and saved in the output folder.
174
+ """
175
+ from omegaconf import OmegaConf
176
+
177
+ output_folder = (
178
+ Path(output_folder)
179
+ if output_folder is not None
180
+ else Path(os.path.dirname(__file__)).parent.parent
181
  )
182
+ models_list_file = output_folder / f"latest_silero_model_{language}.yml"
183
+ app_logger.info(f"models_list_file:{models_list_file}.")
184
+ if not os.path.exists(models_list_file):
185
+ app_logger.info(
186
+ f"model {model_type} yml for '{language}' language, '{version}' version not found, download it in folder {output_folder}..."
187
+ )
188
+ torch.hub.download_url_to_file(
189
+ "https://raw.githubusercontent.com/snakers4/silero-models/master/models.yml",
190
+ str(models_list_file),
191
+ progress=False,
192
+ )
193
+ assert os.path.exists(models_list_file)
194
+ return OmegaConf.load(models_list_file)
195
 
196
+
197
+ def __get_latest_stt_model(language: str, output_folder: str | Path, version: str, model_type: str, jit_model: str, **kwargs) -> tuple[nn.Module, Decoder]:
198
+ """
199
+ Retrieve the latest Speech-to-Text (STT) model for a given language and model type.
200
+
201
+ Args:
202
+ language (str): The language for which the STT model is required.
203
+ output_folder (str): The directory where the model will be saved.
204
+ version (str): The version of the model to retrieve.
205
+ model_type (str): The type of the model (e.g., 'large', 'small').
206
+ jit_model (str): The specific JIT model to use.
207
+ **kwargs: Additional keyword arguments to pass to the model initialization function.
208
+
209
+ Returns:
210
+ tuple: A tuple containing the model and the decoder.
211
+
212
+ Raises:
213
+ AssertionError: If the specified language is not available in the model type.
214
+ """
215
+ models = __get_models(language, output_folder, version, model_type)
216
+ available_languages = list(models[model_type].keys())
217
+ assert language in available_languages
218
+
219
+ model, decoder = init_jit_model(
220
+ model_url=models[model_type].get(language).get(version).get(jit_model),
221
+ output_folder=output_folder,
222
  **kwargs,
223
  )
224
+ return model, decoder
 
 
225
 
226
 
227
  def init_jit_model(
228
+ model_url: str,
229
+ device: torch.device = torch.device("cpu"),
230
+ output_folder: Path | str = None,
231
+ ) -> tuple[torch.nn.Module, Decoder]:
232
+ """
233
+ Initialize a JIT model from a given URL.
234
+
235
+ Args:
236
+ model_url (str): The URL to download the model from.
237
+ device (torch.device, optional): The device to load the model on. Defaults to CPU.
238
+ output_folder (Path | str, optional): The folder to save the downloaded model.
239
+ If None, defaults to a 'model' directory in the current file's directory.
240
+
241
+ Returns:
242
+ Tuple[torch.jit.ScriptModule, Decoder]: The loaded JIT model and its corresponding decoder.
243
+ """
244
  torch.set_grad_enabled(False)
245
 
246
  app_logger.info(
 
260
 
261
  if not os.path.isfile(model_path):
262
  app_logger.info(f"downloading model_path: '{model_path}' ...")
263
+ torch.hub.download_url_to_file(model_url, str(model_path), progress=True)
264
  app_logger.info(f"model_path {model_path} downloaded!")
265
  model = torch.jit.load(model_path, map_location=device)
266
  model.eval()
267
  return model, Decoder(model.labels)
268
 
269
 
270
+ def __silero_stt(
271
+ language: str = "en",
272
+ version: str = "latest",
273
+ jit_model: str = "jit",
274
+ output_folder: Path | str = None,
275
+ **kwargs,
276
+ ) -> tuple[nn.Module, Decoder, set[Callable, Callable, Callable, Callable]]:
277
+ """
278
+ Modified function to create instances of Silero Speech-To-Text Model(s).
279
+ Please see https://github.com/snakers4/silero-models?tab=readme-ov-file#speech-to-text for usage examples.
 
 
 
 
 
 
 
 
280
 
281
+ Args:
282
+ language (str): Language of the model. Available options are ['en', 'de', 'es'].
283
+ version (str): Version of the model to use. Default is "latest".
284
+ jit_model (str): Type of JIT model to use. Default is "jit".
285
+ output_folder (Path | str, optional): Output folder needed in case of docker build. Default is None.
286
+ **kwargs: Additional keyword arguments.
287
 
288
+ Returns:
289
+ tuple: A tuple containing the model, decoder object, and a set of utility functions.
290
+ """
291
+ from silero.utils import (
292
+ read_audio,
293
+ read_batch,
294
+ split_into_batches,
295
+ prepare_model_input,
296
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
297
 
298
+ model, decoder = __get_latest_stt_model(
299
+ language,
300
+ output_folder,
301
+ version,
302
+ model_type="stt_models",
303
+ jit_model=jit_model,
 
 
304
  **kwargs,
305
  )
306
+ utils = (read_batch, split_into_batches, read_audio, prepare_model_input)
307
+
308
+ return model, decoder, utils
packages.txt DELETED
@@ -1 +0,0 @@
1
- ffmpeg
 
 
pre-requirements.txt DELETED
@@ -1 +0,0 @@
1
- pip
 
 
aip_trainer/pronunciationTrainer.py β†’ pronunciationTrainer.py RENAMED
@@ -5,29 +5,22 @@ import epitran
5
  import numpy as np
6
  import torch
7
 
8
- from . import WordMatching as wm
9
- from . import WordMetrics
10
- from . import app_logger
11
- from .models import AIModels, ModelInterfaces as mi, RuleBasedModels, models as mo
 
 
12
 
13
 
14
- def getTrainer(language: str):
15
-
16
- device = torch.device('cpu')
17
-
18
- model, decoder = mo.getASRModel(language)
19
- model = model.to(device)
20
- model.eval()
21
- asr_model = AIModels.NeuralASR(model, decoder)
22
-
23
  if language == 'de':
24
- epitran_deu_latn = epitran.Epitran('deu-Latn')
25
- phonem_converter = RuleBasedModels.EpitranPhonemConverter(epitran_deu_latn)
26
  elif language == 'en':
27
  phonem_converter = RuleBasedModels.EngPhonemConverter()
28
  else:
29
- raise ValueError('Language not implemented')
30
-
31
  trainer = PronunciationTrainer(asr_model, phonem_converter)
32
 
33
  return trainer
@@ -50,7 +43,7 @@ class PronunciationTrainer:
50
  current_words_pronunciation_accuracy = []
51
  categories_thresholds = np.array([80, 60, 59])
52
 
53
- sampling_rate = 16000
54
 
55
  def __init__(self, asr_model: mi.IASRModel, word_to_ipa_coverter: mi.ITextToPhonemModel) -> None:
56
  self.asr_model = asr_model
@@ -67,22 +60,36 @@ class PronunciationTrainer:
67
 
68
  return audio_transcript, word_locations_in_samples
69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  ##################### ASR Functions ###########################
71
 
72
  def processAudioForGivenText(self, recordedAudio: torch.Tensor = None, real_text=None):
73
 
74
  start = time.time()
75
- app_logger.info('starting getAudioTranscript...')
76
- recording_transcript, recording_ipa, word_locations = self.getAudioTranscript(recordedAudio)
77
-
78
- duration = time.time() - start
79
- app_logger.info(f'Time for NN to transcript audio: {duration}.')
80
 
81
  start = time.time()
82
  real_and_transcribed_words, real_and_transcribed_words_ipa, mapped_words_indices = self.matchSampleAndRecordedWords(
83
  real_text, recording_transcript)
84
- duration = time.time() - start
85
- app_logger.info(f'Time for matching transcripts: {duration}.')
86
 
87
  start_time, end_time = self.getWordLocationsFromRecordInSeconds(
88
  word_locations, mapped_words_indices)
@@ -104,22 +111,20 @@ class PronunciationTrainer:
104
  def getAudioTranscript(self, recordedAudio: torch.Tensor = None):
105
  current_recorded_audio = recordedAudio
106
 
107
- app_logger.info('starting preprocessAudio...')
108
- current_recorded_audio = self.preprocessAudio(current_recorded_audio)
109
-
110
- app_logger.info('starting processAudio...')
111
  self.asr_model.processAudio(current_recorded_audio)
112
 
113
- app_logger.info('starting getTranscriptAndWordsLocations...')
114
  current_recorded_transcript, current_recorded_word_locations = self.getTranscriptAndWordsLocations(
115
  current_recorded_audio.shape[1])
116
- app_logger.info('starting convertToPhonem...')
117
- current_recorded_ipa = self.ipa_converter.convertToPhonem(current_recorded_transcript)
118
 
119
- app_logger.info('ok, return audio transcript!')
120
  return current_recorded_transcript, current_recorded_ipa, current_recorded_word_locations
121
 
122
- def getWordLocationsFromRecordInSeconds(self, word_locations, mapped_words_indices) -> tuple[str, str]:
 
123
  start_time = []
124
  end_time = []
125
  for word_idx in range(len(mapped_words_indices)):
@@ -135,10 +140,10 @@ class PronunciationTrainer:
135
  def matchSampleAndRecordedWords(self, real_text, recorded_transcript):
136
  words_estimated = recorded_transcript.split()
137
 
138
- if real_text is None:
139
- words_real = self.current_transcript[0].split()
140
- else:
141
  words_real = real_text.split()
 
 
142
 
143
  mapped_words, mapped_words_indices = wm.get_best_mapped_words(
144
  words_estimated, words_real)
@@ -154,7 +159,7 @@ class PronunciationTrainer:
154
  self.ipa_converter.convertToPhonem(mapped_words[word_idx])))
155
  return real_and_transcribed_words, real_and_transcribed_words_ipa, mapped_words_indices
156
 
157
- def getPronunciationAccuracy(self, real_and_transcribed_words_ipa) -> tuple[float, list]:
158
  total_mismatches = 0.
159
  number_of_phonemes = 0.
160
  current_words_pronunciation_accuracy = []
@@ -191,4 +196,4 @@ class PronunciationTrainer:
191
  return np.argmin(abs(self.categories_thresholds-accuracy))
192
 
193
  def preprocessAudio(self, audio: torch.tensor) -> torch.tensor:
194
- return preprocessAudioStandalone(audio=audio)
 
5
  import numpy as np
6
  import torch
7
 
8
+ import ModelInterfaces as mi
9
+ import RuleBasedModels
10
+ import WordMatching as wm
11
+ import WordMetrics
12
+ import models as mo
13
+ from constants import app_logger, MODEL_NAME_DEFAULT, sample_rate_resample
14
 
15
 
16
+ def getTrainer(language: str, model_name: str = MODEL_NAME_DEFAULT):
17
+ asr_model = mo.getASRModel(language, model_name=model_name)
 
 
 
 
 
 
 
18
  if language == 'de':
19
+ phonem_converter = RuleBasedModels.EpitranPhonemConverter(epitran.Epitran('deu-Latn'))
 
20
  elif language == 'en':
21
  phonem_converter = RuleBasedModels.EngPhonemConverter()
22
  else:
23
+ raise ValueError(f"Language '{language}' not implemented")
 
24
  trainer = PronunciationTrainer(asr_model, phonem_converter)
25
 
26
  return trainer
 
43
  current_words_pronunciation_accuracy = []
44
  categories_thresholds = np.array([80, 60, 59])
45
 
46
+ sampling_rate = sample_rate_resample
47
 
48
  def __init__(self, asr_model: mi.IASRModel, word_to_ipa_coverter: mi.ITextToPhonemModel) -> None:
49
  self.asr_model = asr_model
 
60
 
61
  return audio_transcript, word_locations_in_samples
62
 
63
+ # def getWordsRelativeIntonation(self, Audio: torch.tensor, word_locations: list):
64
+ # intonations = torch.zeros((len(word_locations), 1))
65
+ # intonation_fade_samples = 0.3*self.sampling_rate
66
+ # app_logger.info(f"intonations.shape: {intonations.shape}.")
67
+ # for word in range(len(word_locations)):
68
+ # intonation_start = int(np.maximum(
69
+ # 0, word_locations[word][0]-intonation_fade_samples))
70
+ # intonation_end = int(np.minimum(
71
+ # Audio.shape[1]-1, word_locations[word][1]+intonation_fade_samples))
72
+ # intonations[word] = torch.sqrt(torch.mean(
73
+ # Audio[0][intonation_start:intonation_end]**2))
74
+ #
75
+ # intonations = intonations/torch.mean(intonations)
76
+ # return intonations
77
+
78
  ##################### ASR Functions ###########################
79
 
80
  def processAudioForGivenText(self, recordedAudio: torch.Tensor = None, real_text=None):
81
 
82
  start = time.time()
83
+ recording_transcript, recording_ipa, word_locations = self.getAudioTranscript(
84
+ recordedAudio)
85
+ time_transcript_audio = time.time() - start
86
+ app_logger.info(f'Time for NN to transcript audio: {time_transcript_audio:.2f}.')
 
87
 
88
  start = time.time()
89
  real_and_transcribed_words, real_and_transcribed_words_ipa, mapped_words_indices = self.matchSampleAndRecordedWords(
90
  real_text, recording_transcript)
91
+ time_matching_transcripts = time.time() - start
92
+ app_logger.info(f'Time for matching transcripts: {time_matching_transcripts:.3f}.')
93
 
94
  start_time, end_time = self.getWordLocationsFromRecordInSeconds(
95
  word_locations, mapped_words_indices)
 
111
  def getAudioTranscript(self, recordedAudio: torch.Tensor = None):
112
  current_recorded_audio = recordedAudio
113
 
114
+ current_recorded_audio = self.preprocessAudio(
115
+ current_recorded_audio)
 
 
116
  self.asr_model.processAudio(current_recorded_audio)
117
 
 
118
  current_recorded_transcript, current_recorded_word_locations = self.getTranscriptAndWordsLocations(
119
  current_recorded_audio.shape[1])
120
+ current_recorded_ipa = self.ipa_converter.convertToPhonem(
121
+ current_recorded_transcript)
122
 
123
+ # time.sleep(10000)
124
  return current_recorded_transcript, current_recorded_ipa, current_recorded_word_locations
125
 
126
+ def getWordLocationsFromRecordInSeconds(self, word_locations, mapped_words_indices) -> list:
127
+ app_logger.info(f"len_list: word_locations:{len(word_locations)}, mapped_words_indices:{len(mapped_words_indices)}, {len(word_locations) == len(mapped_words_indices)}...")
128
  start_time = []
129
  end_time = []
130
  for word_idx in range(len(mapped_words_indices)):
 
140
  def matchSampleAndRecordedWords(self, real_text, recorded_transcript):
141
  words_estimated = recorded_transcript.split()
142
 
143
+ try:
 
 
144
  words_real = real_text.split()
145
+ except AttributeError:
146
+ raise ValueError("Real text is None, but should be a string.")
147
 
148
  mapped_words, mapped_words_indices = wm.get_best_mapped_words(
149
  words_estimated, words_real)
 
159
  self.ipa_converter.convertToPhonem(mapped_words[word_idx])))
160
  return real_and_transcribed_words, real_and_transcribed_words_ipa, mapped_words_indices
161
 
162
+ def getPronunciationAccuracy(self, real_and_transcribed_words_ipa) -> float:
163
  total_mismatches = 0.
164
  number_of_phonemes = 0.
165
  current_words_pronunciation_accuracy = []
 
196
  return np.argmin(abs(self.categories_thresholds-accuracy))
197
 
198
  def preprocessAudio(self, audio: torch.tensor) -> torch.tensor:
199
+ return preprocessAudioStandalone(audio)
requirements-dev.txt CHANGED
@@ -1,3 +1,2 @@
1
- bson
2
  pytest
3
  pytest-cov
 
 
1
  pytest
2
  pytest-cov
requirements-flask.txt DELETED
@@ -1,21 +0,0 @@
1
- audioread
2
- dtwalign
3
- eng_to_ipa
4
- epitran==1.25.1
5
- flask
6
- flask_cors
7
- gunicorn
8
- omegaconf
9
- ortools==9.11.4210
10
- pandas
11
- pickle-mixin
12
- python-dotenv
13
- requests
14
- sentencepiece
15
- silero==0.4.1
16
- soundfile==0.12.1
17
- sqlalchemy
18
- structlog
19
- torch
20
- torchaudio
21
- transformers
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements-gradio.txt DELETED
@@ -1 +0,0 @@
1
- gradio==5.11.0
 
 
requirements.txt CHANGED
@@ -1,19 +1,23 @@
1
- asgi-correlation-id
2
  audioread
3
  dtwalign
4
  eng_to_ipa
5
- epitran==1.25.1
6
- gunicorn
 
 
7
  omegaconf
8
- ortools==9.11.4210
 
9
  pandas
10
  pickle-mixin
11
- python-dotenv
12
  requests
 
13
  sentencepiece
14
- silero==0.4.1
15
- soundfile==0.12.1
 
16
  structlog
17
- torch
18
- torchaudio
 
19
  transformers
 
 
1
  audioread
2
  dtwalign
3
  eng_to_ipa
4
+ epitran
5
+ faster-whisper
6
+ flask
7
+ flask_cors
8
  omegaconf
9
+ openai-whisper
10
+ ortools
11
  pandas
12
  pickle-mixin
 
13
  requests
14
+ sacremoses # suggested by marian translation model
15
  sentencepiece
16
+ silero
17
+ soundfile
18
+ sqlalchemy
19
  structlog
20
+ -f https://download.pytorch.org/whl/torch_stable.html
21
+ torch
22
+ torchaudio
23
  transformers
aip_trainer/utils/session_logger.py β†’ session_logger.py RENAMED
@@ -28,9 +28,9 @@ def drop_color_message_key(_, __, event_dict: EventDict) -> EventDict:
28
 
29
  def setup_logging(json_logs: bool = False, log_level: str = "INFO"):
30
  """Enhance the configuration of structlog.
31
- Needed for correlation id injection with fastapi middleware in samgis-web.
32
- After the use of logging_middleware() in samgis_web.web.middlewares, add also the CorrelationIdMiddleware from
33
- 'asgi_correlation_id' package. (See 'tests/web/test_middlewares.py' in samgis_web).
34
  To change an input parameter like the log level, re-run the function changing the parameter
35
  (no need to re-instantiate the logger instance: it's a hot change)
36
 
 
28
 
29
  def setup_logging(json_logs: bool = False, log_level: str = "INFO"):
30
  """Enhance the configuration of structlog.
31
+ Needed for correlation id injection with fastapi middleware within the app.
32
+ After the use of logging_middleware() within the middlewares module (if present), add also the CorrelationIdMiddleware from
33
+ 'asgi_correlation_id' package.
34
  To change an input parameter like the log level, re-run the function changing the parameter
35
  (no need to re-instantiate the logger instance: it's a hot change)
36
 
static/.gitignore DELETED
@@ -1,3 +0,0 @@
1
- playwright-report/*
2
- node_modules
3
- test-results/*
 
 
 
 
static/.vscode/launch.json DELETED
@@ -1,20 +0,0 @@
1
- {
2
- // Use IntelliSense to learn about possible attributes.
3
- // Hover to view descriptions of existing attributes.
4
- // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
5
- "version": "0.2.0",
6
- "configurations": [
7
- {
8
- "type": "node",
9
- "request": "launch",
10
- "name": "Launch Program",
11
- "skipFiles": [
12
- "<node_internals>/**"
13
- ],
14
- "program": "${workspaceFolder}/tests/test-1.spec.ts",
15
- "outFiles": [
16
- "${workspaceFolder}/**/*.js"
17
- ]
18
- }
19
- ]
20
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
static/css/{style.css β†’ style-new.css} RENAMED
@@ -2,6 +2,21 @@ body {
2
  background: #f2f2f2;
3
  }
4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  .expanded {
7
  margin: auto;
@@ -18,7 +33,13 @@ h1 {
18
 
19
  a.disabled {
20
  pointer-events: none;
21
- color: #ccc;
 
 
 
 
 
 
22
  background-color: #ccc;
23
  }
24
 
@@ -29,6 +50,31 @@ a.disabled {
29
  display: flex;
30
  }
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  /* ############## Next button ##### */
33
  .button-next {
34
  border-radius: 4px;
@@ -40,11 +86,10 @@ a.disabled {
40
  box-sizing: border-box;
41
  position: absolute;
42
  top: 0;
43
- left: 0%;
44
- right: 2%;
45
- bottom: 2%;
46
  background-color: #58636d;
47
- width: 10em;
 
48
 
49
  transition: all 0.5s;
50
  cursor: pointer;
@@ -127,41 +172,24 @@ a.disabled {
127
  display: block;
128
  position: absolute;
129
  left: 2%;
130
- top: 15%;
131
- transform: translate(-0%, -0%);
132
- height: 45%;
133
- width: 96%;
134
- max-width: 96%;
135
- background: #ffff;
136
- overflow: hidden;
137
- border-radius: 20px;
138
- box-shadow: 0 0 20px 8px #d0d0d0;
139
- }
140
-
141
- .container2 {
142
- display: block;
143
- position: absolute;
144
- left: 2%;
145
- top: 63%;
146
  transform: translate(-0%, -0%);
147
- height: 10%;
148
  width: 96%;
149
  max-width: 96%;
150
  background: #ffff;
151
  overflow: hidden;
152
  border-radius: 20px;
153
  box-shadow: 0 0 20px 8px #d0d0d0;
154
- overflow: scroll;
155
- max-height: 15%;
156
  }
157
 
158
  .container-small {
159
  position: fixed;
160
- left: 73%;
161
- top: 95%;
162
  transform: translate(-0%, -0%);
163
- height: 4%;
164
- width: 15%;
165
  background: #ffff;
166
  overflow: hidden;
167
  border-radius: 20px;
@@ -238,6 +266,17 @@ a.disabled {
238
  font-size: 3.5em !important;
239
  }
240
 
 
 
 
 
 
 
 
 
 
 
 
241
  .mic-button-div {
242
  position: fixed;
243
  left: 50%;
@@ -349,6 +388,75 @@ a.disabled {
349
  width: 100%;
350
  }
351
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
352
  /* ############ Links and credits ####*/
353
 
354
  .link-icon-div {
@@ -362,7 +470,7 @@ a.disabled {
362
  .credits-icon-div {
363
  position: fixed;
364
  left: 90.5%;
365
- top: 96%;
366
  font-size: x-small;
367
  }
368
 
@@ -401,9 +509,9 @@ a.disabled {
401
  display: block;
402
  position: absolute;
403
  left: 2%;
404
- top: 15%;
405
  transform: translate(-0%, -0%);
406
- height: 85%;
407
  width: 96%;
408
  max-width: 96%;
409
  background: #ffff;
@@ -412,23 +520,6 @@ a.disabled {
412
  box-shadow: 0 0 20px 8px #d0d0d0;
413
  }
414
 
415
- .container2 {
416
- display: block;
417
- position: absolute;
418
- left: 2%;
419
- top: 63%;
420
- transform: translate(-0%, -0%);
421
- height: 10%;
422
- width: 96%;
423
- max-width: 96%;
424
- background: #ffff;
425
- overflow: hidden;
426
- border-radius: 20px;
427
- box-shadow: 0 0 20px 8px #d0d0d0;
428
- overflow: scroll;
429
- max-height: 15%;
430
- }
431
-
432
  .icon-text {
433
  font-size: 0.8em !important;
434
  text-align: center;
@@ -445,7 +536,7 @@ a.disabled {
445
  /* 80px */
446
  height: 3.5em;
447
  padding-top: 0.4em;
448
- left: 50%;
449
  line-height: 0px;
450
  border: 6px solid #fff;
451
  border-radius: 50%;
@@ -460,7 +551,7 @@ a.disabled {
460
 
461
  .mic-button-div {
462
  position: fixed;
463
- left: 50%;
464
  top: 80%
465
  }
466
 
@@ -502,4 +593,4 @@ a.disabled {
502
  font-size: 0.8em;
503
  }
504
 
505
- }
 
2
  background: #f2f2f2;
3
  }
4
 
5
+ .flex {
6
+ display: flex;
7
+ }
8
+
9
+ .text-align-center {
10
+ text-align: center;
11
+ }
12
+
13
+ .display-block {
14
+ display: block;
15
+ }
16
+
17
+ .display-inline-block {
18
+ display: inline-block;
19
+ }
20
 
21
  .expanded {
22
  margin: auto;
 
33
 
34
  a.disabled {
35
  pointer-events: none;
36
+ color: black;
37
+ background-color: #ccc;
38
+ }
39
+
40
+ .color-disabled {
41
+ pointer-events: none;
42
+ color: black;
43
  background-color: #ccc;
44
  }
45
 
 
50
  display: flex;
51
  }
52
 
53
+ .darkgreen {
54
+ color: white;
55
+ background-color: #467387;
56
+ }
57
+
58
+ /* text button */
59
+ .text-button {
60
+ border: none;
61
+ text-align: center;
62
+ text-decoration: none;
63
+ display: inline-block;
64
+ font-size: 16px;
65
+ margin: 4px 2px;
66
+ height: fit-content;
67
+ width: 4em;
68
+ }
69
+ .text-button-div {
70
+ position: absolute;
71
+ top: 38%;
72
+ }
73
+ #input-uploader-audio-file {
74
+ width: 100px;
75
+ white-space: normal;
76
+ }
77
+
78
  /* ############## Next button ##### */
79
  .button-next {
80
  border-radius: 4px;
 
86
  box-sizing: border-box;
87
  position: absolute;
88
  top: 0;
89
+ left: 94%;
 
 
90
  background-color: #58636d;
91
+ width: 6%;
92
+ height: 100%;
93
 
94
  transition: all 0.5s;
95
  cursor: pointer;
 
172
  display: block;
173
  position: absolute;
174
  left: 2%;
175
+ top: 18%;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  transform: translate(-0%, -0%);
177
+ height: 59%;
178
  width: 96%;
179
  max-width: 96%;
180
  background: #ffff;
181
  overflow: hidden;
182
  border-radius: 20px;
183
  box-shadow: 0 0 20px 8px #d0d0d0;
 
 
184
  }
185
 
186
  .container-small {
187
  position: fixed;
188
+ left: 68%;
189
+ top: 79%;
190
  transform: translate(-0%, -0%);
191
+ height: 7%;
192
+ width: 30%;
193
  background: #ffff;
194
  overflow: hidden;
195
  border-radius: 20px;
 
266
  font-size: 3.5em !important;
267
  }
268
 
269
+ .form-audio-file {
270
+ position: fixed;
271
+ left: 25%;
272
+ top: 82%;
273
+ }
274
+ .form-audio-file-label {
275
+ position: fixed;
276
+ left: 25%;
277
+ top: 86%;
278
+ }
279
+
280
  .mic-button-div {
281
  position: fixed;
282
  left: 50%;
 
388
  width: 100%;
389
  }
390
 
391
+ /* ############ checkbox for using DTW */
392
+ .container-dtw-div {
393
+ position: absolute;
394
+ top: 60%;
395
+ }
396
+ .container-label-dtw {
397
+ padding-left: 35px;
398
+ cursor: pointer;
399
+ font-size: 2em;
400
+ -webkit-user-select: none;
401
+ -moz-user-select: none;
402
+ -ms-user-select: none;
403
+ user-select: none;
404
+ }
405
+
406
+ /* Hide the browser's default checkbox */
407
+ .container-label-dtw input {
408
+ position: absolute;
409
+ opacity: 0;
410
+ cursor: pointer;
411
+ height: 0;
412
+ width: 0;
413
+ }
414
+
415
+ /* Create a custom checkbox */
416
+ .checkmark {
417
+ position: absolute;
418
+ margin-top: 0.4em;
419
+ left: 0;
420
+ height: 25px;
421
+ width: 25px;
422
+ background-color: #eee;
423
+ }
424
+
425
+ /* On mouse-over, add a grey background color */
426
+ .container:hover input ~ .checkmark {
427
+ background-color: #ccc;
428
+ }
429
+
430
+ /* When the checkbox is checked, add a blue background */
431
+ .container input:checked ~ .checkmark {
432
+ background-color: #467387;
433
+ }
434
+
435
+ /* Create the checkmark/indicator (hidden when not checked) */
436
+ .checkmark:after {
437
+ content: "";
438
+ position: absolute;
439
+ display: none;
440
+ }
441
+
442
+ /* Show the checkmark when checked */
443
+ .container input:checked ~ .checkmark:after {
444
+ display: block;
445
+ }
446
+
447
+ /* Style the checkmark/indicator */
448
+ .container .checkmark:after {
449
+ left: 9px;
450
+ top: 5px;
451
+ width: 5px;
452
+ height: 10px;
453
+ border: solid white;
454
+ border-width: 0 3px 3px 0;
455
+ -webkit-transform: rotate(45deg);
456
+ -ms-transform: rotate(45deg);
457
+ transform: rotate(45deg);
458
+ }
459
+
460
  /* ############ Links and credits ####*/
461
 
462
  .link-icon-div {
 
470
  .credits-icon-div {
471
  position: fixed;
472
  left: 90.5%;
473
+ top: 95%;
474
  font-size: x-small;
475
  }
476
 
 
509
  display: block;
510
  position: absolute;
511
  left: 2%;
512
+ top: 22%;
513
  transform: translate(-0%, -0%);
514
+ height: 55%;
515
  width: 96%;
516
  max-width: 96%;
517
  background: #ffff;
 
520
  box-shadow: 0 0 20px 8px #d0d0d0;
521
  }
522
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
523
  .icon-text {
524
  font-size: 0.8em !important;
525
  text-align: center;
 
536
  /* 80px */
537
  height: 3.5em;
538
  padding-top: 0.4em;
539
+ left: 40%;
540
  line-height: 0px;
541
  border: 6px solid #fff;
542
  border-radius: 50%;
 
551
 
552
  .mic-button-div {
553
  position: fixed;
554
+ left: 40%;
555
  top: 80%
556
  }
557
 
 
593
  font-size: 0.8em;
594
  }
595
 
596
+ }