Spaces:

aletrn
/

ai-pronunciation-trainer

Running

App Files Files Community

alessandro trinca tornidor commited on Feb 24

Commit

85b7206

1 Parent(s): 7810fbd

feat: port whisper and faster-whisper support from https://github.com/Thiagohgl/ai-pronunciation-trainer

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.coveragerc +3 -3
.gitignore +153 -118
.idea/inspectionProfiles/Project_Default.xml +15 -0
.idea/vcs.xml +6 -0
aip_trainer/models/AIModels.py → AIModels.py +31 -3
Dockerfile +0 -37
aip_trainer/models/ModelInterfaces.py → ModelInterfaces.py +1 -1
README.md +2 -2
aip_trainer/models/RuleBasedModels.py → RuleBasedModels.py +14 -6
aip_trainer/WordMatching.py → WordMatching.py +85 -51
aip_trainer/WordMetrics.py → WordMetrics.py +28 -4
aip_trainer/__init__.py +0 -21
aip_trainer/lambdas/__init__.py +0 -1
aip_trainer/lambdas/data_de_en_with_categories.json +0 -0
aip_trainer/lambdas/lambdaGetSample.py +0 -106
aip_trainer/models/__init__.py +0 -0
aip_trainer/utils/__init__.py +0 -0
aip_trainer/utils/split_cosmic_ray_report.py +0 -33
aip_trainer/utils/typing_hints.py +0 -19
aip_trainer/utils/utilities.py +0 -57
app.py +53 -24
app_description.md +11 -0
aip_trainer/lambdas/app_description.md → app_headline.md +3 -1
constants.py +31 -0
cosmic_ray_config.toml +0 -8
tests/test_data_de_en_2.pickle → data_de_en_2.pickle +0 -0
databases/data_de.csv +0 -0
databases/data_en.csv +0 -0
dockerfiles/apt_preferences +0 -9
dockerfiles/debian.sources +0 -17
dockerfiles/dockerfile-base +0 -72
faster_whisper_wrapper.py +56 -0
images/{MainScreen.png → MainScreen.jpg} +2 -2
aip_trainer/lambdas/js.py → js.py +0 -14
lambdaChangeModel.py +14 -0
lambdaGetSample.py +145 -0
aip_trainer/lambdas/lambdaSpeechToScore.py → lambdaSpeechToScore.py +176 -57
aip_trainer/lambdas/lambdaTTS.py → lambdaTTS.py +29 -5
aip_trainer/models/models.py → models.py +209 -88
packages.txt +0 -1
pre-requirements.txt +0 -1
aip_trainer/pronunciationTrainer.py → pronunciationTrainer.py +44 -39
requirements-dev.txt +0 -1
requirements-flask.txt +0 -21
requirements-gradio.txt +0 -1
requirements.txt +13 -9
aip_trainer/utils/session_logger.py → session_logger.py +3 -3
static/.gitignore +0 -3
static/.vscode/launch.json +0 -20
static/css/{style.css → style-new.css} +142 -51

.coveragerc CHANGED Viewed

@@ -1,9 +1,9 @@
 [run]
-source = samgis
-omit = ./venv/*,__version__.py,*tests*,*apps.py,*manage.py,*__init__.py,*migrations*,*asgi*,*wsgi*,*admin.py,*urls.py,./tests/*,aip_trainer/lambdas/js.py
 [report]
-omit = ./venv/*,*tests*,*apps.py,*manage.py,*__init__.py,*migrations*,*asgi*,*wsgi*,*admin.py,*urls.py,./tests/*,aip_trainer/lambdas/js.py
 exclude_lines =
      if __name__ == .__main__.:

 [run]
+source = ./*.py
+omit = ./tests/*,./tests/**/*,./*venv*/*,__version__.py,*tests*,*app.py,js.py,*manage.py,*__init__.py,*migrations*,*asgi*,*wsgi*,*admin.py,*urls.py
 [report]
+omit = ./*venv*/*,*tests*,*app.py,*manage.py,*__init__.py,*migrations*,js.py,*asgi*,*wsgi*,*admin.py,*urls.py
 exclude_lines =
      if __name__ == .__main__.:

.gitignore CHANGED Viewed

@@ -1,63 +1,3 @@
-# Created by https://www.gitignore.io/api/osx,linux,python,windows,pycharm,visualstudiocode
-### Linux ###
-*~
-# temporary files which can be created if a process still has a handle open of a deleted file
-.fuse_hidden*
-# KDE directory preferences
-.directory
-# Linux trash folder which might appear on any partition or disk
-.Trash-*
-# .nfs files are created when an open file is removed but is still being accessed
-.nfs*
-### OSX ###
-*.DS_Store
-*/*.DS_Store
-*/**/*.DS_Store
-.AppleDouble
-.LSOverride
-# Icon must end with two \r
-Icon
-# Thumbnails
-._*
-# Files that might appear in the root of a volume
-.DocumentRevisions-V100
-.fseventsd
-.Spotlight-V100
-.TemporaryItems
-.Trashes
-.VolumeIcon.icns
-.com.apple.timemachine.donotpresent
-# Directories potentially created on remote AFP share
-.AppleDB
-.AppleDesktop
-Network Trash Folder
-Temporary Items
-.apdisk
-# CMake
-cmake-build-debug/
-# Ruby plugin and RubyMine
-/.rakeTasks
-# Crashlytics plugin (for Android Studio and IntelliJ)
-com_crashlytics_export_strings.xml
-crashlytics.properties
-crashlytics-build.properties
-fabric.properties
-### Python ###
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
@@ -80,9 +20,14 @@ parts/
 sdist/
 var/
 wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 # PyInstaller
 #  Usually these files are written by a python script from a template
@@ -97,19 +42,31 @@ pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .coverage
 .coverage.*
 .cache
-.pytest_cache/
 nosetests.xml
 coverage.xml
 *.cover
 .hypothesis/
 # Translations
 *.mo
 *.pot
 # Flask stuff:
 instance/
 .webassets-cache
@@ -118,8 +75,7 @@ instance/
 .scrapy
 # Sphinx documentation
-docs/_build/doctrees/*
-docs/_build/html/*
 # PyBuilder
 target/
@@ -127,25 +83,37 @@ target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # pyenv
 .python-version
-# celery beat schedule file
-celerybeat-schedule.*
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
-.env*
-.venv
 .venv*
 env/
-venv/
 ENV/
 env.bak/
-venv.bak/
 # Spyder project settings
 .spyderproject
@@ -159,55 +127,24 @@ venv.bak/
 # mypy
 .mypy_cache/
-### VisualStudioCode ###
-.vscode/*
-!.vscode/settings.json
-!.vscode/tasks.json
-!.vscode/launch.json
-!.vscode/extensions.json
-.history
-### Windows ###
-# Windows thumbnail cache files
-Thumbs.db
-ehthumbs.db
-ehthumbs_vista.db
-# Folder config file
-Desktop.ini
-# Recycle Bin used on file shares
-$RECYCLE.BIN/
-# Windows Installer files
-*.cab
-*.msi
-*.msm
-*.msp
-# Windows shortcuts
-*.lnk
-# Build folder
-*/build/*
 # custom
-*.ori
-tmp
-nohup.out
-/tests/events.tar
-function_dump_*.json
-*.yml
-# onnx models
-*.onnx
-# End of https://www.gitignore.io/api/osx,linux,python,windows,pycharm,visualstudiocode
-## .idea files
-# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
 # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
 # User-specific stuff
@@ -217,6 +154,9 @@ function_dump_*.json
 .idea/**/dictionaries
 .idea/**/shelf
 # Generated files
 .idea/**/contentModel.xml
@@ -237,9 +177,14 @@ function_dump_*.json
 # When using Gradle or Maven with auto-import, you should exclude module files,
 # since they will be recreated, and may cause churn.  Uncomment if using
 # auto-import.
 # .idea/modules.xml
-.idea/*.iml
 # .idea/modules
 # CMake
 cmake-build-*/
@@ -262,6 +207,9 @@ atlassian-ide-plugin.xml
 # Cursive Clojure plugin
 .idea/replstate.xml
 # Crashlytics plugin (for Android Studio and IntelliJ)
 com_crashlytics_export_strings.xml
 crashlytics.properties
@@ -274,11 +222,98 @@ fabric.properties
 # Android studio 3.1+ serialized cache file
 .idea/caches/build_file_checksums.ser
 # Sonarlint plugin
-.idea/sonarlint
-/.idea/modules.xml
-# node_modules
-node_modules
-*.jit

 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 sdist/
 var/
 wheels/
+pip-wheel-metadata/
+share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
+MANIFEST
+static/node_modules/*
+static/dist/*
 # PyInstaller
 #  Usually these files are written by a python script from a template
 # Unit test / coverage reports
 htmlcov/
 .tox/
+.nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
+*.py,cover
 .hypothesis/
+.pytest_cache/
+static/test-results/*
+cosmic-ray-results/*
+cosmic_ray.sqlite
+static/playwright-report/*
 # Translations
 *.mo
 *.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
 # Flask stuff:
 instance/
 .webassets-cache
 .scrapy
 # Sphinx documentation
+docs/_build/
 # PyBuilder
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
 # pyenv
 .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .venv*
 env/
+venv*
 ENV/
 env.bak/
 # Spyder project settings
 .spyderproject
 # mypy
 .mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+tmp/*
 # custom
+translation_model_de.pickle
+translation_tokenizer_de.pickle
+test.ogg
+# Created by https://www.toptal.com/developers/gitignore/api/jetbrains,windows,linux,visualstudiocode
+# Edit at https://www.toptal.com/developers/gitignore?templates=jetbrains,windows,linux,visualstudiocode
+### JetBrains ###
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
 # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
 # User-specific stuff
 .idea/**/dictionaries
 .idea/**/shelf
+# AWS User-specific
+.idea/**/aws.xml
 # Generated files
 .idea/**/contentModel.xml
 # When using Gradle or Maven with auto-import, you should exclude module files,
 # since they will be recreated, and may cause churn.  Uncomment if using
 # auto-import.
+# .idea/artifacts
+# .idea/compiler.xml
+# .idea/jarRepositories.xml
 # .idea/modules.xml
+# .idea/*.iml
 # .idea/modules
+# *.iml
+# *.ipr
 # CMake
 cmake-build-*/
 # Cursive Clojure plugin
 .idea/replstate.xml
+# SonarLint plugin
+.idea/sonarlint/
 # Crashlytics plugin (for Android Studio and IntelliJ)
 com_crashlytics_export_strings.xml
 crashlytics.properties
 # Android studio 3.1+ serialized cache file
 .idea/caches/build_file_checksums.ser
+### JetBrains Patch ###
+# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
+# *.iml
+# modules.xml
+# .idea/misc.xml
+# *.ipr
 # Sonarlint plugin
+# https://plugins.jetbrains.com/plugin/7973-sonarlint
+.idea/**/sonarlint/
+# SonarQube Plugin
+# https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin
+.idea/**/sonarIssues.xml
+# Markdown Navigator plugin
+# https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced
+.idea/**/markdown-navigator.xml
+.idea/**/markdown-navigator-enh.xml
+.idea/**/markdown-navigator/
+# Cache file creation bug
+# See https://youtrack.jetbrains.com/issue/JBR-2257
+.idea/$CACHE_FILE$
+# CodeStream plugin
+# https://plugins.jetbrains.com/plugin/12206-codestream
+.idea/codestream.xml
+# Azure Toolkit for IntelliJ plugin
+# https://plugins.jetbrains.com/plugin/8053-azure-toolkit-for-intellij
+.idea/**/azureSettings.xml
+### Linux ###
+*~
+# temporary files which can be created if a process still has a handle open of a deleted file
+.fuse_hidden*
+# KDE directory preferences
+.directory
+# Linux trash folder which might appear on any partition or disk
+.Trash-*
+# .nfs files are created when an open file is removed but is still being accessed
+.nfs*
+### VisualStudioCode ###
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+!.vscode/*.code-snippets
+# Local History for Visual Studio Code
+.history/
+# Built Visual Studio Code Extensions
+*.vsix
+### VisualStudioCode Patch ###
+# Ignore all local history of files
+.history
+.ionide
+### Windows ###
+# Windows thumbnail cache files
+Thumbs.db
+Thumbs.db:encryptable
+ehthumbs.db
+ehthumbs_vista.db
+# Dump file
+*.stackdump
+# Folder config file
+[Dd]esktop.ini
+# Recycle Bin used on file shares
+$RECYCLE.BIN/
+# Windows Installer files
+*.cab
+*.msi
+*.msix
+*.msm
+*.msp
+# Windows shortcuts
+*.lnk
+# End of https://www.toptal.com/developers/gitignore/api/jetbrains,windows,linux,visualstudiocode

.idea/inspectionProfiles/Project_Default.xml ADDED Viewed

	@@ -0,0 +1,15 @@

+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="HtmlUnknownAttribute" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="myValues">
+        <value>
+          <list size="1">
+            <item index="0" class="java.lang.String" itemvalue="label" />
+          </list>
+        </value>
+      </option>
+      <option name="myCustomValuesEnabled" value="true" />
+    </inspection_tool>
+  </profile>
+</component>

.idea/vcs.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="" vcs="Git" />
+  </component>
+</project>

aip_trainer/models/AIModels.py → AIModels.py RENAMED Viewed

@@ -1,7 +1,6 @@
 import numpy as np
 import torch
-from aip_trainer.models import ModelInterfaces
 class NeuralASR(ModelInterfaces.IASRModel):
@@ -21,7 +20,6 @@ class NeuralASR(ModelInterfaces.IASRModel):
     def getWordLocations(self) -> list:
         """Get the pair of words location from audio"""
         assert self.word_locations_in_samples is not None, 'Can get word locations without having processed the audio'
         return self.word_locations_in_samples
     def processAudio(self, audio: torch.Tensor):
@@ -32,3 +30,33 @@ class NeuralASR(ModelInterfaces.IASRModel):
             self.audio_transcript, self.word_locations_in_samples = self.decoder(
                 nn_output[0, :, :].detach(), audio_length_in_samples, word_align=True)

 import numpy as np
 import torch
+import ModelInterfaces
 class NeuralASR(ModelInterfaces.IASRModel):
     def getWordLocations(self) -> list:
         """Get the pair of words location from audio"""
         assert self.word_locations_in_samples is not None, 'Can get word locations without having processed the audio'
         return self.word_locations_in_samples
     def processAudio(self, audio: torch.Tensor):
             self.audio_transcript, self.word_locations_in_samples = self.decoder(
                 nn_output[0, :, :].detach(), audio_length_in_samples, word_align=True)
+class NeuralTTS(ModelInterfaces.ITextToSpeechModel):
+    def __init__(self, model: torch.nn.Module, sampling_rate: int) -> None:
+        super().__init__()
+        self.model = model
+        self.sampling_rate = sampling_rate
+    def getAudioFromSentence(self, sentence: str) -> np.array:
+        with torch.inference_mode():
+            audio_transcript = self.model.apply_tts(texts=[sentence],
+                                                    sample_rate=self.sampling_rate)[0]
+        return audio_transcript
+class NeuralTranslator(ModelInterfaces.ITranslationModel):
+    def __init__(self, model: torch.nn.Module, tokenizer) -> None:
+        super().__init__()
+        self.model = model
+        self.tokenizer = tokenizer
+    def translateSentence(self, sentence: str) -> str:
+        """Get the transcripts of the process audio"""
+        tokenized_text = self.tokenizer(sentence, return_tensors='pt')
+        translation = self.model.generate(**tokenized_text)
+        translated_text = self.tokenizer.batch_decode(
+            translation, skip_special_tokens=True)[0]
+        return translated_text

Dockerfile DELETED Viewed

@@ -1,37 +0,0 @@
-FROM registry.gitlab.com/aletrn/ai-pronunciation-trainer:0.5.0
-ARG ARCH
-ARG WORKDIR_ROOT
-ENV PYTHONPATH="${WORKDIR_ROOT}:${WORKDIR_ROOT}/.venv:${PYTHONPATH}:/usr/local/lib/python3/dist-packages"
-ENV MPLCONFIGDIR=/tmp/matplotlib
-ENV IS_DOCKER_CONTAINER="YES"
-ENV LOG_JSON_FORMAT="TRUE"
-ENV LOG_LEVEL="INFO"
-ENV VIRTUAL_ENV=${WORKDIR_ROOT}/.venv PATH="${WORKDIR_ROOT}/.venv/bin:$PATH"
-COPY --chown=python:python . ${WORKDIR_ROOT}/.
-RUN python --version
-RUN pip list
-RUN echo "PATH: ${PATH}."
-RUN echo "WORKDIR_ROOT: ${WORKDIR_ROOT}."
-RUN ls -l ${WORKDIR_ROOT}
-RUN ls -ld ${WORKDIR_ROOT}
-RUN python -c "import sys; print(sys.path)"
-RUN python -c "import epitran"
-RUN python -c "import flask"
-RUN python -c "import pandas"
-RUN python -c "from torch import Tensor"
-RUN python -c "import gunicorn"
-RUN df -h
-RUN ls -l ${WORKDIR_ROOT}/webApp.py
-RUN ls -l ${WORKDIR_ROOT}/static/
-USER 999
-ENV PATH="${WORKDIR_ROOT}:${WORKDIR_ROOT}/.venv/bin:$PATH"
-RUN echo "PATH: $PATH ..."
-RUN echo "PYTHONPATH: $PYTHONPATH ..."
-RUN echo "MPLCONFIGDIR: $MPLCONFIGDIR ..."
-CMD ["gunicorn", "--bind", "0.0.0.0:3000", "webApp:app"]

aip_trainer/models/ModelInterfaces.py → ModelInterfaces.py RENAMED Viewed

@@ -1,5 +1,5 @@
-import abc
 import numpy as np



1
2	+ import abc
3	import numpy as np
4
5

README.md CHANGED Viewed

@@ -89,7 +89,7 @@ find aip_trainer -name "__pycache__" -exec rm -rf {} \;
 Then execute the tests again:
 ```bash
-pytest --cov=aip_trainer --cov-report=term-missing && coverage html
 ```
 ### Backend tests execution on Windows
@@ -106,7 +106,7 @@ Normally I use Visual Studio Code to write and execute my playwright tests, howe
 ```bash
 pnpm install
-pnpm playwright test
 ```
 ### Unused classes and functions (now removed)

 Then execute the tests again:
 ```bash
+python -m pytest tests/models/test_models_faster_whisper.py; echo "# start pytest complete test suite #"; IS_TESTING=TRUE python -m pytest tests --cov="." --cov-report=term-missing && coverage html
 ```
 ### Backend tests execution on Windows
 ```bash
 pnpm install
+pnpm playwright test --workers 1 --retries 4 --project=chromium
 ```
 ### Unused classes and functions (now removed)

aip_trainer/models/RuleBasedModels.py → RuleBasedModels.py RENAMED Viewed

@@ -1,8 +1,20 @@
 import eng_to_ipa
-from aip_trainer.models import ModelInterfaces
-from aip_trainer import app_logger
 class EpitranPhonemConverter(ModelInterfaces.ITextToPhonemModel):
     word_locations_in_samples = None
@@ -13,9 +25,7 @@ class EpitranPhonemConverter(ModelInterfaces.ITextToPhonemModel):
         self.epitran_model = epitran_model
     def convertToPhonem(self, sentence: str) -> str:
-        app_logger.debug(f'starting EpitranPhonemConverter.convertToPhonem for sentence/token "{sentence}"...')
         phonem_representation = self.epitran_model.transliterate(sentence)
-        app_logger.debug(f'EpitranPhonemConverter: got phonem_representation for sentence/token "{sentence}"!')
         return phonem_representation
@@ -25,8 +35,6 @@ class EngPhonemConverter(ModelInterfaces.ITextToPhonemModel):
         super().__init__()
     def convertToPhonem(self, sentence: str) -> str:
-        app_logger.debug(f'starting EngPhonemConverter.convertToPhonem for sentence/token "{sentence}"...')
         phonem_representation = eng_to_ipa.convert(sentence)
         phonem_representation = phonem_representation.replace('*','')
-        app_logger.debug(f'EngPhonemConverter: got phonem_representation for sentence/token "{sentence}"!')
         return phonem_representation

+import ModelInterfaces
+import torch
+import numpy as np
+import epitran
 import eng_to_ipa
+def get_phonem_converter(language: str):
+    if language == 'de':
+        phonem_converter = EpitranPhonemConverter(
+            epitran.Epitran('deu-Latn'))
+    elif language == 'en':
+        phonem_converter = EngPhonemConverter()
+    else:
+        raise ValueError('Language not implemented')
+    return phonem_converter
 class EpitranPhonemConverter(ModelInterfaces.ITextToPhonemModel):
     word_locations_in_samples = None
         self.epitran_model = epitran_model
     def convertToPhonem(self, sentence: str) -> str:
         phonem_representation = self.epitran_model.transliterate(sentence)
         return phonem_representation
         super().__init__()
     def convertToPhonem(self, sentence: str) -> str:
         phonem_representation = eng_to_ipa.convert(sentence)
         phonem_representation = phonem_representation.replace('*','')
         return phonem_representation

aip_trainer/WordMatching.py → WordMatching.py RENAMED Viewed

@@ -1,22 +1,24 @@
 import time
 from string import punctuation
 import numpy as np
 from dtwalign import dtw_from_distance_matrix
 from ortools.sat.python import cp_model
-from . import WordMetrics, app_logger
 offset_blank = 1
 TIME_THRESHOLD_MAPPING = 5.0
-def get_word_distance_matrix(words_estimated: list, words_real: list) -> np.array:
     number_of_real_words = len(words_real)
     number_of_estimated_words = len(words_estimated)
     word_distance_matrix = np.zeros(
-        (number_of_estimated_words+offset_blank, number_of_real_words))
     for idx_estimated in range(number_of_estimated_words):
         for idx_real in range(number_of_real_words):
             word_distance_matrix[idx_estimated, idx_real] = WordMetrics.edit_distance_python(
@@ -25,7 +27,7 @@ def get_word_distance_matrix(words_estimated: list, words_real: list) -> np.arra
     if offset_blank == 1:
         for idx_real in range(number_of_real_words):
             word_distance_matrix[number_of_estimated_words,
-                                 idx_real] = len(words_real[idx_real])
     return word_distance_matrix
@@ -33,37 +35,37 @@ def get_best_path_from_distance_matrix(word_distance_matrix):
     modelCpp = cp_model.CpModel()
     number_of_real_words = word_distance_matrix.shape[1]
-    number_of_estimated_words = word_distance_matrix.shape[0]-1
     number_words = np.maximum(number_of_real_words, number_of_estimated_words)
     estimated_words_order = [modelCpp.NewIntVar(0, int(
-        number_words - 1 + offset_blank), 'w%i' % i) for i in range(number_words+offset_blank)]
     # They are in ascending order
-    for word_idx in range(number_words-1):
         modelCpp.Add(
-            estimated_words_order[word_idx+1] >= estimated_words_order[word_idx])
     total_phoneme_distance = 0
     real_word_at_time = {}
     for idx_estimated in range(number_of_estimated_words):
         for idx_real in range(number_of_real_words):
             real_word_at_time[idx_estimated, idx_real] = modelCpp.NewBoolVar(
-                'real_word_at_time'+str(idx_real)+'-'+str(idx_estimated))
             modelCpp.Add(estimated_words_order[idx_estimated] == idx_real).OnlyEnforceIf(
                 real_word_at_time[idx_estimated, idx_real])
             total_phoneme_distance += word_distance_matrix[idx_estimated,
-                                                           idx_real]*real_word_at_time[idx_estimated, idx_real]
     # If no word in time, difference is calculated from empty string
     for idx_real in range(number_of_real_words):
         word_has_a_match = modelCpp.NewBoolVar(
-            'word_has_a_match'+str(idx_real))
         modelCpp.Add(sum([real_word_at_time[idx_estimated, idx_real] for idx_estimated in range(
             number_of_estimated_words)]) == 1).OnlyEnforceIf(word_has_a_match)
         total_phoneme_distance += word_distance_matrix[number_of_estimated_words,
-                                                       idx_real]*word_has_a_match.Not()
     # Loss should be minimized
     modelCpp.Minimize(total_phoneme_distance)
@@ -79,18 +81,16 @@ def get_best_path_from_distance_matrix(word_distance_matrix):
                 (solver.Value(estimated_words_order[word_idx])))
         return np.array(mapped_indices, dtype=int)
-    except Exception as ex:
-        app_logger.error(f"ex:{ex}.")
         return []
-def get_resulting_string(mapped_indices: np.array, words_estimated: list, words_real: list) -> tuple[list, list]:
     mapped_words = []
     mapped_words_indices = []
     WORD_NOT_FOUND_TOKEN = '-'
     number_of_real_words = len(words_real)
     for word_idx in range(number_of_real_words):
-        app_logger.debug(f"{word_idx} => {mapped_indices} == {word_idx}, {mapped_indices == word_idx} #")
         position_of_real_word_indices = np.where(
             mapped_indices == word_idx)[0].astype(int)
@@ -109,59 +109,93 @@ def get_resulting_string(mapped_indices: np.array, words_estimated: list, words_
             error = 99999
             best_possible_combination = ''
             best_possible_idx = -1
-            best_possible_combination, best_possible_idx = inner_get_resulting_string(
-                best_possible_combination, best_possible_idx, error, position_of_real_word_indices,
-                word_idx, words_estimated, words_real
-            )
             mapped_words.append(best_possible_combination)
             mapped_words_indices.append(best_possible_idx)
-            # continue
-    return mapped_words, mapped_words_indices
-def inner_get_resulting_string(
-        best_possible_combination, best_possible_idx, error, position_of_real_word_indices, word_idx, words_estimated, words_real
-    ):
-    for single_word_idx in position_of_real_word_indices:
-        idx_above_word = single_word_idx >= len(words_estimated)
-        if idx_above_word:
             continue
-        error_word = WordMetrics.edit_distance_python(
-            words_estimated[single_word_idx], words_real[word_idx])
-        if error_word < error:
-            error = error_word * 1
-            best_possible_combination = words_estimated[single_word_idx]
-            best_possible_idx = single_word_idx
-    return best_possible_combination, best_possible_idx
-def get_best_mapped_words(words_estimated: list, words_real: list) -> tuple[list, list]:
     word_distance_matrix = get_word_distance_matrix(
         words_estimated, words_real)
     start = time.time()
-    mapped_indices = get_best_path_from_distance_matrix(word_distance_matrix)
-    duration_of_mapping = time.time()-start
-    # In case or-tools doesn't converge, go to a faster, low-quality solution
-    if len(mapped_indices) == 0 or duration_of_mapping > TIME_THRESHOLD_MAPPING+0.5:
-        mapped_indices = (dtw_from_distance_matrix(
-            word_distance_matrix)).path[:len(words_estimated), 1]
-    mapped_words, mapped_words_indices = get_resulting_string(
-        mapped_indices, words_estimated, words_real)
-    return mapped_words, mapped_words_indices
 def getWhichLettersWereTranscribedCorrectly(real_word, transcribed_word):
-    is_leter_correct = [None]*len(real_word)
     for idx, letter in enumerate(real_word):
         if letter == transcribed_word[idx] or letter in punctuation:
             is_leter_correct[idx] = 1
         else:
             is_leter_correct[idx] = 0
     return is_leter_correct

 import time
 from string import punctuation
+from typing import List, Tuple
 import numpy as np
 from dtwalign import dtw_from_distance_matrix
 from ortools.sat.python import cp_model
+import WordMetrics
+from constants import app_logger
 offset_blank = 1
 TIME_THRESHOLD_MAPPING = 5.0
+def get_word_distance_matrix(words_estimated: list, words_real: list) -> np.ndarray:
     number_of_real_words = len(words_real)
     number_of_estimated_words = len(words_estimated)
     word_distance_matrix = np.zeros(
+        (number_of_estimated_words + offset_blank, number_of_real_words))
     for idx_estimated in range(number_of_estimated_words):
         for idx_real in range(number_of_real_words):
             word_distance_matrix[idx_estimated, idx_real] = WordMetrics.edit_distance_python(
     if offset_blank == 1:
         for idx_real in range(number_of_real_words):
             word_distance_matrix[number_of_estimated_words,
+            idx_real] = len(words_real[idx_real])
     return word_distance_matrix
     modelCpp = cp_model.CpModel()
     number_of_real_words = word_distance_matrix.shape[1]
+    number_of_estimated_words = word_distance_matrix.shape[0] - 1
     number_words = np.maximum(number_of_real_words, number_of_estimated_words)
     estimated_words_order = [modelCpp.NewIntVar(0, int(
+        number_words - 1 + offset_blank), 'w%i' % i) for i in range(number_words + offset_blank)]
     # They are in ascending order
+    for word_idx in range(number_words - 1):
         modelCpp.Add(
+            estimated_words_order[word_idx + 1] >= estimated_words_order[word_idx])
     total_phoneme_distance = 0
     real_word_at_time = {}
     for idx_estimated in range(number_of_estimated_words):
         for idx_real in range(number_of_real_words):
             real_word_at_time[idx_estimated, idx_real] = modelCpp.NewBoolVar(
+                'real_word_at_time' + str(idx_real) + '-' + str(idx_estimated))
             modelCpp.Add(estimated_words_order[idx_estimated] == idx_real).OnlyEnforceIf(
                 real_word_at_time[idx_estimated, idx_real])
             total_phoneme_distance += word_distance_matrix[idx_estimated,
+            idx_real] * real_word_at_time[idx_estimated, idx_real]
     # If no word in time, difference is calculated from empty string
     for idx_real in range(number_of_real_words):
         word_has_a_match = modelCpp.NewBoolVar(
+            'word_has_a_match' + str(idx_real))
         modelCpp.Add(sum([real_word_at_time[idx_estimated, idx_real] for idx_estimated in range(
             number_of_estimated_words)]) == 1).OnlyEnforceIf(word_has_a_match)
         total_phoneme_distance += word_distance_matrix[number_of_estimated_words,
+        idx_real] * word_has_a_match.Not()
     # Loss should be minimized
     modelCpp.Minimize(total_phoneme_distance)
                 (solver.Value(estimated_words_order[word_idx])))
         return np.array(mapped_indices, dtype=int)
+    except:
         return []
+def get_resulting_string(mapped_indices: np.ndarray, words_estimated: list, words_real: list) -> Tuple[List, List]:
     mapped_words = []
     mapped_words_indices = []
     WORD_NOT_FOUND_TOKEN = '-'
     number_of_real_words = len(words_real)
     for word_idx in range(number_of_real_words):
         position_of_real_word_indices = np.where(
             mapped_indices == word_idx)[0].astype(int)
             error = 99999
             best_possible_combination = ''
             best_possible_idx = -1
+            for single_word_idx in position_of_real_word_indices:
+                idx_above_word = single_word_idx >= len(words_estimated)
+                if idx_above_word:
+                    continue
+                error_word = WordMetrics.edit_distance_python(
+                    words_estimated[single_word_idx], words_real[word_idx])
+                if error_word < error:
+                    error = error_word * 1
+                    best_possible_combination = words_estimated[single_word_idx]
+                    best_possible_idx = single_word_idx
             mapped_words.append(best_possible_combination)
             mapped_words_indices.append(best_possible_idx)
             continue
+    return mapped_words, mapped_words_indices
+def get_best_mapped_words(words_estimated: list | str, words_real: list | str, use_dtw:bool = False) -> tuple[list, list]:
+    app_logger.info(f"words_estimated: '{words_estimated}', words_real: '{words_real}', use_dtw:{use_dtw}.")
     word_distance_matrix = get_word_distance_matrix(
         words_estimated, words_real)
+    app_logger.debug(f"word_distance_matrix: '{word_distance_matrix}'.")
     start = time.time()
+    app_logger.info(f"use_dtw: '{use_dtw}'.")
+    if use_dtw:
+        alignment = (dtw_from_distance_matrix(word_distance_matrix.T))
+        app_logger.debug(f"alignment: '{alignment}'.")
+        mapped_indices = alignment.get_warping_path()[:len(words_estimated)]
+        app_logger.debug(f"mapped_indices: '{mapped_indices}'.")
+        duration_of_mapping = time.time()-start
+    else:
+        mapped_indices = get_best_path_from_distance_matrix(word_distance_matrix)
+        app_logger.debug(f"mapped_indices: '{mapped_indices}'.")
+        duration_of_mapping = time.time()-start
+        # In case or-tools doesn't converge, go to a faster, low-quality solution
+        check_mapped_indices_or_duration = len(mapped_indices) == 0 or duration_of_mapping > TIME_THRESHOLD_MAPPING+0.5
+        app_logger.info(f"check_mapped_indices_or_duration: '{check_mapped_indices_or_duration}'.")
+        if check_mapped_indices_or_duration:
+            #mapped_indices = (dtw_from_distance_matrix(
+            #    word_distance_matrix)).path[:len(words_estimated), 1]
+            word_distance_matrix_transposed = word_distance_matrix.T
+            app_logger.debug(f"word_distance_matrix_transposed: '{word_distance_matrix_transposed}'.")
+            alignment = dtw_from_distance_matrix(word_distance_matrix_transposed)
+            app_logger.debug(f"check_mapped_indices_or_duration, alignment: '{alignment}'.")
+            mapped_indices = alignment.get_warping_path()
+            app_logger.debug(f"check_mapped_indices_or_duration, mapped_indices: '{mapped_indices}'.")
+    mapped_words, mapped_words_indices = get_resulting_string(mapped_indices, words_estimated, words_real)
+    app_logger.debug(f"mapped_words: '{mapped_words}', mapped_words_indices: '{mapped_words_indices}', duration_of_mapping:{duration_of_mapping}.")
+    return mapped_words, mapped_words_indices
+## Faster, but not optimal
+# def get_best_mapped_words_dtw(words_estimated: list, words_real: list) -> list:
+#     from dtwalign import dtw_from_distance_matrix
+#     word_distance_matrix = get_word_distance_matrix(
+#         words_estimated, words_real)
+#     mapped_indices = dtw_from_distance_matrix(
+#         word_distance_matrix).path[:-1, 0]
+#
+#     mapped_words, mapped_words_indices = get_resulting_string(
+#         mapped_indices, words_estimated, words_real)
+#     return mapped_words, mapped_words_indices
 def getWhichLettersWereTranscribedCorrectly(real_word, transcribed_word):
+    is_leter_correct = [None] * len(real_word)
     for idx, letter in enumerate(real_word):
+        letter = letter.lower()
+        transcribed_word[idx] = transcribed_word[idx].lower()
         if letter == transcribed_word[idx] or letter in punctuation:
             is_leter_correct[idx] = 1
         else:
             is_leter_correct[idx] = 0
     return is_leter_correct
+# def parseLetterErrorsToHTML(word_real, is_leter_correct):
+#     word_colored = ''
+#     correct_color_start = '*'
+#     correct_color_end = '*'
+#     wrong_color_start = '-'
+#     wrong_color_end = '-'
+#     for idx, letter in enumerate(word_real):
+#         if is_leter_correct[idx] == 1:
+#             word_colored += correct_color_start + letter + correct_color_end
+#         else:
+#             word_colored += wrong_color_start + letter + wrong_color_end
+#     return word_colored

aip_trainer/WordMetrics.py → WordMetrics.py RENAMED Viewed

@@ -1,9 +1,33 @@
 import numpy as np
-from aip_trainer import app_logger
-# https://stackabuse.com/levenshtein-distance-and-text-similarity-in-python/
 def edit_distance_python(seq1, seq2):
     size_x = len(seq1) + 1
     size_y = len(seq2) + 1
@@ -27,5 +51,5 @@ def edit_distance_python(seq1, seq2):
                     matrix[x-1,y-1] + 1,
                     matrix[x,y-1] + 1
                 )
-    app_logger.debug("matrix:{}\n".format(matrix))
-    return matrix[size_x - 1, size_y - 1]

 import numpy as np
+# ref from https://gitlab.com/-/snippets/1948157
+# For some variants, look here https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#Python
+# Pure python
+def edit_distance_python2(a, b):
+    # This version is commutative, so as an optimization we force |a|>=|b|
+    if len(a) < len(b):
+        return edit_distance_python(b, a)
+    if len(b) == 0:  # Can deal with empty sequences faster
+        return len(a)
+    # Only two rows are really needed: the one currently filled in, and the previous
+    distances = []
+    distances.append([i for i in range(len(b)+1)])
+    distances.append([0 for _ in range(len(b)+1)])
+    # We can prefill the first row:
+    costs = [0 for _ in range(3)]
+    for i, a_token in enumerate(a, start=1):
+        distances[1][0] += 1  # Deals with the first column.
+        for j, b_token in enumerate(b, start=1):
+            costs[0] = distances[1][j-1] + 1
+            costs[1] = distances[0][j] + 1
+            costs[2] = distances[0][j-1] + (0 if a_token == b_token else 1)
+            distances[1][j] = min(costs)
+        # Move to the next row:
+        distances[0][:] = distances[1][:]
+    return distances[1][len(b)]
+#https://stackabuse.com/levenshtein-distance-and-text-similarity-in-python/
 def edit_distance_python(seq1, seq2):
     size_x = len(seq1) + 1
     size_y = len(seq2) + 1
                     matrix[x-1,y-1] + 1,
                     matrix[x,y-1] + 1
                 )
+    #print (matrix)
+    return matrix[size_x - 1, size_y - 1]

aip_trainer/__init__.py DELETED Viewed

@@ -1,21 +0,0 @@
-import os
-from pathlib import Path
-import structlog
-from dotenv import load_dotenv
-from aip_trainer.utils import session_logger
-load_dotenv()
-PROJECT_ROOT_FOLDER = Path(globals().get("__file__", "./_")).absolute().parent.parent
-LOG_JSON_FORMAT = bool(os.getenv("LOG_JSON_FORMAT"))
-log_level = os.getenv("LOG_LEVEL", "INFO")
-sample_rate_start = int(os.getenv('SAMPLE_RATE', 48000))
-accepted_sample_rates = [48000, 24000, 16000, 8000]
-try:
-    assert sample_rate_start in accepted_sample_rates
-except AssertionError:
-    raise ValueError(f"cannot use a sample rate of value '{sample_rate_start}', should be one of {accepted_sample_rates} ...")
-session_logger.setup_logging(json_logs=LOG_JSON_FORMAT, log_level=log_level)
-app_logger = structlog.stdlib.get_logger(__name__)

aip_trainer/lambdas/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	-

aip_trainer/lambdas/data_de_en_with_categories.json DELETED Viewed

The diff for this file is too large to render. See raw diff

aip_trainer/lambdas/lambdaGetSample.py DELETED Viewed

@@ -1,106 +0,0 @@
-import json
-import pickle
-from pathlib import Path
-import epitran
-import pandas as pd
-from aip_trainer import PROJECT_ROOT_FOLDER, app_logger
-from aip_trainer.models import RuleBasedModels
-from aip_trainer.utils.typing_hints import BodyGetSampleRequest
-class TextDataset:
-    def __init__(self, table, language='-'):
-        self.table_dataframe = table
-        self.number_of_samples = len(table)
-        self.language = language
-    def __getitem__(self, idx):
-        language_sentence = f"{self.language}_sentence" if self.language != '-' else 'sentence'
-        language_series = self.table_dataframe[language_sentence]
-        return [language_series.iloc[idx]]
-    def __len__(self):
-        return self.number_of_samples
-    def get_category_from_df_by_language(self, language: str, category_value:int):
-        selector = self.table_dataframe[f"{language}_category"] == category_value
-        df_by_category = self.table_dataframe[selector]
-        return df_by_category
-    def get_random_sample_from_df(self, language: str, category_value:int):
-        app_logger.info(f"language={language}, category_value={category_value}.")
-        choice = self.table_dataframe.sample(n=1)
-        if category_value !=0:
-            df_language_filtered_by_category_and_language = self.get_category_from_df_by_language(language, category_value)
-            choice = df_language_filtered_by_category_and_language.sample(n=1)
-        return [choice[f"{language}_sentence"].iloc[0]]
-sample_folder = Path(PROJECT_ROOT_FOLDER / "aip_trainer" / "lambdas")
-lambda_database = {}
-lambda_ipa_converter = {}
-with open(sample_folder / 'data_de_en_with_categories.json', 'r') as src:
-    df = pd.read_json(src)
-lambda_database['de'] = TextDataset(df, 'de')
-lambda_database['en'] = TextDataset(df, 'en')
-lambda_translate_new_sample = False
-lambda_ipa_converter['de'] = RuleBasedModels.EpitranPhonemConverter(
-    epitran.Epitran('deu-Latn'))
-lambda_ipa_converter['en'] = RuleBasedModels.EngPhonemConverter()
-def lambda_handler(event, context):
-    event_body = event["body"]
-    body = BodyGetSampleRequest.model_validate_json(event_body)
-    current_transcript = get_random_selection(body.language, body.category, is_gradio_output=False, transcript=body.transcript)
-    current_transcript = current_transcript[0] if isinstance(current_transcript, list) else current_transcript
-    current_ipa = lambda_ipa_converter[body.language].convertToPhonem(current_transcript)
-    app_logger.info(f"real_transcript='{current_transcript}', ipa_transcript='{current_ipa}'.")
-    result = {
-        'real_transcript': current_transcript,
-        'ipa_transcript': current_ipa,
-        'transcript_translation': ""
-    }
-    return json.dumps(result)
-def get_random_selection(language: str, category: int, is_gradio_output=True, transcript=None):
-    if transcript is not None and isinstance(transcript, str) and len(transcript) > 0:
-        return transcript
-    lambda_df_lang = lambda_database[language]
-    current_transcript = lambda_df_lang.get_random_sample_from_df(language, category)
-    app_logger.info(f"category={category}, language={language}, current_transcript={current_transcript}.")
-    return current_transcript[0] if is_gradio_output else current_transcript
-def getSentenceCategory(sentence) -> int:
-    number_of_words = len(sentence.split())
-    categories_word_limits = [0, 8, 20, 100000]
-    for category in range(len(categories_word_limits) - 1):
-        if categories_word_limits[category] < number_of_words <= categories_word_limits[category + 1]:
-            return category + 1
-def get_pickle2json_dataframe(
-        custom_pickle_filename_no_ext: Path | str = 'data_de_en_2',
-        custom_folder: Path = sample_folder
-    ):
-    custom_folder = Path(custom_folder)
-    with open(custom_folder / f'{custom_pickle_filename_no_ext}.pickle', 'rb') as handle:
-        df2 = pickle.load(handle)
-        pass
-        df2["de_category"] = df2["de_sentence"].apply(getSentenceCategory)
-        print("de_category added")
-        df2["en_category"] = df2["en_sentence"].apply(getSentenceCategory)
-        print("en_category added")
-    df_json = df2.to_json()
-    with open(custom_folder / f'{custom_pickle_filename_no_ext}.json', 'w') as dst:
-        dst.write(df_json)
-        print("data_de_en_with_categories.json written")

aip_trainer/models/__init__.py DELETED Viewed

File without changes

aip_trainer/utils/__init__.py DELETED Viewed

File without changes

aip_trainer/utils/split_cosmic_ray_report.py DELETED Viewed

@@ -1,33 +0,0 @@
-from pathlib import Path
-def get_cosmic_ray_report_filtered(input_filename, suffix="filtered", separator="============", filter_string_list: list = None):
-    if filter_string_list is None:
-        filter_string_list = ["test outcome: TestOutcome.KILLED"]
-    filename, ext = Path(input_filename).stem, Path(input_filename).suffix
-    working_dir = input_filename.parent
-    # Read the input file
-    with open(input_filename, 'r') as file:
-        content = file.read()
-    # Split the content into sections
-    sections = content.split(separator)
-    filtered_sections = [section for section in sections]
-    # Filter out sections containing "test outcome: TestOutcome.KILLED"
-    for filter_string in filter_string_list:
-        filtered_sections = [section for section in filtered_sections if filter_string not in section]
-    # Join the filtered sections back into a single string
-    filtered_content = separator.join(filtered_sections)
-    # Write the filtered content to a new file
-    with open(working_dir / f'{filename}_{suffix}{ext}', 'w') as file:
-        file.write(filtered_content)
-if __name__ == "__main__":
-    from aip_trainer import PROJECT_ROOT_FOLDER
-    _input_filename =  "cosmic-ray-models2.txt"
-    get_cosmic_ray_report_filtered(PROJECT_ROOT_FOLDER / "tmp" / _input_filename)

aip_trainer/utils/typing_hints.py DELETED Viewed

@@ -1,19 +0,0 @@
-from typing import Annotated, Optional, TypeAlias
-from pydantic import BaseModel
-import annotated_types
-Category: TypeAlias = Annotated[int, annotated_types.Ge(0), annotated_types.Le(4)]
-class BodyGetSampleRequest(BaseModel):
-    category: Optional[Category] = 0
-    language: str
-    transcript: Optional[str] = ""
-class BodySpeechToScoreRequest(BaseModel):
-    base64Audio: str
-    language: str
-    title: str

aip_trainer/utils/utilities.py DELETED Viewed

@@ -1,57 +0,0 @@
-"""Various utilities (logger, time benchmark, args dump, numerical and stats info)"""
-from copy import deepcopy
-from aip_trainer import app_logger
-from aip_trainer.utils.serialize import serialize
-def hash_calculate(arr_or_path, is_file: bool, read_mode: str = "rb") -> str | bytes:
-    """
-    Return computed hash from input variable (typically a numpy array).
-    Args:
-        arr: input variable
-    Returns:
-        computed hash from input variable
-    """
-    from hashlib import sha256
-    from base64 import b64encode
-    from numpy import ndarray as np_ndarray
-    if is_file:
-        with open(arr_or_path, read_mode) as file_to_check:
-            # read contents of the file
-            arr_or_path = file_to_check.read()
-            # # pipe contents of the file through
-            # try:
-            #     return hashlib.sha256(data).hexdigest()
-            # except TypeError:
-            #     app_logger.warning(
-            #         f"TypeError, re-try encoding arg:{arr_or_path},type:{type(arr_or_path)}."
-            #     )
-            #     return hashlib.sha256(data.encode("utf-8")).hexdigest()
-    if isinstance(arr_or_path, np_ndarray):
-        hash_fn = sha256(arr_or_path.data)
-    elif isinstance(arr_or_path, dict):
-        import json
-        serialized = serialize(arr_or_path)
-        variable_to_hash = json.dumps(serialized, sort_keys=True).encode("utf-8")
-        hash_fn = sha256(variable_to_hash)
-    elif isinstance(arr_or_path, str):
-        try:
-            hash_fn = sha256(arr_or_path)
-        except TypeError:
-            app_logger.warning(
-                f"TypeError, re-try encoding arg:{arr_or_path},type:{type(arr_or_path)}."
-            )
-            hash_fn = sha256(arr_or_path.encode("utf-8"))
-    elif isinstance(arr_or_path, bytes):
-        hash_fn = sha256(arr_or_path)
-    else:
-        raise ValueError(
-            f"variable 'arr':{arr_or_path} of type '{type(arr_or_path)}' not yet handled."
-        )
-    return b64encode(hash_fn.digest())

app.py CHANGED Viewed

@@ -1,8 +1,12 @@
 from pathlib import Path
 import gradio as gr
-from aip_trainer import PROJECT_ROOT_FOLDER, app_logger, sample_rate_start
-from aip_trainer.lambdas import js, lambdaGetSample, lambdaSpeechToScore, lambdaTTS
 css = """
@@ -38,9 +42,34 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
     app_logger.info("start gradio app building...")
     project_root_folder = Path(PROJECT_ROOT_FOLDER)
-    with open(project_root_folder / "aip_trainer" / "lambdas" / "app_description.md", "r", encoding="utf-8") as app_description_src:
         md_app_description = app_description_src.read()
-        gr.Markdown(md_app_description.format(sample_rate_start=sample_rate_start))
     with gr.Row():
         with gr.Column(scale=4, min_width=300):
             with gr.Row(elem_id="id-choose-random-phrase-by-language-and-difficulty"):
@@ -108,10 +137,10 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
                 visible=False,
             )
             text_recording_ipa = gr.Textbox(
-                placeholder=None, label="Student phonetic transcription", elem_id="text-student-recording-ipa-id-element"
             )
             text_ideal_ipa = gr.Textbox(
-                placeholder=None, label="Ideal phonetic transcription", elem_id="text-ideal-ipa-id-element"
             )
             text_raw_json_output_hidden = gr.Textbox(placeholder=None, label="text_raw_json_output_hidden", visible=False)
             with gr.Group(elem_classes="speech-output-group background-white"):
@@ -127,11 +156,11 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
                         elem_classes="speech-output-html background-white",
                     )
             with gr.Row():
-                with gr.Column(min_width=100, elem_classes="speech-accuracy-score-container row2 col1"):
-                    num_pronunciation_accuracy = gr.Number(label="Current score %", elem_id="number-pronunciation-accuracy-id-element")
-                with gr.Column(min_width=100, elem_classes="speech-accuracy-score-container row2 col2"):
                     num_score_de = gr.Number(label="Global score DE %", value=0, interactive=False, elem_id="number-score-de-id-element")
-                with gr.Column(min_width=100, elem_classes="speech-accuracy-score-container row2 col3"):
                     num_score_en = gr.Number(label="Global score EN %", value=0, interactive=False, elem_id="number-score-en-id-element")
             btn_recognize_speech_accuracy = gr.Button(value="Get speech accuracy score (%)", elem_id="btn-recognize-speech-accuracy-id-element")
             with gr.Row(elem_id="id-replay-splitted-audio-by-words"):
@@ -139,17 +168,17 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
                 with gr.Column(scale=1, min_width=50):
                     num_selected_recognized_word = gr.Number(label=word_idx_text, visible=True, minimum=0, value=0, interactive=False)
                 with gr.Column(scale=4, min_width=100):
-                    audio_splitted_student_recording_stt = gr.Audio(
-                        label="Splitted student speech output",
                         type="filepath",
                         show_download_button=True,
-                        elem_id="audio-splitted-student-recording-stt-id-element",
                     )
             text_selected_recognized_word_hidden = gr.Textbox(label="text_selected_recognized_word", value="placeholder", interactive=False, visible=False)
     def get_updated_score_by_language(text: str, audio_rec: str | Path, lang: str, score_de: float, score_en: float):
         import json
-        _transcribed_text, _letter_correctness, _pronunciation_accuracy, _recording_ipa, _ideal_ipa, _num_tot_recognized_word, first_audio_file, _res = lambdaSpeechToScore.get_speech_to_score_tuple(text, audio_rec, lang, remove_random_file=False)
         new_num_selected_recognized_word = gr.Number(label=word_idx_text, visible=True, value=0)
         words_list = _transcribed_text.split()
         first_word = words_list[0]
@@ -165,7 +194,7 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
             text_raw_json_output_hidden: _res,
             num_tot_recognized_words: _num_tot_recognized_word,
             num_selected_recognized_word: new_num_selected_recognized_word,
-            audio_splitted_student_recording_stt: first_audio_file,
             text_selected_recognized_word_hidden: first_word,
             num_audio_duration_hidden: first_audio_duration
         }
@@ -199,7 +228,7 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
             num_score_en,
             num_tot_recognized_words,
             num_selected_recognized_word,
-            audio_splitted_student_recording_stt,
             text_selected_recognized_word_hidden,
             num_audio_duration_hidden
         ],
@@ -229,7 +258,7 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
         clear3,
         inputs=[],
         outputs=[
-            audio_student_recording_stt, audio_tts, audio_splitted_student_recording_stt, text_recording_ipa, text_ideal_ipa, text_transcribed_hidden,
             num_pronunciation_accuracy, num_selected_recognized_word, num_pronunciation_accuracy
         ],
     )
@@ -280,18 +309,18 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
     num_selected_recognized_word.input(
         fn=lambdaSpeechToScore.get_selected_word,
         inputs=[num_selected_recognized_word, text_raw_json_output_hidden],
-        outputs=[audio_splitted_student_recording_stt, text_selected_recognized_word_hidden, num_audio_duration_hidden],
     )
-    audio_splitted_student_recording_stt.play(
         fn=None,
         inputs=[text_selected_recognized_word_hidden, radio_language, num_audio_duration_hidden],
-        outputs=audio_splitted_student_recording_stt,
         js=js.js_play_audio
     )
     @gradio_app.load(inputs=[local_storage], outputs=[num_score_de, num_score_en])
     def load_from_local_storage(saved_values):
-        print("loading from local storage", saved_values)
         return saved_values[0], saved_values[1]
     @gr.on([num_score_de.change, num_score_en.change], inputs=[num_score_de, num_score_en], outputs=[local_storage])
@@ -302,6 +331,6 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
 if __name__ == "__main__":
     try:
         gradio_app.launch()
-    except Exception as e:
-        app_logger.error(f"Error: {e}")
-        raise e

 from pathlib import Path
 import gradio as gr
+import js
+from constants import (PROJECT_ROOT_FOLDER, app_logger, sample_rate_start, MODEL_NAME_DEFAULT, model_urls,
+    sample_rate_resample, samplerate_tts, silero_versions_dict)
+import lambdaGetSample
+import lambdaSpeechToScore
+import lambdaTTS
 css = """
     app_logger.info("start gradio app building...")
     project_root_folder = Path(PROJECT_ROOT_FOLDER)
+    with open(project_root_folder / "app_headline.md", "r", encoding="utf-8") as app_headline_src:
+        md_app_headline = app_headline_src.read()
+        gr.Markdown(md_app_headline)
+    with open(project_root_folder / "app_description.md", "r", encoding="utf-8") as app_description_src:
         md_app_description = app_description_src.read()
+        model_url = model_urls[MODEL_NAME_DEFAULT]
+        app_logger.info(f"model_urls:{model_urls} ...")
+        models_names_urls_list = ""
+        other_supported_models = {k: v for k, v in model_urls.items() if k != MODEL_NAME_DEFAULT}
+        for model_name, model_url in other_supported_models.items():
+            app_logger.info(f"model_name: {model_name}, model_url: {model_url} ...")
+            models_names_urls_list += """\n  - [{model_name}]({model_url})""".format(model_name=model_name, model_url=model_url)
+            if model_name == "silero":
+                models_names_urls_list += " (German version: {}, English version: {})".format(silero_versions_dict["de"], silero_versions_dict["en"])
+        app_logger.info(f"models_names_urls_list: '{models_names_urls_list}' ...")
+        with gr.Accordion(
+                "Click here for expand and show current env variables samplerate values, the selected model and the supported ones",
+                open=False,
+                elem_id="accordion-models-env-variables-id-element"
+            ):
+            gr.Markdown(md_app_description.format(
+                sample_rate_start=sample_rate_start,
+                model_name=MODEL_NAME_DEFAULT,
+                model_url=model_url,
+                models_names_urls_list=models_names_urls_list,
+                sample_rate_resample=sample_rate_resample,
+                samplerate_tts=samplerate_tts
+            ))
     with gr.Row():
         with gr.Column(scale=4, min_width=300):
             with gr.Row(elem_id="id-choose-random-phrase-by-language-and-difficulty"):
                 visible=False,
             )
             text_recording_ipa = gr.Textbox(
+                placeholder="-", label="Student phonetic transcription", elem_id="text-student-recording-ipa-id-element", interactive=False
             )
             text_ideal_ipa = gr.Textbox(
+                placeholder="-", label="Ideal phonetic transcription", elem_id="text-ideal-ipa-id-element", interactive=False
             )
             text_raw_json_output_hidden = gr.Textbox(placeholder=None, label="text_raw_json_output_hidden", visible=False)
             with gr.Group(elem_classes="speech-output-group background-white"):
                         elem_classes="speech-output-html background-white",
                     )
             with gr.Row():
+                with gr.Column(min_width=100, elem_classes="speech-accuracy-score-container row2 col1", elem_id="id-current-speech-accuracy-score-container"):
+                    num_pronunciation_accuracy = gr.Number(label="Current score %", elem_id="number-pronunciation-accuracy-id-element", interactive=False, value=0)
+                with gr.Column(min_width=100, elem_classes="speech-accuracy-score-container row2 col2", elem_id="id-global-speech-accuracy-score-de-container"):
                     num_score_de = gr.Number(label="Global score DE %", value=0, interactive=False, elem_id="number-score-de-id-element")
+                with gr.Column(min_width=100, elem_classes="speech-accuracy-score-container row2 col3", elem_id="id-global-speech-accuracy-score-en-container"):
                     num_score_en = gr.Number(label="Global score EN %", value=0, interactive=False, elem_id="number-score-en-id-element")
             btn_recognize_speech_accuracy = gr.Button(value="Get speech accuracy score (%)", elem_id="btn-recognize-speech-accuracy-id-element")
             with gr.Row(elem_id="id-replay-splitted-audio-by-words"):
                 with gr.Column(scale=1, min_width=50):
                     num_selected_recognized_word = gr.Number(label=word_idx_text, visible=True, minimum=0, value=0, interactive=False)
                 with gr.Column(scale=4, min_width=100):
+                    audio_sliced_student_recording_stt = gr.Audio(
+                        label="Sliced student speech output",
                         type="filepath",
                         show_download_button=True,
+                        elem_id="audio-sliced-student-recording-stt-id-element",
                     )
             text_selected_recognized_word_hidden = gr.Textbox(label="text_selected_recognized_word", value="placeholder", interactive=False, visible=False)
     def get_updated_score_by_language(text: str, audio_rec: str | Path, lang: str, score_de: float, score_en: float):
         import json
+        _transcribed_text, _letter_correctness, _pronunciation_accuracy, _recording_ipa, _ideal_ipa, _num_tot_recognized_word, first_audio_file, _res, _ = lambdaSpeechToScore.get_speech_to_score_tuple(text, audio_rec, lang, remove_random_file=False)
         new_num_selected_recognized_word = gr.Number(label=word_idx_text, visible=True, value=0)
         words_list = _transcribed_text.split()
         first_word = words_list[0]
             text_raw_json_output_hidden: _res,
             num_tot_recognized_words: _num_tot_recognized_word,
             num_selected_recognized_word: new_num_selected_recognized_word,
+            audio_sliced_student_recording_stt: first_audio_file,
             text_selected_recognized_word_hidden: first_word,
             num_audio_duration_hidden: first_audio_duration
         }
             num_score_en,
             num_tot_recognized_words,
             num_selected_recognized_word,
+            audio_sliced_student_recording_stt,
             text_selected_recognized_word_hidden,
             num_audio_duration_hidden
         ],
         clear3,
         inputs=[],
         outputs=[
+            audio_student_recording_stt, audio_tts, audio_sliced_student_recording_stt, text_recording_ipa, text_ideal_ipa, text_transcribed_hidden,
             num_pronunciation_accuracy, num_selected_recognized_word, num_pronunciation_accuracy
         ],
     )
     num_selected_recognized_word.input(
         fn=lambdaSpeechToScore.get_selected_word,
         inputs=[num_selected_recognized_word, text_raw_json_output_hidden],
+        outputs=[audio_sliced_student_recording_stt, text_selected_recognized_word_hidden, num_audio_duration_hidden],
     )
+    audio_sliced_student_recording_stt.play(
         fn=None,
         inputs=[text_selected_recognized_word_hidden, radio_language, num_audio_duration_hidden],
+        outputs=audio_sliced_student_recording_stt,
         js=js.js_play_audio
     )
     @gradio_app.load(inputs=[local_storage], outputs=[num_score_de, num_score_en])
     def load_from_local_storage(saved_values):
+        app_logger.info(f"loading from local storage: {saved_values} ...")
         return saved_values[0], saved_values[1]
     @gr.on([num_score_de.change, num_score_en.change], inputs=[num_score_de, num_score_en], outputs=[local_storage])
 if __name__ == "__main__":
     try:
         gradio_app.launch()
+    except Exception as ex:
+        app_logger.error(f"Error: {ex}")
+        raise ex

app_description.md ADDED Viewed

	@@ -0,0 +1,11 @@

+## Models and variables
+Right now this tool uses:
+- [{model_name}]({model_url}) as STT (speech-to-text) model; other supported models are:
+{models_names_urls_list}
+- <u>{sample_rate_start}</u> as input samplerate value (from empirical tests the best sample rate value is 48000)
+- <u>{sample_rate_resample}</u> as resampled samplerate value
+- <u>{samplerate_tts}</u> as TTS (text-to-speech) samplerate value

aip_trainer/lambdas/app_description.md → app_headline.md RENAMED Viewed

@@ -1,4 +1,6 @@
 # AI Pronunciation Trainer
 See [my fork](https://github.com/trincadev/ai-pronunciation-trainer) of [AI Pronunciation Trainer](https://github.com/Thiagohgl/ai-pronunciation-trainer) repository
-for more details. Right now this tool uses {sample_rate_start} as sample rate value. From empirical tests the best sample rate value is 48000.

 # AI Pronunciation Trainer
 See [my fork](https://github.com/trincadev/ai-pronunciation-trainer) of [AI Pronunciation Trainer](https://github.com/Thiagohgl/ai-pronunciation-trainer) repository
+for more details.

constants.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import os
+from pathlib import Path
+import structlog
+import session_logger
+PROJECT_ROOT_FOLDER = Path(__file__).parent
+ALLOWED_ORIGIN = os.getenv('ALLOWED_ORIGIN', 'http://localhost:3000')
+LOG_JSON_FORMAT = bool(os.getenv("LOG_JSON_FORMAT"))
+IS_TESTING = bool(os.getenv('IS_TESTING', ""))
+STSCOREAPIKEY = os.getenv('STSCOREAPIKEY', "stscore_apikey_placeholder")
+log_level = os.getenv("LOG_LEVEL", "INFO")
+USE_DTW = bool(os.getenv("USE_DTW"))
+MODEL_NAME_TESTING = "whisper"
+_MODEL_NAME_DEFAULT = os.getenv("MODEL_NAME_DEFAULT", MODEL_NAME_TESTING)
+MODEL_NAME_DEFAULT = MODEL_NAME_TESTING if IS_TESTING else _MODEL_NAME_DEFAULT
+DEVICE = os.getenv("DEVICE", "cpu")
+tmp_audio_extension = os.getenv('TMP_AUDIO_EXTENSION', '.wav')
+session_logger.setup_logging(json_logs=LOG_JSON_FORMAT, log_level=log_level)
+app_logger = structlog.stdlib.get_logger(__name__)
+sample_rate_start = int(os.getenv('SAMPLE_RATE', 48000))
+sample_rate_resample = 16000
+samplerate_tts = 16000
+language_not_implemented = "Language '{}' not implemented. Supported languages: 'de', 'en'."
+SILERO_VERSION_DE = "v4"
+SILERO_VERSION_EN = "latest"
+silero_versions_dict = {"de": SILERO_VERSION_DE, "en": SILERO_VERSION_EN}
+model_urls = {
+    "faster_whisper": "https://pypi.org/project/faster-whisper/",
+    "silero": "https://pypi.org/project/silero/",
+    "whisper": "https://pypi.org/project/openai-whisper/",
+}

cosmic_ray_config.toml DELETED Viewed

@@ -1,8 +0,0 @@
-[cosmic-ray]
-module-path = "aip_trainer/models/models.py"
-timeout = 30.0
-excluded-modules = []
-test-command = "python -m pytest tests/models/test_models.py"
-[cosmic-ray.distributor]
-name = "local"

tests/test_data_de_en_2.pickle → data_de_en_2.pickle RENAMED Viewed

File without changes

databases/data_de.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

databases/data_en.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

dockerfiles/apt_preferences DELETED Viewed

@@ -1,9 +0,0 @@
-Explanation: Uninstall or do not install any Debian-originated
-Explanation: package versions other than those in the stable distro
-Package: *
-Pin: release a=stable
-Pin-Priority: 900
-Package: zlib1g
-Pin: release a=trixie
-Pin-Priority: -10

dockerfiles/debian.sources DELETED Viewed

@@ -1,17 +0,0 @@
-Types: deb deb-src
-URIs: http://deb.debian.org/debian
-Suites: bookworm bookworm-updates
-Components: main
-Signed-By: /usr/share/keyrings/debian-archive-keyring.gpg
-Types: deb deb-src
-URIs: http://deb.debian.org/debian-security
-Suites: bookworm-security
-Components: main
-Signed-By: /usr/share/keyrings/debian-archive-keyring.gpg
-Types: deb
-URIs: http://deb.debian.org/debian
-Suites: trixie
-Components: main
-Signed-By: /usr/share/keyrings/debian-archive-keyring.gpg

dockerfiles/dockerfile-base DELETED Viewed

@@ -1,72 +0,0 @@
-# Include global ARGs at the dockerfile top
-ARG ARCH="x86_64"
-ARG WORKDIR_ROOT="/var/task"
-FROM python:3.12-bookworm AS builder_global
-ARG ARCH
-ARG WORKDIR_ROOT
-ARG POETRY_NO_INTERACTION
-ARG POETRY_VIRTUALENVS_IN_PROJECT
-ARG POETRY_VIRTUALENVS_CREATE
-ARG POETRY_CACHE_DIR
-ARG ZLIB1G="http://ftp.it.debian.org/debian/pool/main/z/zlib/zlib1g_1.3.dfsg-3+b1_amd64.deb"
-ENV PYTHONPATH="${WORKDIR_ROOT}:${PYTHONPATH}:/usr/local/lib/python3/dist-packages"
-ENV MPLCONFIGDIR=/tmp/matplotlib
-ARG USER="999"
-RUN echo "ARCH: $ARCH, ARG POETRY_CACHE_DIR: ${POETRY_CACHE_DIR}, ENV PYTHONPATH: $PYTHONPATH, USER: $USER ..."
-# RUN groupadd -g 999 python && useradd -r -u 999 -g python python
-# Set working directory to function root directory
-WORKDIR ${WORKDIR_ROOT}
-COPY --chown=python:python requirements.txt ${WORKDIR_ROOT}/
-# avoid segment-geospatial exception caused by missing libGL.so.1 library
-RUN echo "BUILDER: check libz.s* before start" && ls -l /usr/lib/${ARCH}-linux-gnu/libz.so*
-RUN apt update && apt install -y curl ffmpeg libgl1 python3-pip && apt clean
-COPY --chown=python:python ./dockerfiles/apt_preferences /etc/apt/preferences
-COPY --chown=python:python ./dockerfiles/debian.sources /etc/apt/sources.list.d/debian.sources
-RUN apt update && apt install -t trixie zlib1g -y && apt clean
-RUN echo "BUILDER: check libz.s* after install from trixie" && ls -l /usr/lib/${ARCH}-linux-gnu/libz.so*
-RUN ls -l /etc/apt/sources* /etc/apt/preferences*
-# poetry installation path is NOT within ${WORKDIR_ROOT}: not needed for runtime docker image
-RUN python3 -m venv ${WORKDIR_ROOT}/.venv
-ENV PATH="${WORKDIR_ROOT}/.venv/bin:$PATH"
-RUN . ${WORKDIR_ROOT}/.venv/bin/activate && python -m pip install -r ${WORKDIR_ROOT}/requirements.txt
-# USER 999
-FROM python:3.12-slim-bookworm AS runtime
-RUN groupadd -g 999 python && useradd -r -u 999 -g python python
-ARG ARCH
-ARG WORKDIR_ROOT
-ENV PYTHONPATH="${WORKDIR_ROOT}:${WORKDIR_ROOT}/.venv:${PYTHONPATH}:/usr/local/lib/python3/dist-packages"
-ENV MPLCONFIGDIR=/tmp/matplotlib
-ENV VIRTUAL_ENV=${WORKDIR_ROOT}/.venv PATH="${WORKDIR_ROOT}/.venv/bin:$PATH"
-RUN apt update && apt install -y ffmpeg && apt clean
-RUN echo "COPY --chown=python:python --from=builder_global /usr/lib/${ARCH}-linux-gnu/libGL.so* /usr/lib/${ARCH}-linux-gnu/"
-RUN echo "RUNTIME: check libz.s* before upgrade" && ls -l /usr/lib/${ARCH}-linux-gnu/libz.so*
-RUN echo "RUNTIME: remove libz.s* to force upgrade" && rm /usr/lib/${ARCH}-linux-gnu/libz.so*
-COPY --chown=python:python --from=builder_global /usr/lib/${ARCH}-linux-gnu/libz.so* /usr/lib/${ARCH}-linux-gnu/
-COPY --chown=python:python --from=builder_global /lib/${ARCH}-linux-gnu/libexpat.so* /lib/${ARCH}-linux-gnu/
-RUN echo "RUNTIME: check libz.s* after copy" && ls -l /usr/lib/${ARCH}-linux-gnu/libz.so*
-COPY --chown=python:python --from=builder_global ${WORKDIR_ROOT}/.venv ${WORKDIR_ROOT}/.venv
-RUN echo "check ffmpeg files..."
-RUN ls -ld /usr/share/ffmpeg || echo "ffpeg folder not found!"
-RUN ls -l /usr/bin/ff* || echo "ffpeg bin not found!"
-RUN ls -l /usr/share/ffmpeg || echo "ffpeg folder share not found!"
-RUN . ${WORKDIR_ROOT}/.venv && which python && pip list
-RUN echo "new WORKDIR_ROOT after hidden venv COPY --chown=python:python => ${WORKDIR_ROOT}"
-RUN ls -ld ${WORKDIR_ROOT}/
-RUN ls -lA ${WORKDIR_ROOT}/

faster_whisper_wrapper.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from typing import Union
+import numpy as np
+import onnxruntime
+import torch
+from faster_whisper import WhisperModel
+from ModelInterfaces import IASRModel
+from constants import sample_rate_resample, app_logger, IS_TESTING, DEVICE
+device = onnxruntime.get_device()
+device = "cpu" if IS_TESTING or device.lower() == DEVICE.lower() else device
+app_logger.info(f"device: {device} #")
+device_compute = "int8_float16" if device == "cuda" else "int8"
+app_logger.info(f"device: {device}, device_compute: {device_compute} #")
+def parse_word_info(word_info, sample_rate):
+    start_ts = float(word_info.start) * sample_rate
+    end_ts = float(word_info.end) * sample_rate
+    word = word_info.word
+    return {"word": word, "start_ts": start_ts, "end_ts": end_ts}
+class FasterWhisperASRModel(IASRModel):
+    def __init__(self, model_name="base", language=None):
+        self.asr = WhisperModel(model_name, device=device, compute_type=device_compute)
+        self._transcript = ""
+        self._word_locations = []
+        self.sample_rate = sample_rate_resample
+        self.language = language
+    def processAudio(self, audio:Union[np.ndarray, torch.Tensor]):
+        # 'audio' can be a path to a file or a numpy array of audio samples.
+        if isinstance(audio, torch.Tensor):
+            audio = audio.detach().cpu().numpy()
+        segments, info = self.asr.transcribe(audio=audio[0], language=self.language, word_timestamps=True, beam_size=5, temperature=0, vad_filter=True)  #, "verbose": True})
+        app_logger.debug(f"segments: type={type(segments)}, segments complete: {segments} #")
+        app_logger.info(f"info: type={type(info)}, info complete: {info} #")
+        transcript = []
+        count = 0
+        for segment in segments:
+            app_logger.debug(f"single segment: {type(segment)}, segment: {segment} #")
+            transcript.append(segment.text)
+            segment_word_locations = [parse_word_info(word_info, sample_rate=self.sample_rate) for word_info in segment.words]
+            self._word_locations.extend(segment_word_locations)
+            app_logger.info(f"elaborated segment {count}: type={type(segment)}, len(words):{len(segment.words)}, text:{segment.text} #")
+            count += 1
+        app_logger.info(f"transcript: {transcript} #")
+        self._transcript = " ".join(transcript)
+    def getTranscript(self) -> str:
+        return self._transcript
+    def getWordLocations(self) -> list:
+        return self._word_locations

images/{MainScreen.png → MainScreen.jpg} RENAMED Viewed

File without changes

aip_trainer/lambdas/js.py → js.py RENAMED Viewed

@@ -1,11 +1,4 @@
 js_update_ipa_output = """
-/**
- * Updates the CSS text of the given text based on the correctness of each letter.
- *
- * @param text - The text to be displayed.
- * @param letters - A string representing the correctness of each letter in the text.
- * @param idxSelectedWord - The index of the selected word to be underlined.
- */
 function updateCssText(text, letters, idxSelectedWord) {
     let wordsArr = text.split(" ")
     let lettersWordsArr = letters.split(" ")
@@ -31,13 +24,6 @@ function updateCssText(text, letters, idxSelectedWord) {
 """
 js_play_audio = """
-/**
- * Plays the given text as audio using the Web Speech API.
- *
- * @param text - The text to be spoken.
- * @param language - The language code for the speech synthesis (e.g., 'en' for English, 'de' for German).
- * @param sleepTime - Optional. The time in seconds to wait before starting the speech synthesis. Default is 0.
- */
 function playAudio(text, language, sleepTime = 0) {
     let voice_idx = 0;
     let voice_synth = null;

 js_update_ipa_output = """
 function updateCssText(text, letters, idxSelectedWord) {
     let wordsArr = text.split(" ")
     let lettersWordsArr = letters.split(" ")
 """
 js_play_audio = """
 function playAudio(text, language, sleepTime = 0) {
     let voice_idx = 0;
     let voice_synth = null;

lambdaChangeModel.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import json
+import pronunciationTrainer
+trainer_SST_lambda = {'de': pronunciationTrainer.getTrainer("de"), 'en': pronunciationTrainer.getTrainer("en")}
+def lambda_handler(event, context):
+    data = json.loads(event['body'])
+    model_name = data['modelName']
+    trainer_SST_lambda["de"] = pronunciationTrainer.getTrainer("de", model_name=model_name)
+    trainer_SST_lambda["en"] = pronunciationTrainer.getTrainer("en", model_name=model_name)
+    return f'Model changed to {model_name}!'

lambdaGetSample.py ADDED Viewed

	@@ -0,0 +1,145 @@

+import json
+from pathlib import Path
+import pandas as pd
+import RuleBasedModels
+from constants import app_logger
+class TextDataset:
+    def __init__(self, table, language):
+        self.table_dataframe = table
+        self.language = language
+    def __getitem__(self, idx):
+        line = [self.table_dataframe['sentence'].iloc[idx]]
+        return line
+    def __len__(self):
+        return len(self.table_dataframe)
+    def get_category_from_df(self, category_value:int):
+        selector = self.table_dataframe["category"] == category_value
+        df_by_category = self.table_dataframe[selector]
+        return df_by_category
+    def get_random_sample_from_df(self, category_value:int):
+        app_logger.info(f"language={self.language}, category_value={category_value}.")
+        choice = self.table_dataframe.sample(n=1)
+        if category_value !=0:
+            df_language_filtered_by_category = self.get_category_from_df(category_value)
+            choice = df_language_filtered_by_category.sample(n=1)
+        sentence = choice["sentence"].iloc[0]
+        app_logger.info(f"sentence={sentence} ...")
+        return [sentence]
+sample_folder = Path(__file__).parent / "databases"
+lambda_database = {}
+lambda_ipa_converter = {}
+available_languages = ['de', 'en']
+for lang in available_languages:
+    # avoid using ";" or "," as separator because these are present within the dataframe sentences
+    df = pd.read_csv(sample_folder / f'data_{lang}.csv', delimiter='|')
+    lambda_database[lang] = TextDataset(df, lang)
+    lambda_ipa_converter[lang] = RuleBasedModels.get_phonem_converter(lang)
+lambda_translate_new_sample = False
+def lambda_handler(event, context):
+    """
+    lambda handler to return a random text sample from the dataset.
+    Parameters:
+        event (dict): The event data passed to the Lambda function.
+        context (dict): The context in which the Lambda function is called.
+    Returns:
+        str: The JSON-encoded result.
+    """
+    try:
+        body = json.loads(event['body'])
+        try:
+            category = int(body['category'])
+        except KeyError:
+            category = 0
+        language = body['language']
+        try:
+            current_transcript = str(body["transcript"])
+        except KeyError:
+            current_transcript = get_random_selection(language, category)
+        current_ipa = lambda_ipa_converter[language].convertToPhonem(current_transcript)
+        app_logger.info(f"real_transcript='{current_transcript}', ipa_transcript='{current_ipa}'.")
+        result = {
+            'real_transcript': [current_transcript],
+            'ipa_transcript': current_ipa,
+            'transcript_translation': ""
+        }
+        return json.dumps(result)
+    except Exception as ex:
+        app_logger.error(f"ex: {ex} ...")
+        raise ex
+def get_random_selection(language: str, category: int) -> str:
+    """
+    Get a random text sample from the dataset.
+    Parameters:
+        language (str): The language code.
+        category (int): The category value to filter the dataset.
+    Returns:
+        str: The selected text sample.
+    """
+    lambda_df_lang = lambda_database[language]
+    current_transcript = lambda_df_lang.get_random_sample_from_df(category)
+    app_logger.info(f"category={category}, language={language}, current_transcript={current_transcript}.")
+    return current_transcript[0]
+def getSentenceCategory(sentence) -> int | None:
+    number_of_words = len(sentence.split())
+    categories_word_limits = [0, 8, 20, 100000]
+    for category in range(len(categories_word_limits)-1):
+        if categories_word_limits[category] < number_of_words <= categories_word_limits[category + 1]:
+            return category+1
+    raise ValueError(f"category not assigned for sentence '{sentence}' ...")
+def get_enriched_dataframe_csv(
+        language: str,
+        custom_dataframe_csv_filename_no_ext: str = "data",
+        custom_folder: Path = sample_folder
+) -> None:
+    """
+    Read a csv dataframe adding a 'category' column.
+    Parameters:
+        language (str): The language code (e.g. "de" for German).
+        custom_dataframe_csv_filename_no_ext (str): The csv dataframe without extension.
+        custom_folder (Path): The folder containing the csv dataframe.
+    Returns:
+        None
+    """
+    custom_folder = Path(custom_folder).absolute()
+    df_filename = custom_folder / f'{custom_dataframe_csv_filename_no_ext}_{language}.csv'
+    with open(df_filename, 'r') as handle:
+        df2 = pd.read_csv(handle, sep="|")
+        df2["category"] = df2["sentence"].apply(getSentenceCategory)
+        app_logger.info("de_category added")
+    output_path = custom_folder / f'{custom_dataframe_csv_filename_no_ext}_{language}.csv'
+    df2.to_csv(output_path, index=False, sep="|")
+    app_logger.info(f"written {output_path} ...")
+if __name__ == '__main__':
+    get_enriched_dataframe_csv("de")
+    get_enriched_dataframe_csv("en")

aip_trainer/lambdas/lambdaSpeechToScore.py → lambdaSpeechToScore.py RENAMED Viewed

@@ -4,52 +4,75 @@ import os
 from pathlib import Path
 import tempfile
 import time
 import audioread
 import numpy as np
 import torch
 from torchaudio.transforms import Resample
-from aip_trainer import WordMatching as wm, app_logger
-from aip_trainer import pronunciationTrainer, sample_rate_start
-from aip_trainer.utils.typing_hints import BodySpeechToScoreRequest
-trainer_SST_lambda = {
-    'de': pronunciationTrainer.getTrainer("de"),
-    'en': pronunciationTrainer.getTrainer("en")
-}
-transform = Resample(orig_freq=sample_rate_start, new_freq=16000)
-def lambda_handler(event, context):
-    event_body = event['body']
-    data = BodySpeechToScoreRequest.model_validate_json(event_body)
-    real_text = data.title
-    base64_audio = data.base64Audio
     app_logger.debug(f"base64Audio:{base64_audio} ...")
     file_bytes_or_audiotmpfile = base64.b64decode(base64_audio[22:].encode('utf-8'))
-    language = data.language
     if len(real_text) == 0:
-        return {
-            'statusCode': 200,
-            'headers': {
-                'Access-Control-Allow-Headers': '*',
-                'Access-Control-Allow-Credentials': "true",
-                'Access-Control-Allow-Origin': 'http://127.0.0.1:3000/',
-                'Access-Control-Allow-Methods': 'OPTIONS,POST,GET'
-            },
-            'body': ''
-        }
-    output = get_speech_to_score_dict(real_text=real_text, file_bytes_or_audiotmpfile=file_bytes_or_audiotmpfile, language=language, remove_random_file=False)
     output = json.dumps(output)
     app_logger.debug(f"output: {output} ...")
     return output
-def get_speech_to_score_dict(real_text: str, file_bytes_or_audiotmpfile: str | dict, language: str = "en", remove_random_file: bool = True, extension: str = ".ogg"):
     from soundfile import LibsndfileError
     app_logger.info(f"real_text:{real_text} ...")
     app_logger.debug(f"file_bytes:{file_bytes_or_audiotmpfile} ...")
@@ -72,23 +95,19 @@ def get_speech_to_score_dict(real_text: str, file_bytes_or_audiotmpfile: str | d
         app_logger.debug("writing streaming data to file on disk...")
         with tempfile.NamedTemporaryFile(prefix="temp_sound_speech_score_", suffix=extension, delete=False) as f1:
             f1.write(file_bytes_or_audiotmpfile)
             duration = time.time() - start0
             app_logger.info(f'Saved binary data in file in {duration}s.')
-            random_file_name = f1.name
     start = time.time()
-    app_logger.info(f'Loading {extension} file file {random_file_name} ...')
     try:
         signal, samplerate = soundfile_load(random_file_name)
     except LibsndfileError as sfe:
         # https://github.com/beetbox/audioread/issues/144
         # deprecation warnings => pip install standard-aifc standard-sunau
         app_logger.error(f"Error reading file {random_file_name}: {sfe}, re-try with audioread...")
-        try:
-            signal, samplerate = audioread_load(random_file_name)
-        except ModuleNotFoundError as mnfe:
-            app_logger.error(f"Error reading file {random_file_name}: {mnfe}, try read https://github.com/beetbox/audioread/issues/144")
-            raise mnfe
     duration = time.time() - start
     app_logger.info(f'Read {extension} file {random_file_name} in {duration}s.')
@@ -103,11 +122,11 @@ def get_speech_to_score_dict(real_text: str, file_bytes_or_audiotmpfile: str | d
     result = language_trainer_sst_lambda.processAudioForGivenText(signal_transformed, real_text)
     app_logger.info(f'language_trainer_sst_lambda: result: {result}...')
-    start = time.time()
-    if remove_random_file:
-        os.remove(random_file_name)
-    duration = time.time() - start
-    app_logger.info(f'Deleted file {random_file_name} in {duration}s.')
     start = time.time()
     real_transcripts_ipa = ' '.join(
@@ -125,9 +144,9 @@ def get_speech_to_score_dict(real_text: str, file_bytes_or_audiotmpfile: str | d
     is_letter_correct_all_words = ''
     for idx, word_real in enumerate(words_real):
-        mapped_letters, _ = wm.get_best_mapped_words(
-            mapped_words[idx], word_real
-        )
         is_letter_correct = wm.getWhichLettersWereTranscribedCorrectly(
             word_real, mapped_letters)  # , mapped_letters_indices)
@@ -146,21 +165,40 @@ def get_speech_to_score_dict(real_text: str, file_bytes_or_audiotmpfile: str | d
     return {
         'real_transcript': result['recording_transcript'],
         'ipa_transcript': ipa_transcript,
-        'pronunciation_accuracy': float(f"{pronunciation_accuracy:.2f}"),
         'real_transcripts': real_transcripts, 'matched_transcripts': matched_transcripts,
         'real_transcripts_ipa': real_transcripts_ipa, 'matched_transcripts_ipa': matched_transcripts_ipa,
         'pair_accuracy_category': pair_accuracy_category,
         'start_time': result['start_time'],
         'end_time': result['end_time'],
-        'is_letter_correct_all_words': is_letter_correct_all_words
     }
-def get_speech_to_score_tuple(real_text: str, file_bytes_or_audiotmpfile: str | dict, language: str = "en", remove_random_file: bool = True):
-    output = get_speech_to_score_dict(real_text=real_text, file_bytes_or_audiotmpfile=file_bytes_or_audiotmpfile, language=language, remove_random_file=remove_random_file)
     real_transcripts = output['real_transcripts']
     is_letter_correct_all_words = output['is_letter_correct_all_words']
-    pronunciation_accuracy = output['pronunciation_accuracy']
     ipa_transcript = output['ipa_transcript']
     real_transcripts_ipa = output['real_transcripts_ipa']
     end_time = [float(x) for x in output['end_time'].split(" ")]
@@ -169,17 +207,45 @@ def get_speech_to_score_tuple(real_text: str, file_bytes_or_audiotmpfile: str |
     app_logger.debug(f"start splitting recorded audio into {num_words} words...")
     audio_files, audio_durations = get_splitted_audio_file(audiotmpfile=file_bytes_or_audiotmpfile, start_time=start_time, end_time=end_time)
     output = {'audio_files': audio_files, "audio_durations": audio_durations, **output}
     first_audio_file = audio_files[0]
-    return real_transcripts, is_letter_correct_all_words, pronunciation_accuracy, ipa_transcript, real_transcripts_ipa, num_words, first_audio_file, json.dumps(output)
-def soundfile_write(audiofile: str | Path, data: np.ndarray, samplerate: int):
     import soundfile as sf
     sf.write(audiofile, data, samplerate)
-def get_selected_word(idx_recorded_word: int, raw_json_output: str) -> tuple[str]:
     recognition_output = json.loads(raw_json_output)
     list_audio_files = recognition_output["audio_files"]
     real_transcripts = recognition_output["real_transcripts"]
@@ -194,10 +260,23 @@ def get_selected_word(idx_recorded_word: int, raw_json_output: str) -> tuple[str
 def get_splitted_audio_file(audiotmpfile: str | Path, start_time: list[float], end_time: list[float]) -> tuple[list[str], list[float]]:
     import soundfile as sf
     audio_files = []
     audio_durations = []
     for n, (start_nth, end_nth) in enumerate(zip(start_time, end_time)):
         signal_nth, samplerate = soundfile_load(audiotmpfile, offset=start_nth, duration=end_nth - start_nth)
         audiofile = get_file_with_custom_suffix(audiotmpfile, f"_part{n}_start{start_nth}_end{end_nth}")
         soundfile_write(audiofile=audiofile, data=signal_nth, samplerate=samplerate)
@@ -210,20 +289,52 @@ def get_splitted_audio_file(audiotmpfile: str | Path, start_time: list[float], e
 def get_file_with_custom_suffix(basefile: str | Path, custom_suffix: str) -> Path:
     pathname = Path(basefile)
     dirname, filename_no_ext, filename_ext = pathname.parent, pathname.stem, pathname.suffix
-    output_file = Path(dirname) / f"{filename_no_ext}_{custom_suffix}.{filename_ext}"
     return output_file
 # From Librosa
-def calc_start_end(sr_native, time_position, n_channels):
     return int(np.round(sr_native * time_position)) * n_channels
-def soundfile_load(path: str | Path, offset: float = 0.0, duration: float = None, dtype=np.float32):
-    """Load an audio buffer using soundfile. Taken from librosa """
     import soundfile as sf
     if isinstance(path, sf.SoundFile):
@@ -250,10 +361,18 @@ def soundfile_load(path: str | Path, offset: float = 0.0, duration: float = None
     return y, sr_native
-def audioread_load(path, offset=0.0, duration=None, dtype=np.float32):
-    """Load an audio buffer using audioread.
     This loads one block at a time, and then concatenates the results.
     """
     y = []
     app_logger.debug(f"reading audio file at path:{path} ...")
@@ -309,7 +428,7 @@ def audioread_load(path, offset=0.0, duration=None, dtype=np.float32):
 # From Librosa
-def buf_to_float(x, n_bytes=2, dtype=np.float32):
     """Convert an integer buffer to floating point values.
     This is primarily useful when loading integer-valued wav data
     into numpy arrays.

 from pathlib import Path
 import tempfile
 import time
+from typing import Dict, Any, LiteralString
 import audioread
 import numpy as np
 import torch
 from torchaudio.transforms import Resample
+import WordMatching as wm
+import pronunciationTrainer
+import utilsFileIO
+from constants import app_logger, sample_rate_resample, sample_rate_start, USE_DTW, IS_TESTING, tmp_audio_extension
+trainer_SST_lambda = {'de': pronunciationTrainer.getTrainer("de"), 'en': pronunciationTrainer.getTrainer("en")}
+transform = Resample(orig_freq=sample_rate_start, new_freq=sample_rate_resample)
+def lambda_handler(event: Dict[str, Any], context: Any) -> Dict[str, Any]:
+    """
+    Lambda handler for speech-to-score.
+    Parameters:
+        event (Dict[str, Any]): The event data containing the request body.
+        context (Any): The context in which the lambda function is executed.
+    Returns:
+        Dict[str, Any]: The response containing the speech-to-score results.
+    """
+    body = event['body']
+    data = json.loads(body)
+    real_text = data['title']
+    base64_audio = data["base64Audio"]
     app_logger.debug(f"base64Audio:{base64_audio} ...")
     file_bytes_or_audiotmpfile = base64.b64decode(base64_audio[22:].encode('utf-8'))
+    language = data['language']
+    try:
+        use_dtw = data["useDTW"]
+        app_logger.info(f'use_dtw: "{type(use_dtw)}", "{use_dtw}".')
+    except KeyError:
+        use_dtw = USE_DTW
     if len(real_text) == 0:
+        return utilsFileIO.return_response_ok('{}')
+    output = get_speech_to_score_dict(
+        real_text=real_text, file_bytes_or_audiotmpfile=file_bytes_or_audiotmpfile, language=language, use_dtw=use_dtw
+    )
+    output["pronunciation_accuracy"] = f"{int(output["pronunciation_accuracy"])}"
     output = json.dumps(output)
     app_logger.debug(f"output: {output} ...")
     return output
+def get_speech_to_score_dict(
+        real_text: str, file_bytes_or_audiotmpfile: str | bytes | dict, language: str = "en", extension: str = tmp_audio_extension, use_dtw: bool = False
+    ) -> Dict[str | Any, float | LiteralString | str | Any]:
+    """
+    Process the audio file and return a dictionary with speech-to-score results.
+    Parameters:
+        use_dtw:
+        real_text (str): The text to be matched with the audio.
+        file_bytes_or_audiotmpfile (str | bytes | dict): The audio file in bytes or a temporary file.
+        language (str): The language of the audio.
+        extension (str): The file extension of the audio file.
+    Returns:
+        Dict[str | Any, float | LiteralString | str | Any]: The speech-to-score results.
+    """
     from soundfile import LibsndfileError
     app_logger.info(f"real_text:{real_text} ...")
     app_logger.debug(f"file_bytes:{file_bytes_or_audiotmpfile} ...")
         app_logger.debug("writing streaming data to file on disk...")
         with tempfile.NamedTemporaryFile(prefix="temp_sound_speech_score_", suffix=extension, delete=False) as f1:
             f1.write(file_bytes_or_audiotmpfile)
+            random_file_name = f1.name
             duration = time.time() - start0
             app_logger.info(f'Saved binary data in file in {duration}s.')
     start = time.time()
+    app_logger.info(f"Loading temp '{random_file_name}' file...")
     try:
         signal, samplerate = soundfile_load(random_file_name)
     except LibsndfileError as sfe:
         # https://github.com/beetbox/audioread/issues/144
         # deprecation warnings => pip install standard-aifc standard-sunau
         app_logger.error(f"Error reading file {random_file_name}: {sfe}, re-try with audioread...")
+        signal, samplerate = audioread_load(random_file_name)
     duration = time.time() - start
     app_logger.info(f'Read {extension} file {random_file_name} in {duration}s.')
     result = language_trainer_sst_lambda.processAudioForGivenText(signal_transformed, real_text)
     app_logger.info(f'language_trainer_sst_lambda: result: {result}...')
+    # start = time.time()
+    # if remove_random_file:
+    #     os.remove(random_file_name)
+    # duration = time.time() - start
+    # app_logger.info(f'Deleted file {random_file_name} in {duration}s.')
     start = time.time()
     real_transcripts_ipa = ' '.join(
     is_letter_correct_all_words = ''
     for idx, word_real in enumerate(words_real):
+        mapped_letters, mapped_letters_indices = wm.get_best_mapped_words(
+            mapped_words[idx], word_real, use_dtw=use_dtw)
         is_letter_correct = wm.getWhichLettersWereTranscribedCorrectly(
             word_real, mapped_letters)  # , mapped_letters_indices)
     return {
         'real_transcript': result['recording_transcript'],
         'ipa_transcript': ipa_transcript,
+        'pronunciation_accuracy': pronunciation_accuracy,
         'real_transcripts': real_transcripts, 'matched_transcripts': matched_transcripts,
         'real_transcripts_ipa': real_transcripts_ipa, 'matched_transcripts_ipa': matched_transcripts_ipa,
         'pair_accuracy_category': pair_accuracy_category,
         'start_time': result['start_time'],
         'end_time': result['end_time'],
+        'is_letter_correct_all_words': is_letter_correct_all_words,
+        "random_file_name": random_file_name
     }
+def get_speech_to_score_tuple(real_text: str, file_bytes_or_audiotmpfile: str | dict, language: str = "en", remove_random_file: bool = True) -> tuple:
+    """
+    Process the audio file and return a tuple with speech-to-score results.
+    Parameters:
+        real_text (str): The text to be matched with the audio.
+        file_bytes_or_audiotmpfile (str | dict): The audio file in bytes or a temporary file.
+        language (str): The language of the audio.
+        remove_random_file (bool): Whether to remove the temporary file after processing.
+    Returns:
+        tuple: A tuple containing real transcripts, letter correctness, pronunciation accuracy, IPA transcript, real transcripts in IPA, number of words, first audio file, and JSON output.
+    """
+    output = get_speech_to_score_dict(
+        real_text=real_text, file_bytes_or_audiotmpfile=file_bytes_or_audiotmpfile,
+        language=language
+    )
+    random_file_name = output["random_file_name"]
+    del output["random_file_name"]
     real_transcripts = output['real_transcripts']
     is_letter_correct_all_words = output['is_letter_correct_all_words']
+    pronunciation_accuracy = f"{output["pronunciation_accuracy"]:.2f}"
+    output["pronunciation_accuracy"] = pronunciation_accuracy
     ipa_transcript = output['ipa_transcript']
     real_transcripts_ipa = output['real_transcripts_ipa']
     end_time = [float(x) for x in output['end_time'].split(" ")]
     app_logger.debug(f"start splitting recorded audio into {num_words} words...")
     audio_files, audio_durations = get_splitted_audio_file(audiotmpfile=file_bytes_or_audiotmpfile, start_time=start_time, end_time=end_time)
+    remove_random_file = not IS_TESTING and remove_random_file
+    if remove_random_file:
+        app_logger.info(f"{IS_TESTING} => remove_random_file:{remove_random_file}, removing:{random_file_name} ...")
+        Path(random_file_name).unlink(missing_ok=True)
+        app_logger.info(f"removed:{random_file_name} ...")
     output = {'audio_files': audio_files, "audio_durations": audio_durations, **output}
     first_audio_file = audio_files[0]
+    return real_transcripts, is_letter_correct_all_words, pronunciation_accuracy, ipa_transcript, real_transcripts_ipa, num_words, first_audio_file, json.dumps(output), random_file_name
+def soundfile_write(audiofile: str | Path, data: np.ndarray, samplerate: int) -> None:
+    """
+    Write audio data to a file using soundfile.
+    Parameters:
+        audiofile (str | Path): The path to the audio file.
+        data (np.ndarray): The audio data to write.
+        samplerate (int): The sample rate of the audio data.
+    Returns:
+        None
+    """
     import soundfile as sf
     sf.write(audiofile, data, samplerate)
+def get_selected_word(idx_recorded_word: int, raw_json_output: str) -> tuple[str, str, float]:
+    """
+    Get the selected word, its audio file, and duration from the recognition output.
+    Parameters:
+        idx_recorded_word (int): The index of the recorded word.
+        raw_json_output (str): The JSON output from the recognition process.
+    Returns:
+        tuple: A tuple containing the audio file, the current word, and its duration.
+    """
     recognition_output = json.loads(raw_json_output)
     list_audio_files = recognition_output["audio_files"]
     real_transcripts = recognition_output["real_transcripts"]
 def get_splitted_audio_file(audiotmpfile: str | Path, start_time: list[float], end_time: list[float]) -> tuple[list[str], list[float]]:
+    """
+    Split the audio file into segments based on start and end times.
+    Parameters:
+        audiotmpfile (str | Path): The path to the audio file.
+        start_time (list[float]): The start times of the segments.
+        end_time (list[float]): The end times of the segments.
+    Returns:
+        tuple: A tuple containing a list of audio files and their durations.
+    """
     import soundfile as sf
     audio_files = []
     audio_durations = []
+    app_logger.info(f"start_time:{start_time}, end_time:{end_time} ...")
     for n, (start_nth, end_nth) in enumerate(zip(start_time, end_time)):
+        # assert start_nth < end_nth, f"start_nth:{start_nth} (index {n}) should be less than end_nth:{end_nth} (start_time:{start_time}, end_time:{end_time})..."
         signal_nth, samplerate = soundfile_load(audiotmpfile, offset=start_nth, duration=end_nth - start_nth)
         audiofile = get_file_with_custom_suffix(audiotmpfile, f"_part{n}_start{start_nth}_end{end_nth}")
         soundfile_write(audiofile=audiofile, data=signal_nth, samplerate=samplerate)
 def get_file_with_custom_suffix(basefile: str | Path, custom_suffix: str) -> Path:
+    """
+    Generate a file path with a custom suffix.
+    Parameters:
+        basefile (str | Path): The base file path.
+        custom_suffix (str): The custom suffix to add to the file name.
+    Returns:
+        Path: The new file path with the custom suffix.
+    """
     pathname = Path(basefile)
     dirname, filename_no_ext, filename_ext = pathname.parent, pathname.stem, pathname.suffix
+    output_file = dirname / f"{filename_no_ext}_{custom_suffix}.{filename_ext}"
     return output_file
 # From Librosa
+def calc_start_end(sr_native: int, time_position: float, n_channels: int) -> int:
+    """
+    Calculate the start or end position in samples.
+    Parameters:
+        sr_native (int): The native sample rate.
+        time_position (float): The time position in seconds.
+        n_channels (int): The number of audio channels.
+    Returns:
+        int: The start or end position in samples.
+    """
     return int(np.round(sr_native * time_position)) * n_channels
+def soundfile_load(path: str | Path, offset: float = 0.0, duration: float = None, dtype=np.float32) -> tuple[np.ndarray, int]:
+    """
+    Load an audio buffer using soundfile.
+    Parameters:
+        path (str | Path): The path to the audio file.
+        offset (float): The offset in seconds to start reading the file.
+        duration (float): The duration in seconds to read from the file.
+        dtype (np.float32): The data type of the audio buffer.
+    Returns:
+        tuple: A tuple containing the audio buffer and the sample rate.
+    """
     import soundfile as sf
     if isinstance(path, sf.SoundFile):
     return y, sr_native
+def audioread_load(path: str | Path, offset: float = 0.0, duration: float = None, dtype=np.float32) -> tuple[np.ndarray, int]:
+    """
     This loads one block at a time, and then concatenates the results.
+    Parameters:
+        path (str | Path): The path to the audio file.
+        offset (float): The offset in seconds to start reading the file.
+        duration (float): The duration in seconds to read from the file.
+        dtype (np.float32): The data type of the audio buffer.
+    Returns:
+        tuple: A tuple containing the audio buffer and the sample rate.
     """
     y = []
     app_logger.debug(f"reading audio file at path:{path} ...")
 # From Librosa
+def buf_to_float(x: np.ndarray, n_bytes: int = 2, dtype: np.float32 = np.float32) -> np.ndarray:
     """Convert an integer buffer to floating point values.
     This is primarily useful when loading integer-valued wav data
     into numpy arrays.

aip_trainer/lambdas/lambdaTTS.py → lambdaTTS.py RENAMED Viewed

@@ -1,12 +1,36 @@
-import random
 import tempfile
 from pathlib import Path
-from aip_trainer import app_logger
 def get_tts(text: str, language: str, tmp_prefix="audio_", tmp_suffix=".wav") -> str:
-    from aip_trainer.models import models
     if text is None or len(text) == 0:
         raise ValueError(f"cannot read an empty/None text: '{text}'...")
@@ -15,11 +39,11 @@ def get_tts(text: str, language: str, tmp_prefix="audio_", tmp_suffix=".wav") ->
     tmp_dir = Path(tempfile.gettempdir())
     try:
-        model, _, speaker, sample_rate = models.silero_tts(
             language, output_folder=tmp_dir
         )
     except ValueError:
-        model, _, sample_rate, _, _, speaker = models.silero_tts(
             language, output_folder=tmp_dir
         )
     app_logger.info(f"model speaker #0: {speaker} ...")

+import base64
+import json
+import os
 import tempfile
 from pathlib import Path
+import soundfile as sf
+import AIModels
+import models
+import utilsFileIO
+from constants import app_logger, sample_rate_resample
 def get_tts(text: str, language: str, tmp_prefix="audio_", tmp_suffix=".wav") -> str:
+    """
+    Generate text-to-speech (TTS) audio for the given text and language.
+    Args:
+        text (str): The text to be converted to speech.
+        language (str): The language of the text. Supported languages are "en" (English) and "de" (German).
+        tmp_prefix (str, optional): The temporary directory to use for temporary files.
+        tmp_suffix (str, optional): The temporary directory to use for temporary files.
+    Returns:
+        str: The path to the generated audio file.
+    Raises:
+        NotImplementedError: If the provided language is not supported.
+    Notes:
+        This function uses the Silero TTS model to generate the audio. The model and speaker are selected based on the provided language.
+    """
     if text is None or len(text) == 0:
         raise ValueError(f"cannot read an empty/None text: '{text}'...")
     tmp_dir = Path(tempfile.gettempdir())
     try:
+        model, _, speaker, sample_rate = models.__silero_tts(
             language, output_folder=tmp_dir
         )
     except ValueError:
+        model, _, sample_rate, _, _, speaker = models.__silero_tts(
             language, output_folder=tmp_dir
         )
     app_logger.info(f"model speaker #0: {speaker} ...")

aip_trainer/models/models.py → models.py RENAMED Viewed

@@ -1,11 +1,15 @@
 import os
 from pathlib import Path
-import tempfile
 import torch
 import torch.nn as nn
 from silero.utils import Decoder
-from aip_trainer import app_logger, sample_rate_start
 default_speaker_dict = {
@@ -14,11 +18,92 @@ default_speaker_dict = {
 }
-def silero_tts(language="en", version="latest", output_folder: Path | str = None, **kwargs):
-    """Silero Text-To-Speech Models
-    language (str): language of the model, now available are ['ru', 'en', 'de', 'es', 'fr']
-    Returns a model and a set of utils
-    Please see https://github.com/snakers4/silero-models for usage examples
     """
     output_folder = Path(output_folder)
     current_model_lang = default_speaker_dict[language]
@@ -26,10 +111,10 @@ def silero_tts(language="en", version="latest", output_folder: Path | str = None
     if language in default_speaker_dict:
         model_id = current_model_lang["model_id"]
-    models = get_models(language, output_folder, version, model_type="tts_models")
     available_languages = list(models.tts_models.keys())
     assert (
-        language in available_languages
     ), f"Language not in the supported list {available_languages}"
     tts_models_lang = models.tts_models[language]
@@ -67,46 +152,95 @@ def silero_tts(language="en", version="latest", output_folder: Path | str = None
         return model, symbols, sample_rate, example_text, apply_tts, model_id
-def silero_stt(
-    language="en",
-    version="latest",
-    jit_model="jit",
-    output_folder: Path | str = None,
-    **kwargs,
-):
-    """Modified Silero Speech-To-Text Model(s) function
-    language (str): language of the model, now available are ['en', 'de', 'es']
-    version:
-    jit_model:
-    output_folder: needed in case of docker build
-    Returns a model, decoder object and a set of utils
-    Please see https://github.com/snakers4/silero-models for usage examples
     """
-    from silero.utils import (
-        read_audio,
-        read_batch,
-        split_into_batches,
-        prepare_model_input,
     )
-    model, decoder = get_latest_model(
-        language,
-        output_folder,
-        version,
-        model_type="stt_models",
-        jit_model=jit_model,
         **kwargs,
     )
-    utils = (read_batch, split_into_batches, read_audio, prepare_model_input)
-    return model, decoder, utils
 def init_jit_model(
-    model_url: str,
-    device: torch.device = torch.device("cpu"),
-    output_folder: Path | str = None,
-):
     torch.set_grad_enabled(False)
     app_logger.info(
@@ -126,62 +260,49 @@ def init_jit_model(
     if not os.path.isfile(model_path):
         app_logger.info(f"downloading model_path: '{model_path}' ...")
-        torch.hub.download_url_to_file(model_url, model_path, progress=True)
     app_logger.info(f"model_path {model_path} downloaded!")
     model = torch.jit.load(model_path, map_location=device)
     model.eval()
     return model, Decoder(model.labels)
-# second returned type here is the custom class src.silero.utils.Decoder from snakers4/silero-models
-def getASRModel(language: str) -> tuple[nn.Module, Decoder]:
-    tmp_dir = tempfile.gettempdir()
-    if language == "de":
-        model, decoder, _ = silero_stt(
-            language="de", version="v4", jit_model="jit_large", output_folder=tmp_dir
-        )
-    elif language == "en":
-        model, decoder, _ = silero_stt(language="en", output_folder=tmp_dir)
-    else:
-        raise NotImplementedError(
-            "currenty works only for 'de' and 'en' languages, not for '{}'.".format(
-                language
-            )
-        )
-    return model, decoder
-def get_models(language, output_folder, version, model_type):
-    from omegaconf import OmegaConf
-    output_folder = (
-        Path(output_folder)
-        if output_folder is not None
-        else Path(os.path.dirname(__file__)).parent.parent
     )
-    models_list_file = output_folder / f"latest_silero_model_{language}.yml"
-    if not os.path.exists(models_list_file):
-        app_logger.info(
-            f"model {model_type} yml for '{language}' language, '{version}' version not found, download it in folder {output_folder}..."
-        )
-        torch.hub.download_url_to_file(
-            "https://raw.githubusercontent.com/snakers4/silero-models/master/models.yml",
-            models_list_file,
-            progress=False,
-        )
-    assert os.path.exists(models_list_file)
-    return OmegaConf.load(models_list_file)
-def get_latest_model(language, output_folder, version, model_type, jit_model, **kwargs):
-    models = get_models(language, output_folder, version, model_type)
-    available_languages = list(models[model_type].keys())
-    assert language in available_languages
-    model, decoder = init_jit_model(
-        model_url=models[model_type].get(language).get(version).get(jit_model),
-        output_folder=output_folder,
         **kwargs,
     )
-    return model, decoder

 import os
 from pathlib import Path
+from typing import Union, Callable
 import torch
 import torch.nn as nn
+from omegaconf import DictConfig, ListConfig
 from silero.utils import Decoder
+from AIModels import NeuralASR
+from ModelInterfaces import IASRModel
+from constants import MODEL_NAME_DEFAULT, language_not_implemented, app_logger, sample_rate_start, silero_versions_dict
 default_speaker_dict = {
 }
+def getASRModel(language: str, model_name: str = MODEL_NAME_DEFAULT) -> IASRModel:
+    models_dict = {
+        "whisper": __get_model_whisper,
+        "faster_whisper": __get_model_faster_whisper,
+        "silero": __get_model_silero
+    }
+    if model_name in models_dict:
+        fn = models_dict[model_name]
+        return fn(language)
+    models_supported = ", ".join(models_dict.keys())
+    raise ValueError(f"Model '{model_name}' not implemented. Supported models: {models_supported}.")
+def __get_model_whisper(language: str) -> IASRModel:
+    from whisper_wrapper import WhisperASRModel
+    return WhisperASRModel(language=language)
+def __get_model_faster_whisper(language: str) -> IASRModel:
+    from faster_whisper_wrapper import FasterWhisperASRModel
+    return FasterWhisperASRModel(language=language)
+def __get_model_silero(language: str) -> IASRModel:
+    import tempfile
+    tmp_dir = tempfile.gettempdir()
+    if language == "de":
+        model, decoder, _ = __silero_stt(
+            language="de", version="v4", jit_model="jit_large", output_folder=tmp_dir
+        )
+        return __eval_apply_neural_asr(model, decoder, language)
+    elif language == "en":
+        model, decoder, _ = __silero_stt(language="en", output_folder=tmp_dir)
+        return __eval_apply_neural_asr(model, decoder, language)
+    raise ValueError(language_not_implemented.format(language))
+def __eval_apply_neural_asr(model: nn.Module, decoder: Decoder, language: str):
+    app_logger.info(f"LOADED silero model language: {language}, version: '{silero_versions_dict[language]}'")
+    model.eval()
+    app_logger.info(f"EVALUATED silero model language: {language}, version: '{silero_versions_dict[language]}'")
+    return NeuralASR(model, decoder)
+def getTranslationModel(language: str) -> nn.Module:
+    from transformers import AutoTokenizer
+    from transformers import AutoModelForSeq2SeqLM
+    if language == 'de':
+        model = AutoModelForSeq2SeqLM.from_pretrained(
+            "Helsinki-NLP/opus-mt-de-en")
+        tokenizer = AutoTokenizer.from_pretrained(
+            "Helsinki-NLP/opus-mt-de-en")
+        # Cache models to avoid Hugging face processing (not needed now)
+        # with open('translation_model_de.pickle', 'wb') as handle:
+        #     pickle.dump(model, handle)
+        # with open('translation_tokenizer_de.pickle', 'wb') as handle:
+        #     pickle.dump(tokenizer, handle)
+    else:
+        raise ValueError(language_not_implemented.format(language))
+    return model, tokenizer
+def __silero_tts(language: str = "en", version: str = "latest", output_folder: Path | str = None, **kwargs) -> tuple[nn.Module, str, int, str, dict, Callable, str]:
+    """
+    Modified function to create instances of Silero Text-To-Speech Models.
+    Please see https://github.com/snakers4/silero-models?tab=readme-ov-file#text-to-speech for usage examples.
+    language="en", version="latest", output_folder: Path | str = None, **kwargs
+    Args:
+        language (str): Language of the model. Available options are ['ru', 'en', 'de', 'es', 'fr']. Default is 'en'.
+        version (str): Version of the model to use. Default is 'latest'.
+        output_folder (Path | str): Path to the folder where the model will be saved. Default is None.
+        **kwargs: Additional keyword arguments.
+    Returns:
+        tuple: Depending on the model version and the input arguments, returns a tuple containing:
+            - model: The loaded TTS model.
+            - symbols (str): The set of symbols used by the model (only for older model versions).
+            - sample_rate (int): The sample rate of the model.
+            - example_text (str): Example text for the model.
+            - speaker (dict):
+            - apply_tts (function): Function to apply TTS (only for older model versions).
+            - model_id (str): The model ID (only for older model versions).
+    Raises:
+        AssertionError: If the specified language is not in the supported list.
     """
     output_folder = Path(output_folder)
     current_model_lang = default_speaker_dict[language]
     if language in default_speaker_dict:
         model_id = current_model_lang["model_id"]
+    models = __get_models(language, output_folder, version, model_type="tts_models")
     available_languages = list(models.tts_models.keys())
     assert (
+            language in available_languages
     ), f"Language not in the supported list {available_languages}"
     tts_models_lang = models.tts_models[language]
         return model, symbols, sample_rate, example_text, apply_tts, model_id
+def __get_models(language: str, output_folder: str | Path, version: str, model_type: str) -> Union[DictConfig, ListConfig]:
     """
+    Retrieve and load the model configuration for a specified language and model type.
+    Args:
+        language (str): The language for which the model is required.
+        output_folder (str or Path): The folder where the model configuration file should be saved
+        version (str): The version of the model.
+        model_type (str): The type of the model.
+    Returns:
+        OmegaConf: The loaded model configuration.
+    Raises:
+        AssertionError: If the model configuration file does not exist after attempting to download it.
+    Notes:
+        If the model configuration file does not exist in the specified output folder, it will be downloaded
+        from a predefined URL and saved in the output folder.
+    """
+    from omegaconf import OmegaConf
+    output_folder = (
+        Path(output_folder)
+        if output_folder is not None
+        else Path(os.path.dirname(__file__)).parent.parent
     )
+    models_list_file = output_folder / f"latest_silero_model_{language}.yml"
+    app_logger.info(f"models_list_file:{models_list_file}.")
+    if not os.path.exists(models_list_file):
+        app_logger.info(
+            f"model {model_type} yml for '{language}' language, '{version}' version not found, download it in folder {output_folder}..."
+        )
+        torch.hub.download_url_to_file(
+            "https://raw.githubusercontent.com/snakers4/silero-models/master/models.yml",
+            str(models_list_file),
+            progress=False,
+        )
+    assert os.path.exists(models_list_file)
+    return OmegaConf.load(models_list_file)
+def __get_latest_stt_model(language: str, output_folder: str | Path, version: str, model_type: str, jit_model: str, **kwargs) -> tuple[nn.Module, Decoder]:
+    """
+    Retrieve the latest Speech-to-Text (STT) model for a given language and model type.
+    Args:
+        language (str): The language for which the STT model is required.
+        output_folder (str): The directory where the model will be saved.
+        version (str): The version of the model to retrieve.
+        model_type (str): The type of the model (e.g., 'large', 'small').
+        jit_model (str): The specific JIT model to use.
+        **kwargs: Additional keyword arguments to pass to the model initialization function.
+    Returns:
+        tuple: A tuple containing the model and the decoder.
+    Raises:
+        AssertionError: If the specified language is not available in the model type.
+    """
+    models = __get_models(language, output_folder, version, model_type)
+    available_languages = list(models[model_type].keys())
+    assert language in available_languages
+    model, decoder = init_jit_model(
+        model_url=models[model_type].get(language).get(version).get(jit_model),
+        output_folder=output_folder,
         **kwargs,
     )
+    return model, decoder
 def init_jit_model(
+        model_url: str,
+        device: torch.device = torch.device("cpu"),
+        output_folder: Path | str = None,
+) -> tuple[torch.nn.Module, Decoder]:
+    """
+    Initialize a JIT model from a given URL.
+    Args:
+        model_url (str): The URL to download the model from.
+        device (torch.device, optional): The device to load the model on. Defaults to CPU.
+        output_folder (Path | str, optional): The folder to save the downloaded model.
+            If None, defaults to a 'model' directory in the current file's directory.
+    Returns:
+        Tuple[torch.jit.ScriptModule, Decoder]: The loaded JIT model and its corresponding decoder.
+    """
     torch.set_grad_enabled(False)
     app_logger.info(
     if not os.path.isfile(model_path):
         app_logger.info(f"downloading model_path: '{model_path}' ...")
+        torch.hub.download_url_to_file(model_url, str(model_path), progress=True)
     app_logger.info(f"model_path {model_path} downloaded!")
     model = torch.jit.load(model_path, map_location=device)
     model.eval()
     return model, Decoder(model.labels)
+def __silero_stt(
+        language: str = "en",
+        version: str = "latest",
+        jit_model: str = "jit",
+        output_folder: Path | str = None,
+        **kwargs,
+) -> tuple[nn.Module, Decoder, set[Callable, Callable, Callable, Callable]]:
+    """
+    Modified function to create instances of Silero Speech-To-Text Model(s).
+    Please see https://github.com/snakers4/silero-models?tab=readme-ov-file#speech-to-text for usage examples.
+    Args:
+        language (str): Language of the model. Available options are ['en', 'de', 'es'].
+        version (str): Version of the model to use. Default is "latest".
+        jit_model (str): Type of JIT model to use. Default is "jit".
+        output_folder (Path | str, optional): Output folder needed in case of docker build. Default is None.
+        **kwargs: Additional keyword arguments.
+    Returns:
+        tuple: A tuple containing the model, decoder object, and a set of utility functions.
+    """
+    from silero.utils import (
+        read_audio,
+        read_batch,
+        split_into_batches,
+        prepare_model_input,
     )
+    model, decoder = __get_latest_stt_model(
+        language,
+        output_folder,
+        version,
+        model_type="stt_models",
+        jit_model=jit_model,
         **kwargs,
     )
+    utils = (read_batch, split_into_batches, read_audio, prepare_model_input)
+    return model, decoder, utils

packages.txt DELETED Viewed

	@@ -1 +0,0 @@
1	- ffmpeg

pre-requirements.txt DELETED Viewed

	@@ -1 +0,0 @@
1	- pip

aip_trainer/pronunciationTrainer.py → pronunciationTrainer.py RENAMED Viewed

@@ -5,29 +5,22 @@ import epitran
 import numpy as np
 import torch
-from . import WordMatching as wm
-from . import WordMetrics
-from . import app_logger
-from .models import AIModels, ModelInterfaces as mi, RuleBasedModels, models as mo
-def getTrainer(language: str):
-    device = torch.device('cpu')
-    model, decoder = mo.getASRModel(language)
-    model = model.to(device)
-    model.eval()
-    asr_model = AIModels.NeuralASR(model, decoder)
     if language == 'de':
-        epitran_deu_latn = epitran.Epitran('deu-Latn')
-        phonem_converter = RuleBasedModels.EpitranPhonemConverter(epitran_deu_latn)
     elif language == 'en':
         phonem_converter = RuleBasedModels.EngPhonemConverter()
     else:
-        raise ValueError('Language not implemented')
     trainer = PronunciationTrainer(asr_model, phonem_converter)
     return trainer
@@ -50,7 +43,7 @@ class PronunciationTrainer:
     current_words_pronunciation_accuracy = []
     categories_thresholds = np.array([80, 60, 59])
-    sampling_rate = 16000
     def __init__(self, asr_model: mi.IASRModel, word_to_ipa_coverter: mi.ITextToPhonemModel) -> None:
         self.asr_model = asr_model
@@ -67,22 +60,36 @@ class PronunciationTrainer:
         return audio_transcript, word_locations_in_samples
     ##################### ASR Functions ###########################
     def processAudioForGivenText(self, recordedAudio: torch.Tensor = None, real_text=None):
         start = time.time()
-        app_logger.info('starting getAudioTranscript...')
-        recording_transcript, recording_ipa, word_locations = self.getAudioTranscript(recordedAudio)
-        duration = time.time() - start
-        app_logger.info(f'Time for NN to transcript audio: {duration}.')
         start = time.time()
         real_and_transcribed_words, real_and_transcribed_words_ipa, mapped_words_indices = self.matchSampleAndRecordedWords(
             real_text, recording_transcript)
-        duration = time.time() - start
-        app_logger.info(f'Time for matching transcripts: {duration}.')
         start_time, end_time = self.getWordLocationsFromRecordInSeconds(
             word_locations, mapped_words_indices)
@@ -104,22 +111,20 @@ class PronunciationTrainer:
     def getAudioTranscript(self, recordedAudio: torch.Tensor = None):
         current_recorded_audio = recordedAudio
-        app_logger.info('starting preprocessAudio...')
-        current_recorded_audio = self.preprocessAudio(current_recorded_audio)
-        app_logger.info('starting processAudio...')
         self.asr_model.processAudio(current_recorded_audio)
-        app_logger.info('starting getTranscriptAndWordsLocations...')
         current_recorded_transcript, current_recorded_word_locations = self.getTranscriptAndWordsLocations(
             current_recorded_audio.shape[1])
-        app_logger.info('starting convertToPhonem...')
-        current_recorded_ipa = self.ipa_converter.convertToPhonem(current_recorded_transcript)
-        app_logger.info('ok, return audio transcript!')
         return current_recorded_transcript, current_recorded_ipa, current_recorded_word_locations
-    def getWordLocationsFromRecordInSeconds(self, word_locations, mapped_words_indices) -> tuple[str, str]:
         start_time = []
         end_time = []
         for word_idx in range(len(mapped_words_indices)):
@@ -135,10 +140,10 @@ class PronunciationTrainer:
     def matchSampleAndRecordedWords(self, real_text, recorded_transcript):
         words_estimated = recorded_transcript.split()
-        if real_text is None:
-            words_real = self.current_transcript[0].split()
-        else:
             words_real = real_text.split()
         mapped_words, mapped_words_indices = wm.get_best_mapped_words(
             words_estimated, words_real)
@@ -154,7 +159,7 @@ class PronunciationTrainer:
                                                    self.ipa_converter.convertToPhonem(mapped_words[word_idx])))
         return real_and_transcribed_words, real_and_transcribed_words_ipa, mapped_words_indices
-    def getPronunciationAccuracy(self, real_and_transcribed_words_ipa) -> tuple[float, list]:
         total_mismatches = 0.
         number_of_phonemes = 0.
         current_words_pronunciation_accuracy = []
@@ -191,4 +196,4 @@ class PronunciationTrainer:
         return np.argmin(abs(self.categories_thresholds-accuracy))
     def preprocessAudio(self, audio: torch.tensor) -> torch.tensor:
-        return preprocessAudioStandalone(audio=audio)

 import numpy as np
 import torch
+import ModelInterfaces as mi
+import RuleBasedModels
+import WordMatching as wm
+import WordMetrics
+import models as mo
+from constants import app_logger, MODEL_NAME_DEFAULT, sample_rate_resample
+def getTrainer(language: str, model_name: str = MODEL_NAME_DEFAULT):
+    asr_model = mo.getASRModel(language, model_name=model_name)
     if language == 'de':
+        phonem_converter = RuleBasedModels.EpitranPhonemConverter(epitran.Epitran('deu-Latn'))
     elif language == 'en':
         phonem_converter = RuleBasedModels.EngPhonemConverter()
     else:
+        raise ValueError(f"Language '{language}' not implemented")
     trainer = PronunciationTrainer(asr_model, phonem_converter)
     return trainer
     current_words_pronunciation_accuracy = []
     categories_thresholds = np.array([80, 60, 59])
+    sampling_rate = sample_rate_resample
     def __init__(self, asr_model: mi.IASRModel, word_to_ipa_coverter: mi.ITextToPhonemModel) -> None:
         self.asr_model = asr_model
         return audio_transcript, word_locations_in_samples
+    # def getWordsRelativeIntonation(self, Audio: torch.tensor, word_locations: list):
+    #     intonations = torch.zeros((len(word_locations), 1))
+    #     intonation_fade_samples = 0.3*self.sampling_rate
+    #     app_logger.info(f"intonations.shape: {intonations.shape}.")
+    #     for word in range(len(word_locations)):
+    #         intonation_start = int(np.maximum(
+    #             0, word_locations[word][0]-intonation_fade_samples))
+    #         intonation_end = int(np.minimum(
+    #             Audio.shape[1]-1, word_locations[word][1]+intonation_fade_samples))
+    #         intonations[word] = torch.sqrt(torch.mean(
+    #             Audio[0][intonation_start:intonation_end]**2))
+    #
+    #     intonations = intonations/torch.mean(intonations)
+    #     return intonations
     ##################### ASR Functions ###########################
     def processAudioForGivenText(self, recordedAudio: torch.Tensor = None, real_text=None):
         start = time.time()
+        recording_transcript, recording_ipa, word_locations = self.getAudioTranscript(
+            recordedAudio)
+        time_transcript_audio = time.time() - start
+        app_logger.info(f'Time for NN to transcript audio: {time_transcript_audio:.2f}.')
         start = time.time()
         real_and_transcribed_words, real_and_transcribed_words_ipa, mapped_words_indices = self.matchSampleAndRecordedWords(
             real_text, recording_transcript)
+        time_matching_transcripts = time.time() - start
+        app_logger.info(f'Time for matching transcripts: {time_matching_transcripts:.3f}.')
         start_time, end_time = self.getWordLocationsFromRecordInSeconds(
             word_locations, mapped_words_indices)
     def getAudioTranscript(self, recordedAudio: torch.Tensor = None):
         current_recorded_audio = recordedAudio
+        current_recorded_audio = self.preprocessAudio(
+            current_recorded_audio)
         self.asr_model.processAudio(current_recorded_audio)
         current_recorded_transcript, current_recorded_word_locations = self.getTranscriptAndWordsLocations(
             current_recorded_audio.shape[1])
+        current_recorded_ipa = self.ipa_converter.convertToPhonem(
+            current_recorded_transcript)
+        # time.sleep(10000)
         return current_recorded_transcript, current_recorded_ipa, current_recorded_word_locations
+    def getWordLocationsFromRecordInSeconds(self, word_locations, mapped_words_indices) -> list:
+        app_logger.info(f"len_list: word_locations:{len(word_locations)},  mapped_words_indices:{len(mapped_words_indices)}, {len(word_locations) == len(mapped_words_indices)}...")
         start_time = []
         end_time = []
         for word_idx in range(len(mapped_words_indices)):
     def matchSampleAndRecordedWords(self, real_text, recorded_transcript):
         words_estimated = recorded_transcript.split()
+        try:
             words_real = real_text.split()
+        except AttributeError:
+            raise ValueError("Real text is None, but should be a string.")
         mapped_words, mapped_words_indices = wm.get_best_mapped_words(
             words_estimated, words_real)
                                                    self.ipa_converter.convertToPhonem(mapped_words[word_idx])))
         return real_and_transcribed_words, real_and_transcribed_words_ipa, mapped_words_indices
+    def getPronunciationAccuracy(self, real_and_transcribed_words_ipa) -> float:
         total_mismatches = 0.
         number_of_phonemes = 0.
         current_words_pronunciation_accuracy = []
         return np.argmin(abs(self.categories_thresholds-accuracy))
     def preprocessAudio(self, audio: torch.tensor) -> torch.tensor:
+        return preprocessAudioStandalone(audio)

requirements-dev.txt CHANGED Viewed

@@ -1,3 +1,2 @@
-bson
 pytest
 pytest-cov



1	pytest
2	pytest-cov

requirements-flask.txt DELETED Viewed

@@ -1,21 +0,0 @@
-audioread
-dtwalign
-eng_to_ipa
-epitran==1.25.1
-flask
-flask_cors
-gunicorn
-omegaconf
-ortools==9.11.4210
-pandas
-pickle-mixin
-python-dotenv
-requests
-sentencepiece
-silero==0.4.1
-soundfile==0.12.1
-sqlalchemy
-structlog
-torch
-torchaudio
-transformers

requirements-gradio.txt DELETED Viewed

	@@ -1 +0,0 @@
1	- gradio==5.11.0

requirements.txt CHANGED Viewed

@@ -1,19 +1,23 @@
-asgi-correlation-id
 audioread
 dtwalign
 eng_to_ipa
-epitran==1.25.1
-gunicorn
 omegaconf
-ortools==9.11.4210
 pandas
 pickle-mixin
-python-dotenv
 requests
 sentencepiece
-silero==0.4.1
-soundfile==0.12.1
 structlog
-torch
-torchaudio
 transformers

 audioread
 dtwalign
 eng_to_ipa
+epitran
+faster-whisper
+flask
+flask_cors
 omegaconf
+openai-whisper
+ortools
 pandas
 pickle-mixin
 requests
+sacremoses # suggested by marian translation model
 sentencepiece
+silero
+soundfile
+sqlalchemy
 structlog
+-f https://download.pytorch.org/whl/torch_stable.html
+torch
+torchaudio
 transformers

aip_trainer/utils/session_logger.py → session_logger.py RENAMED Viewed

@@ -28,9 +28,9 @@ def drop_color_message_key(_, __, event_dict: EventDict) -> EventDict:
 def setup_logging(json_logs: bool = False, log_level: str = "INFO"):
     """Enhance the configuration of structlog.
-    Needed for correlation id injection with fastapi middleware in samgis-web.
-    After the use of logging_middleware() in samgis_web.web.middlewares, add also the CorrelationIdMiddleware from
-    'asgi_correlation_id' package. (See 'tests/web/test_middlewares.py' in samgis_web).
     To change an input parameter like the log level, re-run the function changing the parameter
     (no need to re-instantiate the logger instance: it's a hot change)

 def setup_logging(json_logs: bool = False, log_level: str = "INFO"):
     """Enhance the configuration of structlog.
+    Needed for correlation id injection with fastapi middleware within the app.
+    After the use of logging_middleware() within the middlewares module (if present), add also the CorrelationIdMiddleware from
+    'asgi_correlation_id' package.
     To change an input parameter like the log level, re-run the function changing the parameter
     (no need to re-instantiate the logger instance: it's a hot change)

static/.gitignore DELETED Viewed

@@ -1,3 +0,0 @@
-playwright-report/*
-node_modules
-test-results/*

static/.vscode/launch.json DELETED Viewed

@@ -1,20 +0,0 @@
-{
-    // Use IntelliSense to learn about possible attributes.
-    // Hover to view descriptions of existing attributes.
-    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
-    "version": "0.2.0",
-    "configurations": [
-        {
-            "type": "node",
-            "request": "launch",
-            "name": "Launch Program",
-            "skipFiles": [
-                "<node_internals>/**"
-            ],
-            "program": "${workspaceFolder}/tests/test-1.spec.ts",
-            "outFiles": [
-                "${workspaceFolder}/**/*.js"
-            ]
-        }
-    ]
-}

static/css/{style.css → style-new.css} RENAMED Viewed

@@ -2,6 +2,21 @@ body {
 	background: #f2f2f2;
 }
 .expanded {
 	margin: auto;
@@ -18,7 +33,13 @@ h1 {
 a.disabled {
 	pointer-events: none;
-	color: #ccc;
 	background-color: #ccc;
 }
@@ -29,6 +50,31 @@ a.disabled {
 	display: flex;
 }
 /* ############## Next button ##### */
 .button-next {
 	border-radius: 4px;
@@ -40,11 +86,10 @@ a.disabled {
 	box-sizing: border-box;
 	position: absolute;
 	top: 0;
-	left: 0%;
-	right: 2%;
-	bottom: 2%;
 	background-color: #58636d;
-	width: 10em;
 	transition: all 0.5s;
 	cursor: pointer;
@@ -127,41 +172,24 @@ a.disabled {
 	display: block;
 	position: absolute;
 	left: 2%;
-	top: 15%;
-	transform: translate(-0%, -0%);
-	height: 45%;
-	width: 96%;
-	max-width: 96%;
-	background: #ffff;
-	overflow: hidden;
-	border-radius: 20px;
-	box-shadow: 0 0 20px 8px #d0d0d0;
-}
-.container2 {
-	display: block;
-	position: absolute;
-	left: 2%;
-	top: 63%;
 	transform: translate(-0%, -0%);
-	height: 10%;
 	width: 96%;
 	max-width: 96%;
 	background: #ffff;
 	overflow: hidden;
 	border-radius: 20px;
 	box-shadow: 0 0 20px 8px #d0d0d0;
-	overflow: scroll;
-    max-height: 15%;
 }
 .container-small {
 	position: fixed;
-	left: 73%;
-	top: 95%;
 	transform: translate(-0%, -0%);
-	height: 4%;
-	width: 15%;
 	background: #ffff;
 	overflow: hidden;
 	border-radius: 20px;
@@ -238,6 +266,17 @@ a.disabled {
 	font-size: 3.5em !important;
 }
 .mic-button-div {
 	position: fixed;
 	left: 50%;
@@ -349,6 +388,75 @@ a.disabled {
 	width: 100%;
 }
 /* ############ Links and credits ####*/
 .link-icon-div {
@@ -362,7 +470,7 @@ a.disabled {
 .credits-icon-div {
 	position: fixed;
 	left: 90.5%;
-	top: 96%;
 	font-size: x-small;
 }
@@ -401,9 +509,9 @@ a.disabled {
 		display: block;
 		position: absolute;
 		left: 2%;
-		top: 15%;
 		transform: translate(-0%, -0%);
-		height: 85%;
 		width: 96%;
 		max-width: 96%;
 		background: #ffff;
@@ -412,23 +520,6 @@ a.disabled {
 		box-shadow: 0 0 20px 8px #d0d0d0;
 	}
-	.container2 {
-		display: block;
-		position: absolute;
-		left: 2%;
-		top: 63%;
-		transform: translate(-0%, -0%);
-		height: 10%;
-		width: 96%;
-		max-width: 96%;
-		background: #ffff;
-		overflow: hidden;
-		border-radius: 20px;
-		box-shadow: 0 0 20px 8px #d0d0d0;
-		overflow: scroll;
-		max-height: 15%;
-	}
 	.icon-text {
 		font-size: 0.8em !important;
 		text-align: center;
@@ -445,7 +536,7 @@ a.disabled {
 		/* 80px */
 		height: 3.5em;
 		padding-top: 0.4em;
-		left: 50%;
 		line-height: 0px;
 		border: 6px solid #fff;
 		border-radius: 50%;
@@ -460,7 +551,7 @@ a.disabled {
 	.mic-button-div {
 		position: fixed;
-		left: 50%;
 		top: 80%
 	}
@@ -502,4 +593,4 @@ a.disabled {
 		font-size: 0.8em;
 	}
-}

 	background: #f2f2f2;
 }
+.flex {
+	display: flex;
+}
+.text-align-center {
+	text-align: center;
+}
+.display-block {
+	display: block;
+}
+.display-inline-block {
+	display: inline-block;
+}
 .expanded {
 	margin: auto;
 a.disabled {
 	pointer-events: none;
+	color: black;
+	background-color: #ccc;
+}
+.color-disabled {
+	pointer-events: none;
+	color: black;
 	background-color: #ccc;
 }
 	display: flex;
 }
+.darkgreen {
+	color: white;
+	background-color: #467387;
+}
+/* text button */
+.text-button {
+	border: none;
+	text-align: center;
+	text-decoration: none;
+	display: inline-block;
+	font-size: 16px;
+	margin: 4px 2px;
+	height: fit-content;
+	width: 4em;
+}
+.text-button-div {
+	position: absolute;
+	top: 38%;
+}
+#input-uploader-audio-file {
+	width: 100px;
+	white-space: normal;
+}
 /* ############## Next button ##### */
 .button-next {
 	border-radius: 4px;
 	box-sizing: border-box;
 	position: absolute;
 	top: 0;
+	left: 94%;
 	background-color: #58636d;
+	width: 6%;
+	height: 100%;
 	transition: all 0.5s;
 	cursor: pointer;
 	display: block;
 	position: absolute;
 	left: 2%;
+	top: 18%;
 	transform: translate(-0%, -0%);
+	height: 59%;
 	width: 96%;
 	max-width: 96%;
 	background: #ffff;
 	overflow: hidden;
 	border-radius: 20px;
 	box-shadow: 0 0 20px 8px #d0d0d0;
 }
 .container-small {
 	position: fixed;
+	left: 68%;
+	top: 79%;
 	transform: translate(-0%, -0%);
+	height: 7%;
+	width: 30%;
 	background: #ffff;
 	overflow: hidden;
 	border-radius: 20px;
 	font-size: 3.5em !important;
 }
+.form-audio-file {
+	position: fixed;
+	left: 25%;
+	top: 82%;
+}
+.form-audio-file-label {
+	position: fixed;
+	left: 25%;
+	top: 86%;
+}
 .mic-button-div {
 	position: fixed;
 	left: 50%;
 	width: 100%;
 }
+/* ############ checkbox for using DTW */
+.container-dtw-div {
+	position: absolute;
+	top: 60%;
+}
+.container-label-dtw {
+	padding-left: 35px;
+	cursor: pointer;
+	font-size: 2em;
+	-webkit-user-select: none;
+	-moz-user-select: none;
+	-ms-user-select: none;
+	user-select: none;
+}
+/* Hide the browser's default checkbox */
+.container-label-dtw input {
+	position: absolute;
+	opacity: 0;
+	cursor: pointer;
+	height: 0;
+	width: 0;
+}
+/* Create a custom checkbox */
+.checkmark {
+	position: absolute;
+	margin-top: 0.4em;
+	left: 0;
+	height: 25px;
+	width: 25px;
+	background-color: #eee;
+}
+/* On mouse-over, add a grey background color */
+.container:hover input ~ .checkmark {
+	background-color: #ccc;
+}
+/* When the checkbox is checked, add a blue background */
+.container input:checked ~ .checkmark {
+	background-color: #467387;
+}
+/* Create the checkmark/indicator (hidden when not checked) */
+.checkmark:after {
+	content: "";
+	position: absolute;
+	display: none;
+}
+/* Show the checkmark when checked */
+.container input:checked ~ .checkmark:after {
+	display: block;
+}
+/* Style the checkmark/indicator */
+.container .checkmark:after {
+	left: 9px;
+	top: 5px;
+	width: 5px;
+	height: 10px;
+	border: solid white;
+	border-width: 0 3px 3px 0;
+	-webkit-transform: rotate(45deg);
+	-ms-transform: rotate(45deg);
+	transform: rotate(45deg);
+}
 /* ############ Links and credits ####*/
 .link-icon-div {
 .credits-icon-div {
 	position: fixed;
 	left: 90.5%;
+	top: 95%;
 	font-size: x-small;
 }
 		display: block;
 		position: absolute;
 		left: 2%;
+		top: 22%;
 		transform: translate(-0%, -0%);
+		height: 55%;
 		width: 96%;
 		max-width: 96%;
 		background: #ffff;
 		box-shadow: 0 0 20px 8px #d0d0d0;
 	}
 	.icon-text {
 		font-size: 0.8em !important;
 		text-align: center;
 		/* 80px */
 		height: 3.5em;
 		padding-top: 0.4em;
+		left: 40%;
 		line-height: 0px;
 		border: 6px solid #fff;
 		border-radius: 50%;
 	.mic-button-div {
 		position: fixed;
+		left: 40%;
 		top: 80%
 	}
 		font-size: 0.8em;
 	}
+}