alessandro trinca tornidor
feat: port whisper and faster-whisper support from https://github.com/Thiagohgl/ai-pronunciation-trainer
85b7206
"""Various utilities (Serialize objects, time benchmark, args dump, numerical and stats info)"""
from pathlib import Path
from typing import Mapping
import numpy as np
from constants import app_logger
def serialize(obj: any, include_none: bool = False):
"""
Return the input object into a serializable one
Args:
obj: Object to serialize
include_none: bool to indicate if include also keys with None values during dict serialization
Returns:
serialized object
"""
return _serialize(obj, include_none)
def _serialize(obj: any, include_none: bool):
from numpy import ndarray as np_ndarray, floating as np_floating, integer as np_integer
primitive = (int, float, str, bool)
# print(type(obj))
try:
if obj is None:
return None
elif isinstance(obj, np_integer):
return int(obj)
elif isinstance(obj, np_floating):
return float(obj)
elif isinstance(obj, np_ndarray):
return obj.tolist()
elif isinstance(obj, primitive):
return obj
elif type(obj) is list:
return _serialize_list(obj, include_none)
elif type(obj) is tuple:
return list(obj)
elif type(obj) is bytes:
return _serialize_bytes(obj)
elif isinstance(obj, Exception):
return _serialize_exception(obj)
# elif isinstance(obj, object):
# return _serialize_object(obj, include_none)
else:
return _serialize_object(obj, include_none)
except Exception as e_serialize:
app_logger.error(f"e_serialize::{e_serialize}, type_obj:{type(obj)}, obj:{obj}.")
return f"object_name:{str(obj)}__object_type_str:{str(type(obj))}."
def _serialize_object(obj: Mapping[any, object], include_none: bool) -> dict[any]:
res = {}
if type(obj) is not dict:
keys = [i for i in obj.__dict__.keys() if (getattr(obj, i) is not None) or include_none]
else:
keys = [i for i in obj.keys() if (obj[i] is not None) or include_none]
for key in keys:
if type(obj) is not dict:
res[key] = _serialize(getattr(obj, key), include_none)
else:
res[key] = _serialize(obj[key], include_none)
return res
def _serialize_list(ls: list, include_none: bool) -> list:
return [_serialize(elem, include_none) for elem in ls]
def _serialize_bytes(b: bytes) -> dict[str, str]:
import base64
encoded = base64.b64encode(b)
return {"value": encoded.decode('ascii'), "type": "bytes"}
def _serialize_exception(e: Exception) -> dict[str, str]:
return {"msg": str(e), "type": str(type(e)), **e.__dict__}
def hash_calculate(arr_or_path: np.ndarray | str | Path, is_file: bool, read_mode: str = "rb") -> str | bytes:
"""
Return computed hash from input variable (typically a numpy array).
Args:
arr_or_path: variable to hash (numpy array, string, Path-like object, dict, bytes)
is_file: read the variable from a file
read_mode: used when is_file is True to read the file in binary or text mode
Returns:
computed hash from input variable
"""
from hashlib import sha256
from base64 import b64encode
from numpy import ndarray as np_ndarray
if is_file:
with open(arr_or_path, read_mode) as file_to_check:
# read contents of the file
arr_or_path = file_to_check.read()
# # pipe contents of the file through
# try:
# return hashlib.sha256(data).hexdigest()
# except TypeError:
# app_logger.warning(
# f"TypeError, re-try encoding arg:{arr_or_path},type:{type(arr_or_path)}."
# )
# return hashlib.sha256(data.encode("utf-8")).hexdigest()
if isinstance(arr_or_path, np_ndarray):
hash_fn = sha256(arr_or_path.data)
elif isinstance(arr_or_path, dict):
import json
serialized = serialize(arr_or_path)
variable_to_hash = json.dumps(serialized, sort_keys=True).encode("utf-8")
hash_fn = sha256(variable_to_hash)
elif isinstance(arr_or_path, str):
try:
hash_fn = sha256(arr_or_path)
except TypeError:
app_logger.error(
f"TypeError, re-try encoding arg:{arr_or_path},type:{type(arr_or_path)}."
)
hash_fn = sha256(arr_or_path.encode("utf-8"))
elif isinstance(arr_or_path, bytes):
hash_fn = sha256(arr_or_path)
else:
raise ValueError(
f"variable 'arr':{arr_or_path} of type '{type(arr_or_path)}' not yet handled."
)
return b64encode(hash_fn.digest())