naomi-app-api / utils.py
vumichien's picture
Create utils.py
532a2ea
raw
history blame
4.96 kB
import subprocess
import numpy as np
import requests
import json
from typing import Dict, List
import random
import torch
from joblib import Parallel, delayed
import os
def random_runner(target_prob, size):
indice = random.choices(range(0, size[1]), k=size[0])
value = target_prob[range(len(indice)), indice].sum().detach().numpy().item()
return indice, value
def query(data, model_id, api_token) -> Dict:
"""
Helper function to query text from audio file by huggingface api inference.
"""
headers = {"Authorization": f"Bearer {api_token}"}
api_url = f"https://api-inference.huggingface.co/models/{model_id}"
response = requests.request("POST", api_url, headers=headers, data=data)
return json.loads(response.content.decode("utf-8"))
def query_process(filename, model_id, api_token) -> Dict:
"""
Helper function to query text from audio file by huggingface api inference.
"""
headers = {"Authorization": f"Bearer {api_token}"}
api_url = f"https://api-inference.huggingface.co/models/{model_id}"
with open(filename, "rb") as f:
data = f.read()
response = requests.request("POST", api_url, headers=headers, data=data)
return json.loads(response.content.decode("utf-8"))
def query_dummy(raw_data, processor, model):
inputs = processor(raw_data, sampling_rate=16000, return_tensors="pt")
with torch.no_grad():
logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)
return transcription[0]
def query_raw(raw_data, word, processor, processor_with_lm, model, temperature=15) -> List:
"""
Helper function to query draw file to huggingface api inference.
"""
input_values = processor(raw_data, sampling_rate=16000, return_tensors="pt").input_values
with torch.no_grad():
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
top1_prediction = processor_with_lm.decode(logits[0].cpu().numpy())['text']
if word != top1_prediction.replace(" ", ""):
pad_token_id = processor.tokenizer.pad_token_id
word_delimiter_token_id = processor.tokenizer.word_delimiter_token_id
value_top5, ind_top5 = torch.topk(logits, 3)
target_index = ind_top5[(predicted_ids != word_delimiter_token_id) & (predicted_ids != pad_token_id)]
target_prob = value_top5[(predicted_ids != word_delimiter_token_id) & (predicted_ids != pad_token_id)]
size = target_index.size()
trial = size[1]**4//2
prediction_list = Parallel(n_jobs=1, backend="multiprocessing")(
delayed(random_runner)(target_prob, size) for _ in range(trial)
)
target_dict = {i[1]: i[0] for i in prediction_list}
target_dict = sorted(target_dict.items(), reverse=True)
results = {}
for top_pred in target_dict[:temperature]:
indices = top_pred[1]
output_sentence = processor.decode(target_index[range(size[0]), indices]).lower()
results[output_sentence] = top_pred[0]
results = sorted(results.items(), key=lambda x: x[1], reverse=True)
return results
else:
return [(word, 100)]
def find_different(target, prediction):
# target_word = set(target)
# prediction_word = set(prediction)
# difference = target_word.symmetric_difference(prediction_word)
# wrong_words = [word for word in target_word if word in list(difference)]
if len(target) != len(prediction):
target = target[:len(prediction)]
wrong_words = [str(1) if target[index] != prediction[index] else str(0) for index in range(len(target))]
return "".join(wrong_words)
def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array:
"""
Helper function to read an audio file through ffmpeg.
"""
ar = f"{sampling_rate}"
ac = "1"
format_for_conversion = "f32le"
ffmpeg_command = [
"ffmpeg",
"-i",
"pipe:0",
"-ac",
ac,
"-ar",
ar,
"-f",
format_for_conversion,
"-hide_banner",
"-loglevel",
"quiet",
"pipe:1",
]
try:
ffmpeg_process = subprocess.Popen(ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
except FileNotFoundError:
raise ValueError("ffmpeg was not found but is required to load audio files from filename")
output_stream = ffmpeg_process.communicate(bpayload)
out_bytes = output_stream[0]
audio = np.frombuffer(out_bytes, np.float32)
# if audio.shape[0] == 0:
# raise ValueError("Malformed soundfile")
return audio
def get_model_size(model):
torch.save(model.state_dict(), 'temp_saved_model.pt')
model_size_in_mb = os.path.getsize('temp_saved_model.pt') >> 20
os.remove('temp_saved_model.pt')
return model_size_in_mb