from fastapi import FastAPI, File, Form import datetime import time import torch import os import numpy as np from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM, AutoConfig from huggingface_hub import hf_hub_download from fuzzywuzzy import fuzz from utils import ffmpeg_read, query_dummy, query_raw, find_different ## config API_TOKEN = os.environ["API_TOKEN"] MODEL_PATH = os.environ["MODEL_PATH"] PITCH_PATH = os.environ["PITCH_PATH"] QUANTIZED_MODEL_PATH = hf_hub_download(repo_id=MODEL_PATH, filename='quantized_model.pt', token=API_TOKEN) QUANTIZED_PITCH_MODEL_PATH = hf_hub_download(repo_id=PITCH_PATH, filename='quantized_model.pt', token=API_TOKEN) ## word preprocessor processor_with_lm = Wav2Vec2ProcessorWithLM.from_pretrained(MODEL_PATH, use_auth_token=API_TOKEN) processor = Wav2Vec2Processor.from_pretrained(MODEL_PATH, use_auth_token=API_TOKEN) ### quantized model config = AutoConfig.from_pretrained(MODEL_PATH, use_auth_token=API_TOKEN) dummy_model = Wav2Vec2ForCTC(config) quantized_model = torch.quantization.quantize_dynamic(dummy_model, {torch.nn.Linear}, dtype=torch.qint8, inplace=True) quantized_model.load_state_dict(torch.load(QUANTIZED_MODEL_PATH)) ## pitch preprocessor processor_pitch = Wav2Vec2Processor.from_pretrained(PITCH_PATH, use_auth_token=API_TOKEN) ### quantized pitch mode config = AutoConfig.from_pretrained(PITCH_PATH, use_auth_token=API_TOKEN) dummy_pitch_model = Wav2Vec2ForCTC(config) quantized_pitch_model = torch.quantization.quantize_dynamic(dummy_pitch_model, {torch.nn.Linear}, dtype=torch.qint8, inplace=True) quantized_pitch_model.load_state_dict(torch.load(QUANTIZED_PITCH_MODEL_PATH)) app = FastAPI() @app.get("/") def read_root(): return {"Message": "Application startup complete"} @app.post("/naomi_api_score/") async def predict( file: bytes = File(...), word: str = Form(...), pitch: str = Form("None"), temperature: int = Form(...), ): """ Transform input audio, get text and pitch from Huggingface api and calculate score by Levenshtein Distance Score Parameters: ---------- file : bytes input audio file word : strings true hiragana word to calculate word score pitch : strings true pitch to calculate pitch score temperature: integer the difficulty of AI model Returns: ------- timestamp: strings current time Year-Month-Day-Hours:Minutes:Second running_time : strings running time second error message : strings error message from api audio duration: integer durations of source audio target : integer durations of target audio method : string method applied to transform source audio word predict : strings text from api pitch predict : strings pitch from api wrong word index: strings (ex: 100) wrong word compare to target word wrong pitch index: strings (ex: 100) wrong word compare to target word score: integer Levenshtein Distance Score from pitch and word """ upload_audio = ffmpeg_read(file, sampling_rate=16000) # print(upload_audio.shape) # print(np.sum(np.abs(upload_audio))) debug = np.sum(np.abs(upload_audio)) audio_duration = len(upload_audio) / 16000 current_time = datetime.datetime.now().strftime("%Y-%h-%d-%H:%M:%S") start_time = time.time() error_message, score = None, None if len(word) != len(pitch): error_message = "Length of word and pitch input is not equal" word_preds = query_raw(upload_audio, word, processor, processor_with_lm, quantized_model, temperature=temperature) if pitch != "None": pitch_preds = query_dummy(upload_audio, processor_pitch, quantized_pitch_model) # find best word word_score_list = [] for word_predict in word_preds: word_score_list.append(fuzz.ratio(word, word_predict[0])) word_score = max(word_score_list) best_word_predict = word_preds[word_score_list.index(word_score)][0] wrong_word = find_different(word, best_word_predict) # get wrong word # find best pitch if pitch != "None": if pitch_preds is not None: best_pitch_predict = pitch_preds.replace(" ", "") if len(best_pitch_predict) < len(best_word_predict): best_pitch_predict = best_pitch_predict + "1" * (len(best_word_predict) - len(best_pitch_predict)) else: best_pitch_predict = best_pitch_predict[:len(best_word_predict)] # truncate to max len pitch_score = fuzz.ratio(pitch, best_pitch_predict) score = int((word_score * 2 + pitch_score) / 3) wrong_pitch = find_different(pitch, best_pitch_predict) # get wrong pitch else: score = int(word_score) best_pitch_predict = None wrong_pitch = None return {"timestamp": current_time, "running_time": f"{round(time.time() - start_time, 4)} s", "error message": error_message, "audio duration": audio_duration, "word predict": best_word_predict, "pitch predict": best_pitch_predict, "wrong word index": wrong_word, "wrong pitch index": wrong_pitch, "score": score, "debug": debug, }