Spaces:

Detomo
/

naomi-app-api

Runtime error

App Files Files Community

vumichien commited on Feb 15, 2023

Commit

4c7e2b8

1 Parent(s): f08373d

Update main.py

Browse files

Files changed (1) hide show

main.py +107 -1

main.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from fastapi import FastAPI
 import datetime
 import torch
 import os
 from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM, AutoConfig
@@ -39,4 +40,109 @@ app = FastAPI()
 @app.get("/")
 def read_root():
-    return {"Message": "Application startup complete"}

 from fastapi import FastAPI
 import datetime
+import time
 import torch
 import os
 from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM, AutoConfig
 @app.get("/")
 def read_root():
+    return {"Message": "Application startup complete"}
+@app.post("/naomi_api_score/")
+async def predict(
+                file: bytes = File(...),
+                word: str = Form(...),
+                pitch: str = Form("None"),
+                temperature: int = Form(...),
+                 ):
+    """ Transform input audio, get text and pitch from Huggingface api and calculate score by Levenshtein Distance Score
+        Parameters:
+         ----------
+        file : bytes
+            input audio file
+        word : strings
+            true hiragana word to calculate word score
+        pitch : strings
+            true pitch to calculate pitch score
+        temperature: integer
+            the difficulty of AI model
+        Returns:
+        -------
+        timestamp: strings
+            current time Year-Month-Day-Hours:Minutes:Second
+        running_time : strings
+            running time second
+        error message : strings
+            error message from api
+        audio duration: integer
+            durations of source audio
+        target : integer
+            durations of target audio
+        method : string
+            method applied to transform source audio
+        word predict : strings
+            text from api
+        pitch predict : strings
+            pitch from api
+        wrong word index: strings (ex: 100)
+            wrong word compare to target word
+        wrong pitch index: strings (ex: 100)
+            wrong word compare to target word
+        score: integer
+            Levenshtein Distance Score from pitch and word
+    """
+    upload_audio = ffmpeg_read(file, sampling_rate=16000)
+    audio_duration = len(upload_audio) / 16000
+    current_time = datetime.datetime.now().strftime("%Y-%h-%d-%H:%M:%S")
+    start_time = time.time()
+    error_message, score = None, None
+    if len(word) != len(pitch):
+        error_message = "Length of word and pitch input is not equal"
+    word_preds = query_raw(upload_audio, word, processor, processor_with_lm, quantized_model, temperature=temperature)
+    if pitch != "None":
+        pitch_preds = query_dummy(upload_audio, processor_pitch, quantized_pitch_model)
+    # find best word
+    word_score_list = []
+    for word_predict in word_preds:
+        word_score_list.append(fuzz.ratio(word, word_predict[0]))
+    word_score = max(word_score_list)
+    best_word_predict = word_preds[word_score_list.index(word_score)][0]
+    wrong_word = find_different(word, best_word_predict)  # get wrong word
+    # find best pitch
+    if pitch != "None":
+        if pitch_preds is not None:
+            best_pitch_predict = pitch_preds.replace(" ", "")
+            if len(best_pitch_predict) < len(best_word_predict):
+                best_pitch_predict = best_pitch_predict + "1" * (len(best_word_predict) - len(best_pitch_predict))
+            else:
+                best_pitch_predict = best_pitch_predict[:len(best_word_predict)]  # truncate to max len
+            pitch_score = fuzz.ratio(pitch, best_pitch_predict)
+            score = int((word_score * 2 + pitch_score) / 3)
+            wrong_pitch = find_different(pitch, best_pitch_predict)  # get wrong pitch
+    else:
+        score = int(word_score)
+        best_pitch_predict = None
+        wrong_pitch = None
+    return {"timestamp": current_time,
+            "running_time": f"{round(time.time() - start_time, 4)} s",
+            "error message": error_message,
+            "audio duration": audio_duration,
+            "word predict": best_word_predict,
+            "pitch predict": best_pitch_predict,
+            "wrong word index": wrong_word,
+            "wrong pitch index": wrong_pitch,
+            "score": score
+            }