kevinwang676
/

GPT-SoVITS-v3-api

ONNX

Model card Files Files and versions

xet

Community

kevinwang676 commited on Mar 29

Commit

10d4c29

verified ·

1 Parent(s): a9f4026

Update api_v2.py

Browse files

Files changed (1) hide show

api_v2.py +85 -50

api_v2.py CHANGED Viewed

@@ -129,14 +129,23 @@ cut_method_names = get_cut_method_names()
 import os
 import sys
 import traceback
-from typing import Generator
 import requests
 import tempfile
 import urllib.parse
 from pathlib import Path
 # Function to check if a path is a URL and download it if needed
-def process_audio_path(audio_path):
     if audio_path and (audio_path.startswith('http://') or audio_path.startswith('https://') or
                       audio_path.startswith('s3://')):
         try:
@@ -175,13 +184,13 @@ def process_audio_path(audio_path):
                         f.write(chunk)
             print(f"Downloaded to: {local_path}")
-            return local_path
         except Exception as e:
             print(f"Error downloading audio file: {e}")
             raise Exception(f"Failed to download audio from URL: {e}")
     # If not a URL or download failed, return the original path
-    return audio_path
 parser = argparse.ArgumentParser(description="GPT-SoVITS api")
 parser.add_argument("-c", "--tts_config", type=str, default="GPT_SoVITS/configs/tts_infer.yaml", help="tts_infer路径")
@@ -332,40 +341,14 @@ def check_params(req:dict):
 async def tts_handle(req:dict):
     """
     Text to speech handler.
-    Args:
-        req (dict):
-            {
-                "text": "",                   # str.(required) text to be synthesized
-                "text_lang: "",               # str.(required) language of the text to be synthesized
-                "ref_audio_path": "",         # str.(required) reference audio path or URL
-                "aux_ref_audio_paths": [],    # list.(optional) auxiliary reference audio paths or URLs
-                "prompt_text": "",            # str.(optional) prompt text for the reference audio
-                "prompt_lang": "",            # str.(required) language of the prompt text for the reference audio
-                "top_k": 5,                   # int. top k sampling
-                "top_p": 1,                   # float. top p sampling
-                "temperature": 1,             # float. temperature for sampling
-                "text_split_method": "cut5",  # str. text split method, see text_segmentation_method.py for details.
-                "batch_size": 1,              # int. batch size for inference
-                "batch_threshold": 0.75,      # float. threshold for batch splitting.
-                "split_bucket: True,          # bool. whether to split the batch into multiple buckets.
-                "speed_factor":1.0,           # float. control the speed of the synthesized audio.
-                "fragment_interval":0.3,      # float. to control the interval of the audio fragment.
-                "seed": -1,                   # int. random seed for reproducibility.
-                "media_type": "wav",          # str. media type of the output audio, support "wav", "raw", "ogg", "aac".
-                "streaming_mode": False,      # bool. whether to return a streaming response.
-                "parallel_infer": True,       # bool.(optional) whether to use parallel inference.
-                "repetition_penalty": 1.35    # float.(optional) repetition penalty for T2S model.
-                "sample_steps": 32,           # int. number of sampling steps for VITS model V3.
-                "super_sampling": False,       # bool. whether to use super-sampling for audio when using VITS model V3.
-            }
-    returns:
-        StreamingResponse: audio stream response.
     """
     streaming_mode = req.get("streaming_mode", False)
     return_fragment = req.get("return_fragment", False)
     media_type = req.get("media_type", "wav")
     check_res = check_params(req)
     if check_res is not None:
@@ -376,36 +359,69 @@ async def tts_handle(req:dict):
     try:
         # Process ref_audio_path (download if it's a URL)
-        req["ref_audio_path"] = process_audio_path(req["ref_audio_path"])
         # Process aux_ref_audio_paths (download if they're URLs)
         if req.get("aux_ref_audio_paths"):
             aux_paths = []
             for aux_path in req["aux_ref_audio_paths"]:
-                aux_paths.append(process_audio_path(aux_path))
             req["aux_ref_audio_paths"] = aux_paths
-        tts_generator=tts_pipeline.run(req)
         if streaming_mode:
-            def streaming_generator(tts_generator:Generator, media_type:str):
                 if_frist_chunk = True
-                for sr, chunk in tts_generator:
-                    if if_frist_chunk and media_type == "wav":
-                        yield wave_header_chunk(sample_rate=sr)
-                        media_type = "raw"
-                        if_frist_chunk = False
-                    yield pack_audio(BytesIO(), chunk, sr, media_type).getvalue()
-            # _media_type = f"audio/{media_type}" if not (streaming_mode and media_type in ["wav", "raw"]) else f"audio/x-{media_type}"
-            return StreamingResponse(streaming_generator(tts_generator, media_type, ), media_type=f"audio/{media_type}")
         else:
             sr, audio_data = next(tts_generator)
             audio_data = pack_audio(BytesIO(), audio_data, sr, media_type).getvalue()
             return Response(audio_data, media_type=f"audio/{media_type}")
     except Exception as e:
-        return JSONResponse(status_code=400, content={"message": f"tts failed", "Exception": str(e)})
@@ -479,13 +495,32 @@ async def tts_post_endpoint(request: TTS_Request):
 @APP.get("/set_refer_audio")
 async def set_refer_aduio(refer_audio_path: str = None):
     try:
         # Process the path (download if it's a URL)
-        local_path = process_audio_path(refer_audio_path)
         tts_pipeline.set_ref_audio(local_path)
     except Exception as e:
         return JSONResponse(status_code=400, content={"message": f"set refer audio failed", "Exception": str(e)})
-    return JSONResponse(status_code=200, content={"message": "success"})
 # @APP.post("/set_refer_audio")

 import os
 import sys
 import traceback
+from typing import Generator, Tuple
 import requests
 import tempfile
 import urllib.parse
 from pathlib import Path
 # Function to check if a path is a URL and download it if needed
+def process_audio_path(audio_path) -> Tuple[str, bool]:
+    """
+    Process an audio path, downloading it if it's a URL.
+    Args:
+        audio_path (str): Path or URL to audio file
+    Returns:
+        Tuple[str, bool]: (local_path, is_temporary)
+    """
     if audio_path and (audio_path.startswith('http://') or audio_path.startswith('https://') or
                       audio_path.startswith('s3://')):
         try:
                         f.write(chunk)
             print(f"Downloaded to: {local_path}")
+            return local_path, True  # Return path and flag indicating it's temporary
         except Exception as e:
             print(f"Error downloading audio file: {e}")
             raise Exception(f"Failed to download audio from URL: {e}")
     # If not a URL or download failed, return the original path
+    return audio_path, False  # Not a temporary file
 parser = argparse.ArgumentParser(description="GPT-SoVITS api")
 parser.add_argument("-c", "--tts_config", type=str, default="GPT_SoVITS/configs/tts_infer.yaml", help="tts_infer路径")
 async def tts_handle(req:dict):
     """
     Text to speech handler.
     """
     streaming_mode = req.get("streaming_mode", False)
     return_fragment = req.get("return_fragment", False)
     media_type = req.get("media_type", "wav")
+    temp_files = []  # Track temporary files for cleanup
+    print(f"----------现在使用的模型版本是：{tts_config.version}----------")
     check_res = check_params(req)
     if check_res is not None:
     try:
         # Process ref_audio_path (download if it's a URL)
+        ref_path, is_temp = process_audio_path(req["ref_audio_path"])
+        req["ref_audio_path"] = ref_path
+        if is_temp:
+            temp_files.append(ref_path)
         # Process aux_ref_audio_paths (download if they're URLs)
         if req.get("aux_ref_audio_paths"):
             aux_paths = []
             for aux_path in req["aux_ref_audio_paths"]:
+                local_path, is_temp = process_audio_path(aux_path)
+                aux_paths.append(local_path)
+                if is_temp:
+                    temp_files.append(local_path)
             req["aux_ref_audio_paths"] = aux_paths
+        tts_generator = tts_pipeline.run(req)
         if streaming_mode:
+            async def streaming_generator(tts_generator:Generator, media_type:str):
                 if_frist_chunk = True
+                try:
+                    for sr, chunk in tts_generator:
+                        if if_frist_chunk and media_type == "wav":
+                            yield wave_header_chunk(sample_rate=sr)
+                            media_type = "raw"
+                            if_frist_chunk = False
+                        yield pack_audio(BytesIO(), chunk, sr, media_type).getvalue()
+                finally:
+                    # Clean up temporary files after streaming completes
+                    for temp_file in temp_files:
+                        try:
+                            if os.path.exists(temp_file):
+                                os.remove(temp_file)
+                                print(f"Removed temporary file: {temp_file}")
+                        except Exception as e:
+                            print(f"Error removing temporary file {temp_file}: {e}")
+            return StreamingResponse(streaming_generator(tts_generator, media_type), media_type=f"audio/{media_type}")
         else:
             sr, audio_data = next(tts_generator)
             audio_data = pack_audio(BytesIO(), audio_data, sr, media_type).getvalue()
+            # Clean up temporary files after generation completes
+            for temp_file in temp_files:
+                try:
+                    if os.path.exists(temp_file):
+                        os.remove(temp_file)
+                        print(f"Removed temporary file: {temp_file}")
+                except Exception as e:
+                    print(f"Error removing temporary file {temp_file}: {e}")
             return Response(audio_data, media_type=f"audio/{media_type}")
     except Exception as e:
+        # Clean up temporary files in case of error
+        for temp_file in temp_files:
+            try:
+                if os.path.exists(temp_file):
+                    os.remove(temp_file)
+                    print(f"Removed temporary file: {temp_file}")
+            except Exception as cleanup_error:
+                print(f"Error removing temporary file {temp_file}: {cleanup_error}")
+        return JSONResponse(status_code=400, content={"message": f"tts failed", "Exception": str(e)})
 @APP.get("/set_refer_audio")
 async def set_refer_aduio(refer_audio_path: str = None):
+    temp_file = None
     try:
         # Process the path (download if it's a URL)
+        local_path, is_temp = process_audio_path(refer_audio_path)
+        if is_temp:
+            temp_file = local_path
+        # Store reference to the audio
         tts_pipeline.set_ref_audio(local_path)
+        # If temporary, remove after setting (since TTS pipeline should load the audio into memory)
+        if temp_file and os.path.exists(temp_file):
+            os.remove(temp_file)
+            print(f"Removed temporary file: {temp_file}")
+        return JSONResponse(status_code=200, content={"message": "success"})
     except Exception as e:
+        # Clean up temp file in case of error
+        if temp_file and os.path.exists(temp_file):
+            try:
+                os.remove(temp_file)
+                print(f"Removed temporary file: {temp_file}")
+            except Exception as cleanup_error:
+                print(f"Error removing temporary file {temp_file}: {cleanup_error}")
         return JSONResponse(status_code=400, content={"message": f"set refer audio failed", "Exception": str(e)})
 # @APP.post("/set_refer_audio")