Spaces:

Gregniuki
/

Pipertts

Runtime error

App Files Files Community

Pipertts / app.py

Gregniuki

Update app.py

3e91c45 about 2 years ago

raw

history blame

17 kB

	enhanced_accessibility = False #@param {type:"boolean"}
	#@markdown ---

	#@markdown #### Please select your language:
	#lang_select = "English" #@param ["English", "Spanish"]
	#if lang_select == "English":
	# lang = "en"
	#elif lang_select == "Spanish":
	# lang = "es"

	#else:
	# raise Exception("Language not supported.")
	#@markdown ---
	use_gpu = False #@param {type:"boolean"}

	from fastapi import FastAPI, Request, Form
	from fastapi.responses import HTMLResponse
	from fastapi.responses import FileResponse
	from fastapi.templating import Jinja2Templates
	from fastapi.staticfiles import StaticFiles

	# ...
	# Mount a directory to serve static files (e.g., CSS and JavaScript)


	import logging


	app = FastAPI()
	app.mount("/static", StaticFiles(directory="static"), name="static")
	templates = Jinja2Templates(directory="templates")
	files = {}
	# Configure logging
	logging.basicConfig(level=logging.DEBUG)
	# Mock data for your interface
	data = {
	"speaker_options": ["en","en-us","en-029","n-gb-x-gbclan","en-gb-x-rp","en-gb-scotland","en-gb-gbcwmd", "es", "de", "pl","ar","be","bn","bpy","bs","bg","ca","yue","hak","haw","cmn","hr","cs","da","nl","eo","et","fa","fa-latn","fi","fr-be","fr","ga","gd","ka","grc","el","kl","gn","gu","ht","he","hi","hu","id","io","it","ja","kn","kok","ko","ku","kk","ky","la","lb","ltg","lv","lfn","lt","jbo","mi","mk","ms","ml","mt","mr","nci","ne","nb","nog","or","om","pap","pt-br","pt","ro","ru","ru-lv","uk","sjn","sr","tn","sd","shn","si","sk","sl","es","es-419","sw","sv","ta","th","tk","tt","te","tr","ug","ur","uz","vi-vn-x-central","vi","vi0vn-x-south"],
	"default_speaker": "en",
	}
	# Define a dictionary to store model configurations
	model_configurations = {}
	import logging
	import math
	import sys
	from pathlib import Path
	from enum import Enum
	from typing import Iterable, List, Optional, Union
	import numpy as np
	import onnxruntime

	import glob
	#import ipywidgets as widgets
	from pydub import AudioSegment
	import tempfile
	import uuid
	import soundfile as sf
	#from IPython.display import display, Audio, Markdown, clear_output
	from piper_phonemize import phonemize_codepoints, phonemize_espeak, tashkeel_run

	#@app.get("/", response_class=HTMLResponse)
	#async def read_root(request: Request):
	# return templates.TemplateResponse("interface.html", {"request": request, "data": data})
	@app.get("/", response_class=HTMLResponse)
	async def read_root(request: Request):
	#data = {"your_data_key": "your_data_value"} # Replace with your data
	return templates.TemplateResponse("interface.html", {"request": request, "data": data, "voices": voices})
	import json
	_LOGGER = logging.getLogger("piper_train.infer_onnx")
	import os
	#if not os.path.exists("./content/piper/src/python/lng"):
	# import subprocess
	# command = "cp -r ./content/piper/notebooks/lng ./content/piper/src/python/lng"
	# subprocess.run(command, shell=True)

	import sys
	#sys.path.append('/content/piper/notebooks')
	sys.path.append('./content/piper/src/python')
	import configparser

	class Translator:
	def __init__(self):
	self.configs = {}

	def load_language(self, language_name):
	if language_name not in self.configs:
	config = configparser.ConfigParser()
	config.read(os.path.join(os.getcwd(), "lng", f"{language_name}.lang"))
	self.configs[language_name] = config

	def translate(self, language_name, string):
	if language_name == "en":
	return string
	elif language_name not in self.configs:
	self.load_language(language_name)
	config = self.configs[language_name]
	try:
	return config.get("Strings", string)
	except (configparser.NoOptionError, configparser.NoSectionError):
	if string:
	return string
	else:
	raise Exception("language engine error: This translation is corrupt!")
	return 0
	#from translator import *
	lan = Translator()
	def detect_onnx_models(path):
	onnx_models = glob.glob(path + '/*.onnx')
	onnx_configs = glob.glob(path + '/*.json')
	if len(onnx_models) > 1:
	return onnx_models, onnx_configs
	elif len(onnx_models) == 1:
	return onnx_models[0], onnx_configs[0]
	else:
	return None


	renamed_audio_file = None

	model_configurations = {}

	@app.on_event("startup")
	async def load_model_data():
	# Load data for all models in the directory upon startup
	sys.path.append('./content/piper/src/python')
	models_path = "./content/piper/src/python"
	logging.basicConfig(level=logging.DEBUG)
	providers = [
	"CPUExecutionProvider"
	if use_gpu is False
	else ("CUDAExecutionProvider", {"cudnn_conv_algo_search": "DEFAULT"})
	]
	sess_options = onnxruntime.SessionOptions()

	# Collect data for all models in the directory and populate model_configurations
	model_names = detect_onnx_models(models_path)
	for model_name in model_names:
	# Load the configuration data for each model (including speaker_id_map)
	config = load_model_configuration(model_name)
	model_configurations[model_name] = config


	def load_model_configuration(model_name):
	# Assuming model_name is the path to the ONNX model file, e.g., 'model.onnx'
	config_file_path = model_name.replace('.onnx', '.json')

	try:
	with open(config_file_path, 'r') as config_file:
	config_data = json.load(config_file)
	return config_data
	except FileNotFoundError:
	# Handle the case where the configuration file does not exist
	return None


	#@app.post("/synthesize")
	#@app.post("/", response_class=FileResponse)
	@app.post("/", response_class=HTMLResponse)
	async def main(
	request: Request,
	text_input: str = Form(default="1, 2, 3. This is a test. Enter some text to generate."),
	selected_model: str = Form(...), # Selected model
	selected_speaker_id: str = Form(...), # Selected speaker ID
	speaker: str = Form(...),
	speed_slider: float = Form(...),
	noise_scale_slider: float = Form(...),
	noise_scale_w_slider: float = Form(...),
	play: bool = Form(True)
	):

	# Handle loading the selected model and speaker ID here
	if selected_model in onnx_models:
	config = model_configurations[selected_model]
	onnx_model = config["onnx_model"] # Replace with the actual key for your ONNX model file
	model, config = load_onnx(onnx_model, sess_options, providers)
	speaker_id_map = config.get("speaker_id_map", {})
	if isinstance(onnx_models, str):
	# Show loading message and disable the form
	response_html = """
	<div id="loading-message">Generating your audio, please wait...</div>
	<script>
	document.getElementById("synthesize_button").disabled = true;
	</script>
	"""
	if selected_model in onnx_models:
	# The selected_model is found in the list of model file paths
	model_name = selected_model
	config = load_model_configuration(model_name)
	onnx_model = model_name # Replace with the actual key for your ONNX model file
	if config:
	model, _ = load_onnx(onnx_model, sess_options, providers)
	speaker_id_map = config.get("speaker_id_map", {})
	else:
	# The selected_model is not found in the list; handle this case as needed
	# You can show an error message or handle it differently
	response_html = """
	<div id="error-message">Selected model not found.</div>
	<script>
	document.getElementById("synthesize_button").disabled = true;
	</script>
	"""
	auto_play = play
	audio = inferencing(model, config, 1, text_input, speed_slider, noise_scale_slider, noise_scale_w_slider, auto_play)
	temp_dir = tempfile.mkdtemp()


	renamed_audio_file = os.path.join(temp_dir, "download.mp3")

	audio.export(renamed_audio_file, format="mp3")
	# Save the generated audio as a temporary file
	filepath = renamed_audio_file

	# Generate a unique file ID
	file_id = str(uuid.uuid4())

	# Store the file path with the generated file ID
	files[file_id] = filepath

	# Create a URL to download the file
	file_url = f'/download?fileId={file_id}'
	# Restore the form and return the response
	response_html += """
	<script>
	document.getElementById("loading-message").innerText = "Audio generated successfully!";
	document.getElementById("synthesize_button").disabled = false;
	</script>
	"""





	# Save the audio as a temporary WAV file
	return templates.TemplateResponse("interface.html", {"request": request, "file_url": file_url, "text_input": text_input, "data": data, "model_names": model_configurations.keys(), # Pass the list of available model names
	"selected_model": selected_model, # Pass the selected model
	"speaker_id_map": speaker_id_map, # Pass the speaker ID map for the selected model
	"selected_speaker_id": selected_speaker_id, # Pass the selected speaker ID
	# "model": model, # Pass the loaded model (if any)
	"voices": voices, "dynamic_content": response_html})

	# Serve the audio file with the correct media type
	# return FileResponse(renamed_audio_file)


	# return {"message": f"Text to synthesize: {text_input}, Speed: {speed_slider}, Play: {play}"}

	@app.get("/download")
	async def download_file(fileId: str):
	# Retrieve the file path from the dictionary using the file ID
	filepath = files.get(fileId)
	if filepath:
	# Create a FileResponse to serve the file for download
	return FileResponse(filepath, headers={"Content-Disposition": "attachment"})
	else:
	return {"error": "File not found"}

	def load_onnx(model, sess_options, providers = ["CPUExecutionProvider"]):
	_LOGGER.debug("Loading model from %s", model)
	config = load_config(model)
	model = onnxruntime.InferenceSession(
	str(model),
	sess_options=sess_options,
	providers= providers
	)
	_LOGGER.info("Loaded model from %s", model)
	return model, config

	def load_config(model):
	with open(f"{model}.json", "r") as file:
	config = json.load(file)
	return config
	PAD = "_" # padding (0)
	BOS = "^" # beginning of sentence
	EOS = "$" # end of sentence

	class PhonemeType(str, Enum):
	ESPEAK = "espeak"
	TEXT = "text"

	def phonemize(config, text: str) -> List[List[str]]:
	"""Text to phonemes grouped by sentence."""
	if config["phoneme_type"] == PhonemeType.ESPEAK:
	if config["espeak"]["voice"] == "ar":
	# Arabic diacritization
	# https://github.com/mush42/libtashkeel/
	text = tashkeel_run(text)
	return phonemize_espeak(text, config["espeak"]["voice"])
	if config["phoneme_type"] == PhonemeType.TEXT:
	return phonemize_codepoints(text)
	raise ValueError(f'Unexpected phoneme type: {config["phoneme_type"]}')

	def phonemes_to_ids(config, phonemes: List[str]) -> List[int]:
	"""Phonemes to ids."""
	id_map = config["phoneme_id_map"]
	ids: List[int] = list(id_map[BOS])
	for phoneme in phonemes:
	if phoneme not in id_map:
	print("Missing phoneme from id map: %s", phoneme)
	continue
	ids.extend(id_map[phoneme])
	ids.extend(id_map[PAD])
	ids.extend(id_map[EOS])
	return ids
	def audio_float_to_int16(
	audio: np.ndarray, max_wav_value: float = 32767.0
	) -> np.ndarray:
	"""Normalize audio and convert to int16 range"""
	audio_norm = audio * (max_wav_value / max(0.01, np.max(np.abs(audio))))
	audio_norm = np.clip(audio_norm, -max_wav_value, max_wav_value)
	audio_norm = audio_norm.astype("int16")
	return audio_norm

	def inferencing(model, config, sid, line, length_scale, noise_scale, noise_scale_w, auto_play=True):
	audios = []
	if config["phoneme_type"] == "PhonemeType.ESPEAK":
	config["phoneme_type"] = "espeak"
	text = phonemize(config, line)
	for phonemes in text:
	phoneme_ids = phonemes_to_ids(config, phonemes)
	num_speakers = config["num_speakers"]
	if num_speakers == 1:
	speaker_id = None # for now
	else:
	speaker_id = sid
	text = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
	text_lengths = np.array([text.shape[1]], dtype=np.int64)
	scales = np.array(
	[noise_scale, length_scale, noise_scale_w],
	dtype=np.float32,
	)
	sid = None
	if speaker_id is not None:
	sid = np.array([speaker_id], dtype=np.int64) # Ensure sid is a 1D array
	audio = model.run(
	None,
	{
	"input": text,
	"input_lengths": text_lengths,
	"scales": scales,
	"sid": sid,
	},
	)[0].squeeze((0, 1))
	audio = audio_float_to_int16(audio.squeeze())
	audios.append(audio)
	merged_audio = np.concatenate(audios)
	sample_rate = config["audio"]["sample_rate"]
	temp_audio_path = os.path.join(tempfile.gettempdir(), "generated_audio.wav")
	sf.write(temp_audio_path, merged_audio, config["audio"]["sample_rate"])
	audio = AudioSegment.from_mp3(temp_audio_path)
	return audio
	# return FileResponse(temp_audio_path)
	# Return the audio file as a FastAPI response
	# display(Markdown(f"{line}"))
	# display(Audio(merged_audio, rate=sample_rate, autoplay=auto_play))

	def denoise(
	audio: np.ndarray, bias_spec: np.ndarray, denoiser_strength: float
	) -> np.ndarray:
	audio_spec, audio_angles = transform(audio)

	a = bias_spec.shape[-1]
	b = audio_spec.shape[-1]
	repeats = max(1, math.ceil(b / a))
	bias_spec_repeat = np.repeat(bias_spec, repeats, axis=-1)[..., :b]

	audio_spec_denoised = audio_spec - (bias_spec_repeat * denoiser_strength)
	audio_spec_denoised = np.clip(audio_spec_denoised, a_min=0.0, a_max=None)
	audio_denoised = inverse(audio_spec_denoised, audio_angles)

	return audio_denoised


	def stft(x, fft_size, hopsamp):
	"""Compute and return the STFT of the supplied time domain signal x.
	Args:
	x (1-dim Numpy array): A time domain signal.
	fft_size (int): FFT size. Should be a power of 2, otherwise DFT will be used.
	hopsamp (int):
	Returns:
	The STFT. The rows are the time slices and columns are the frequency bins.
	"""
	window = np.hanning(fft_size)
	fft_size = int(fft_size)
	hopsamp = int(hopsamp)
	return np.array(
	[
	np.fft.rfft(window * x[i : i + fft_size])
	for i in range(0, len(x) - fft_size, hopsamp)
	]
	)


	def istft(X, fft_size, hopsamp):
	"""Invert a STFT into a time domain signal.
	Args:
	X (2-dim Numpy array): Input spectrogram. The rows are the time slices and columns are the frequency bins.
	fft_size (int):
	hopsamp (int): The hop size, in samples.
	Returns:
	The inverse STFT.
	"""
	fft_size = int(fft_size)
	hopsamp = int(hopsamp)
	window = np.hanning(fft_size)
	time_slices = X.shape[0]
	len_samples = int(time_slices * hopsamp + fft_size)
	x = np.zeros(len_samples)
	for n, i in enumerate(range(0, len(x) - fft_size, hopsamp)):
	x[i : i + fft_size] += window * np.real(np.fft.irfft(X[n]))
	return x


	def inverse(magnitude, phase):
	recombine_magnitude_phase = np.concatenate(
	[magnitude * np.cos(phase), magnitude * np.sin(phase)], axis=1
	)

	x_org = recombine_magnitude_phase
	n_b, n_f, n_t = x_org.shape # pylint: disable=unpacking-non-sequence
	x = np.empty([n_b, n_f // 2, n_t], dtype=np.complex64)
	x.real = x_org[:, : n_f // 2]
	x.imag = x_org[:, n_f // 2 :]
	inverse_transform = []
	for y in x:
	y_ = istft(y.T, fft_size=1024, hopsamp=256)
	inverse_transform.append(y_[None, :])

	inverse_transform = np.concatenate(inverse_transform, 0)

	return inverse_transform


	def transform(input_data):
	x = input_data
	real_part = []
	imag_part = []
	for y in x:
	y_ = stft(y, fft_size=1024, hopsamp=256).T
	real_part.append(y_.real[None, :, :]) # pylint: disable=unsubscriptable-object
	imag_part.append(y_.imag[None, :, :]) # pylint: disable=unsubscriptable-object
	real_part = np.concatenate(real_part, 0)
	imag_part = np.concatenate(imag_part, 0)

	magnitude = np.sqrt(real_part2 + imag_part2)
	phase = np.arctan2(imag_part.data, real_part.data)

	return magnitude, phase



	#@app.get("/")
	#async def read_root(request: Request):
	# return templates.TemplateResponse("interface.html", {"request": request})

	if __name__ == "__main__":
	# main()
	import uvicorn
	uvicorn.run(app, host="0.0.0.0", port=7860)
	# main()
	# pass
	# app()

	# Create an instance of the FastAPI class
	#app = main()

	# Define a route for the root endpoint

	#def read_root():
	# return {"message": "Hello, World!"}