Spaces:

ruslanmv
/

ai-story-server

Running

App Files Files Community

ai-story-server / utils /tts.py

ruslanmv

First api full version

1bac931 over 1 year ago

raw

history blame contribute delete

6.86 kB

	#!pip install torch
	#!pip install noisereduce
	#!pip install scipy

	import requests
	import base64
	import numpy as np
	from scipy.io.wavfile import read, write
	#import noisereduce as nr
	import nltk
	import struct
	test=False
	# Define sentence split length
	SENTENCE_SPLIT_LENGTH = 400

	##["en","es","fr","de","it","pt","pl","tr","ru","nl","cs","ar","zh-cn","ja"]
	def detect_language(sentence):
	url = "https://ruslanmv-hf-llm-api-collection.hf.space/detect"
	data = {"input_text": sentence}
	headers = {"Accept": "application/json", "Content-Type": "application/json"}
	response = requests.post(url, headers=headers, json=data)
	if response.status_code == 200:
	try:
	response_json = response.json()
	language = response_json.get("lang") # Assuming "lang" is the key
	return language
	except JSONDecodeError:
	print("Error: Invalid JSON response from the language detection API.")
	else:
	print(f"Error: Language detection API call failed with status code {response.status_code}")

	return None # Fallback if API calls fail

	def split_sentences(text, max_len):
	# Apply custom rules to enforce sentence breaks with double punctuation
	text = re.sub(r"(\s\.{2})\s", r".\1 ", text) # for '..'
	text = re.sub(r"(\s\!{2})\s", r"!\1 ", text) # for '!!'

	# Use NLTK to split into sentences
	sentences = nltk.sent_tokenize(text)

	# Then check if each sentence is greater than max_len, if so, use textwrap to split it
	sentence_list = []
	for sent in sentences:
	if len(sent) > max_len:
	wrapped = textwrap.wrap(sent, max_len, break_long_words=True)
	sentence_list.extend(wrapped)
	else:
	sentence_list.append(sent)

	return sentence_list


	def get_voice_streaming2(sentence, language):
	"""Makes a POST request to the text-to-speech API and yields audio chunks."""
	url = "https://ruslanmv-hf-llm-api-collection.hf.space/tts"
	data = {"input_text": sentence, "from_language": language}
	headers = {"Accept": "application/json", "Content-Type": "application/json"}
	response = requests.post(url, headers=headers, json=data)
	return response


	def pcm_to_wav2(pcm_data, sample_rate=24000, channels=1, bit_depth=16):
	if pcm_data.startswith(b"RIFF"):
	return pcm_data

	fmt_subchunk_size = 16
	data_subchunk_size = len(pcm_data)
	chunk_size = 4 + (8 + fmt_subchunk_size) + (8 + data_subchunk_size)

	wav_header = struct.pack('<4sI4s', b'RIFF', chunk_size, b'WAVE')
	fmt_subchunk = struct.pack('<4sIHHIIHH',
	b'fmt ', fmt_subchunk_size, 1, channels,
	sample_rate, sample_rate * channels * bit_depth // 8,
	channels * bit_depth // 8, bit_depth)

	data_subchunk = struct.pack('<4sI', b'data', data_subchunk_size)
	return wav_header + fmt_subchunk + data_subchunk + pcm_data

	import base64
	import re
	def generate_speech_from_history2(history, chatbot_role, sentence):
	"""
	Generates speech audio from a given sentence, performing necessary preprocessing.

	Args:
	history (list): Conversation history.
	chatbot_role (str): Role of the chatbot.
	sentence (str): The sentence to be converted to speech.

	Returns:
	list: A list of dictionaries containing text and audio (base64 encoded) for each sentence fragment.
	"""
	language = "autodetect"
	if len(sentence) == 0:
	print("EMPTY SENTENCE")
	return
	# Preprocessing steps:
	# - Remove special prompt token (</s>)
	sentence = sentence.replace("</s>", "")
	# - Remove code sections (enclosed in triple backticks)
	sentence = re.sub("`.*`", "", sentence, flags=re.DOTALL)
	# - Remove inline code fragments (backticks)
	sentence = re.sub("`.*`", "", sentence, flags=re.DOTALL)
	# - Remove content within parentheses
	sentence = re.sub("\(.*\)", "", sentence, flags=re.DOTALL)
	# - Remove remaining triple backticks
	sentence = sentence.replace("```", "")
	# - Replace ellipses with spaces
	sentence = sentence.replace("...", " ")
	# - Replace parentheses with spaces
	sentence = sentence.replace("(", " ")
	sentence = sentence.replace(")", " ")
	# - Remove assistant tag
	sentence = sentence.replace("<\|assistant\|>","")
	if len(sentence) == 0:
	print("EMPTY SENTENCE after processing")
	return
	# - Handle punctuation at the end of sentences
	sentence = re.sub("([^\x00-\x7F]\|\w)([\.。?!]+)", r"\1 \2", sentence)
	print("Sentence for speech:", sentence)
	results = []

	try:
	if len(sentence) < SENTENCE_SPLIT_LENGTH:
	sentence_list = [sentence]
	else:
	# Split longer sentences (implement your preferred split method)
	sentence_list = split_sentences(sentence, SENTENCE_SPLIT_LENGTH)
	print("detected sentences:", sentence_list)

	for sentence in sentence_list:
	print("- sentence =", sentence)
	if any(c.isalnum() for c in sentence):
	if language == "autodetect":
	language = detect_language(sentence) # Detect language on first call
	print("language",language)
	audio_stream = get_voice_streaming2(sentence, language)
	if audio_stream is not None:
	sentence_wav_bytestream = b""
	# Process audio chunks
	for chunk in audio_stream:
	if chunk is not None:
	sentence_wav_bytestream += chunk
	# Encode WAV to base64
	base64_audio = base64.b64encode(pcm_to_wav2(sentence_wav_bytestream)).decode('utf8')
	print("base64_audio",base64_audio[:10])
	results.append({ "text": sentence, "audio": base64_audio })
	else:
	# Handle the case where the audio stream is None (e.g., silent response)
	results.append({ "text": sentence, "audio": "" })

	except RuntimeError as e:
	if "device-side assert" in str(e):
	# cannot do anything , need to restart
	print(
	f"Exit due to: Unrecoverable exception caused by prompt:{sentence}",
	flush=True,
	)
	#This error is unrecoverable need to restart space
	#api.restart_space(repo_id=repo_id)
	else:
	print("RuntimeError: non device-side assert error:", str(e))
	raise e

	return results

	if test:
	# Example usage
	history = []
	chatbot_role = "assistant"
	sentence = "Hello, how can I help you?"
	result = generate_speech_from_history2(history, chatbot_role, sentence)
	print(result)