Spaces:

bobpopboom
/

chaty

Sleeping

chaty / app.py

hashhac

added

7afe31a 3 months ago

7.23 kB

	from fastrtc import (
	ReplyOnPause, AdditionalOutputs, Stream,
	audio_to_bytes, aggregate_bytes_to_16bit
	)
	import gradio as gr
	import numpy as np
	import torch
	import os
	from transformers import (
	AutoModelForSpeechSeq2Seq,
	AutoProcessor,
	pipeline,
	AutoTokenizer,
	AutoModelForCausalLM,
	AutoModelForSeq2SeqLM
	)
	from datasets import load_dataset
	import scipy

	# Check if CUDA is available, otherwise use CPU
	device = "cuda" if torch.cuda.is_available() else "cpu"
	torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

	# Step 1: Audio transcription with Whisper
	def load_asr_model():
	model_id = "openai/whisper-small" # Smaller version that's more efficient

	model = AutoModelForSpeechSeq2Seq.from_pretrained(
	model_id,
	torch_dtype=torch_dtype,
	low_cpu_mem_usage=True,
	use_safetensors=True
	)
	model.to(device)

	processor = AutoProcessor.from_pretrained(model_id)

	return pipeline(
	"automatic-speech-recognition",
	model=model,
	tokenizer=processor.tokenizer,
	feature_extractor=processor.feature_extractor,
	max_new_tokens=128,
	chunk_length_s=30,
	batch_size=16,
	return_timestamps=False,
	torch_dtype=torch_dtype,
	device=device,
	)

	# Step 2: Text generation with a smaller LLM
	def load_llm_model():
	model_id = "facebook/opt-1.3b" # A smaller language model

	tokenizer = AutoTokenizer.from_pretrained(model_id)
	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	torch_dtype=torch_dtype,
	low_cpu_mem_usage=True
	)
	model.to(device)

	return model, tokenizer

	# Step 3: Text-to-Speech with a free model
	def load_tts_model():
	model_id = "microsoft/speecht5_tts"
	processor = AutoProcessor.from_pretrained(model_id)
	model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
	model.to(device)

	# Load vocoder for waveform generation
	vocoder_id = "microsoft/speecht5_hifigan"
	vocoder = AutoModelForCausalLM.from_pretrained(vocoder_id)
	vocoder.to(device)

	# Load speaker embeddings
	embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
	speaker_embeddings = torch.tensor(embeddings_dataset[7]["xvector"]).unsqueeze(0)

	return model, processor, vocoder, speaker_embeddings

	# Initialize all models
	print("Loading ASR model...")
	asr_pipeline = load_asr_model()

	print("Loading LLM model...")
	llm_model, llm_tokenizer = load_llm_model()

	print("Loading TTS model...")
	tts_model, tts_processor, tts_vocoder, speaker_embeddings = load_tts_model()

	# Chat history management
	chat_history = []

	def generate_response(prompt):
	# If chat history is empty, add a system message
	if not chat_history:
	chat_history.append({"role": "system", "content": "You are a helpful, friendly AI assistant. Keep your responses concise and conversational."})

	# Add user message to history
	chat_history.append({"role": "user", "content": prompt})

	# Prepare input for the model
	full_prompt = ""
	for message in chat_history:
	if message["role"] == "system":
	full_prompt += f"System: {message['content']}\n"
	elif message["role"] == "user":
	full_prompt += f"User: {message['content']}\n"
	elif message["role"] == "assistant":
	full_prompt += f"Assistant: {message['content']}\n"

	full_prompt += "Assistant: "

	# Generate response
	inputs = llm_tokenizer(full_prompt, return_tensors="pt").to(device)
	with torch.no_grad():
	output = llm_model.generate(
	**inputs,
	max_new_tokens=128,
	do_sample=True,
	temperature=0.7,
	top_p=0.9
	)

	response_text = llm_tokenizer.decode(output[0], skip_special_tokens=True)
	response_text = response_text.split("Assistant: ")[-1].strip()

	# Add assistant response to history
	chat_history.append({"role": "assistant", "content": response_text})

	# Keep history at a reasonable size
	if len(chat_history) > 10:
	# Keep system message and last 9 exchanges
	chat_history.pop(1)

	return response_text

	def text_to_speech(text):
	# Prepare inputs
	inputs = tts_processor(text=text, return_tensors="pt")

	# Add speaker embeddings
	inputs["speaker_embeddings"] = speaker_embeddings.to(device)

	# Generate speech
	with torch.no_grad():
	speech = tts_model.generate_speech(
	inputs["input_ids"].to(device),
	speaker_embeddings.to(device)
	)

	# Convert to waveform using vocoder
	with torch.no_grad():
	waveform = tts_vocoder(speech)

	# Convert to numpy array
	audio_array = waveform.cpu().numpy().squeeze()

	# Normalize and convert to int16
	audio_array = (audio_array / np.max(np.abs(audio_array)) * 32767).astype(np.int16)

	# Reshape for fastrtc
	audio_array = audio_array.reshape(1, -1)

	return (24000, audio_array) # Using 24kHz sample rate

	def response(audio: tuple[int, np.ndarray]):
	# Step 1: Speech-to-Text
	transcript = asr_pipeline({"sampling_rate": audio[0], "raw": audio[1].flatten()})
	prompt = transcript["text"]

	# Step 2: Generate text response
	response_text = generate_response(prompt)

	# Step 3: Text-to-Speech
	sample_rate, audio_array = text_to_speech(response_text)

	# Convert to expected format
	chunk_size = 4800 # 200ms chunks at 24kHz
	for i in range(0, audio_array.shape[1], chunk_size):
	chunk = audio_array[:, i:i+chunk_size]
	if chunk.size > 0: # Ensure we don't yield empty chunks
	yield (sample_rate, chunk)

	stream = Stream(
	modality="audio",
	mode="send-receive",
	handler=ReplyOnPause(response),
	)

	# For testing without WebRTC
	def demo():
	with gr.Blocks() as demo:
	gr.Markdown("# Local Voice Chatbot")
	audio_input = gr.Audio(sources=["microphone"], type="numpy")
	audio_output = gr.Audio()

	def process_audio(audio):
	if audio is None:
	return None

	sample_rate, audio_array = audio
	transcript = asr_pipeline({"sampling_rate": sample_rate, "raw": audio_array.flatten()})
	prompt = transcript["text"]
	print(f"Transcribed: {prompt}")

	response_text = generate_response(prompt)
	print(f"Response: {response_text}")

	sample_rate, audio_array = text_to_speech(response_text)
	return (sample_rate, audio_array[0])

	audio_input.change(process_audio, inputs=[audio_input], outputs=[audio_output])

	demo.launch()

	if __name__ == "__main__":
	import argparse
	parser = argparse.ArgumentParser()
	parser.add_argument("--demo", action="store_true", help="Run Gradio demo instead of WebRTC")
	args = parser.parse_args()

	if args.demo:
	demo()
	else:
	# For running with FastRTC
	# You would need to add your FastRTC server code here
	pass