Spaces:

bobpopboom
/

chaty

Sleeping

App Files Files Community

hashhac commited on Mar 15

Commit

7afe31a

1 Parent(s): e724e7e

added

Browse files

Files changed (2) hide show

app.py +216 -37
requirements.txt +6 -15

app.py CHANGED Viewed

@@ -3,49 +3,228 @@ from fastrtc import (
     audio_to_bytes, aggregate_bytes_to_16bit
 )
 import gradio as gr
-from groq import Groq
 import numpy as np
-import anthropic
-from elevenlabs import ElevenLabs
-groq_client = Groq()
-claude_client = anthropic.Anthropic()
-tts_client = ElevenLabs()
-# See "Talk to Claude" in Cookbook for an example of how to keep
-# track of the chat history.
-def response(
-    audio: tuple[int, np.ndarray],
-):
-    prompt = groq_client.audio.transcriptions.create(
-        file=("audio-file.mp3", audio_to_bytes(audio)),
-        model="whisper-large-v3-turbo",
-        response_format="verbose_json",
-    ).text
-    response = claude_client.messages.create(
-        model="claude-3-5-haiku-20241022",
-        max_tokens=512,
-        messages=[{"role": "user", "content": prompt}],
     )
-    response_text = " ".join(
-        block.text
-        for block in response.content
-        if getattr(block, "type", None) == "text"
     )
-    iterator = tts_client.text_to_speech.convert_as_stream(
-        text=response_text,
-        voice_id="JBFqnCBsd6RMkjVDRZzb",
-        model_id="eleven_multilingual_v2",
-        output_format="pcm_24000"
     )
-    for chunk in aggregate_bytes_to_16bit(iterator):
-        audio_array = np.frombuffer(chunk, dtype=np.int16).reshape(1, -1)
-        yield (24000, audio_array)
 stream = Stream(
     modality="audio",
     mode="send-receive",
     handler=ReplyOnPause(response),
-)

     audio_to_bytes, aggregate_bytes_to_16bit
 )
 import gradio as gr
 import numpy as np
+import torch
+import os
+from transformers import (
+    AutoModelForSpeechSeq2Seq,
+    AutoProcessor,
+    pipeline,
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    AutoModelForSeq2SeqLM
+)
+from datasets import load_dataset
+import scipy
+# Check if CUDA is available, otherwise use CPU
+device = "cuda" if torch.cuda.is_available() else "cpu"
+torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+# Step 1: Audio transcription with Whisper
+def load_asr_model():
+    model_id = "openai/whisper-small"  # Smaller version that's more efficient
+    model = AutoModelForSpeechSeq2Seq.from_pretrained(
+        model_id,
+        torch_dtype=torch_dtype,
+        low_cpu_mem_usage=True,
+        use_safetensors=True
     )
+    model.to(device)
+    processor = AutoProcessor.from_pretrained(model_id)
+    return pipeline(
+        "automatic-speech-recognition",
+        model=model,
+        tokenizer=processor.tokenizer,
+        feature_extractor=processor.feature_extractor,
+        max_new_tokens=128,
+        chunk_length_s=30,
+        batch_size=16,
+        return_timestamps=False,
+        torch_dtype=torch_dtype,
+        device=device,
     )
+# Step 2: Text generation with a smaller LLM
+def load_llm_model():
+    model_id = "facebook/opt-1.3b"  # A smaller language model
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        torch_dtype=torch_dtype,
+        low_cpu_mem_usage=True
     )
+    model.to(device)
+    return model, tokenizer
+# Step 3: Text-to-Speech with a free model
+def load_tts_model():
+    model_id = "microsoft/speecht5_tts"
+    processor = AutoProcessor.from_pretrained(model_id)
+    model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
+    model.to(device)
+    # Load vocoder for waveform generation
+    vocoder_id = "microsoft/speecht5_hifigan"
+    vocoder = AutoModelForCausalLM.from_pretrained(vocoder_id)
+    vocoder.to(device)
+    # Load speaker embeddings
+    embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+    speaker_embeddings = torch.tensor(embeddings_dataset[7]["xvector"]).unsqueeze(0)
+    return model, processor, vocoder, speaker_embeddings
+# Initialize all models
+print("Loading ASR model...")
+asr_pipeline = load_asr_model()
+print("Loading LLM model...")
+llm_model, llm_tokenizer = load_llm_model()
+print("Loading TTS model...")
+tts_model, tts_processor, tts_vocoder, speaker_embeddings = load_tts_model()
+# Chat history management
+chat_history = []
+def generate_response(prompt):
+    # If chat history is empty, add a system message
+    if not chat_history:
+        chat_history.append({"role": "system", "content": "You are a helpful, friendly AI assistant. Keep your responses concise and conversational."})
+    # Add user message to history
+    chat_history.append({"role": "user", "content": prompt})
+    # Prepare input for the model
+    full_prompt = ""
+    for message in chat_history:
+        if message["role"] == "system":
+            full_prompt += f"System: {message['content']}\n"
+        elif message["role"] == "user":
+            full_prompt += f"User: {message['content']}\n"
+        elif message["role"] == "assistant":
+            full_prompt += f"Assistant: {message['content']}\n"
+    full_prompt += "Assistant: "
+    # Generate response
+    inputs = llm_tokenizer(full_prompt, return_tensors="pt").to(device)
+    with torch.no_grad():
+        output = llm_model.generate(
+            **inputs,
+            max_new_tokens=128,
+            do_sample=True,
+            temperature=0.7,
+            top_p=0.9
+        )
+    response_text = llm_tokenizer.decode(output[0], skip_special_tokens=True)
+    response_text = response_text.split("Assistant: ")[-1].strip()
+    # Add assistant response to history
+    chat_history.append({"role": "assistant", "content": response_text})
+    # Keep history at a reasonable size
+    if len(chat_history) > 10:
+        # Keep system message and last 9 exchanges
+        chat_history.pop(1)
+    return response_text
+def text_to_speech(text):
+    # Prepare inputs
+    inputs = tts_processor(text=text, return_tensors="pt")
+    # Add speaker embeddings
+    inputs["speaker_embeddings"] = speaker_embeddings.to(device)
+    # Generate speech
+    with torch.no_grad():
+        speech = tts_model.generate_speech(
+            inputs["input_ids"].to(device),
+            speaker_embeddings.to(device)
+        )
+    # Convert to waveform using vocoder
+    with torch.no_grad():
+        waveform = tts_vocoder(speech)
+    # Convert to numpy array
+    audio_array = waveform.cpu().numpy().squeeze()
+    # Normalize and convert to int16
+    audio_array = (audio_array / np.max(np.abs(audio_array)) * 32767).astype(np.int16)
+    # Reshape for fastrtc
+    audio_array = audio_array.reshape(1, -1)
+    return (24000, audio_array)  # Using 24kHz sample rate
+def response(audio: tuple[int, np.ndarray]):
+    # Step 1: Speech-to-Text
+    transcript = asr_pipeline({"sampling_rate": audio[0], "raw": audio[1].flatten()})
+    prompt = transcript["text"]
+    # Step 2: Generate text response
+    response_text = generate_response(prompt)
+    # Step 3: Text-to-Speech
+    sample_rate, audio_array = text_to_speech(response_text)
+    # Convert to expected format
+    chunk_size = 4800  # 200ms chunks at 24kHz
+    for i in range(0, audio_array.shape[1], chunk_size):
+        chunk = audio_array[:, i:i+chunk_size]
+        if chunk.size > 0:  # Ensure we don't yield empty chunks
+            yield (sample_rate, chunk)
 stream = Stream(
     modality="audio",
     mode="send-receive",
     handler=ReplyOnPause(response),
+)
+# For testing without WebRTC
+def demo():
+    with gr.Blocks() as demo:
+        gr.Markdown("# Local Voice Chatbot")
+        audio_input = gr.Audio(sources=["microphone"], type="numpy")
+        audio_output = gr.Audio()
+        def process_audio(audio):
+            if audio is None:
+                return None
+            sample_rate, audio_array = audio
+            transcript = asr_pipeline({"sampling_rate": sample_rate, "raw": audio_array.flatten()})
+            prompt = transcript["text"]
+            print(f"Transcribed: {prompt}")
+            response_text = generate_response(prompt)
+            print(f"Response: {response_text}")
+            sample_rate, audio_array = text_to_speech(response_text)
+            return (sample_rate, audio_array[0])
+        audio_input.change(process_audio, inputs=[audio_input], outputs=[audio_output])
+    demo.launch()
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--demo", action="store_true", help="Run Gradio demo instead of WebRTC")
+    args = parser.parse_args()
+    if args.demo:
+        demo()
+    else:
+        # For running with FastRTC
+        # You would need to add your FastRTC server code here
+        pass

requirements.txt CHANGED Viewed

@@ -1,16 +1,7 @@
-fastapi
-uvicorn
-transformers
-torch
-numpy
-# librosa
-python-dotenv
-fastrtc[vad, tts]
-# SentencePiece
-# twilio
 gradio
-# torchaudio
-elevenlabs
-groq
-anthropic
-ffmpeg

+transformers
+torch
+datasets
+scipy
+fastrtc
 gradio
+accelerate