Spaces:

bobpopboom
/

audio1test

Sleeping

App Files Files Community

hashhac commited on Mar 15

Commit

6218f6a

1 Parent(s): fdd081d

try 2 orion time

Browse files

Files changed (2) hide show

app.py +43 -220
requirements.txt +8 -8

app.py CHANGED Viewed

@@ -1,197 +1,35 @@
 import gradio as gr
 import numpy as np
 import torch
-import os
-import tempfile
-from transformers import (
-    AutoModelForSpeechSeq2Seq,
-    AutoProcessor,
-    pipeline,
-    AutoTokenizer,
-    AutoModelForCausalLM
-)
 # Check if CUDA is available, otherwise use CPU
 device = "cuda" if torch.cuda.is_available() else "cpu"
-torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
-# Initialize pyttsx3 for local TTS
-def load_local_tts():
-    import pyttsx3
-    engine = pyttsx3.init()
-    engine.setProperty('rate', 150)  # Speed of speech
-    engine.setProperty('volume', 0.9)  # Volume
-    voices = engine.getProperty('voices')
-    if len(voices) > 1:
-        engine.setProperty('voice', voices[1].id)  # Set female voice
-    return engine
-# Initialize the TTS engine
-print("Loading local TTS engine...")
-tts_engine = load_local_tts()
-def text_to_speech_local(text):
-    """Convert text to speech using pyttsx3 local TTS engine"""
-    import tempfile
-    import soundfile as sf
-    # Create a temporary file to store the audio
-    with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
-        temp_filename = temp_file.name
-    # Generate speech to the temporary file
-    tts_engine.save_to_file(text, temp_filename)
-    tts_engine.runAndWait()
-    # Read the audio file
-    audio_data, sample_rate = sf.read(temp_filename)
-    # Convert to the expected format
-    if len(audio_data.shape) == 1:
-        audio_data = audio_data.reshape(1, -1)
-    else:
-        audio_data = audio_data[:, 0].reshape(1, -1)
-    # Ensure it's int16
-    audio_data = (audio_data * 32767).astype(np.int16)
-    # Clean up the temporary file
-    os.unlink(temp_filename)
-    return (sample_rate, audio_data)
-# Load ASR model (Whisper)
-def load_asr_model():
-    model_id = "openai/whisper-small"
-    model = AutoModelForSpeechSeq2Seq.from_pretrained(
-        model_id,
-        torch_dtype=torch_dtype,
-        low_cpu_mem_usage=True,
-        use_safetensors=True
-    )
-    model.to(device)
-    processor = AutoProcessor.from_pretrained(model_id)
-    return pipeline(
-        "automatic-speech-recognition",
-        model=model,
-        tokenizer=processor.tokenizer,
-        feature_extractor=processor.feature_extractor,
-        max_new_tokens=128,
-        chunk_length_s=30,
-        batch_size=16,
-        return_timestamps=False,
-        torch_dtype=torch_dtype,
-        device=device,
-    )
-# Load LLM model
-def load_llm_model():
-    model_id = "facebook/opt-1.3b"
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
-    if tokenizer.pad_token is None:
-        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
-    model = AutoModelForCausalLM.from_pretrained(
-        model_id,
-        torch_dtype=torch_dtype,
-        low_cpu_mem_usage=True
-    )
-    model.resize_token_embeddings(len(tokenizer))
-    model.config.pad_token_id = tokenizer.pad_token_id
-    if hasattr(model.config, "word_embed_proj_dim"):
-        model.config._remove_wrong_keys = False
-    model.to(device)
-    return model, tokenizer
-# Initialize models
-print("Loading ASR model...")
-asr_pipeline = load_asr_model()
-print("Loading LLM model...")
-llm_model, llm_tokenizer = load_llm_model()
-# Chat history management
-chat_history = []
-def generate_response(prompt):
-    # If chat history is empty, add a system message
-    if not chat_history:
-        chat_history.append({"role": "system", "content": "You are a helpful, friendly AI assistant. Keep your responses concise and conversational."})
-    # Add user message to history
-    chat_history.append({"role": "user", "content": prompt})
-    # Build prompt from chat history
-    full_prompt = ""
-    for message in chat_history:
-        if message["role"] == "system":
-            full_prompt += f"System: {message['content']}\n"
-        elif message["role"] == "user":
-            full_prompt += f"User: {message['content']}\n"
-        elif message["role"] == "assistant":
-            full_prompt += f"Assistant: {message['content']}\n"
-    full_prompt += "Assistant: "
-    # Encode input
-    encoded_input = llm_tokenizer.encode_plus(
-        full_prompt,
-        return_tensors="pt",
-        padding=False,
-        add_special_tokens=True,
-        return_attention_mask=True
-    )
-    input_ids = encoded_input["input_ids"].to(device)
-    attention_mask = torch.ones_like(input_ids).to(device)
-    # Generate response
     with torch.no_grad():
-        try:
-            output = llm_model.generate(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                max_new_tokens=128,
-                do_sample=True,
-                temperature=0.7,
-                top_p=0.9,
-                pad_token_id=llm_tokenizer.pad_token_id,
-                eos_token_id=llm_tokenizer.eos_token_id,
-                use_cache=True
-            )
-        except Exception as e:
-            output = llm_model.generate(
-                input_ids=input_ids,
-                max_new_tokens=128,
-                do_sample=True,
-                temperature=0.7
-            )
-    # Decode output
-    response_text = llm_tokenizer.decode(output[0], skip_special_tokens=True)
-    response_text = response_text.split("Assistant: ")[-1].strip()
-    # Add assistant response to history
-    chat_history.append({"role": "assistant", "content": response_text})
-    # Keep history manageable
-    if len(chat_history) > 10:
-        chat_history.pop(1)
-    return response_text
 def demo():
     with gr.Blocks() as demo:
         gr.Markdown("# Voice Chatbot")
@@ -205,55 +43,40 @@ def demo():
             if audio is None:
                 return None, "No audio detected."
-            # Track conversation for display
-            conversation_text = ""
-            # Process audio
-            sample_rate, audio_array = audio
-            # Convert to float32 for ASR
-            audio_float32 = audio_array.flatten().astype(np.float32) / 32768.0
             # Speech-to-text
-            transcript = asr_pipeline({
-                "sampling_rate": sample_rate,
-                "raw": audio_float32
-            })
-            prompt = transcript["text"]
-            conversation_text += f"You: {prompt}\n"
-            print(f"Transcribed: {prompt}")
-            # Generate response
-            response_text = generate_response(prompt)
-            conversation_text += f"Assistant: {response_text}\n"
             print(f"Response: {response_text}")
-            # Convert to speech
-            sample_rate, audio_array = text_to_speech_local(response_text)
-            # Concatenate chunks for Gradio
-            full_audio = np.concatenate([audio_array[:, i:i+int(sample_rate*0.2)]
-                                     for i in range(0, audio_array.shape[1], int(sample_rate*0.2))
-                                     if audio_array[:, i:i+int(sample_rate*0.2)].size > 0], axis=1)
-            return (sample_rate, full_audio), conversation_text
         audio_input.change(process_audio,
-                        inputs=[audio_input],
-                        outputs=[audio_output, transcript_display])
         clear_btn = gr.Button("Clear Conversation")
         clear_btn.click(lambda: (None, ""), outputs=[audio_output, transcript_display])
-        # Add function to clear chat history
-        def reset_chat():
-            global chat_history
-            chat_history = []
-            return None, "Conversation history cleared."
-        reset_btn = gr.Button("Reset Chat History")
-        reset_btn.click(reset_chat, outputs=[audio_output, transcript_display])
     demo.launch()

 import gradio as gr
 import numpy as np
 import torch
+from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5ForSpeechToText
+from datasets import load_dataset
+import soundfile as sf
 # Check if CUDA is available, otherwise use CPU
 device = "cuda" if torch.cuda.is_available() else "cpu"
+# Load SpeechT5 models and processor
+processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_asr")
+asr_model = SpeechT5ForSpeechToText.from_pretrained("microsoft/speecht5_asr").to(device)
+tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
+# Function to convert speech to text
+def speech_to_text(audio):
+    inputs = processor(audio, sampling_rate=16000, return_tensors="pt").input_values.to(device)
+    with torch.no_grad():
+        logits = asr_model(inputs).logits
+    predicted_ids = torch.argmax(logits, dim=-1)
+    transcription = processor.batch_decode(predicted_ids)[0]
+    return transcription
+# Function to convert text to speech
+def text_to_speech(text):
+    inputs = processor(text, return_tensors="pt").input_ids.to(device)
     with torch.no_grad():
+        speech = tts_model.generate_speech(inputs)
+    return speech
+# Gradio demo
 def demo():
     with gr.Blocks() as demo:
         gr.Markdown("# Voice Chatbot")
             if audio is None:
                 return None, "No audio detected."
+            # Convert audio to the correct format
+            sample_rate, audio_data = audio
+            audio_data = audio_data.flatten().astype(np.float32) / 32768.0  # Normalize to [-1.0, 1.0]
             # Speech-to-text
+            transcript = speech_to_text(audio_data)
+            print(f"Transcribed: {transcript}")
+            # Generate response (for simplicity, echo the transcript)
+            response_text = transcript
             print(f"Response: {response_text}")
+            # Text-to-speech
+            response_audio = text_to_speech(response_text)
+            # Save the response audio to a temporary file
+            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
+                sf.write(temp_file.name, response_audio.cpu().numpy(), 16000)
+                temp_filename = temp_file.name
+            # Read the audio file
+            audio_data, sample_rate = sf.read(temp_filename)
+            # Clean up the temporary file
+            os.unlink(temp_filename)
+            return (sample_rate, audio_data), f"You: {transcript}\nAssistant: {response_text}"
         audio_input.change(process_audio,
+                           inputs=[audio_input],
+                           outputs=[audio_output, transcript_display])
         clear_btn = gr.Button("Clear Conversation")
         clear_btn.click(lambda: (None, ""), outputs=[audio_output, transcript_display])
     demo.launch()

requirements.txt CHANGED Viewed

@@ -1,16 +1,16 @@
-transformers
-torch
-datasets
-scipy
-fastrtc
 gradio
 accelerate
-sentencepiece
 fastrtc[vad,tts]
 torchaudio
 gtts
-pydub
 scipy
-pyttsx3
 soundfile
 py-espeak-ng

+transformers
+torch
+datasets
+scipy
+fastrtc
 gradio
 accelerate
+sentencepiece
 fastrtc[vad,tts]
 torchaudio
 gtts
+pydub
 scipy
+pyttsx3
 soundfile
 py-espeak-ng