Spaces:

Futuresony
/

Speech-recognition

Sleeping

App Files Files Community

Futuresony commited on Feb 24

Commit

7b1a576

verified ·

1 Parent(s): d7c7caa

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -36

app.py CHANGED Viewed

@@ -1,76 +1,79 @@
 import gradio as gr
-from asr import transcribe_auto
 from huggingface_hub import InferenceClient
 from ttsmms import download, TTS
 from langdetect import detect
-# Initialize text generation client
 client = InferenceClient("Futuresony/future_ai_12_10_2024.gguf")
-# Download and load TTS models for Swahili and English
 swahili_dir = download("swh", "./data/swahili")
-english_dir = download("eng", "./data/english")  # Ensure an English TTS model is available
 swahili_tts = TTS(swahili_dir)
 english_tts = TTS(english_dir)
-def is_uncertain(question, response):
-    """Check if the model's response is unreliable."""
-    if len(response.split()) < 4 or response.lower() in question.lower():
-        return True
-    uncertain_phrases = ["Kulingana na utafiti", "Inaaminika kuwa", "Ninadhani", "It is believed that", "Some people say"]
-    return any(phrase.lower() in response.lower() for phrase in uncertain_phrases)
 def generate_text(prompt):
-    """Generate a response from the text generation model."""
-    messages = [{"role": "user", "content": prompt}]
     response = ""
     for message in client.chat_completion(messages, max_tokens=512, stream=True, temperature=0.7, top_p=0.95):
         token = message.choices[0].delta.content
         response += token
-    if is_uncertain(prompt, response):
-        return "AI is uncertain about the response."
-    return response
-# Function to detect language and generate speech
 def text_to_speech(text):
-    lang = detect(text)  # Detect language
     wav_path = "./output.wav"
-    if lang == "sw":  # Swahili
         swahili_tts.synthesis(text, wav_path=wav_path)
-    else:  # Default to English if not Swahili
         english_tts.synthesis(text, wav_path=wav_path)
     return wav_path
 def process_audio(audio):
-    # Step 1: Transcribe the audio
-    transcription = transcribe_auto(audio)
-    # Step 2: Generate text based on the transcription
     generated_text = generate_text(transcription)
-    # Step 3: Convert the generated text to speech
     speech = text_to_speech(generated_text)
     return transcription, generated_text, speech
 # Gradio Interface
 with gr.Blocks() as demo:
     gr.Markdown("<p align='center' style='font-size: 20px;'>End-to-End ASR, Text Generation, and TTS</p>")
     gr.HTML("<center>Upload or record audio. The model will transcribe, generate a response, and read it out.</center>")
     audio_input = gr.Audio(label="Input Audio", type="filepath")
     text_output = gr.Textbox(label="Transcription")
     generated_text_output = gr.Textbox(label="Generated Text")
     audio_output = gr.Audio(label="Output Speech")
     submit_btn = gr.Button("Submit")
     submit_btn.click(
         fn=process_audio,
         inputs=audio_input,
@@ -78,4 +81,4 @@ with gr.Blocks() as demo:
     )
 if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
+import torch
+import torchaudio
+from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 from huggingface_hub import InferenceClient
 from ttsmms import download, TTS
 from langdetect import detect
+# Load ASR Model
+asr_model_name = "Futuresony/Future-sw_ASR-24-02-2025"
+processor = Wav2Vec2Processor.from_pretrained(asr_model_name)
+asr_model = Wav2Vec2ForCTC.from_pretrained(asr_model_name)
+# Load Text Generation Model
 client = InferenceClient("Futuresony/future_ai_12_10_2024.gguf")
+def format_prompt(user_input):
+    return f"### User: {user_input}\n### Assistant:"
+# Load TTS Models
 swahili_dir = download("swh", "./data/swahili")
+english_dir = download("eng", "./data/english")
 swahili_tts = TTS(swahili_dir)
 english_tts = TTS(english_dir)
+# ASR Function
+def transcribe(audio_file):
+    speech_array, sample_rate = torchaudio.load(audio_file)
+    resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
+    speech_array = resampler(speech_array).squeeze().numpy()
+    input_values = processor(speech_array, sampling_rate=16000, return_tensors="pt").input_values
+    with torch.no_grad():
+        logits = asr_model(input_values).logits
+    predicted_ids = torch.argmax(logits, dim=-1)
+    transcription = processor.batch_decode(predicted_ids)[0]
+    return transcription
+# Text Generation Function
 def generate_text(prompt):
+    formatted_prompt = format_prompt(prompt)
+    messages = [{"role": "user", "content": formatted_prompt}]
     response = ""
     for message in client.chat_completion(messages, max_tokens=512, stream=True, temperature=0.7, top_p=0.95):
         token = message.choices[0].delta.content
         response += token
+    return response.strip()
+# TTS Function
 def text_to_speech(text):
+    lang = detect(text)
     wav_path = "./output.wav"
+    if lang == "sw":
         swahili_tts.synthesis(text, wav_path=wav_path)
+    else:
         english_tts.synthesis(text, wav_path=wav_path)
     return wav_path
+# Combined Processing Function
 def process_audio(audio):
+    transcription = transcribe(audio)
     generated_text = generate_text(transcription)
     speech = text_to_speech(generated_text)
     return transcription, generated_text, speech
 # Gradio Interface
 with gr.Blocks() as demo:
     gr.Markdown("<p align='center' style='font-size: 20px;'>End-to-End ASR, Text Generation, and TTS</p>")
     gr.HTML("<center>Upload or record audio. The model will transcribe, generate a response, and read it out.</center>")
     audio_input = gr.Audio(label="Input Audio", type="filepath")
     text_output = gr.Textbox(label="Transcription")
     generated_text_output = gr.Textbox(label="Generated Text")
     audio_output = gr.Audio(label="Output Speech")
     submit_btn = gr.Button("Submit")
     submit_btn.click(
         fn=process_audio,
         inputs=audio_input,
     )
 if __name__ == "__main__":
+    demo.launch()