Spaces:

EYEDOL
/

SALAMA

Sleeping

App Files Files Community

EYEDOL commited on 6 days ago

Commit

6fefd54

verified ·

1 Parent(s): 8454ce0

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -21

app.py CHANGED Viewed

@@ -9,15 +9,16 @@ import numpy as np
 import onnxruntime
 import torch
 import librosa
-from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, AutoTokenizer, pipeline
 from scipy.io.wavfile import write as write_wav
 import os
 import re
 from huggingface_hub import login
 # --- Login to Hugging Face using secret ---
 # Make sure HF_TOKEN is set in your Hugging Face Space > Settings > Repository secrets
-hf_token = os.environ.get("hugface")
 if not hf_token:
     raise ValueError("HF_TOKEN not found. Please set it in Hugging Face Space repository secrets.")
 login(token=hf_token)
@@ -25,7 +26,7 @@ print("Successfully logged into Hugging Face Hub!")
 # --- Configuration ---
 STT_MODEL_ID = "EYEDOL/SALAMA_C3"
-LLM_MODEL_ID = "google/gemma-3-1b-it"
 TTS_TOKENIZER_ID = "facebook/mms-tts-swh"
 TTS_ONNX_MODEL_PATH = "swahili_tts.onnx"
@@ -62,10 +63,13 @@ class WeeboAssistant:
         # LLM
         print(f"Loading LLM: {LLM_MODEL_ID}")
         self.llm_pipeline = pipeline(
             "text-generation",
             model=LLM_MODEL_ID,
             model_kwargs={"torch_dtype": self.torch_dtype},
             device=self.device,
         )
         print("LLM pipeline loaded successfully.")
@@ -118,6 +122,7 @@ class WeeboAssistant:
             messages.append({'role': 'user', 'content': turn[0]})
             if turn[1] is not None:
                 messages.append({'role': 'assistant', 'content': turn[1]})
         prompt = self.llm_pipeline.tokenizer.apply_chat_template(
             messages, tokenize=False, add_generation_prompt=True
         )
@@ -125,17 +130,27 @@ class WeeboAssistant:
             self.llm_pipeline.tokenizer.eos_token_id,
             self.llm_pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
         ]
-        streamer = self.llm_pipeline(
-            prompt,
             max_new_tokens=512,
             eos_token_id=terminators,
             do_sample=True,
             temperature=0.6,
             top_p=0.9,
-            streamer=gr.TextIterator(),
         )
         return streamer
 assistant = WeeboAssistant()
@@ -146,31 +161,35 @@ def s2s_pipeline(audio_input, chat_history):
         chat_history.append((user_text or "(No valid speech detected)", None))
         yield chat_history, None, "Please record your voice again."
         return
-    chat_history.append((user_text, None))
-    yield chat_history, None, "..."
     response_stream = assistant.get_llm_response(chat_history)
     llm_response_text = ""
     for text_chunk in response_stream:
-        llm_response_text = text_chunk
         chat_history[-1] = (user_text, llm_response_text)
         yield chat_history, None, llm_response_text
     final_audio_path = assistant.generate_speech(llm_response_text)
     yield chat_history, final_audio_path, llm_response_text
 def t2t_pipeline(text_input, chat_history):
-    chat_history.append((text_input, None))
-    yield chat_history, "..."
     response_stream = assistant.get_llm_response(chat_history)
     llm_response_text = ""
     for text_chunk in response_stream:
-        llm_response_text = text_chunk
         chat_history[-1] = (text_input, llm_response_text)
-        yield chat_history, llm_response_text
 def clear_textbox():
-    return ""
 with gr.Blocks(theme=gr.themes.Soft(), title="Msaidizi wa Kiswahili") as demo:
@@ -191,14 +210,14 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Msaidizi wa Kiswahili") as demo:
         with gr.TabItem("⌨️ Maandishi-kwa-Maandishi (Text-to-Text)"):
             t2t_chatbot = gr.Chatbot(label="Mazungumzo (Conversation)", bubble_full_width=False, height=500)
             with gr.Row():
-                t2t_text_in = gr.Textbox(label="Andika Hapa (Write Here)", placeholder="Habari yako...", scale=4)
                 t2t_submit_btn = gr.Button("Tuma (Submit)", variant="primary", scale=1)
         with gr.TabItem("🛠️ Zana (Tools)"):
             with gr.Row():
                 with gr.Column():
                     gr.Markdown("### Unukuzi wa Sauti (Speech Transcription)")
-                    tool_s2t_audio_in = gr.Audio(sources=["microphone"], type="numpy", label="Sauti ya Kuingiza (Input Audio)")
                     tool_s2t_text_out = gr.Textbox(label="Maandishi Yaliyonukuliwa (Transcribed Text)", interactive=False)
                     tool_s2t_btn = gr.Button("Nukuu (Transcribe)")
                 with gr.Column():
@@ -212,12 +231,28 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Msaidizi wa Kiswahili") as demo:
         inputs=[s2s_audio_in, s2s_chatbot],
         outputs=[s2s_chatbot, s2s_audio_out, s2s_text_out],
         queue=True
     )
     t2t_submit_btn.click(
         fn=t2t_pipeline,
         inputs=[t2t_text_in, t2t_chatbot],
-        outputs=[t2t_chatbot, t2t_text_in],
         queue=True
     ).then(
         fn=clear_textbox,
@@ -225,15 +260,18 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Msaidizi wa Kiswahili") as demo:
         outputs=t2t_text_in
     )
     tool_s2t_btn.click(
         fn=assistant.transcribe_audio,
         inputs=tool_s2t_audio_in,
-        outputs=tool_s2t_text_out
     )
     tool_t2s_btn.click(
         fn=assistant.generate_speech,
         inputs=tool_t2s_text_in,
-        outputs=tool_t2s_audio_out
     )
-demo.queue().launch(debug=True)

 import onnxruntime
 import torch
 import librosa
+from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, AutoTokenizer, pipeline, TextIteratorStreamer
 from scipy.io.wavfile import write as write_wav
 import os
 import re
 from huggingface_hub import login
+import threading # <-- FIX: Added threading import
 # --- Login to Hugging Face using secret ---
 # Make sure HF_TOKEN is set in your Hugging Face Space > Settings > Repository secrets
+hf_token = os.environ.get("hugface") #
 if not hf_token:
     raise ValueError("HF_TOKEN not found. Please set it in Hugging Face Space repository secrets.")
 login(token=hf_token)
 # --- Configuration ---
 STT_MODEL_ID = "EYEDOL/SALAMA_C3"
+LLM_MODEL_ID = "google/gemma-1.1-2b-it"
 TTS_TOKENIZER_ID = "facebook/mms-tts-swh"
 TTS_ONNX_MODEL_PATH = "swahili_tts.onnx"
         # LLM
         print(f"Loading LLM: {LLM_MODEL_ID}")
+        # <-- FIX: Initialize tokenizer separately to use it with the streamer
+        self.llm_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_ID)
         self.llm_pipeline = pipeline(
             "text-generation",
             model=LLM_MODEL_ID,
             model_kwargs={"torch_dtype": self.torch_dtype},
+            tokenizer=self.llm_tokenizer, # Pass the tokenizer here
             device=self.device,
         )
         print("LLM pipeline loaded successfully.")
             messages.append({'role': 'user', 'content': turn[0]})
             if turn[1] is not None:
                 messages.append({'role': 'assistant', 'content': turn[1]})
         prompt = self.llm_pipeline.tokenizer.apply_chat_template(
             messages, tokenize=False, add_generation_prompt=True
         )
             self.llm_pipeline.tokenizer.eos_token_id,
             self.llm_pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
         ]
+        # <-- START OF FIX: Use TextIteratorStreamer instead of gr.TextIterator -->
+        streamer = TextIteratorStreamer(
+            self.llm_pipeline.tokenizer, skip_prompt=True, skip_special_tokens=True
+        )
+        generation_kwargs = dict(
+            streamer=streamer,
             max_new_tokens=512,
             eos_token_id=terminators,
             do_sample=True,
             temperature=0.6,
             top_p=0.9,
         )
+        # Run the pipeline in a separate thread to enable streaming
+        thread = threading.Thread(target=self.llm_pipeline, args=[prompt], kwargs=generation_kwargs)
+        thread.start()
         return streamer
+        # <-- END OF FIX -->
 assistant = WeeboAssistant()
         chat_history.append((user_text or "(No valid speech detected)", None))
         yield chat_history, None, "Please record your voice again."
         return
+    chat_history.append((user_text, ""))
+    yield chat_history, None, "..." # Show thinking indicator
     response_stream = assistant.get_llm_response(chat_history)
     llm_response_text = ""
     for text_chunk in response_stream:
+        llm_response_text += text_chunk # <-- FIX: Append chunk to full response
         chat_history[-1] = (user_text, llm_response_text)
         yield chat_history, None, llm_response_text
     final_audio_path = assistant.generate_speech(llm_response_text)
     yield chat_history, final_audio_path, llm_response_text
 def t2t_pipeline(text_input, chat_history):
+    chat_history.append((text_input, ""))
+    yield chat_history
     response_stream = assistant.get_llm_response(chat_history)
     llm_response_text = ""
     for text_chunk in response_stream:
+        llm_response_text += text_chunk # <-- FIX: Append chunk to full response
         chat_history[-1] = (text_input, llm_response_text)
+        yield chat_history
 def clear_textbox():
+    return gr.Textbox(value="")
 with gr.Blocks(theme=gr.themes.Soft(), title="Msaidizi wa Kiswahili") as demo:
         with gr.TabItem("⌨️ Maandishi-kwa-Maandishi (Text-to-Text)"):
             t2t_chatbot = gr.Chatbot(label="Mazungumzo (Conversation)", bubble_full_width=False, height=500)
             with gr.Row():
+                t2t_text_in = gr.Textbox(show_label=False, placeholder="Habari yako...", scale=4, container=False)
                 t2t_submit_btn = gr.Button("Tuma (Submit)", variant="primary", scale=1)
         with gr.TabItem("🛠️ Zana (Tools)"):
             with gr.Row():
                 with gr.Column():
                     gr.Markdown("### Unukuzi wa Sauti (Speech Transcription)")
+                    tool_s2t_audio_in = gr.Audio(sources=["microphone", "upload"], type="numpy", label="Sauti ya Kuingiza (Input Audio)")
                     tool_s2t_text_out = gr.Textbox(label="Maandishi Yaliyonukuliwa (Transcribed Text)", interactive=False)
                     tool_s2t_btn = gr.Button("Nukuu (Transcribe)")
                 with gr.Column():
         inputs=[s2s_audio_in, s2s_chatbot],
         outputs=[s2s_chatbot, s2s_audio_out, s2s_text_out],
         queue=True
+    ).then(
+        fn=lambda: gr.Audio(value=None), # Clear audio input after submit
+        inputs=None,
+        outputs=s2s_audio_in
     )
     t2t_submit_btn.click(
         fn=t2t_pipeline,
         inputs=[t2t_text_in, t2t_chatbot],
+        outputs=[t2t_chatbot], # <-- FIX: Only output to the chatbot
+        queue=True
+    ).then(
+        fn=clear_textbox,
+        inputs=None,
+        outputs=t2t_text_in
+    )
+    # Also allow Enter key to submit text
+    t2t_text_in.submit(
+        fn=t2t_pipeline,
+        inputs=[t2t_text_in, t2t_chatbot],
+        outputs=[t2t_chatbot],
         queue=True
     ).then(
         fn=clear_textbox,
         outputs=t2t_text_in
     )
     tool_s2t_btn.click(
         fn=assistant.transcribe_audio,
         inputs=tool_s2t_audio_in,
+        outputs=tool_s2t_text_out,
+        queue=True
     )
     tool_t2s_btn.click(
         fn=assistant.generate_speech,
         inputs=tool_t2s_text_in,
+        outputs=tool_t2s_audio_out,
+        queue=True
     )
+demo.queue().launch(debug=True)