Spaces:

AitBAD
/

kab-asr-tanti

Running

App Files Files Community

Bouaziz-bad commited on Aug 7

Commit

aab61cd

1 Parent(s): 3697d7f

Add Kabyle ASR for free tier (GPL-3.0)

Browse files

Files changed (8) hide show

LICENSE.txt +8 -0
README.md +26 -14
app.py +22 -12
app_full.py +0 -137
backend.py +42 -0
frontend/package.js +0 -210
requirements.txt +10 -2
requirements_full.txt +0 -7

LICENSE.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+ Copyright (C) 2025 [Your Name or Organization]
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+(See full text at: https://www.gnu.org/licenses/gpl-3.0.txt)

README.md CHANGED Viewed

@@ -1,21 +1,33 @@
 ---
-title: Kab Asr Tanti
-emoji: 👀
-colorFrom: pink
-colorTo: indigo
-sdk: docker
-pinned: false
 license: gpl-3.0
-short_description: Backend of Kab ASR using nemo Nvidia
----
-# Kabyle ASR Web App on Hugging Face Spaces
-This is a Hugging Face Space for a Kabyle Automatic Speech Recognition (ASR) web application.
-The backend is a Flask app that uses the `nvidia/stt_kab_conformer_transducer_large` NeMo ASR model to transcribe Kabyle speech. The frontend is a separate React application that communicates with this backend.
-The application is deployed using Docker on Hugging Face Spaces, leveraging its generous free-tier memory to accommodate the large ASR model.
-[Add more details about the project, how to use it, etc.]
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Tanti - Kabyle ASR
+emoji: 🎤
+colorFrom: purple
+colorTo: blue
+sdk: gradio
+app_file: app.py
+python_version: "3.10"
 license: gpl-3.0
+short_description: "Kabyle speech-to-text using NeMo on CPU (free tier)."
+tags:
+  - asr
+  - kabyle
+  - nemo
+  - speech-to-text
+  - cpu
+  - gpl
+# Preload the large NeMo model during build
+preload_from_hub:
+  - nvidia/stt_kab_conformer_transducer_large
+# Allow up to 30 minutes startup (critical for CPU + large model)
+startup_duration_timeout: 30m
+# Allow embedding in Google Sites
+disable_embedding: false
+# No GPU (free tier)
+# Do NOT include suggested_hardware to default to cpu-basic
+---

app.py CHANGED Viewed

@@ -1,16 +1,26 @@
-from flask import Flask
-app = Flask(__name__)
-@app.route("/")
-def hello_world():
-    return "Hello from the backend!"
-@app.route("/health")
-def health_check():
-    return "Healthy", 200
-# Add a simple test route for transcription logic
-@app.route("/transcribe", methods=['POST'])
-def transcribe_test():
-    return "Backend received audio and is ready to transcribe!"

+# app.py
+import gradio as gr
+from backend import KabyleASR
+# Initialize ASR (happens once at startup)
+asr = KabyleASR()
+def transcribe_audio(audio):
+    if audio is None:
+        return "Please upload an audio file."
+    return asr.transcribe(audio)
+# Gradio Interface
+demo = gr.Interface(
+    fn=transcribe_audio,
+    inputs=gr.Audio(sources=["upload"], type="filepath"),
+    outputs=gr.Textbox(label="Kabyle Transcription", lines=6),
+    title="🎙️ Tanti: Kabyle ASR (Free Tier)",
+    description="Upload a Kabyle audio file. Transcription may take 1–2 minutes per 30 seconds of audio. Powered by NeMo on CPU.",
+    flagging_mode="never",
+    allow_screenshot=True
+)
+# Launch without SSR
+if __name__ == "__main__":
+    demo.launch(ssr_mode=False)

app_full.py DELETED Viewed

@@ -1,137 +0,0 @@
-# app.py - Flask server to handle ASR requests using the NeMo model (Corrected)
-import os
-import tempfile
-import logging
-import sys
-from flask import Flask, request, jsonify
-from flask_cors import CORS
-import nemo.collections.asr as nemo_asr
-from pydub import AudioSegment
-import re
-import datetime
-# --- Suppress verbose NeMo logging ---
-logging.getLogger('nemo_logger').setLevel(logging.ERROR)
-app = Flask(__name__)
-CORS(app)
-# --- Post-processing function to correct annexation in Kabyle transcription ---
-def post_process_kabyle_text(text):
-    """
-    Corrects annexation in Kabyle transcription by replacing spaces with dashes.
-    This version uses regular expressions for more robust pattern matching.
-    """
-    # Defensive check to ensure 'text' is a string before processing
-    if not isinstance(text, str):
-        print(f"Warning: Expected string for post-processing, but received type: {type(text)}. Skipping post-processing.")
-        return text
-    if not text:
-        return ""
-    # Ensure text is lowercase for consistent matching
-    text = text.lower()
-    # Define the sets of particles
-    PoPro = {'inu', 'inem', 'ines', 'nneɣ', 'ntex', 'nwen', 'nwent', 'nsen', 'nsent',
-             'iw', 'ik', 'im', 'is', 'w', 'k', 'm', 's', 'tneɣ', 'tentex', 'tsen', 'tsent'}
-    SpWo = {'deg', 'gar', 'ɣer', 'ɣur', 'fell', 'ɣef', 'ddaw', 'nnig', 'ɣid', 'aql', 'sɣur', 'sennig', 'deffir', 'sdat'}
-    StPaSp = {'i', 'am', 'at', 's', 'neɣ', 'aɣ'}
-    StPa = {'ak', 'as', 'aneɣ', 'anteɣ', 'awen', 'awent', 'asen', 'asent',
-            'k', 'm', 'ntex', 'wen', 'went', 'sen', 'sent', 'akem', 'att',
-            'aken', 'akent', 'aten', 'atent'}
-    DePa = {'a', 'agi', 'nni', 'ihin', 'nniden'}
-    DiPa = {'id', 'in'}
-    FuPa = {'ad', 'ara'}
-    DiObPa = {'yi', 'k', 'kem', 't', 'tt', 'ay', 'ken', 'kent', 'ten', 'tent',
-              'iyi', 'ik', 'ikem', 'it', 'itt', 'iken', 'ikent', 'iten', 'itent'}
-    InObPa = {'yi', 'yak', 'yam', 'yas', 'yaɣ', 'yawen', 'yawent', 'yasen', 'yasent'}
-    # Combine all particles that can be annexed.
-    all_annexable_particles = PoPro.union(SpWo, StPa, StPaSp, DePa, DiPa, FuPa, DiObPa, InObPa)
-    sorted_all_annexable = sorted(list(all_annexable_particles), key=len, reverse=True)
-    # Create a single regex pattern to handle all annexations in one go.
-    annexation_pattern = r'\b(\w{2,})\s+(' + '|'.join(sorted_all_annexable) + r')\b'
-    text = re.sub(annexation_pattern, r'\1-\2', text)
-    # Final cleanup for any remaining double spaces or trailing hyphens
-    text = re.sub(r'\s+', ' ', text).strip()
-    text = re.sub(r'-+', '-', text)
-    return text
-# --- Load the ASR model once at the beginning to avoid reloading on every request ---
-print("Loading NeMo ASR model...")
-try:
-    asr_model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained("nvidia/stt_kab_conformer_transducer_large")
-    print("NeMo ASR model loaded successfully.")
-except Exception as e:
-    print(f"Error loading NeMo ASR model: {e}")
-    print("Please check your internet connection and ensure nemo_toolkit[asr] is correctly installed.")
-    asr_model = None
-@app.route('/transcribe', methods=['POST'])
-def transcribe():
-    if asr_model is None:
-        return jsonify({"error": "ASR model is not loaded."}), 503
-    if 'audio' not in request.files:
-        return jsonify({"error": "No audio file provided"}), 400
-    audio_file = request.files['audio']
-    if audio_file.filename == '':
-        return jsonify({"error": "No selected file"}), 400
-    temp_input_file = None
-    processed_file_path = None
-    try:
-        # Save the uploaded file to a temporary location
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_audio:
-            audio_file.save(tmp_audio.name)
-            temp_input_file = tmp_audio.name
-        try:
-            # The model requires the audio to be in a specific format (16kHz mono).
-            input_audio = AudioSegment.from_file(temp_input_file)
-            processed_audio = input_audio.set_frame_rate(16000).set_channels(1)
-            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as processed_tmp:
-                processed_audio.export(processed_tmp.name, format="wav")
-                processed_file_path = processed_tmp.name
-        except Exception as audio_e:
-            print(f"Error during audio processing with pydub: {audio_e}", file=sys.stderr)
-            return jsonify({"error": "Failed to process audio file. Please ensure it's a valid audio format."}), 500
-        try:
-            # Transcribe the processed file using the loaded model
-            transcription_list = asr_model.transcribe([processed_file_path])
-        except Exception as asr_e:
-            print(f"Error during transcription with NeMo model: {asr_e}", file=sys.stderr)
-            return jsonify({"error": "Transcription failed due to a model error."}), 500
-        if transcription_list and transcription_list[0] and hasattr(transcription_list[0], 'text'):
-            raw_transcription = transcription_list[0].text
-            final_transcription = post_process_kabyle_text(raw_transcription)
-            return jsonify({"transcription": final_transcription})
-        else:
-            print("ASR model returned an empty, invalid, or unexpected transcription object.")
-            return jsonify({"error": "Transcription failed. No text returned."}), 500
-    except Exception as e:
-        print(f"An unhandled server error occurred: {e}", file=sys.stderr)
-        return jsonify({"error": "An internal server error occurred."}), 500
-    finally:
-        # Cleanup temporary files
-        if temp_input_file and os.path.exists(temp_input_file):
-            os.remove(temp_input_file)
-        if processed_file_path and os.path.exists(processed_file_path):
-            os.remove(processed_file_path)
-if __name__ == '__main__':
-    print("Starting Flask server...")
-    print("Server running at http://127.0.0.1:5000")
-    app.run(debug=True)

backend.py ADDED Viewed

	@@ -0,0 +1,42 @@

+# backend.py
+import os
+import torch
+from nemo.collections.asr.models import EncDecRNNTBPEModel
+class KabyleASR:
+    def __init__(self):
+        self.device = "cpu"  # Force CPU
+        self.model = None
+        self.load_model()
+    def load_model(self):
+        """Load the NeMo Kabyle model on CPU."""
+        print("Loading NeMo ASR model for Kabyle (CPU mode)...")
+        try:
+            # Load from Hugging Face Hub
+            self.model = EncDecRNNTBPEModel.from_pretrained(
+                "nvidia/stt_kab_conformer_transducer_large"
+            )
+            self.model = self.model.to(self.device)
+            self.model.preprocessor.featurizer.dither = 0.0
+            self.model.preprocessor.featurizer.pad_to = 0
+            print("Model loaded successfully on CPU.")
+        except Exception as e:
+            raise RuntimeError(f"Failed to load model: {str(e)}")
+    def transcribe(self, audio_file):
+        if not os.path.exists(audio_file):
+            return "Error: Audio file not found."
+        try:
+            # Transcribe (this will be slow on CPU)
+            with torch.no_grad():
+                transcriptions = self.model.transcribe(
+                    [audio_file],
+                    batch_size=1,
+                    num_workers=0  # CPU-friendly
+                )
+            text = transcriptions[0] if transcriptions else ""
+            return str(text).strip()
+        except Exception as e:
+            return f"Transcription error: {str(e)}"

frontend/package.js DELETED Viewed

@@ -1,210 +0,0 @@
-import React, { useState, useRef } from 'react';
-// The URL of our new Flask backend server
-const BACKEND_URL = 'http://127.0.0.1:5000/transcribe';
-const App = () => {
-  const [isRecording, setIsRecording] = useState(false);
-  const [isLoading, setIsLoading] = useState(false);
-  const [statusMessage, setStatusMessage] = useState('Ready to transcribe Kabyle.');
-  const [transcription, setTranscription] = useState('');
-  const [audioURL, setAudioURL] = useState('');
-  const mediaRecorderRef = useRef(null);
-  const audioChunksRef = useRef([]);
-  const startRecording = async () => {
-    try {
-      const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
-      mediaRecorderRef.current = new MediaRecorder(stream);
-      audioChunksRef.current = [];
-      mediaRecorderRef.current.ondataavailable = event => {
-        audioChunksRef.current.push(event.data);
-      };
-      mediaRecorderRef.current.onstop = () => {
-        const audioBlob = new Blob(audioChunksRef.current, { type: 'audio/wav' });
-        const url = URL.createObjectURL(audioBlob);
-        setAudioURL(url);
-        stream.getTracks().forEach(track => track.stop()); // Stop microphone stream
-        handleTranscription(audioBlob);
-      };
-      mediaRecorderRef.current.start();
-      setIsRecording(true);
-      setStatusMessage('Recording started... Click again to stop.');
-    } catch (err) {
-      console.error("Error accessing microphone:", err);
-      setStatusMessage('Error: Could not access microphone. Please check permissions.');
-    }
-  };
-  const stopRecording = () => {
-    if (mediaRecorderRef.current && mediaRecorderRef.current.state === 'recording') {
-      mediaRecorderRef.current.stop();
-      setIsRecording(false);
-      setStatusMessage('Recording stopped. Processing audio...');
-    }
-  };
-  const handleFileUpload = (event) => {
-    const file = event.target.files[0];
-    if (file) {
-      const audioBlob = new Blob([file], { type: file.type });
-      const url = URL.createObjectURL(audioBlob);
-      setAudioURL(url);
-      handleTranscription(audioBlob);
-    }
-  };
-  const handleTranscription = async (audioBlob) => {
-    setIsLoading(true);
-    setStatusMessage('Transcribing audio...');
-    setTranscription('');
-    const transcribedText = await sendAudioToServer(audioBlob);
-    // Check if the transcription was successful
-    if (transcribedText && !transcribedText.startsWith("Error:")) {
-        // The server is now responsible for post-processing, so we display the text as-is.
-        setTranscription(transcribedText);
-        setStatusMessage('Transcription complete.');
-    } else {
-        setTranscription(transcribedText);
-        setStatusMessage('Transcription failed.');
-    }
-    setIsLoading(false);
-  };
-  // --- THIS IS THE NEW FUNCTION THAT SENDS AUDIO TO THE FLASK SERVER ---
-  const sendAudioToServer = async (audioBlob) => {
-    const formData = new FormData();
-    formData.append('audio', audioBlob, 'audio.wav');
-    try {
-      const response = await fetch(BACKEND_URL, {
-        method: 'POST',
-        body: formData,
-      });
-      if (!response.ok) {
-        throw new Error(`Server error: ${response.status} ${response.statusText}`);
-      }
-      const data = await response.json();
-      return data.transcription;
-    } catch (error) {
-      console.error("Error sending audio to server:", error);
-      return `Error: Failed to get transcription from server. ${error.message}`;
-    }
-  };
-  const handlePlayAudio = () => {
-    const audio = new Audio(audioURL);
-    audio.play();
-  };
-  return (
-    <div className="min-h-screen bg-gray-100 flex flex-col items-center justify-center font-sans p-4">
-      <style>
-        {`
-          @keyframes spin {
-            from {
-              transform: rotate(0deg);
-            }
-            to {
-              transform: rotate(360deg);
-            }
-          }
-          .animate-spin {
-            animation: spin 1s linear infinite;
-          }
-        `}
-      </style>
-      <div className="bg-white shadow-xl rounded-2xl p-8 max-w-2xl w-full text-center space-y-6">
-        <h1 className="text-4xl font-extrabold text-gray-800">Kabyle ASR Web App</h1>
-        <p className="text-gray-600">Record or upload audio to get a transcription.</p>
-        <div className="flex flex-col sm:flex-row justify-center items-center space-y-4 sm:space-y-0 sm:space-x-4 mt-6">
-          <button
-            onClick={isRecording ? stopRecording : startRecording}
-            className={`flex items-center justify-center px-6 py-3 rounded-xl font-bold text-lg transition-all duration-300 transform hover:scale-105 shadow-md
-              ${isRecording ? 'bg-red-500 text-white hover:bg-red-600' : 'bg-blue-600 text-white hover:bg-blue-700'}`}
-            disabled={isLoading}
-          >
-            {isRecording ? (
-              <>
-                <svg className="w-6 h-6 mr-2 animate-pulse" fill="none" stroke="currentColor" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg">
-                  <path strokeLinecap="round" strokeLinejoin="round" strokeWidth="2" d="M19 11a7 7 0 01-7 7m0 0a7 7 0 01-7-7m7 7v4m0 0H8m4 0h4m-4-8a4 4 0 01-4-4V5a4 4 0 118 0v4a4 4 0 01-4 4z"></path>
-                </svg>
-                Stop Recording
-              </>
-            ) : (
-              <>
-                <svg className="w-6 h-6 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg">
-                  <path strokeLinecap="round" strokeLinejoin="round" strokeWidth="2" d="M19 11a7 7 0 01-7 7m0 0a7 7 0 01-7-7m7 7v4m0 0H8m4 0h4m-4-8a4 4 0 01-4-4V5a4 4 0 118 0v4a4 4 0 01-4 4z"></path>
-                </svg>
-                Start Recording
-              </>
-            )}
-          </button>
-          <label htmlFor="file-upload" className={`flex items-center justify-center px-6 py-3 rounded-xl font-bold text-lg transition-all duration-300 transform hover:scale-105 shadow-md
-            bg-gray-200 text-gray-800 hover:bg-gray-300 cursor-pointer ${isLoading || isRecording ? 'opacity-50 pointer-events-none' : ''}`}>
-            <svg className="w-6 h-6 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg">
-              <path strokeLinecap="round" strokeLinejoin="round" strokeWidth="2" d="M4 16v1a3 3 0 003 3h10a3 3 0 003-3v-1m-4-8l-4-4m0 0L8 8m4-4v12"></path>
-            </svg>
-            Upload Audio
-            <input
-              id="file-upload"
-              type="file"
-              accept="audio/*"
-              className="hidden"
-              onChange={handleFileUpload}
-              disabled={isLoading || isRecording}
-            />
-          </label>
-        </div>
-        {audioURL && (
-            <div className="flex items-center justify-center mt-4">
-                <button
-                    onClick={handlePlayAudio}
-                    className="flex items-center px-4 py-2 rounded-lg bg-green-500 text-white font-semibold shadow-md hover:bg-green-600 transition-colors"
-                >
-                    <svg className="w-5 h-5 mr-2" fill="currentColor" viewBox="0 0 20 20" xmlns="http://www.w3.org/2000/svg">
-                      <path fillRule="evenodd" d="M10 18a8 8 0 100-16 8 8 0 000 16zM9.555 7.168A1 1 0 008 8v4a1 1 0 001.555.832l3-2a1 1 0 000-1.664l-3-2z" clipRule="evenodd"></path>
-                    </svg>
-                    Play Audio
-                </button>
-            </div>
-        )}
-        <div className="mt-6 text-xl font-medium text-gray-700 h-8">
-          {isLoading ? (
-            <div className="flex items-center justify-center">
-              <svg className="w-6 h-6 animate-spin text-blue-500 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg">
-                <path strokeLinecap="round" strokeLinejoin="round" strokeWidth="2" d="M4 4v5h.582m15.418 5v5h.582M18 10V4.5a2.5 2.5 0 00-2.5-2.5h-8A2.5 2.5 0 005 4.5V10m13 0l-3 3m0 0l-3 3m3-3v14m0-14H10"></path>
-              </svg>
-              <span className="text-blue-500">{statusMessage}</span>
-            </div>
-          ) : (
-            <span className="text-gray-500">{statusMessage}</span>
-          )}
-        </div>
-        {transcription && (
-          <div className="mt-8 p-6 bg-gray-50 rounded-xl shadow-inner text-left">
-            <h3 className="text-2xl font-bold text-gray-800 flex items-center mb-4">
-                <svg className="w-6 h-6 text-green-500 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg">
-                  <path strokeLinecap="round" strokeLinejoin="round" strokeWidth="2" d="M9 12l2 2 4-4m6 2a9 9 0 11-18 0 9 9 0 0118 0z"></path>
-                </svg>
-                Transcription
-            </h3>
-            <p className="text-gray-800 text-xl leading-relaxed">{transcription}</p>
-          </div>
-        )}
-      </div>
-    </div>
-  );
-};
-export default App;

requirements.txt CHANGED Viewed

@@ -1,2 +1,10 @@
-Flask==2.3.3
-gunicorn==21.2.0

+torch==1.13.1
+torchaudio==0.13.1
+pytorch-lightning==1.9.5
+omegaconf>=2.0
+hydra-core
+numpy<1.24.0
+gradio==4.25.0
+# Install NeMo ASR only (lighter)
+git+https://github.com/NVIDIA/[email protected]#egg=nemo_toolkit[asr]&subdirectory=.

requirements_full.txt DELETED Viewed

@@ -1,7 +0,0 @@
-numpy
-typing_extensions
-Flask
-Flask-CORS
-pydub
-gunicorn
-nemo_toolkit[asr]