diff --git "a/notebooks/Audio_Challenge.ipynb" "b/notebooks/Audio_Challenge.ipynb" new file mode 100644--- /dev/null +++ "b/notebooks/Audio_Challenge.ipynb" @@ -0,0 +1,308 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "MQiEgneCUB5a" + }, + "source": [ + "# 🔊 Audio Section: Frugal AI Challenge\n", + "\n", + "## Strategy for solving the problem\n", + "\n", + "To minimize energy consumption, we deliberately **chose not to use deep learning techniques** such as CNN-based spectrogram analysis, LSTM on raw audio signals or transformer models, which are generally **more computationally intensive**.\n", + "\n", + "Instead, a more **lightweight approach** was adopted:\n", + "- Feature extraction from the audio signal (MFCCs and spectral contrast)\n", + "- Training a simple machine learning model (decision tree) on these extracted features\n", + "\n", + "Potential Improvements (Not Yet Tested)\n", + "- Hyperparameter tuning for better performance\n", + "- Exploring alternative lightweight ML models, such as logistic regression or k-nearest neighbors\n", + "- Feature extraction without Librosa, using NumPy directly to compute basic signal properties, further reducing dependencies and overhead.\n", + "\n", + "## Installation and library imports" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Lig3tpd66Jvi", + "outputId": "6de11ca3-52c0-483c-a0f9-d56dfabb3ff8" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: librosa in /usr/local/lib/python3.11/dist-packages (0.10.2.post1)\n", + "Requirement already satisfied: soundfile in /usr/local/lib/python3.11/dist-packages (0.13.1)\n", + "Requirement already satisfied: datasets in /usr/local/lib/python3.11/dist-packages (3.2.0)\n", + "Requirement already satisfied: audioread>=2.1.9 in /usr/local/lib/python3.11/dist-packages (from librosa) (3.0.1)\n", + "Requirement already satisfied: numpy!=1.22.0,!=1.22.1,!=1.22.2,>=1.20.3 in /usr/local/lib/python3.11/dist-packages (from librosa) (1.26.4)\n", + "Requirement already satisfied: scipy>=1.2.0 in /usr/local/lib/python3.11/dist-packages (from librosa) (1.13.1)\n", + "Requirement already satisfied: scikit-learn>=0.20.0 in /usr/local/lib/python3.11/dist-packages (from librosa) (1.6.1)\n", + "Requirement already satisfied: joblib>=0.14 in /usr/local/lib/python3.11/dist-packages (from librosa) (1.4.2)\n", + "Requirement already satisfied: decorator>=4.3.0 in /usr/local/lib/python3.11/dist-packages (from librosa) (4.4.2)\n", + "Requirement already satisfied: numba>=0.51.0 in /usr/local/lib/python3.11/dist-packages (from librosa) (0.60.0)\n", + "Requirement already satisfied: pooch>=1.1 in /usr/local/lib/python3.11/dist-packages (from librosa) (1.8.2)\n", + "Requirement already satisfied: soxr>=0.3.2 in /usr/local/lib/python3.11/dist-packages (from librosa) (0.5.0.post1)\n", + "Requirement already satisfied: typing-extensions>=4.1.1 in /usr/local/lib/python3.11/dist-packages (from librosa) (4.12.2)\n", + "Requirement already satisfied: lazy-loader>=0.1 in /usr/local/lib/python3.11/dist-packages (from librosa) (0.4)\n", + "Requirement already satisfied: msgpack>=1.0 in /usr/local/lib/python3.11/dist-packages (from librosa) (1.1.0)\n", + "Requirement already satisfied: cffi>=1.0 in /usr/local/lib/python3.11/dist-packages (from soundfile) (1.17.1)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.11/dist-packages (from datasets) (3.17.0)\n", + "Requirement already satisfied: pyarrow>=15.0.0 in /usr/local/lib/python3.11/dist-packages (from datasets) (17.0.0)\n", + "Requirement already satisfied: dill<0.3.9,>=0.3.0 in /usr/local/lib/python3.11/dist-packages (from datasets) (0.3.8)\n", + "Requirement already satisfied: pandas in /usr/local/lib/python3.11/dist-packages (from datasets) (2.2.2)\n", + "Requirement already satisfied: requests>=2.32.2 in /usr/local/lib/python3.11/dist-packages (from datasets) (2.32.3)\n", + "Requirement already satisfied: tqdm>=4.66.3 in /usr/local/lib/python3.11/dist-packages (from datasets) (4.67.1)\n", + "Requirement already satisfied: xxhash in /usr/local/lib/python3.11/dist-packages (from datasets) (3.5.0)\n", + "Requirement already satisfied: multiprocess<0.70.17 in /usr/local/lib/python3.11/dist-packages (from datasets) (0.70.16)\n", + "Requirement already satisfied: fsspec<=2024.9.0,>=2023.1.0 in /usr/local/lib/python3.11/dist-packages (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets) (2024.9.0)\n", + "Requirement already satisfied: aiohttp in /usr/local/lib/python3.11/dist-packages (from datasets) (3.11.11)\n", + "Requirement already satisfied: huggingface-hub>=0.23.0 in /usr/local/lib/python3.11/dist-packages (from datasets) (0.27.1)\n", + "Requirement already satisfied: packaging in /usr/local/lib/python3.11/dist-packages (from datasets) (24.2)\n", + "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.11/dist-packages (from datasets) (6.0.2)\n", + "Requirement already satisfied: pycparser in /usr/local/lib/python3.11/dist-packages (from cffi>=1.0->soundfile) (2.22)\n", + "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp->datasets) (2.4.4)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.11/dist-packages (from aiohttp->datasets) (1.3.2)\n", + "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp->datasets) (25.1.0)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.11/dist-packages (from aiohttp->datasets) (1.5.0)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.11/dist-packages (from aiohttp->datasets) (6.1.0)\n", + "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp->datasets) (0.2.1)\n", + "Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp->datasets) (1.18.3)\n", + "Requirement already satisfied: llvmlite<0.44,>=0.43.0dev0 in /usr/local/lib/python3.11/dist-packages (from numba>=0.51.0->librosa) (0.43.0)\n", + "Requirement already satisfied: platformdirs>=2.5.0 in /usr/local/lib/python3.11/dist-packages (from pooch>=1.1->librosa) (4.3.6)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests>=2.32.2->datasets) (3.4.1)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests>=2.32.2->datasets) (3.10)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests>=2.32.2->datasets) (2.3.0)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests>=2.32.2->datasets) (2024.12.14)\n", + "Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn>=0.20.0->librosa) (3.5.0)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.11/dist-packages (from pandas->datasets) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas->datasets) (2024.2)\n", + "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.11/dist-packages (from pandas->datasets) (2025.1)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.11/dist-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.17.0)\n" + ] + } + ], + "source": [ + "!pip install librosa soundfile datasets" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "pu_R5oY36QTL" + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from datasets import load_dataset\n", + "from IPython.display import Audio\n", + "import librosa\n", + "import numpy as np\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.ensemble import HistGradientBoostingClassifier\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import classification_report\n", + "from tqdm import tqdm\n", + "import joblib\n", + "import itertools" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 116 + }, + "id": "JGl86Gu3USQk", + "outputId": "d256701b-6b96-46d6-be48-9eaea04e2421" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "{'audio': {'path': 'pooks_6ebcaf77-aa92-4f10-984e-ecc5a919bcbb_41-44.wav', 'array': array([-0.00915527, 0.01025391, -0.01452637, ..., -0.00628662,\n", + " 0.00064087, 0.00137329]), 'sampling_rate': 12000}, 'label': 1}\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {}, + "execution_count": 3 + } + ], + "source": [ + "dataset = load_dataset(\"rfcx/frugalai\", streaming=True)\n", + "example = next(iter(dataset['train']))\n", + "print(example)\n", + "audio_url = example['audio']['array']\n", + "Audio(audio_url, rate=12000)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "HwTOLylnYQxO", + "outputId": "1019f03c-5aae-4fb1-e43d-3babfb7ac45f" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "8038it [01:32, 107.82it/s]/usr/local/lib/python3.11/dist-packages/librosa/core/spectrum.py:266: UserWarning: n_fft=2048 is too large for input signal of length=0\n", + " warnings.warn(\n", + "35277it [06:27, 91.13it/s] \n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.80 0.79 0.80 2912\n", + " 1 0.86 0.86 0.86 4144\n", + "\n", + " accuracy 0.83 7056\n", + " macro avg 0.83 0.83 0.83 7056\n", + "weighted avg 0.83 0.83 0.83 7056\n", + "\n" + ] + } + ], + "source": [ + "def extract_features(audio_array, sampling_rate):\n", + " \"\"\"\n", + " Extracts spectral contrast features from an audio signal.\n", + " \"\"\"\n", + " contrast = librosa.feature.spectral_contrast(y=audio_array)\n", + " return np.mean(contrast, axis=1)\n", + "\n", + "# Initialize feature and label lists\n", + "X, y = [], []\n", + "batch_size = 100\n", + "dataset_train = dataset['train']\n", + "\n", + "# Process dataset in batches to optimize memory usage\n", + "X_total, y_total = [], []\n", + "\n", + "for i, example in enumerate(tqdm(dataset_train)):\n", + " audio_array = example['audio']['array']\n", + " sampling_rate = example['audio']['sampling_rate']\n", + " label = example['label']\n", + "\n", + " # Extract audio features\n", + " features = extract_features(audio_array, sampling_rate)\n", + " X.append(features)\n", + " y.append(label)\n", + "\n", + " # Process batches to reduce RAM usage\n", + " if (i + 1) % batch_size == 0:\n", + " X_batch = np.array(X)\n", + " y_batch = np.array(y)\n", + "\n", + " if i + 1 == batch_size:\n", + " X_total, y_total = X_batch, y_batch\n", + " else:\n", + " X_total = np.vstack([X_total, X_batch])\n", + " y_total = np.hstack([y_total, y_batch])\n", + "\n", + " # Reset batch storage\n", + " X, y = [], []\n", + "\n", + "# Add remaining data if not a multiple of batch_size\n", + "if X:\n", + " X_total = np.vstack([X_total, np.array(X)])\n", + " y_total = np.hstack([y_total, np.array(y)])\n", + "\n", + "# Split the dataset into training and test sets\n", + "X_train, X_test, y_train, y_test = train_test_split(X_total, y_total, test_size=0.2, random_state=42)\n", + "\n", + "# Train a decision tree classifier\n", + "clf = DecisionTreeClassifier()\n", + "clf.fit(X_train, y_train)\n", + "\n", + "# Evaluate the model\n", + "y_pred = clf.predict(X_test)\n", + "print(classification_report(y_test, y_pred))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hahOkzxKPnn_" + }, + "source": [ + "## Model export" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "W6n01NMHfZBk", + "outputId": "400f40f6-ffbd-49ad-a709-f8d28bc40a80" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Model name : model_audio.pkl\n" + ] + } + ], + "source": [ + "model_filename = \"model_audio.pkl\"\n", + "joblib.dump(clf, model_filename)\n", + "print(f\"Model name : {model_filename}\")" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file