Update app.py
Browse files
app.py
CHANGED
@@ -16,13 +16,28 @@ import torch
|
|
16 |
|
17 |
# Fix: Add trust_remote_code=True
|
18 |
import fsspec
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
-
|
21 |
-
fsspec.config.conf["timeout"] = 20000 # minutes
|
22 |
-
|
23 |
-
dataset = load_dataset("librispeech_asr", "clean", split="train", trust_remote_code=True)
|
24 |
|
|
|
|
|
25 |
|
|
|
26 |
|
27 |
# Function to load & resample audio
|
28 |
def preprocess_audio(batch):
|
|
|
16 |
|
17 |
# Fix: Add trust_remote_code=True
|
18 |
import fsspec
|
19 |
+
import os
|
20 |
+
import tarfile
|
21 |
+
|
22 |
+
# Define paths
|
23 |
+
dataset_tar_path = "dev-clean.tar.gz" # Path in your repo
|
24 |
+
extract_path = "./librispeech_dev_clean" # Extracted folder
|
25 |
+
|
26 |
+
# Check if dataset is already extracted, if not, extract it
|
27 |
+
if not os.path.exists(extract_path):
|
28 |
+
print("Extracting dataset...")
|
29 |
+
with tarfile.open(dataset_tar_path, "r:gz") as tar:
|
30 |
+
tar.extractall(extract_path)
|
31 |
+
print("Extraction complete.")
|
32 |
+
else:
|
33 |
+
print("Dataset already extracted.")
|
34 |
|
35 |
+
from datasets import load_dataset
|
|
|
|
|
|
|
36 |
|
37 |
+
# Load extracted dataset
|
38 |
+
dataset = load_dataset("librispeech_asr", data_dir=extract_path, split="train", trust_remote_code=True)
|
39 |
|
40 |
+
print("Dataset loaded successfully!")
|
41 |
|
42 |
# Function to load & resample audio
|
43 |
def preprocess_audio(batch):
|