tahirsher commited on
Commit
098a61e
·
verified ·
1 Parent(s): ee9adc4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -4
app.py CHANGED
@@ -16,13 +16,28 @@ import torch
16
 
17
  # Fix: Add trust_remote_code=True
18
  import fsspec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
- # Set a higher timeout limit
21
- fsspec.config.conf["timeout"] = 20000 # minutes
22
-
23
- dataset = load_dataset("librispeech_asr", "clean", split="train", trust_remote_code=True)
24
 
 
 
25
 
 
26
 
27
  # Function to load & resample audio
28
  def preprocess_audio(batch):
 
16
 
17
  # Fix: Add trust_remote_code=True
18
  import fsspec
19
+ import os
20
+ import tarfile
21
+
22
+ # Define paths
23
+ dataset_tar_path = "dev-clean.tar.gz" # Path in your repo
24
+ extract_path = "./librispeech_dev_clean" # Extracted folder
25
+
26
+ # Check if dataset is already extracted, if not, extract it
27
+ if not os.path.exists(extract_path):
28
+ print("Extracting dataset...")
29
+ with tarfile.open(dataset_tar_path, "r:gz") as tar:
30
+ tar.extractall(extract_path)
31
+ print("Extraction complete.")
32
+ else:
33
+ print("Dataset already extracted.")
34
 
35
+ from datasets import load_dataset
 
 
 
36
 
37
+ # Load extracted dataset
38
+ dataset = load_dataset("librispeech_asr", data_dir=extract_path, split="train", trust_remote_code=True)
39
 
40
+ print("Dataset loaded successfully!")
41
 
42
  # Function to load & resample audio
43
  def preprocess_audio(batch):