Spaces:

tahirsher
/

ASR_Model_for_Transcription_into_Text

Sleeping

tahirsher commited on Mar 9

Commit

098a61e

verified ·

1 Parent(s): ee9adc4

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -16,13 +16,28 @@ import torch
 # Fix: Add trust_remote_code=True
 import fsspec
-# Set a higher timeout limit
-fsspec.config.conf["timeout"] = 20000  #  minutes
-dataset = load_dataset("librispeech_asr", "clean", split="train", trust_remote_code=True)
 # Function to load & resample audio
 def preprocess_audio(batch):

 # Fix: Add trust_remote_code=True
 import fsspec
+import os
+import tarfile
+# Define paths
+dataset_tar_path = "dev-clean.tar.gz"  # Path in your repo
+extract_path = "./librispeech_dev_clean"  # Extracted folder
+# Check if dataset is already extracted, if not, extract it
+if not os.path.exists(extract_path):
+    print("Extracting dataset...")
+    with tarfile.open(dataset_tar_path, "r:gz") as tar:
+        tar.extractall(extract_path)
+    print("Extraction complete.")
+else:
+    print("Dataset already extracted.")
+from datasets import load_dataset
+# Load extracted dataset
+dataset = load_dataset("librispeech_asr", data_dir=extract_path, split="train", trust_remote_code=True)
+print("Dataset loaded successfully!")
 # Function to load & resample audio
 def preprocess_audio(batch):