File size: 3,294 Bytes
82bc972
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#!/bin/bash

# Define the root directory where the tar files are located
root=$1 # /data/scratch/pyp/datasets/emilia/downloads
save_root=$2 # /data/scratch/pyp/datasets/emilia/preprocessed/audio

mkdir -p "${save_root}"

# Input log files
log_file="file_log.txt"       # Full log of files to process
exist_log_file="file_log_debug.txt" # Log of already processed files
failure_log="untar_failures.log" # Log file for untar failures

# Clear previous failure log
> "$failure_log"

# Create an array of filenames already processed (from exist_log_file)
if [ -f "$exist_log_file" ]; then
    mapfile -t existing_files < "$exist_log_file"
else
    existing_files=()
fi

# Create a temporary filtered log of files to process
filtered_log="filtered_file_log.txt"
grep -v -F -f "$exist_log_file" "$log_file" > "$filtered_log"

# Count total filtered files
total_files=$(wc -l < "$filtered_log")
echo "Found $total_files entries to process in $filtered_log."

# Print the filtered files
echo "Filtered files to process:"
cat "$filtered_log"
echo

# Confirm before starting processing
read -p "Do you want to proceed with the above files? (y/n): " confirm
if [[ "$confirm" != "y" ]]; then
    echo "Operation canceled."
    rm -f "$filtered_log"
    exit 1
fi

# Start time
start_time=$(date +%s)

# Counter for how many lines we've processed
count=0

# Process filtered log
while IFS=',' read -r filename size local_sha256 original_filename url; do
    count=$((count + 1))

    # Trim leading/trailing whitespace
    filename=$(echo "$filename" | xargs)
    size=$(echo "$size" | xargs)
    local_sha256=$(echo "$local_sha256" | xargs)
    original_filename=$(echo "$original_filename" | xargs)
    url=$(echo "$url" | xargs)

    # Construct the full path to the tar file
    tar_file="${root}/${original_filename}"

    # Check if the tar file exists
    if [ ! -f "$tar_file" ]; then
        echo "❌ File not found: $tar_file"
        echo "$filename, $size, $local_sha256, $original_filename, $url" >> "$failure_log"
    else
        # Try to untar the file
        echo "[$count/$total_files] Untarring $tar_file..."
        
        if ! tar -xf "$tar_file" -C "${save_root}"; then
            # If untar fails, log the failure
            echo "❌ Failed to untar: $tar_file"
            echo "$filename, $size, $local_sha256, $original_filename, $url" >> "$failure_log"
        else
            echo "✅ Successfully untarred: $tar_file"
            # Append successfully untarred filename to exist_log_file
            echo "$filename" >> "$exist_log_file"
        fi
    fi

    # Calculate elapsed time, average time per file, and ETA
    now=$(date +%s)
    elapsed=$(( now - start_time ))  # total seconds since the start
    if [ $count -gt 0 ]; then
        avg_time=$(awk "BEGIN { printf \"%.2f\", $elapsed / $count }")
        remain=$(( total_files - count ))
        eta_seconds=$(awk "BEGIN { printf \"%.0f\", $avg_time * $remain }")
        eta_formatted=$(date -ud "@${eta_seconds}" +'%H:%M:%S')
        echo "Elapsed: ${elapsed}s | Avg/f: ${avg_time}s | Remaining: $remain files | ETA: ~${eta_formatted}"
    fi

done < "$filtered_log"

# Clean up temporary filtered log
rm -f "$filtered_log"

# Summary
echo "Untar operation completed. Check $failure_log for any failures."