Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import requests | |
import io | |
import uuid | |
import os | |
import json | |
import base64 | |
from datetime import datetime | |
import re | |
import time | |
# Set page configuration | |
st.set_page_config( | |
page_title="Speech Hate Detection - Annotation Tool", | |
page_icon="🎧", | |
layout="centered", | |
initial_sidebar_state="collapsed" | |
) | |
# Constants | |
HF_DATASET_URL = "https://huggingface.co/datasets/kcrl/Hs/resolve/main/" | |
RESULTS_FILE = "annotation_results.csv" # Local CSV file to store results | |
# Debug flag - enable to see detailed debug info | |
DEBUG_MODE = True | |
# Log debugging information if debug mode is enabled | |
def debug_log(message): | |
if DEBUG_MODE: | |
st.write(f"DEBUG: {message}") | |
# Initial debug message | |
debug_log("Application starting...") | |
# For Hugging Face Spaces deployment | |
if os.path.exists('/data'): | |
# Use the persistent storage directory | |
RESULTS_FILE = "/data/annotation_results.csv" | |
debug_log(f"Using persistent storage at {RESULTS_FILE}") | |
# Function to check if file exists in the Hugging Face repository with exponential backoff | |
def check_file_exists(file_url, max_retries=3): | |
""" | |
Checks if a file exists at the given URL without downloading the entire file. | |
Uses exponential backoff for retries. | |
Returns True if the file exists, False otherwise. | |
""" | |
for attempt in range(max_retries): | |
try: | |
# Use a short timeout to avoid long waits | |
response = requests.head(file_url, timeout=3) | |
return response.status_code == 200 | |
except Exception as e: | |
if attempt < max_retries - 1: | |
# Exponential backoff: 1s, 2s, 4s, etc. | |
wait_time = 2 ** attempt | |
debug_log(f"Request failed, retrying in {wait_time}s: {str(e)}") | |
time.sleep(wait_time) | |
else: | |
debug_log(f"Request failed after {max_retries} attempts: {str(e)}") | |
return False | |
return False | |
# Function to check if a specific chunk exists | |
def check_chunk_exists(video_id, chunk_num): | |
"""Check if a specific chunk of a video exists in the repository""" | |
chunk_id = f"{chunk_num:04d}" | |
file_name = f"{video_id}_chunk_{chunk_id}.wav" | |
file_url = f"{HF_DATASET_URL}{file_name}" | |
return check_file_exists(file_url) | |
# Function to find all chunks for a video by using binary search approach | |
def find_all_chunks_for_video(video_id, max_possible_chunks=500): | |
""" | |
Find all available chunks for a video ID using an optimized approach. | |
Uses binary search first to find the approximate range, then checks each file. | |
Args: | |
video_id: The video ID to check | |
max_possible_chunks: Upper limit for the binary search | |
Returns: | |
List of chunk numbers that exist | |
""" | |
debug_log(f"Finding chunks for {video_id}...") | |
# First use binary search to find the upper bound | |
low = 1 | |
high = max_possible_chunks | |
# Find an upper bound first (where files no longer exist) | |
while low <= high: | |
mid = (low + high) // 2 | |
if check_chunk_exists(video_id, mid): | |
low = mid + 1 | |
else: | |
high = mid - 1 | |
# The highest existing chunk is at 'high' | |
highest_chunk = max(1, high) | |
debug_log(f"Binary search found highest chunk: {highest_chunk}") | |
# Now check each potential chunk from 1 to highest_chunk | |
existing_chunks = [] | |
for chunk_num in range(1, highest_chunk + 1): | |
# Add some throttling to avoid rate limits (0.1s between requests) | |
time.sleep(0.1) | |
if check_chunk_exists(video_id, chunk_num): | |
existing_chunks.append(chunk_num) | |
debug_log(f"Found {len(existing_chunks)} chunks for {video_id}") | |
return existing_chunks | |
# Function to build a list of audio file paths from video IDs with dynamic chunk detection | |
def build_file_list_from_video_ids(video_ids, check_existence=False): | |
""" | |
Creates a list of audio files based on the provided video IDs. | |
Dynamically detects how many chunks exist for each video. | |
Args: | |
video_ids: List of video IDs | |
check_existence: Whether to verify each file exists before adding it | |
Returns: | |
List of dictionaries with file info | |
""" | |
files = [] | |
debug_log(f"Building file list for {len(video_ids)} videos (check_existence={check_existence})...") | |
# Create progress bar for checking videos | |
progress_bar = st.progress(0) | |
for i, video_id in enumerate(video_ids): | |
# Update progress | |
progress_bar.progress((i + 1) / len(video_ids)) | |
if check_existence: | |
# Find all chunks for this video | |
st.write(f"Finding chunks for video {video_id} ({i+1}/{len(video_ids)})...") | |
chunks = find_all_chunks_for_video(video_id) | |
if chunks: | |
st.write(f"Found {len(chunks)} chunks for video {video_id}") | |
for chunk_num in chunks: | |
chunk_id = f"{chunk_num:04d}" | |
file_id = f"{video_id}_chunk_{chunk_id}" | |
file_name = f"{file_id}.wav" | |
file_url = f"{HF_DATASET_URL}{file_name}" | |
files.append({ | |
"id": file_id, | |
"name": file_name, | |
"url": file_url, | |
"video_id": video_id, | |
"chunk_num": chunk_num | |
}) | |
else: | |
st.warning(f"No chunks found for video {video_id}") | |
else: | |
# If not checking existence, use a default range of chunks (1-100) | |
# Reduced from 1-200 to speed up initial loading | |
for chunk_num in range(1, 101): | |
chunk_id = f"{chunk_num:04d}" | |
file_id = f"{video_id}_chunk_{chunk_id}" | |
file_name = f"{file_id}.wav" | |
file_url = f"{HF_DATASET_URL}{file_name}" | |
files.append({ | |
"id": file_id, | |
"name": file_name, | |
"url": file_url, | |
"video_id": video_id, | |
"chunk_num": chunk_num | |
}) | |
debug_log(f"Built file list with {len(files)} total files") | |
return files | |
# Function to download file from Hugging Face with retry logic | |
def download_file_from_hf(file_url, max_retries=3): | |
for attempt in range(max_retries): | |
try: | |
response = requests.get(file_url, timeout=10) # Increased timeout for audio downloads | |
if response.status_code == 200: | |
return response.content | |
else: | |
if attempt < max_retries - 1: | |
wait_time = 2 ** attempt | |
debug_log(f"Download failed (HTTP {response.status_code}), retrying in {wait_time}s") | |
time.sleep(wait_time) | |
else: | |
st.error(f"Failed to download file: HTTP {response.status_code}") | |
return None | |
except Exception as e: | |
if attempt < max_retries - 1: | |
wait_time = 2 ** attempt | |
debug_log(f"Download error, retrying in {wait_time}s: {str(e)}") | |
time.sleep(wait_time) | |
else: | |
st.error(f"Error downloading file: {e}") | |
return None | |
return None | |
# Create a unique ID for new annotators or retrieve existing | |
def get_annotator_id(): | |
debug_log("Getting annotator ID...") | |
if 'annotator_id' not in st.session_state: | |
# Check if we have a stored ID in local storage | |
annotator_id_file = '.annotator_id' | |
if os.path.exists('/data'): | |
annotator_id_file = '/data/.annotator_id' | |
if os.path.exists(annotator_id_file): | |
with open(annotator_id_file, 'r') as f: | |
st.session_state.annotator_id = f.read().strip() | |
debug_log(f"Retrieved existing annotator ID") | |
else: | |
# Generate a new ID | |
st.session_state.annotator_id = str(uuid.uuid4()) | |
with open(annotator_id_file, 'w') as f: | |
f.write(st.session_state.annotator_id) | |
debug_log(f"Created new annotator ID") | |
return st.session_state.annotator_id | |
# Function to load annotation data from CSV | |
def load_annotations(): | |
debug_log(f"Loading annotations from {RESULTS_FILE}") | |
try: | |
if os.path.exists(RESULTS_FILE): | |
df = pd.read_csv(RESULTS_FILE) | |
debug_log(f"Loaded {len(df)} annotation records") | |
return df | |
else: | |
# Create a new DataFrame if the file doesn't exist | |
debug_log("No existing annotations found, creating new file") | |
df = pd.DataFrame(columns=['file_id', 'file_name', 'Label', 'annotator_id', 'timestamp', 'video_id']) | |
df.to_csv(RESULTS_FILE, index=False) | |
return df | |
except Exception as e: | |
st.error(f"Error loading annotations: {e}") | |
debug_log(f"Error loading annotations: {str(e)}") | |
return pd.DataFrame(columns=['file_id', 'file_name', 'Label', 'annotator_id', 'timestamp', 'video_id']) | |
# Function to save annotations to CSV | |
def save_annotation(df): | |
debug_log(f"Saving annotations to {RESULTS_FILE}") | |
try: | |
df.to_csv(RESULTS_FILE, index=False) | |
debug_log("Annotations saved successfully") | |
return True | |
except Exception as e: | |
st.error(f"Error saving annotation: {e}") | |
debug_log(f"Error saving annotations: {str(e)}") | |
return False | |
# Initialize application state | |
if 'initialized' not in st.session_state: | |
debug_log("Initializing application state") | |
st.session_state.initialized = False | |
st.session_state.current_file_index = 0 | |
st.session_state.current_file = None | |
st.session_state.annotation_df = None | |
st.session_state.all_files = [] | |
st.session_state.pending_files = [] | |
st.session_state.hate_count = 0 | |
st.session_state.non_hate_count = 0 | |
st.session_state.discard_count = 0 | |
st.session_state.page = 1 | |
st.session_state.files_per_page = 50 | |
st.session_state.lite_mode = False | |
# Application title and header | |
st.markdown(""" | |
<style> | |
.main-header { | |
font-size: 26px; | |
font-weight: bold; | |
color: #ff4b4b; | |
margin-bottom: 20px; | |
} | |
.sub-header { | |
font-size: 18px; | |
color: #555; | |
margin-bottom: 30px; | |
} | |
.progress-container { | |
margin: 20px 0; | |
padding: 15px; | |
background-color: #f9f9f9; | |
border-radius: 5px; | |
} | |
.stats-container { | |
display: flex; | |
justify-content: space-around; | |
margin-top: 20px; | |
text-align: center; | |
flex-wrap: wrap; | |
} | |
.stat-item { | |
padding: 10px; | |
min-width: 100px; | |
} | |
.stat-value { | |
font-size: 24px; | |
font-weight: bold; | |
color: #4CAF50; | |
} | |
.stat-label { | |
font-size: 14px; | |
color: #666; | |
} | |
.audio-container { | |
margin: 30px 0; | |
padding: 20px; | |
background-color: #f5f5f5; | |
border-radius: 10px; | |
text-align: center; | |
} | |
.file-info { | |
font-size: 14px; | |
color: #666; | |
margin-top: 5px; | |
} | |
</style> | |
<div class="main-header">Speech Hate Detection - Annotation Tool</div> | |
""", unsafe_allow_html=True) | |
# Quick start in lite mode (new feature) | |
if not st.session_state.initialized: | |
if st.button("⚡ Quick Start (Lite Mode)"): | |
debug_log("Starting in lite mode") | |
st.session_state.lite_mode = True | |
st.session_state.annotation_df = load_annotations() | |
st.session_state.initialized = True | |
st.success("Started in lite mode. Enter video IDs and click Initialize.") | |
st.rerun() | |
# App configuration section (collapsible) | |
with st.expander("Configuration", expanded=not st.session_state.initialized): | |
st.markdown(""" | |
### Configuration | |
This tool loads audio files from the Hugging Face dataset at: | |
https://huggingface.co/datasets/kcrl/Hs | |
You can provide a list of video IDs for annotation by adding them in the text area below. | |
""") | |
# Default video IDs | |
default_video_ids = "0hJ2JGhM7TY\n1PRABBSTpiE\n4ewRgBMP_AY" # Reduced to just 3 for initial testing | |
# Allow user to input video IDs | |
user_video_ids = st.text_area( | |
"Video IDs to annotate (one per line)", | |
value=default_video_ids, | |
height=150, | |
help="Enter the YouTube video IDs, one per line. The app will look for chunks of these videos." | |
) | |
annotator_name = st.text_input("Your Name (Optional)", | |
help="Your name for tracking purposes") | |
# Set default to False to speed initial loading | |
check_files = st.checkbox("Check if files exist (slower but more accurate)", value=False, | |
help="Verifies each file exists before adding it to the list") | |
only_new_files = st.checkbox("Only show new files (not previously annotated)", value=True, | |
help="Skip files that have already been annotated") | |
col1, col2 = st.columns(2) | |
with col1: | |
if st.button("Initialize Application"): | |
debug_log("Initialize button clicked") | |
# Get annotator ID | |
annotator_id = get_annotator_id() | |
# First check if we have any video IDs | |
if not user_video_ids.strip(): | |
st.error("Please enter at least one video ID to annotate") | |
else: | |
# Split by line and remove empty lines | |
video_ids = [vid.strip() for vid in user_video_ids.split('\n') if vid.strip()] | |
if not video_ids: | |
st.error("Please enter at least one valid video ID") | |
else: | |
# Load all audio files based on the video IDs | |
with st.spinner(f"Building file list for {len(video_ids)} videos..."): | |
all_files = build_file_list_from_video_ids( | |
video_ids, | |
check_existence=check_files | |
) | |
if not all_files: | |
st.error("No audio files found. Please check the video IDs and try again.") | |
else: | |
st.session_state.all_files = all_files | |
# Load existing annotation CSV | |
annotation_df = load_annotations() | |
st.session_state.annotation_df = annotation_df | |
# Filter out files that have already been annotated by this annotator | |
annotated_files = set() | |
if not annotation_df.empty: | |
if only_new_files: | |
# If only showing new files, consider files annotated by any annotator | |
annotated_files = set(annotation_df['file_id'].tolist()) | |
else: | |
# Otherwise, only consider files annotated by this specific annotator | |
annotated_files = set(annotation_df[annotation_df['annotator_id'] == annotator_id]['file_id'].tolist()) | |
# Count existing annotations by this annotator | |
hate_count = len(annotation_df[(annotation_df['annotator_id'] == annotator_id) & | |
(annotation_df['Label'] == 'Hate')]) | |
non_hate_count = len(annotation_df[(annotation_df['annotator_id'] == annotator_id) & | |
(annotation_df['Label'] == 'Non-Hate')]) | |
discard_count = len(annotation_df[(annotation_df['annotator_id'] == annotator_id) & | |
(annotation_df['Label'] == 'Discard')]) | |
st.session_state.hate_count = hate_count | |
st.session_state.non_hate_count = non_hate_count | |
st.session_state.discard_count = discard_count | |
# Create list of pending files (not yet annotated) | |
pending_files = [f for f in all_files if f['id'] not in annotated_files] | |
st.session_state.pending_files = pending_files | |
if pending_files: | |
st.session_state.current_file = pending_files[0] | |
st.session_state.initialized = True | |
st.success(f"Application initialized successfully! Found {len(pending_files)} files to annotate.") | |
st.rerun() | |
else: | |
st.warning("All files have already been annotated. Try adding new video IDs or uncheck 'Only show new files'.") | |
with col2: | |
if st.button("Reset Application State"): | |
# Clear the session state | |
for key in list(st.session_state.keys()): | |
del st.session_state[key] | |
st.success("Application state has been reset. You can start fresh.") | |
st.rerun() | |
# Main annotation interface | |
if st.session_state.initialized and st.session_state.pending_files: | |
debug_log("Rendering main annotation interface") | |
# Display current annotator | |
st.markdown(f""" | |
<div class="sub-header"> | |
Annotator: {annotator_name if annotator_name else st.session_state.annotator_id} | |
</div> | |
""", unsafe_allow_html=True) | |
# Display progress | |
total_files = len(st.session_state.all_files) | |
annotated_files = total_files - len(st.session_state.pending_files) | |
progress_percentage = int((annotated_files / total_files) * 100) if total_files > 0 else 0 | |
st.markdown(f""" | |
<div class="progress-container"> | |
<div>Progress: {annotated_files}/{total_files} samples annotated ({progress_percentage}%)</div> | |
<div style="margin-top: 10px; height: 10px; background-color: #eee; border-radius: 5px;"> | |
<div style="height: 100%; width: {progress_percentage}%; background-color: #4CAF50; border-radius: 5px;"></div> | |
</div> | |
</div> | |
""", unsafe_allow_html=True) | |
# Display statistics | |
st.markdown(f""" | |
<div class="stats-container"> | |
<div class="stat-item"> | |
<div class="stat-value">{len(st.session_state.all_files)}</div> | |
<div class="stat-label">Total Files</div> | |
</div> | |
<div class="stat-item"> | |
<div class="stat-value">{annotated_files}</div> | |
<div class="stat-label">Completed</div> | |
</div> | |
<div class="stat-item"> | |
<div class="stat-value">{len(st.session_state.pending_files)}</div> | |
<div class="stat-label">Remaining</div> | |
</div> | |
<div class="stat-item"> | |
<div class="stat-value">{st.session_state.hate_count}</div> | |
<div class="stat-label">Hate</div> | |
</div> | |
<div class="stat-item"> | |
<div class="stat-value">{st.session_state.non_hate_count}</div> | |
<div class="stat-label">Non-Hate</div> | |
</div> | |
<div class="stat-item"> | |
<div class="stat-value">{st.session_state.discard_count}</div> | |
<div class="stat-label">Discard</div> | |
</div> | |
</div> | |
""", unsafe_allow_html=True) | |
# Audio player section | |
current_file = st.session_state.current_file | |
# Get video ID from the file data | |
video_id = current_file.get('video_id', "Unknown") | |
if video_id == "Unknown" and "_chunk_" in current_file['name']: | |
# Extract from filename as fallback | |
video_id = current_file['name'].split("_chunk_")[0] | |
st.markdown(f""" | |
<div class="audio-container"> | |
<div style="font-weight: bold; margin-bottom: 15px;">Currently Playing: {current_file['name']}</div> | |
<div class="file-info">Video ID: {video_id}</div> | |
""", unsafe_allow_html=True) | |
# Get the audio file | |
if 'url' in current_file: | |
debug_log(f"Attempting to download audio from {current_file['url']}") | |
with st.spinner("Loading audio file..."): | |
audio_bytes = download_file_from_hf(current_file['url']) | |
else: | |
# Fallback for old format | |
fallback_url = f"{HF_DATASET_URL}{current_file['name']}" | |
debug_log(f"Attempting to download audio from fallback URL {fallback_url}") | |
with st.spinner("Loading audio file..."): | |
audio_bytes = download_file_from_hf(fallback_url) | |
if audio_bytes: | |
debug_log("Audio file downloaded successfully") | |
# Display audio player | |
st.audio(audio_bytes, format='audio/wav') | |
# Annotation controls | |
col1, col2 = st.columns([3, 1]) | |
with col1: | |
annotation = st.selectbox( | |
"Select classification:", | |
["-- Select --", "Hate", "Non-Hate", "Discard"], | |
index=0, | |
help="Select 'Discard' for unclear audio, background noise, or non-relevant content" | |
) | |
with col2: | |
st.write("") | |
st.write("") | |
if st.button("Skip File"): | |
debug_log("Skip file button clicked") | |
# Remove the current file from pending | |
st.session_state.pending_files.pop(0) | |
# Load the next file if available | |
if st.session_state.pending_files: | |
st.session_state.current_file = st.session_state.pending_files[0] | |
st.rerun() | |
else: | |
st.success("All files have been processed!") | |
if st.button("Submit & Load Next Sample", type="primary"): | |
if annotation == "-- Select --": | |
st.warning("Please select a classification before submitting.") | |
else: | |
debug_log(f"Submitting annotation: {annotation}") | |
# Record the annotation | |
new_row = { | |
'file_id': current_file['id'], | |
'file_name': current_file['name'], | |
'Label': annotation, | |
'annotator_id': st.session_state.annotator_id, | |
'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"), | |
'video_id': video_id | |
} | |
# Update the DataFrame | |
st.session_state.annotation_df = pd.concat([ | |
st.session_state.annotation_df, | |
pd.DataFrame([new_row]) | |
], ignore_index=True) | |
# Update counts | |
if annotation == "Hate": | |
st.session_state.hate_count += 1 | |
elif annotation == "Non-Hate": | |
st.session_state.non_hate_count += 1 | |
else: # Discard | |
st.session_state.discard_count += 1 | |
# Save the updated annotations | |
success = save_annotation(st.session_state.annotation_df) | |
if success: | |
debug_log("Annotation saved successfully") | |
# Remove the current file from pending | |
st.session_state.pending_files.pop(0) | |
# Prefetch next file if available (new optimization) | |
if len(st.session_state.pending_files) > 0: | |
debug_log("Prefetching next file in background") | |
# We'll just set the next file, actual prefetching would require threading | |
# Load the next file if available | |
if st.session_state.pending_files: | |
st.session_state.current_file = st.session_state.pending_files[0] | |
st.rerun() | |
else: | |
st.success("All files have been annotated! Great job!") | |
else: | |
st.error("Failed to save annotation. Please try again.") | |
else: | |
debug_log(f"Failed to load audio file: {current_file['name']}") | |
st.error(f"Failed to load audio file: {current_file['name']}. The file may not exist in the repository.") | |
# Skip button for files that can't be loaded | |
if st.button("Skip This File", type="primary"): | |
debug_log("Skipping unloadable file") | |
# Remove the current file from pending | |
st.session_state.pending_files.pop(0) | |
# Load the next file if available | |
if st.session_state.pending_files: | |
st.session_state.current_file = st.session_state.pending_files[0] | |
st.rerun() | |
else: | |
st.success("All files have been processed!") | |
elif st.session_state.initialized and not st.session_state.pending_files: | |
debug_log("All files annotated, showing summary") | |
st.success("All files have been annotated! Thank you for your contribution!") | |
# Show summary statistics | |
st.markdown(f""" | |
<div class="stats-container"> | |
<div class="stat-item"> | |
<div class="stat-value">{len(st.session_state.all_files)}</div> | |
<div class="stat-label">Total Files</div> | |
</div> | |
<div class="stat-item"> | |
<div class="stat-value">{st.session_state.hate_count}</div> | |
<div class="stat-label">Hate</div> | |
</div> | |
<div class="stat-item"> | |
<div class="stat-value">{st.session_state.non_hate_count}</div> | |
<div class="stat-label">Non-Hate</div> | |
</div> | |
<div class="stat-item"> | |
<div class="stat-value">{st.session_state.discard_count}</div> | |
<div class="stat-label">Discard</div> | |
</div> | |
</div> | |
""", unsafe_allow_html=True) | |
# Option to download the results | |
if not st.session_state.annotation_df.empty: | |
csv = st.session_state.annotation_df.to_csv(index=False) | |
b64 = base64.b64encode(csv.encode()).decode() | |
href = f'<a href="data:file/csv;base64,{b64}" download="annotation_results.csv">Download Results CSV</a>' | |
st.markdown(href, unsafe_allow_html=True) | |
# Two columns for buttons | |
col1, col2 = st.columns(2) | |
with col1: | |
if st.button("Reset and Start Over"): | |
debug_log("Reset and start over clicked") | |
st.session_state.clear() | |
st.rerun() | |
with col2: | |
if st.button("Add More Videos"): | |
debug_log("Add more videos clicked") | |
# Keep the annotation data but reset the initialization | |
st.session_state.initialized = False | |
st.rerun() | |
else: | |
debug_log("Showing initial configuration screen") | |
st.info("Please configure and initialize the application using the Configuration section above.") | |
# Example video IDs | |
st.markdown(""" | |
### Example Video IDs | |
You can use the following format in the Video IDs text area: | |
``` | |
0hJ2JGhM7TY | |
1PRABBSTpiE | |
4ewRgBMP_AY | |
``` | |
The app will look for files like: | |
- 0hJ2JGhM7TY_chunk_0001.wav | |
- 0hJ2JGhM7TY_chunk_0002.wav | |
- 1PRABBSTpiE_chunk_0001.wav | |
- etc. | |
""") | |
# Add a footer with instructions | |
st.markdown(""" | |
--- | |
### Instructions: | |
1. Enter video IDs in the configuration section | |
2. Set your name (optional) and click "Initialize Application" to start | |
3. Listen to each audio sample | |
4. Select the appropriate classification: | |
- **Hate**: Contains hate speech | |
- **Non-Hate**: Does not contain hate speech | |
- **Discard**: Poor audio quality, background noise, or irrelevant content | |
5. Click "Submit & Load Next Sample" to continue | |
6. Your progress is saved automatically | |
7. When all samples are annotated, you can download the results | |
### Adding New Data | |
When you add new data to the Hugging Face dataset: | |
1. Click "Add More Videos" after completing current annotations | |
2. Enter the new video IDs in the configuration | |
3. Make sure "Only show new files" is checked | |
4. Initialize the application again | |
This will only present files that haven't been annotated yet. | |
### Dataset Information | |
The audio files are sourced from the Hugging Face dataset: | |
[kcrl/Hs](https://huggingface.co/datasets/kcrl/Hs) | |
File naming follows the pattern: `[VIDEO_ID]_chunk_[CHUNK_NUMBER].wav` | |
Example: `0hJ2JGhM7TY_chunk_0001.wav` | |
""") |