Spaces:

Ayon128
/

HS-Testing

Sleeping

App Files Files Community

Ayon128 commited on Apr 4

Commit

171d989

verified ·

1 Parent(s): d19e5f0

Upload 2 files

Browse files

Files changed (2) hide show

app.py +728 -0
requirements.txt +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,728 @@

+import streamlit as st
+import pandas as pd
+import requests
+import io
+import uuid
+import os
+import json
+import base64
+from datetime import datetime
+import re
+import time
+# Set page configuration
+st.set_page_config(
+    page_title="Speech Hate Detection - Annotation Tool",
+    page_icon="🎧",
+    layout="centered",
+    initial_sidebar_state="collapsed"
+)
+# Constants
+HF_DATASET_URL = "https://huggingface.co/datasets/kcrl/Hs/resolve/main/"
+RESULTS_FILE = "annotation_results.csv"  # Local CSV file to store results
+# Debug flag - enable to see detailed debug info
+DEBUG_MODE = True
+# Log debugging information if debug mode is enabled
+def debug_log(message):
+    if DEBUG_MODE:
+        st.write(f"DEBUG: {message}")
+# Initial debug message
+debug_log("Application starting...")
+# For Hugging Face Spaces deployment
+if os.path.exists('/data'):
+    # Use the persistent storage directory
+    RESULTS_FILE = "/data/annotation_results.csv"
+    debug_log(f"Using persistent storage at {RESULTS_FILE}")
+# Function to check if file exists in the Hugging Face repository with exponential backoff
+def check_file_exists(file_url, max_retries=3):
+    """
+    Checks if a file exists at the given URL without downloading the entire file.
+    Uses exponential backoff for retries.
+    Returns True if the file exists, False otherwise.
+    """
+    for attempt in range(max_retries):
+        try:
+            # Use a short timeout to avoid long waits
+            response = requests.head(file_url, timeout=3)
+            return response.status_code == 200
+        except Exception as e:
+            if attempt < max_retries - 1:
+                # Exponential backoff: 1s, 2s, 4s, etc.
+                wait_time = 2 ** attempt
+                debug_log(f"Request failed, retrying in {wait_time}s: {str(e)}")
+                time.sleep(wait_time)
+            else:
+                debug_log(f"Request failed after {max_retries} attempts: {str(e)}")
+                return False
+    return False
+# Function to check if a specific chunk exists
+def check_chunk_exists(video_id, chunk_num):
+    """Check if a specific chunk of a video exists in the repository"""
+    chunk_id = f"{chunk_num:04d}"
+    file_name = f"{video_id}_chunk_{chunk_id}.wav"
+    file_url = f"{HF_DATASET_URL}{file_name}"
+    return check_file_exists(file_url)
+# Function to find all chunks for a video by using binary search approach
+def find_all_chunks_for_video(video_id, max_possible_chunks=500):
+    """
+    Find all available chunks for a video ID using an optimized approach.
+    Uses binary search first to find the approximate range, then checks each file.
+    Args:
+        video_id: The video ID to check
+        max_possible_chunks: Upper limit for the binary search
+    Returns:
+        List of chunk numbers that exist
+    """
+    debug_log(f"Finding chunks for {video_id}...")
+    # First use binary search to find the upper bound
+    low = 1
+    high = max_possible_chunks
+    # Find an upper bound first (where files no longer exist)
+    while low <= high:
+        mid = (low + high) // 2
+        if check_chunk_exists(video_id, mid):
+            low = mid + 1
+        else:
+            high = mid - 1
+    # The highest existing chunk is at 'high'
+    highest_chunk = max(1, high)
+    debug_log(f"Binary search found highest chunk: {highest_chunk}")
+    # Now check each potential chunk from 1 to highest_chunk
+    existing_chunks = []
+    for chunk_num in range(1, highest_chunk + 1):
+        # Add some throttling to avoid rate limits (0.1s between requests)
+        time.sleep(0.1)
+        if check_chunk_exists(video_id, chunk_num):
+            existing_chunks.append(chunk_num)
+    debug_log(f"Found {len(existing_chunks)} chunks for {video_id}")
+    return existing_chunks
+# Function to build a list of audio file paths from video IDs with dynamic chunk detection
+def build_file_list_from_video_ids(video_ids, check_existence=False):
+    """
+    Creates a list of audio files based on the provided video IDs.
+    Dynamically detects how many chunks exist for each video.
+    Args:
+        video_ids: List of video IDs
+        check_existence: Whether to verify each file exists before adding it
+    Returns:
+        List of dictionaries with file info
+    """
+    files = []
+    debug_log(f"Building file list for {len(video_ids)} videos (check_existence={check_existence})...")
+    # Create progress bar for checking videos
+    progress_bar = st.progress(0)
+    for i, video_id in enumerate(video_ids):
+        # Update progress
+        progress_bar.progress((i + 1) / len(video_ids))
+        if check_existence:
+            # Find all chunks for this video
+            st.write(f"Finding chunks for video {video_id} ({i+1}/{len(video_ids)})...")
+            chunks = find_all_chunks_for_video(video_id)
+            if chunks:
+                st.write(f"Found {len(chunks)} chunks for video {video_id}")
+                for chunk_num in chunks:
+                    chunk_id = f"{chunk_num:04d}"
+                    file_id = f"{video_id}_chunk_{chunk_id}"
+                    file_name = f"{file_id}.wav"
+                    file_url = f"{HF_DATASET_URL}{file_name}"
+                    files.append({
+                        "id": file_id,
+                        "name": file_name,
+                        "url": file_url,
+                        "video_id": video_id,
+                        "chunk_num": chunk_num
+                    })
+            else:
+                st.warning(f"No chunks found for video {video_id}")
+        else:
+            # If not checking existence, use a default range of chunks (1-100)
+            # Reduced from 1-200 to speed up initial loading
+            for chunk_num in range(1, 101):
+                chunk_id = f"{chunk_num:04d}"
+                file_id = f"{video_id}_chunk_{chunk_id}"
+                file_name = f"{file_id}.wav"
+                file_url = f"{HF_DATASET_URL}{file_name}"
+                files.append({
+                    "id": file_id,
+                    "name": file_name,
+                    "url": file_url,
+                    "video_id": video_id,
+                    "chunk_num": chunk_num
+                })
+    debug_log(f"Built file list with {len(files)} total files")
+    return files
+# Function to download file from Hugging Face with retry logic
+def download_file_from_hf(file_url, max_retries=3):
+    for attempt in range(max_retries):
+        try:
+            response = requests.get(file_url, timeout=10)  # Increased timeout for audio downloads
+            if response.status_code == 200:
+                return response.content
+            else:
+                if attempt < max_retries - 1:
+                    wait_time = 2 ** attempt
+                    debug_log(f"Download failed (HTTP {response.status_code}), retrying in {wait_time}s")
+                    time.sleep(wait_time)
+                else:
+                    st.error(f"Failed to download file: HTTP {response.status_code}")
+                    return None
+        except Exception as e:
+            if attempt < max_retries - 1:
+                wait_time = 2 ** attempt
+                debug_log(f"Download error, retrying in {wait_time}s: {str(e)}")
+                time.sleep(wait_time)
+            else:
+                st.error(f"Error downloading file: {e}")
+                return None
+    return None
+# Create a unique ID for new annotators or retrieve existing
+def get_annotator_id():
+    debug_log("Getting annotator ID...")
+    if 'annotator_id' not in st.session_state:
+        # Check if we have a stored ID in local storage
+        annotator_id_file = '.annotator_id'
+        if os.path.exists('/data'):
+            annotator_id_file = '/data/.annotator_id'
+        if os.path.exists(annotator_id_file):
+            with open(annotator_id_file, 'r') as f:
+                st.session_state.annotator_id = f.read().strip()
+                debug_log(f"Retrieved existing annotator ID")
+        else:
+            # Generate a new ID
+            st.session_state.annotator_id = str(uuid.uuid4())
+            with open(annotator_id_file, 'w') as f:
+                f.write(st.session_state.annotator_id)
+                debug_log(f"Created new annotator ID")
+    return st.session_state.annotator_id
+# Function to load annotation data from CSV
+def load_annotations():
+    debug_log(f"Loading annotations from {RESULTS_FILE}")
+    try:
+        if os.path.exists(RESULTS_FILE):
+            df = pd.read_csv(RESULTS_FILE)
+            debug_log(f"Loaded {len(df)} annotation records")
+            return df
+        else:
+            # Create a new DataFrame if the file doesn't exist
+            debug_log("No existing annotations found, creating new file")
+            df = pd.DataFrame(columns=['file_id', 'file_name', 'Label', 'annotator_id', 'timestamp', 'video_id'])
+            df.to_csv(RESULTS_FILE, index=False)
+            return df
+    except Exception as e:
+        st.error(f"Error loading annotations: {e}")
+        debug_log(f"Error loading annotations: {str(e)}")
+        return pd.DataFrame(columns=['file_id', 'file_name', 'Label', 'annotator_id', 'timestamp', 'video_id'])
+# Function to save annotations to CSV
+def save_annotation(df):
+    debug_log(f"Saving annotations to {RESULTS_FILE}")
+    try:
+        df.to_csv(RESULTS_FILE, index=False)
+        debug_log("Annotations saved successfully")
+        return True
+    except Exception as e:
+        st.error(f"Error saving annotation: {e}")
+        debug_log(f"Error saving annotations: {str(e)}")
+        return False
+# Initialize application state
+if 'initialized' not in st.session_state:
+    debug_log("Initializing application state")
+    st.session_state.initialized = False
+    st.session_state.current_file_index = 0
+    st.session_state.current_file = None
+    st.session_state.annotation_df = None
+    st.session_state.all_files = []
+    st.session_state.pending_files = []
+    st.session_state.hate_count = 0
+    st.session_state.non_hate_count = 0
+    st.session_state.discard_count = 0
+    st.session_state.page = 1
+    st.session_state.files_per_page = 50
+    st.session_state.lite_mode = False
+# Application title and header
+st.markdown("""
+    <style>
+    .main-header {
+        font-size: 26px;
+        font-weight: bold;
+        color: #ff4b4b;
+        margin-bottom: 20px;
+    }
+    .sub-header {
+        font-size: 18px;
+        color: #555;
+        margin-bottom: 30px;
+    }
+    .progress-container {
+        margin: 20px 0;
+        padding: 15px;
+        background-color: #f9f9f9;
+        border-radius: 5px;
+    }
+    .stats-container {
+        display: flex;
+        justify-content: space-around;
+        margin-top: 20px;
+        text-align: center;
+        flex-wrap: wrap;
+    }
+    .stat-item {
+        padding: 10px;
+        min-width: 100px;
+    }
+    .stat-value {
+        font-size: 24px;
+        font-weight: bold;
+        color: #4CAF50;
+    }
+    .stat-label {
+        font-size: 14px;
+        color: #666;
+    }
+    .audio-container {
+        margin: 30px 0;
+        padding: 20px;
+        background-color: #f5f5f5;
+        border-radius: 10px;
+        text-align: center;
+    }
+    .file-info {
+        font-size: 14px;
+        color: #666;
+        margin-top: 5px;
+    }
+    </style>
+    <div class="main-header">Speech Hate Detection - Annotation Tool</div>
+    """, unsafe_allow_html=True)
+# Quick start in lite mode (new feature)
+if not st.session_state.initialized:
+    if st.button("⚡ Quick Start (Lite Mode)"):
+        debug_log("Starting in lite mode")
+        st.session_state.lite_mode = True
+        st.session_state.annotation_df = load_annotations()
+        st.session_state.initialized = True
+        st.success("Started in lite mode. Enter video IDs and click Initialize.")
+        st.rerun()
+# App configuration section (collapsible)
+with st.expander("Configuration", expanded=not st.session_state.initialized):
+    st.markdown("""
+    ### Configuration
+    This tool loads audio files from the Hugging Face dataset at:
+    https://huggingface.co/datasets/kcrl/Hs
+    You can provide a list of video IDs for annotation by adding them in the text area below.
+    """)
+    # Default video IDs
+    default_video_ids = "0hJ2JGhM7TY\n1PRABBSTpiE\n4ewRgBMP_AY"  # Reduced to just 3 for initial testing
+    # Allow user to input video IDs
+    user_video_ids = st.text_area(
+        "Video IDs to annotate (one per line)",
+        value=default_video_ids,
+        height=150,
+        help="Enter the YouTube video IDs, one per line. The app will look for chunks of these videos."
+    )
+    annotator_name = st.text_input("Your Name (Optional)",
+                                 help="Your name for tracking purposes")
+    # Set default to False to speed initial loading
+    check_files = st.checkbox("Check if files exist (slower but more accurate)", value=False,
+                            help="Verifies each file exists before adding it to the list")
+    only_new_files = st.checkbox("Only show new files (not previously annotated)", value=True,
+                               help="Skip files that have already been annotated")
+    col1, col2 = st.columns(2)
+    with col1:
+        if st.button("Initialize Application"):
+            debug_log("Initialize button clicked")
+            # Get annotator ID
+            annotator_id = get_annotator_id()
+            # First check if we have any video IDs
+            if not user_video_ids.strip():
+                st.error("Please enter at least one video ID to annotate")
+            else:
+                # Split by line and remove empty lines
+                video_ids = [vid.strip() for vid in user_video_ids.split('\n') if vid.strip()]
+                if not video_ids:
+                    st.error("Please enter at least one valid video ID")
+                else:
+                    # Load all audio files based on the video IDs
+                    with st.spinner(f"Building file list for {len(video_ids)} videos..."):
+                        all_files = build_file_list_from_video_ids(
+                            video_ids,
+                            check_existence=check_files
+                        )
+                    if not all_files:
+                        st.error("No audio files found. Please check the video IDs and try again.")
+                    else:
+                        st.session_state.all_files = all_files
+                        # Load existing annotation CSV
+                        annotation_df = load_annotations()
+                        st.session_state.annotation_df = annotation_df
+                        # Filter out files that have already been annotated by this annotator
+                        annotated_files = set()
+                        if not annotation_df.empty:
+                            if only_new_files:
+                                # If only showing new files, consider files annotated by any annotator
+                                annotated_files = set(annotation_df['file_id'].tolist())
+                            else:
+                                # Otherwise, only consider files annotated by this specific annotator
+                                annotated_files = set(annotation_df[annotation_df['annotator_id'] == annotator_id]['file_id'].tolist())
+                            # Count existing annotations by this annotator
+                            hate_count = len(annotation_df[(annotation_df['annotator_id'] == annotator_id) &
+                                                        (annotation_df['Label'] == 'Hate')])
+                            non_hate_count = len(annotation_df[(annotation_df['annotator_id'] == annotator_id) &
+                                                            (annotation_df['Label'] == 'Non-Hate')])
+                            discard_count = len(annotation_df[(annotation_df['annotator_id'] == annotator_id) &
+                                                            (annotation_df['Label'] == 'Discard')])
+                            st.session_state.hate_count = hate_count
+                            st.session_state.non_hate_count = non_hate_count
+                            st.session_state.discard_count = discard_count
+                        # Create list of pending files (not yet annotated)
+                        pending_files = [f for f in all_files if f['id'] not in annotated_files]
+                        st.session_state.pending_files = pending_files
+                        if pending_files:
+                            st.session_state.current_file = pending_files[0]
+                            st.session_state.initialized = True
+                            st.success(f"Application initialized successfully! Found {len(pending_files)} files to annotate.")
+                            st.rerun()
+                        else:
+                            st.warning("All files have already been annotated. Try adding new video IDs or uncheck 'Only show new files'.")
+    with col2:
+        if st.button("Reset Application State"):
+            # Clear the session state
+            for key in list(st.session_state.keys()):
+                del st.session_state[key]
+            st.success("Application state has been reset. You can start fresh.")
+            st.rerun()
+# Main annotation interface
+if st.session_state.initialized and st.session_state.pending_files:
+    debug_log("Rendering main annotation interface")
+    # Display current annotator
+    st.markdown(f"""
+    <div class="sub-header">
+        Annotator: {annotator_name if annotator_name else st.session_state.annotator_id}
+    </div>
+    """, unsafe_allow_html=True)
+    # Display progress
+    total_files = len(st.session_state.all_files)
+    annotated_files = total_files - len(st.session_state.pending_files)
+    progress_percentage = int((annotated_files / total_files) * 100) if total_files > 0 else 0
+    st.markdown(f"""
+    <div class="progress-container">
+        <div>Progress: {annotated_files}/{total_files} samples annotated ({progress_percentage}%)</div>
+        <div style="margin-top: 10px; height: 10px; background-color: #eee; border-radius: 5px;">
+            <div style="height: 100%; width: {progress_percentage}%; background-color: #4CAF50; border-radius: 5px;"></div>
+        </div>
+    </div>
+    """, unsafe_allow_html=True)
+    # Display statistics
+    st.markdown(f"""
+    <div class="stats-container">
+        <div class="stat-item">
+            <div class="stat-value">{len(st.session_state.all_files)}</div>
+            <div class="stat-label">Total Files</div>
+        </div>
+        <div class="stat-item">
+            <div class="stat-value">{annotated_files}</div>
+            <div class="stat-label">Completed</div>
+        </div>
+        <div class="stat-item">
+            <div class="stat-value">{len(st.session_state.pending_files)}</div>
+            <div class="stat-label">Remaining</div>
+        </div>
+        <div class="stat-item">
+            <div class="stat-value">{st.session_state.hate_count}</div>
+            <div class="stat-label">Hate</div>
+        </div>
+        <div class="stat-item">
+            <div class="stat-value">{st.session_state.non_hate_count}</div>
+            <div class="stat-label">Non-Hate</div>
+        </div>
+        <div class="stat-item">
+            <div class="stat-value">{st.session_state.discard_count}</div>
+            <div class="stat-label">Discard</div>
+        </div>
+    </div>
+    """, unsafe_allow_html=True)
+    # Audio player section
+    current_file = st.session_state.current_file
+    # Get video ID from the file data
+    video_id = current_file.get('video_id', "Unknown")
+    if video_id == "Unknown" and "_chunk_" in current_file['name']:
+        # Extract from filename as fallback
+        video_id = current_file['name'].split("_chunk_")[0]
+    st.markdown(f"""
+    <div class="audio-container">
+        <div style="font-weight: bold; margin-bottom: 15px;">Currently Playing: {current_file['name']}</div>
+        <div class="file-info">Video ID: {video_id}</div>
+    """, unsafe_allow_html=True)
+    # Get the audio file
+    if 'url' in current_file:
+        debug_log(f"Attempting to download audio from {current_file['url']}")
+        with st.spinner("Loading audio file..."):
+            audio_bytes = download_file_from_hf(current_file['url'])
+    else:
+        # Fallback for old format
+        fallback_url = f"{HF_DATASET_URL}{current_file['name']}"
+        debug_log(f"Attempting to download audio from fallback URL {fallback_url}")
+        with st.spinner("Loading audio file..."):
+            audio_bytes = download_file_from_hf(fallback_url)
+    if audio_bytes:
+        debug_log("Audio file downloaded successfully")
+        # Display audio player
+        st.audio(audio_bytes, format='audio/wav')
+        # Annotation controls
+        col1, col2 = st.columns([3, 1])
+        with col1:
+            annotation = st.selectbox(
+                "Select classification:",
+                ["-- Select --", "Hate", "Non-Hate", "Discard"],
+                index=0,
+                help="Select 'Discard' for unclear audio, background noise, or non-relevant content"
+            )
+        with col2:
+            st.write("")
+            st.write("")
+            if st.button("Skip File"):
+                debug_log("Skip file button clicked")
+                # Remove the current file from pending
+                st.session_state.pending_files.pop(0)
+                # Load the next file if available
+                if st.session_state.pending_files:
+                    st.session_state.current_file = st.session_state.pending_files[0]
+                    st.rerun()
+                else:
+                    st.success("All files have been processed!")
+        if st.button("Submit & Load Next Sample", type="primary"):
+            if annotation == "-- Select --":
+                st.warning("Please select a classification before submitting.")
+            else:
+                debug_log(f"Submitting annotation: {annotation}")
+                # Record the annotation
+                new_row = {
+                    'file_id': current_file['id'],
+                    'file_name': current_file['name'],
+                    'Label': annotation,
+                    'annotator_id': st.session_state.annotator_id,
+                    'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+                    'video_id': video_id
+                }
+                # Update the DataFrame
+                st.session_state.annotation_df = pd.concat([
+                    st.session_state.annotation_df,
+                    pd.DataFrame([new_row])
+                ], ignore_index=True)
+                # Update counts
+                if annotation == "Hate":
+                    st.session_state.hate_count += 1
+                elif annotation == "Non-Hate":
+                    st.session_state.non_hate_count += 1
+                else:  # Discard
+                    st.session_state.discard_count += 1
+                # Save the updated annotations
+                success = save_annotation(st.session_state.annotation_df)
+                if success:
+                    debug_log("Annotation saved successfully")
+                    # Remove the current file from pending
+                    st.session_state.pending_files.pop(0)
+                    # Prefetch next file if available (new optimization)
+                    if len(st.session_state.pending_files) > 0:
+                        debug_log("Prefetching next file in background")
+                        # We'll just set the next file, actual prefetching would require threading
+                    # Load the next file if available
+                    if st.session_state.pending_files:
+                        st.session_state.current_file = st.session_state.pending_files[0]
+                        st.rerun()
+                    else:
+                        st.success("All files have been annotated! Great job!")
+                else:
+                    st.error("Failed to save annotation. Please try again.")
+    else:
+        debug_log(f"Failed to load audio file: {current_file['name']}")
+        st.error(f"Failed to load audio file: {current_file['name']}. The file may not exist in the repository.")
+        # Skip button for files that can't be loaded
+        if st.button("Skip This File", type="primary"):
+            debug_log("Skipping unloadable file")
+            # Remove the current file from pending
+            st.session_state.pending_files.pop(0)
+            # Load the next file if available
+            if st.session_state.pending_files:
+                st.session_state.current_file = st.session_state.pending_files[0]
+                st.rerun()
+            else:
+                st.success("All files have been processed!")
+elif st.session_state.initialized and not st.session_state.pending_files:
+    debug_log("All files annotated, showing summary")
+    st.success("All files have been annotated! Thank you for your contribution!")
+    # Show summary statistics
+    st.markdown(f"""
+    <div class="stats-container">
+        <div class="stat-item">
+            <div class="stat-value">{len(st.session_state.all_files)}</div>
+            <div class="stat-label">Total Files</div>
+        </div>
+        <div class="stat-item">
+            <div class="stat-value">{st.session_state.hate_count}</div>
+            <div class="stat-label">Hate</div>
+        </div>
+        <div class="stat-item">
+            <div class="stat-value">{st.session_state.non_hate_count}</div>
+            <div class="stat-label">Non-Hate</div>
+        </div>
+        <div class="stat-item">
+            <div class="stat-value">{st.session_state.discard_count}</div>
+            <div class="stat-label">Discard</div>
+        </div>
+    </div>
+    """, unsafe_allow_html=True)
+    # Option to download the results
+    if not st.session_state.annotation_df.empty:
+        csv = st.session_state.annotation_df.to_csv(index=False)
+        b64 = base64.b64encode(csv.encode()).decode()
+        href = f'<a href="data:file/csv;base64,{b64}" download="annotation_results.csv">Download Results CSV</a>'
+        st.markdown(href, unsafe_allow_html=True)
+    # Two columns for buttons
+    col1, col2 = st.columns(2)
+    with col1:
+        if st.button("Reset and Start Over"):
+            debug_log("Reset and start over clicked")
+            st.session_state.clear()
+            st.rerun()
+    with col2:
+        if st.button("Add More Videos"):
+            debug_log("Add more videos clicked")
+            # Keep the annotation data but reset the initialization
+            st.session_state.initialized = False
+            st.rerun()
+else:
+    debug_log("Showing initial configuration screen")
+    st.info("Please configure and initialize the application using the Configuration section above.")
+    # Example video IDs
+    st.markdown("""
+    ### Example Video IDs
+    You can use the following format in the Video IDs text area:
+    ```
+    0hJ2JGhM7TY
+    1PRABBSTpiE
+    4ewRgBMP_AY
+    ```
+    The app will look for files like:
+    - 0hJ2JGhM7TY_chunk_0001.wav
+    - 0hJ2JGhM7TY_chunk_0002.wav
+    - 1PRABBSTpiE_chunk_0001.wav
+    - etc.
+    """)
+# Add a footer with instructions
+st.markdown("""
+---
+### Instructions:
+1. Enter video IDs in the configuration section
+2. Set your name (optional) and click "Initialize Application" to start
+3. Listen to each audio sample
+4. Select the appropriate classification:
+   - **Hate**: Contains hate speech
+   - **Non-Hate**: Does not contain hate speech
+   - **Discard**: Poor audio quality, background noise, or irrelevant content
+5. Click "Submit & Load Next Sample" to continue
+6. Your progress is saved automatically
+7. When all samples are annotated, you can download the results
+### Adding New Data
+When you add new data to the Hugging Face dataset:
+1. Click "Add More Videos" after completing current annotations
+2. Enter the new video IDs in the configuration
+3. Make sure "Only show new files" is checked
+4. Initialize the application again
+This will only present files that haven't been annotated yet.
+### Dataset Information
+The audio files are sourced from the Hugging Face dataset:
+[kcrl/Hs](https://huggingface.co/datasets/kcrl/Hs)
+File naming follows the pattern: `[VIDEO_ID]_chunk_[CHUNK_NUMBER].wav`
+Example: `0hJ2JGhM7TY_chunk_0001.wav`
+""")

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+streamlit>=1.25.0
+pandas>=1.5.0
+requests>=2.28.0