Ayon128 commited on
Commit
171d989
·
verified ·
1 Parent(s): d19e5f0

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +728 -0
  2. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,728 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import requests
4
+ import io
5
+ import uuid
6
+ import os
7
+ import json
8
+ import base64
9
+ from datetime import datetime
10
+ import re
11
+ import time
12
+
13
+ # Set page configuration
14
+ st.set_page_config(
15
+ page_title="Speech Hate Detection - Annotation Tool",
16
+ page_icon="🎧",
17
+ layout="centered",
18
+ initial_sidebar_state="collapsed"
19
+ )
20
+
21
+ # Constants
22
+ HF_DATASET_URL = "https://huggingface.co/datasets/kcrl/Hs/resolve/main/"
23
+ RESULTS_FILE = "annotation_results.csv" # Local CSV file to store results
24
+
25
+ # Debug flag - enable to see detailed debug info
26
+ DEBUG_MODE = True
27
+
28
+ # Log debugging information if debug mode is enabled
29
+ def debug_log(message):
30
+ if DEBUG_MODE:
31
+ st.write(f"DEBUG: {message}")
32
+
33
+ # Initial debug message
34
+ debug_log("Application starting...")
35
+
36
+ # For Hugging Face Spaces deployment
37
+ if os.path.exists('/data'):
38
+ # Use the persistent storage directory
39
+ RESULTS_FILE = "/data/annotation_results.csv"
40
+ debug_log(f"Using persistent storage at {RESULTS_FILE}")
41
+
42
+ # Function to check if file exists in the Hugging Face repository with exponential backoff
43
+ def check_file_exists(file_url, max_retries=3):
44
+ """
45
+ Checks if a file exists at the given URL without downloading the entire file.
46
+ Uses exponential backoff for retries.
47
+ Returns True if the file exists, False otherwise.
48
+ """
49
+ for attempt in range(max_retries):
50
+ try:
51
+ # Use a short timeout to avoid long waits
52
+ response = requests.head(file_url, timeout=3)
53
+ return response.status_code == 200
54
+ except Exception as e:
55
+ if attempt < max_retries - 1:
56
+ # Exponential backoff: 1s, 2s, 4s, etc.
57
+ wait_time = 2 ** attempt
58
+ debug_log(f"Request failed, retrying in {wait_time}s: {str(e)}")
59
+ time.sleep(wait_time)
60
+ else:
61
+ debug_log(f"Request failed after {max_retries} attempts: {str(e)}")
62
+ return False
63
+ return False
64
+
65
+ # Function to check if a specific chunk exists
66
+ def check_chunk_exists(video_id, chunk_num):
67
+ """Check if a specific chunk of a video exists in the repository"""
68
+ chunk_id = f"{chunk_num:04d}"
69
+ file_name = f"{video_id}_chunk_{chunk_id}.wav"
70
+ file_url = f"{HF_DATASET_URL}{file_name}"
71
+
72
+ return check_file_exists(file_url)
73
+
74
+ # Function to find all chunks for a video by using binary search approach
75
+ def find_all_chunks_for_video(video_id, max_possible_chunks=500):
76
+ """
77
+ Find all available chunks for a video ID using an optimized approach.
78
+ Uses binary search first to find the approximate range, then checks each file.
79
+
80
+ Args:
81
+ video_id: The video ID to check
82
+ max_possible_chunks: Upper limit for the binary search
83
+
84
+ Returns:
85
+ List of chunk numbers that exist
86
+ """
87
+ debug_log(f"Finding chunks for {video_id}...")
88
+
89
+ # First use binary search to find the upper bound
90
+ low = 1
91
+ high = max_possible_chunks
92
+
93
+ # Find an upper bound first (where files no longer exist)
94
+ while low <= high:
95
+ mid = (low + high) // 2
96
+ if check_chunk_exists(video_id, mid):
97
+ low = mid + 1
98
+ else:
99
+ high = mid - 1
100
+
101
+ # The highest existing chunk is at 'high'
102
+ highest_chunk = max(1, high)
103
+ debug_log(f"Binary search found highest chunk: {highest_chunk}")
104
+
105
+ # Now check each potential chunk from 1 to highest_chunk
106
+ existing_chunks = []
107
+ for chunk_num in range(1, highest_chunk + 1):
108
+ # Add some throttling to avoid rate limits (0.1s between requests)
109
+ time.sleep(0.1)
110
+ if check_chunk_exists(video_id, chunk_num):
111
+ existing_chunks.append(chunk_num)
112
+
113
+ debug_log(f"Found {len(existing_chunks)} chunks for {video_id}")
114
+ return existing_chunks
115
+
116
+ # Function to build a list of audio file paths from video IDs with dynamic chunk detection
117
+ def build_file_list_from_video_ids(video_ids, check_existence=False):
118
+ """
119
+ Creates a list of audio files based on the provided video IDs.
120
+ Dynamically detects how many chunks exist for each video.
121
+
122
+ Args:
123
+ video_ids: List of video IDs
124
+ check_existence: Whether to verify each file exists before adding it
125
+
126
+ Returns:
127
+ List of dictionaries with file info
128
+ """
129
+ files = []
130
+ debug_log(f"Building file list for {len(video_ids)} videos (check_existence={check_existence})...")
131
+
132
+ # Create progress bar for checking videos
133
+ progress_bar = st.progress(0)
134
+
135
+ for i, video_id in enumerate(video_ids):
136
+ # Update progress
137
+ progress_bar.progress((i + 1) / len(video_ids))
138
+
139
+ if check_existence:
140
+ # Find all chunks for this video
141
+ st.write(f"Finding chunks for video {video_id} ({i+1}/{len(video_ids)})...")
142
+ chunks = find_all_chunks_for_video(video_id)
143
+
144
+ if chunks:
145
+ st.write(f"Found {len(chunks)} chunks for video {video_id}")
146
+ for chunk_num in chunks:
147
+ chunk_id = f"{chunk_num:04d}"
148
+ file_id = f"{video_id}_chunk_{chunk_id}"
149
+ file_name = f"{file_id}.wav"
150
+ file_url = f"{HF_DATASET_URL}{file_name}"
151
+
152
+ files.append({
153
+ "id": file_id,
154
+ "name": file_name,
155
+ "url": file_url,
156
+ "video_id": video_id,
157
+ "chunk_num": chunk_num
158
+ })
159
+ else:
160
+ st.warning(f"No chunks found for video {video_id}")
161
+ else:
162
+ # If not checking existence, use a default range of chunks (1-100)
163
+ # Reduced from 1-200 to speed up initial loading
164
+ for chunk_num in range(1, 101):
165
+ chunk_id = f"{chunk_num:04d}"
166
+ file_id = f"{video_id}_chunk_{chunk_id}"
167
+ file_name = f"{file_id}.wav"
168
+ file_url = f"{HF_DATASET_URL}{file_name}"
169
+
170
+ files.append({
171
+ "id": file_id,
172
+ "name": file_name,
173
+ "url": file_url,
174
+ "video_id": video_id,
175
+ "chunk_num": chunk_num
176
+ })
177
+
178
+ debug_log(f"Built file list with {len(files)} total files")
179
+ return files
180
+
181
+ # Function to download file from Hugging Face with retry logic
182
+ def download_file_from_hf(file_url, max_retries=3):
183
+ for attempt in range(max_retries):
184
+ try:
185
+ response = requests.get(file_url, timeout=10) # Increased timeout for audio downloads
186
+ if response.status_code == 200:
187
+ return response.content
188
+ else:
189
+ if attempt < max_retries - 1:
190
+ wait_time = 2 ** attempt
191
+ debug_log(f"Download failed (HTTP {response.status_code}), retrying in {wait_time}s")
192
+ time.sleep(wait_time)
193
+ else:
194
+ st.error(f"Failed to download file: HTTP {response.status_code}")
195
+ return None
196
+ except Exception as e:
197
+ if attempt < max_retries - 1:
198
+ wait_time = 2 ** attempt
199
+ debug_log(f"Download error, retrying in {wait_time}s: {str(e)}")
200
+ time.sleep(wait_time)
201
+ else:
202
+ st.error(f"Error downloading file: {e}")
203
+ return None
204
+ return None
205
+
206
+ # Create a unique ID for new annotators or retrieve existing
207
+ def get_annotator_id():
208
+ debug_log("Getting annotator ID...")
209
+ if 'annotator_id' not in st.session_state:
210
+ # Check if we have a stored ID in local storage
211
+ annotator_id_file = '.annotator_id'
212
+ if os.path.exists('/data'):
213
+ annotator_id_file = '/data/.annotator_id'
214
+
215
+ if os.path.exists(annotator_id_file):
216
+ with open(annotator_id_file, 'r') as f:
217
+ st.session_state.annotator_id = f.read().strip()
218
+ debug_log(f"Retrieved existing annotator ID")
219
+ else:
220
+ # Generate a new ID
221
+ st.session_state.annotator_id = str(uuid.uuid4())
222
+ with open(annotator_id_file, 'w') as f:
223
+ f.write(st.session_state.annotator_id)
224
+ debug_log(f"Created new annotator ID")
225
+ return st.session_state.annotator_id
226
+
227
+ # Function to load annotation data from CSV
228
+ def load_annotations():
229
+ debug_log(f"Loading annotations from {RESULTS_FILE}")
230
+ try:
231
+ if os.path.exists(RESULTS_FILE):
232
+ df = pd.read_csv(RESULTS_FILE)
233
+ debug_log(f"Loaded {len(df)} annotation records")
234
+ return df
235
+ else:
236
+ # Create a new DataFrame if the file doesn't exist
237
+ debug_log("No existing annotations found, creating new file")
238
+ df = pd.DataFrame(columns=['file_id', 'file_name', 'Label', 'annotator_id', 'timestamp', 'video_id'])
239
+ df.to_csv(RESULTS_FILE, index=False)
240
+ return df
241
+ except Exception as e:
242
+ st.error(f"Error loading annotations: {e}")
243
+ debug_log(f"Error loading annotations: {str(e)}")
244
+ return pd.DataFrame(columns=['file_id', 'file_name', 'Label', 'annotator_id', 'timestamp', 'video_id'])
245
+
246
+ # Function to save annotations to CSV
247
+ def save_annotation(df):
248
+ debug_log(f"Saving annotations to {RESULTS_FILE}")
249
+ try:
250
+ df.to_csv(RESULTS_FILE, index=False)
251
+ debug_log("Annotations saved successfully")
252
+ return True
253
+ except Exception as e:
254
+ st.error(f"Error saving annotation: {e}")
255
+ debug_log(f"Error saving annotations: {str(e)}")
256
+ return False
257
+
258
+ # Initialize application state
259
+ if 'initialized' not in st.session_state:
260
+ debug_log("Initializing application state")
261
+ st.session_state.initialized = False
262
+ st.session_state.current_file_index = 0
263
+ st.session_state.current_file = None
264
+ st.session_state.annotation_df = None
265
+ st.session_state.all_files = []
266
+ st.session_state.pending_files = []
267
+ st.session_state.hate_count = 0
268
+ st.session_state.non_hate_count = 0
269
+ st.session_state.discard_count = 0
270
+ st.session_state.page = 1
271
+ st.session_state.files_per_page = 50
272
+ st.session_state.lite_mode = False
273
+
274
+ # Application title and header
275
+ st.markdown("""
276
+ <style>
277
+ .main-header {
278
+ font-size: 26px;
279
+ font-weight: bold;
280
+ color: #ff4b4b;
281
+ margin-bottom: 20px;
282
+ }
283
+ .sub-header {
284
+ font-size: 18px;
285
+ color: #555;
286
+ margin-bottom: 30px;
287
+ }
288
+ .progress-container {
289
+ margin: 20px 0;
290
+ padding: 15px;
291
+ background-color: #f9f9f9;
292
+ border-radius: 5px;
293
+ }
294
+ .stats-container {
295
+ display: flex;
296
+ justify-content: space-around;
297
+ margin-top: 20px;
298
+ text-align: center;
299
+ flex-wrap: wrap;
300
+ }
301
+ .stat-item {
302
+ padding: 10px;
303
+ min-width: 100px;
304
+ }
305
+ .stat-value {
306
+ font-size: 24px;
307
+ font-weight: bold;
308
+ color: #4CAF50;
309
+ }
310
+ .stat-label {
311
+ font-size: 14px;
312
+ color: #666;
313
+ }
314
+ .audio-container {
315
+ margin: 30px 0;
316
+ padding: 20px;
317
+ background-color: #f5f5f5;
318
+ border-radius: 10px;
319
+ text-align: center;
320
+ }
321
+ .file-info {
322
+ font-size: 14px;
323
+ color: #666;
324
+ margin-top: 5px;
325
+ }
326
+ </style>
327
+
328
+ <div class="main-header">Speech Hate Detection - Annotation Tool</div>
329
+ """, unsafe_allow_html=True)
330
+
331
+ # Quick start in lite mode (new feature)
332
+ if not st.session_state.initialized:
333
+ if st.button("⚡ Quick Start (Lite Mode)"):
334
+ debug_log("Starting in lite mode")
335
+ st.session_state.lite_mode = True
336
+ st.session_state.annotation_df = load_annotations()
337
+ st.session_state.initialized = True
338
+ st.success("Started in lite mode. Enter video IDs and click Initialize.")
339
+ st.rerun()
340
+
341
+ # App configuration section (collapsible)
342
+ with st.expander("Configuration", expanded=not st.session_state.initialized):
343
+ st.markdown("""
344
+ ### Configuration
345
+
346
+ This tool loads audio files from the Hugging Face dataset at:
347
+ https://huggingface.co/datasets/kcrl/Hs
348
+
349
+ You can provide a list of video IDs for annotation by adding them in the text area below.
350
+ """)
351
+
352
+ # Default video IDs
353
+ default_video_ids = "0hJ2JGhM7TY\n1PRABBSTpiE\n4ewRgBMP_AY" # Reduced to just 3 for initial testing
354
+
355
+ # Allow user to input video IDs
356
+ user_video_ids = st.text_area(
357
+ "Video IDs to annotate (one per line)",
358
+ value=default_video_ids,
359
+ height=150,
360
+ help="Enter the YouTube video IDs, one per line. The app will look for chunks of these videos."
361
+ )
362
+
363
+ annotator_name = st.text_input("Your Name (Optional)",
364
+ help="Your name for tracking purposes")
365
+
366
+ # Set default to False to speed initial loading
367
+ check_files = st.checkbox("Check if files exist (slower but more accurate)", value=False,
368
+ help="Verifies each file exists before adding it to the list")
369
+
370
+ only_new_files = st.checkbox("Only show new files (not previously annotated)", value=True,
371
+ help="Skip files that have already been annotated")
372
+
373
+ col1, col2 = st.columns(2)
374
+ with col1:
375
+ if st.button("Initialize Application"):
376
+ debug_log("Initialize button clicked")
377
+ # Get annotator ID
378
+ annotator_id = get_annotator_id()
379
+
380
+ # First check if we have any video IDs
381
+ if not user_video_ids.strip():
382
+ st.error("Please enter at least one video ID to annotate")
383
+ else:
384
+ # Split by line and remove empty lines
385
+ video_ids = [vid.strip() for vid in user_video_ids.split('\n') if vid.strip()]
386
+
387
+ if not video_ids:
388
+ st.error("Please enter at least one valid video ID")
389
+ else:
390
+ # Load all audio files based on the video IDs
391
+ with st.spinner(f"Building file list for {len(video_ids)} videos..."):
392
+ all_files = build_file_list_from_video_ids(
393
+ video_ids,
394
+ check_existence=check_files
395
+ )
396
+
397
+ if not all_files:
398
+ st.error("No audio files found. Please check the video IDs and try again.")
399
+ else:
400
+ st.session_state.all_files = all_files
401
+
402
+ # Load existing annotation CSV
403
+ annotation_df = load_annotations()
404
+ st.session_state.annotation_df = annotation_df
405
+
406
+ # Filter out files that have already been annotated by this annotator
407
+ annotated_files = set()
408
+ if not annotation_df.empty:
409
+ if only_new_files:
410
+ # If only showing new files, consider files annotated by any annotator
411
+ annotated_files = set(annotation_df['file_id'].tolist())
412
+ else:
413
+ # Otherwise, only consider files annotated by this specific annotator
414
+ annotated_files = set(annotation_df[annotation_df['annotator_id'] == annotator_id]['file_id'].tolist())
415
+
416
+ # Count existing annotations by this annotator
417
+ hate_count = len(annotation_df[(annotation_df['annotator_id'] == annotator_id) &
418
+ (annotation_df['Label'] == 'Hate')])
419
+ non_hate_count = len(annotation_df[(annotation_df['annotator_id'] == annotator_id) &
420
+ (annotation_df['Label'] == 'Non-Hate')])
421
+ discard_count = len(annotation_df[(annotation_df['annotator_id'] == annotator_id) &
422
+ (annotation_df['Label'] == 'Discard')])
423
+
424
+ st.session_state.hate_count = hate_count
425
+ st.session_state.non_hate_count = non_hate_count
426
+ st.session_state.discard_count = discard_count
427
+
428
+ # Create list of pending files (not yet annotated)
429
+ pending_files = [f for f in all_files if f['id'] not in annotated_files]
430
+ st.session_state.pending_files = pending_files
431
+
432
+ if pending_files:
433
+ st.session_state.current_file = pending_files[0]
434
+ st.session_state.initialized = True
435
+ st.success(f"Application initialized successfully! Found {len(pending_files)} files to annotate.")
436
+ st.rerun()
437
+ else:
438
+ st.warning("All files have already been annotated. Try adding new video IDs or uncheck 'Only show new files'.")
439
+
440
+ with col2:
441
+ if st.button("Reset Application State"):
442
+ # Clear the session state
443
+ for key in list(st.session_state.keys()):
444
+ del st.session_state[key]
445
+ st.success("Application state has been reset. You can start fresh.")
446
+ st.rerun()
447
+
448
+ # Main annotation interface
449
+ if st.session_state.initialized and st.session_state.pending_files:
450
+ debug_log("Rendering main annotation interface")
451
+ # Display current annotator
452
+ st.markdown(f"""
453
+ <div class="sub-header">
454
+ Annotator: {annotator_name if annotator_name else st.session_state.annotator_id}
455
+ </div>
456
+ """, unsafe_allow_html=True)
457
+
458
+ # Display progress
459
+ total_files = len(st.session_state.all_files)
460
+ annotated_files = total_files - len(st.session_state.pending_files)
461
+ progress_percentage = int((annotated_files / total_files) * 100) if total_files > 0 else 0
462
+
463
+ st.markdown(f"""
464
+ <div class="progress-container">
465
+ <div>Progress: {annotated_files}/{total_files} samples annotated ({progress_percentage}%)</div>
466
+ <div style="margin-top: 10px; height: 10px; background-color: #eee; border-radius: 5px;">
467
+ <div style="height: 100%; width: {progress_percentage}%; background-color: #4CAF50; border-radius: 5px;"></div>
468
+ </div>
469
+ </div>
470
+ """, unsafe_allow_html=True)
471
+
472
+ # Display statistics
473
+ st.markdown(f"""
474
+ <div class="stats-container">
475
+ <div class="stat-item">
476
+ <div class="stat-value">{len(st.session_state.all_files)}</div>
477
+ <div class="stat-label">Total Files</div>
478
+ </div>
479
+ <div class="stat-item">
480
+ <div class="stat-value">{annotated_files}</div>
481
+ <div class="stat-label">Completed</div>
482
+ </div>
483
+ <div class="stat-item">
484
+ <div class="stat-value">{len(st.session_state.pending_files)}</div>
485
+ <div class="stat-label">Remaining</div>
486
+ </div>
487
+ <div class="stat-item">
488
+ <div class="stat-value">{st.session_state.hate_count}</div>
489
+ <div class="stat-label">Hate</div>
490
+ </div>
491
+ <div class="stat-item">
492
+ <div class="stat-value">{st.session_state.non_hate_count}</div>
493
+ <div class="stat-label">Non-Hate</div>
494
+ </div>
495
+ <div class="stat-item">
496
+ <div class="stat-value">{st.session_state.discard_count}</div>
497
+ <div class="stat-label">Discard</div>
498
+ </div>
499
+ </div>
500
+ """, unsafe_allow_html=True)
501
+
502
+ # Audio player section
503
+ current_file = st.session_state.current_file
504
+
505
+ # Get video ID from the file data
506
+ video_id = current_file.get('video_id', "Unknown")
507
+ if video_id == "Unknown" and "_chunk_" in current_file['name']:
508
+ # Extract from filename as fallback
509
+ video_id = current_file['name'].split("_chunk_")[0]
510
+
511
+ st.markdown(f"""
512
+ <div class="audio-container">
513
+ <div style="font-weight: bold; margin-bottom: 15px;">Currently Playing: {current_file['name']}</div>
514
+ <div class="file-info">Video ID: {video_id}</div>
515
+ """, unsafe_allow_html=True)
516
+
517
+ # Get the audio file
518
+ if 'url' in current_file:
519
+ debug_log(f"Attempting to download audio from {current_file['url']}")
520
+ with st.spinner("Loading audio file..."):
521
+ audio_bytes = download_file_from_hf(current_file['url'])
522
+ else:
523
+ # Fallback for old format
524
+ fallback_url = f"{HF_DATASET_URL}{current_file['name']}"
525
+ debug_log(f"Attempting to download audio from fallback URL {fallback_url}")
526
+ with st.spinner("Loading audio file..."):
527
+ audio_bytes = download_file_from_hf(fallback_url)
528
+
529
+ if audio_bytes:
530
+ debug_log("Audio file downloaded successfully")
531
+ # Display audio player
532
+ st.audio(audio_bytes, format='audio/wav')
533
+
534
+ # Annotation controls
535
+ col1, col2 = st.columns([3, 1])
536
+
537
+ with col1:
538
+ annotation = st.selectbox(
539
+ "Select classification:",
540
+ ["-- Select --", "Hate", "Non-Hate", "Discard"],
541
+ index=0,
542
+ help="Select 'Discard' for unclear audio, background noise, or non-relevant content"
543
+ )
544
+
545
+ with col2:
546
+ st.write("")
547
+ st.write("")
548
+ if st.button("Skip File"):
549
+ debug_log("Skip file button clicked")
550
+ # Remove the current file from pending
551
+ st.session_state.pending_files.pop(0)
552
+
553
+ # Load the next file if available
554
+ if st.session_state.pending_files:
555
+ st.session_state.current_file = st.session_state.pending_files[0]
556
+ st.rerun()
557
+ else:
558
+ st.success("All files have been processed!")
559
+
560
+ if st.button("Submit & Load Next Sample", type="primary"):
561
+ if annotation == "-- Select --":
562
+ st.warning("Please select a classification before submitting.")
563
+ else:
564
+ debug_log(f"Submitting annotation: {annotation}")
565
+ # Record the annotation
566
+ new_row = {
567
+ 'file_id': current_file['id'],
568
+ 'file_name': current_file['name'],
569
+ 'Label': annotation,
570
+ 'annotator_id': st.session_state.annotator_id,
571
+ 'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
572
+ 'video_id': video_id
573
+ }
574
+
575
+ # Update the DataFrame
576
+ st.session_state.annotation_df = pd.concat([
577
+ st.session_state.annotation_df,
578
+ pd.DataFrame([new_row])
579
+ ], ignore_index=True)
580
+
581
+ # Update counts
582
+ if annotation == "Hate":
583
+ st.session_state.hate_count += 1
584
+ elif annotation == "Non-Hate":
585
+ st.session_state.non_hate_count += 1
586
+ else: # Discard
587
+ st.session_state.discard_count += 1
588
+
589
+ # Save the updated annotations
590
+ success = save_annotation(st.session_state.annotation_df)
591
+
592
+ if success:
593
+ debug_log("Annotation saved successfully")
594
+ # Remove the current file from pending
595
+ st.session_state.pending_files.pop(0)
596
+
597
+ # Prefetch next file if available (new optimization)
598
+ if len(st.session_state.pending_files) > 0:
599
+ debug_log("Prefetching next file in background")
600
+ # We'll just set the next file, actual prefetching would require threading
601
+
602
+ # Load the next file if available
603
+ if st.session_state.pending_files:
604
+ st.session_state.current_file = st.session_state.pending_files[0]
605
+ st.rerun()
606
+ else:
607
+ st.success("All files have been annotated! Great job!")
608
+ else:
609
+ st.error("Failed to save annotation. Please try again.")
610
+ else:
611
+ debug_log(f"Failed to load audio file: {current_file['name']}")
612
+ st.error(f"Failed to load audio file: {current_file['name']}. The file may not exist in the repository.")
613
+
614
+ # Skip button for files that can't be loaded
615
+ if st.button("Skip This File", type="primary"):
616
+ debug_log("Skipping unloadable file")
617
+ # Remove the current file from pending
618
+ st.session_state.pending_files.pop(0)
619
+
620
+ # Load the next file if available
621
+ if st.session_state.pending_files:
622
+ st.session_state.current_file = st.session_state.pending_files[0]
623
+ st.rerun()
624
+ else:
625
+ st.success("All files have been processed!")
626
+
627
+ elif st.session_state.initialized and not st.session_state.pending_files:
628
+ debug_log("All files annotated, showing summary")
629
+ st.success("All files have been annotated! Thank you for your contribution!")
630
+
631
+ # Show summary statistics
632
+ st.markdown(f"""
633
+ <div class="stats-container">
634
+ <div class="stat-item">
635
+ <div class="stat-value">{len(st.session_state.all_files)}</div>
636
+ <div class="stat-label">Total Files</div>
637
+ </div>
638
+ <div class="stat-item">
639
+ <div class="stat-value">{st.session_state.hate_count}</div>
640
+ <div class="stat-label">Hate</div>
641
+ </div>
642
+ <div class="stat-item">
643
+ <div class="stat-value">{st.session_state.non_hate_count}</div>
644
+ <div class="stat-label">Non-Hate</div>
645
+ </div>
646
+ <div class="stat-item">
647
+ <div class="stat-value">{st.session_state.discard_count}</div>
648
+ <div class="stat-label">Discard</div>
649
+ </div>
650
+ </div>
651
+ """, unsafe_allow_html=True)
652
+
653
+ # Option to download the results
654
+ if not st.session_state.annotation_df.empty:
655
+ csv = st.session_state.annotation_df.to_csv(index=False)
656
+ b64 = base64.b64encode(csv.encode()).decode()
657
+ href = f'<a href="data:file/csv;base64,{b64}" download="annotation_results.csv">Download Results CSV</a>'
658
+ st.markdown(href, unsafe_allow_html=True)
659
+
660
+ # Two columns for buttons
661
+ col1, col2 = st.columns(2)
662
+
663
+ with col1:
664
+ if st.button("Reset and Start Over"):
665
+ debug_log("Reset and start over clicked")
666
+ st.session_state.clear()
667
+ st.rerun()
668
+
669
+ with col2:
670
+ if st.button("Add More Videos"):
671
+ debug_log("Add more videos clicked")
672
+ # Keep the annotation data but reset the initialization
673
+ st.session_state.initialized = False
674
+ st.rerun()
675
+
676
+ else:
677
+ debug_log("Showing initial configuration screen")
678
+ st.info("Please configure and initialize the application using the Configuration section above.")
679
+
680
+ # Example video IDs
681
+ st.markdown("""
682
+ ### Example Video IDs
683
+
684
+ You can use the following format in the Video IDs text area:
685
+ ```
686
+ 0hJ2JGhM7TY
687
+ 1PRABBSTpiE
688
+ 4ewRgBMP_AY
689
+ ```
690
+
691
+ The app will look for files like:
692
+ - 0hJ2JGhM7TY_chunk_0001.wav
693
+ - 0hJ2JGhM7TY_chunk_0002.wav
694
+ - 1PRABBSTpiE_chunk_0001.wav
695
+ - etc.
696
+ """)
697
+
698
+ # Add a footer with instructions
699
+ st.markdown("""
700
+ ---
701
+ ### Instructions:
702
+ 1. Enter video IDs in the configuration section
703
+ 2. Set your name (optional) and click "Initialize Application" to start
704
+ 3. Listen to each audio sample
705
+ 4. Select the appropriate classification:
706
+ - **Hate**: Contains hate speech
707
+ - **Non-Hate**: Does not contain hate speech
708
+ - **Discard**: Poor audio quality, background noise, or irrelevant content
709
+ 5. Click "Submit & Load Next Sample" to continue
710
+ 6. Your progress is saved automatically
711
+ 7. When all samples are annotated, you can download the results
712
+
713
+ ### Adding New Data
714
+ When you add new data to the Hugging Face dataset:
715
+ 1. Click "Add More Videos" after completing current annotations
716
+ 2. Enter the new video IDs in the configuration
717
+ 3. Make sure "Only show new files" is checked
718
+ 4. Initialize the application again
719
+
720
+ This will only present files that haven't been annotated yet.
721
+
722
+ ### Dataset Information
723
+ The audio files are sourced from the Hugging Face dataset:
724
+ [kcrl/Hs](https://huggingface.co/datasets/kcrl/Hs)
725
+
726
+ File naming follows the pattern: `[VIDEO_ID]_chunk_[CHUNK_NUMBER].wav`
727
+ Example: `0hJ2JGhM7TY_chunk_0001.wav`
728
+ """)
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ streamlit>=1.25.0
2
+ pandas>=1.5.0
3
+ requests>=2.28.0