Spaces:
Running
Running
script to assign labels to annotators
Browse files- scripts/distribute_workload.py +170 -0
scripts/distribute_workload.py
ADDED
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import os
|
3 |
+
|
4 |
+
# Add project root to Python path
|
5 |
+
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
|
6 |
+
if project_root not in sys.path:
|
7 |
+
sys.path.insert(0, project_root)
|
8 |
+
|
9 |
+
import math
|
10 |
+
import random # Added for password generation
|
11 |
+
from sqlalchemy.sql import func
|
12 |
+
|
13 |
+
from utils.database import get_db
|
14 |
+
from data.models import TTSData
|
15 |
+
from data.repository.annotator_repo import AnnotatorRepo
|
16 |
+
from data.repository.annotation_interval_repo import AnnotationIntervalRepo
|
17 |
+
from utils.logger import Logger
|
18 |
+
|
19 |
+
log = Logger()
|
20 |
+
|
21 |
+
# --- Configuration ---
|
22 |
+
# List of annotator names to ensure exist and assign work to
|
23 |
+
ANNOTATOR_NAMES = ["shahab", "amir", "mohsen", "mahya", "najmeh", "sepehr", "zahra", "moghim", "amin"]
|
24 |
+
# DEFAULT_PASSWORD is no longer used for new users, random password will be generated.
|
25 |
+
|
26 |
+
def generate_random_password():
|
27 |
+
"""Generates a random 4-digit numerical password."""
|
28 |
+
return str(random.randint(1000, 9999))
|
29 |
+
|
30 |
+
def distribute_workload():
|
31 |
+
log.info("Starting workload distribution script...")
|
32 |
+
processed_annotators_details = [] # Stores dicts: {'annotator_obj': obj, 'password_display': str, 'assigned_start': int, 'assigned_end': int}
|
33 |
+
|
34 |
+
try:
|
35 |
+
with get_db() as db:
|
36 |
+
annot_repo = AnnotatorRepo(db)
|
37 |
+
interval_repo = AnnotationIntervalRepo(db)
|
38 |
+
|
39 |
+
# 1. Ensure all annotators exist, create if not, and collect details
|
40 |
+
log.info("Processing annotators...")
|
41 |
+
for name in ANNOTATOR_NAMES:
|
42 |
+
annotator = annot_repo.get_annotator_by_name(name)
|
43 |
+
password_to_display = "(existing user)"
|
44 |
+
|
45 |
+
if not annotator:
|
46 |
+
try:
|
47 |
+
new_password = generate_random_password()
|
48 |
+
log.info(f"Annotator '{name}' not found, creating with new password...")
|
49 |
+
annotator = annot_repo.add_new_annotator(name, new_password)
|
50 |
+
log.info(f"Annotator '{name}' (id={annotator.id}) created successfully with password '{new_password}'.")
|
51 |
+
password_to_display = new_password
|
52 |
+
except ValueError as e:
|
53 |
+
log.warning(f"Could not create annotator '{name}' (likely already exists or other DB issue): {e}. Attempting to fetch again.")
|
54 |
+
annotator = annot_repo.get_annotator_by_name(name) # Try fetching again
|
55 |
+
if annotator:
|
56 |
+
log.info(f"Found existing annotator '{name}' (id={annotator.id}) after creation attempt.")
|
57 |
+
else:
|
58 |
+
log.error(f"Failed to create or find annotator '{name}'. Skipping.")
|
59 |
+
continue
|
60 |
+
else:
|
61 |
+
log.info(f"Found existing annotator '{name}' (id={annotator.id}).")
|
62 |
+
|
63 |
+
if annotator:
|
64 |
+
processed_annotators_details.append({
|
65 |
+
'annotator_obj': annotator,
|
66 |
+
'password_display': password_to_display,
|
67 |
+
'assigned_start': None,
|
68 |
+
'assigned_end': None
|
69 |
+
})
|
70 |
+
|
71 |
+
if not processed_annotators_details:
|
72 |
+
log.error("No annotators processed or found. Exiting.")
|
73 |
+
return
|
74 |
+
|
75 |
+
# 2. Get total number of TTSData items
|
76 |
+
total_tts_items = db.query(func.count(TTSData.id)).scalar()
|
77 |
+
if total_tts_items is None or total_tts_items == 0:
|
78 |
+
log.info("No TTSData items found in the database. Nothing to assign.")
|
79 |
+
# Still print annotator info even if no items to assign
|
80 |
+
log.info("\\n--- Workload Distribution Summary ---")
|
81 |
+
for details in processed_annotators_details:
|
82 |
+
log.info(f"Annotator: {details['annotator_obj'].name}, Assigned Range: N/A (No data items), Password: {details['password_display']}")
|
83 |
+
return
|
84 |
+
log.info(f"Total TTSData items found: {total_tts_items}")
|
85 |
+
|
86 |
+
# 3. Calculate distribution
|
87 |
+
num_annotators_for_assignment = len(processed_annotators_details)
|
88 |
+
if num_annotators_for_assignment == 0: # Should be caught by earlier check, but as a safeguard
|
89 |
+
log.error("No annotators available for assignment. Exiting.")
|
90 |
+
return
|
91 |
+
|
92 |
+
items_per_annotator_base = total_tts_items // num_annotators_for_assignment
|
93 |
+
remainder_items = total_tts_items % num_annotators_for_assignment
|
94 |
+
|
95 |
+
log.info(f"Distributing {total_tts_items} items among {num_annotators_for_assignment} annotators.")
|
96 |
+
log.info(f"Base items per annotator: {items_per_annotator_base}, Remainder: {remainder_items}")
|
97 |
+
|
98 |
+
# 4. Assign intervals
|
99 |
+
current_start_idx = 1 # Assuming TTSData IDs start from 1
|
100 |
+
for details_dict in processed_annotators_details:
|
101 |
+
annotator = details_dict['annotator_obj']
|
102 |
+
num_items_for_this_annotator = items_per_annotator_base
|
103 |
+
if remainder_items > 0:
|
104 |
+
num_items_for_this_annotator += 1
|
105 |
+
remainder_items -= 1
|
106 |
+
|
107 |
+
if num_items_for_this_annotator == 0:
|
108 |
+
log.info(f"Annotator '{annotator.name}' assigned 0 items (total items might be less than annotators or workload already distributed).")
|
109 |
+
continue
|
110 |
+
|
111 |
+
current_end_idx = current_start_idx + num_items_for_this_annotator - 1
|
112 |
+
|
113 |
+
if current_end_idx > total_tts_items:
|
114 |
+
current_end_idx = total_tts_items
|
115 |
+
|
116 |
+
if current_start_idx > current_end_idx:
|
117 |
+
log.info(f"No items to assign to '{annotator.name}' (start_idx {current_start_idx} > end_idx {current_end_idx}).")
|
118 |
+
continue
|
119 |
+
|
120 |
+
log.info(f"Attempting to assign interval [{current_start_idx}-{current_end_idx}] to '{annotator.name}' (id={annotator.id})")
|
121 |
+
try:
|
122 |
+
existing_intervals = interval_repo.get_intervals_by_annotator(annotator.id)
|
123 |
+
if existing_intervals:
|
124 |
+
log.warning(f"Annotator '{annotator.name}' already has existing intervals. Skipping assignment to avoid conflicts. Manual review/cleanup of old intervals might be needed.")
|
125 |
+
# current_start_idx = current_end_idx + 1 # This line should not be here if we skip the user for this round of assignment.
|
126 |
+
# The items for this user won't be assigned and won't be passed to the next.
|
127 |
+
# This means the total items might not be fully distributed if users are skipped.
|
128 |
+
# For a full distribution even with skips, a more complex item re-allocation would be needed.
|
129 |
+
# For now, skipped users mean their share is not re-distributed.
|
130 |
+
continue # Skip this annotator for assignment
|
131 |
+
|
132 |
+
assigned_interval = interval_repo.assign_interval_to_annotator(
|
133 |
+
annotator_id=annotator.id,
|
134 |
+
start_idx=current_start_idx,
|
135 |
+
end_idx=current_end_idx,
|
136 |
+
allow_overlap=False
|
137 |
+
)
|
138 |
+
details_dict['assigned_start'] = assigned_interval.start_index
|
139 |
+
details_dict['assigned_end'] = assigned_interval.end_index
|
140 |
+
log.info(
|
141 |
+
f"Successfully assigned interval [{details_dict['assigned_start']}-{details_dict['assigned_end']}] "
|
142 |
+
f"to '{annotator.name}' (id={annotator.id})"
|
143 |
+
)
|
144 |
+
except ValueError as e:
|
145 |
+
log.error(f"Could not assign interval [{current_start_idx}-{current_end_idx}] to '{annotator.name}': {e}")
|
146 |
+
except Exception as e:
|
147 |
+
log.error(f"An unexpected error occurred while assigning interval to '{annotator.name}': {e}")
|
148 |
+
|
149 |
+
# Only advance current_start_idx if items were potentially assignable to *this* annotator
|
150 |
+
# If an annotator was skipped due to existing intervals, their share of items is not processed further in this loop.
|
151 |
+
current_start_idx = current_end_idx + 1
|
152 |
+
if current_start_idx > total_tts_items:
|
153 |
+
break
|
154 |
+
|
155 |
+
# 5. Print summary
|
156 |
+
log.info("\\n--- Workload Distribution Summary ---")
|
157 |
+
for details in processed_annotators_details:
|
158 |
+
range_str = "N/A (assignment skipped or failed)"
|
159 |
+
if details['assigned_start'] is not None and details['assigned_end'] is not None:
|
160 |
+
range_str = f"[{details['assigned_start']}-{details['assigned_end']}]"
|
161 |
+
|
162 |
+
log.info(f"Annotator: {details['annotator_obj'].name}, Assigned Range: {range_str}, Password: {details['password_display']}")
|
163 |
+
|
164 |
+
log.info("Workload distribution script finished.")
|
165 |
+
|
166 |
+
except Exception as e:
|
167 |
+
log.error(f"An critical error occurred during workload distribution: {e}", exc_info=True)
|
168 |
+
|
169 |
+
if __name__ == "__main__":
|
170 |
+
distribute_workload()
|