vargha commited on
Commit
3c835a7
·
1 Parent(s): a5a2167

script to assign labels to annotators

Browse files
Files changed (1) hide show
  1. scripts/distribute_workload.py +170 -0
scripts/distribute_workload.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+
4
+ # Add project root to Python path
5
+ project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
6
+ if project_root not in sys.path:
7
+ sys.path.insert(0, project_root)
8
+
9
+ import math
10
+ import random # Added for password generation
11
+ from sqlalchemy.sql import func
12
+
13
+ from utils.database import get_db
14
+ from data.models import TTSData
15
+ from data.repository.annotator_repo import AnnotatorRepo
16
+ from data.repository.annotation_interval_repo import AnnotationIntervalRepo
17
+ from utils.logger import Logger
18
+
19
+ log = Logger()
20
+
21
+ # --- Configuration ---
22
+ # List of annotator names to ensure exist and assign work to
23
+ ANNOTATOR_NAMES = ["shahab", "amir", "mohsen", "mahya", "najmeh", "sepehr", "zahra", "moghim", "amin"]
24
+ # DEFAULT_PASSWORD is no longer used for new users, random password will be generated.
25
+
26
+ def generate_random_password():
27
+ """Generates a random 4-digit numerical password."""
28
+ return str(random.randint(1000, 9999))
29
+
30
+ def distribute_workload():
31
+ log.info("Starting workload distribution script...")
32
+ processed_annotators_details = [] # Stores dicts: {'annotator_obj': obj, 'password_display': str, 'assigned_start': int, 'assigned_end': int}
33
+
34
+ try:
35
+ with get_db() as db:
36
+ annot_repo = AnnotatorRepo(db)
37
+ interval_repo = AnnotationIntervalRepo(db)
38
+
39
+ # 1. Ensure all annotators exist, create if not, and collect details
40
+ log.info("Processing annotators...")
41
+ for name in ANNOTATOR_NAMES:
42
+ annotator = annot_repo.get_annotator_by_name(name)
43
+ password_to_display = "(existing user)"
44
+
45
+ if not annotator:
46
+ try:
47
+ new_password = generate_random_password()
48
+ log.info(f"Annotator '{name}' not found, creating with new password...")
49
+ annotator = annot_repo.add_new_annotator(name, new_password)
50
+ log.info(f"Annotator '{name}' (id={annotator.id}) created successfully with password '{new_password}'.")
51
+ password_to_display = new_password
52
+ except ValueError as e:
53
+ log.warning(f"Could not create annotator '{name}' (likely already exists or other DB issue): {e}. Attempting to fetch again.")
54
+ annotator = annot_repo.get_annotator_by_name(name) # Try fetching again
55
+ if annotator:
56
+ log.info(f"Found existing annotator '{name}' (id={annotator.id}) after creation attempt.")
57
+ else:
58
+ log.error(f"Failed to create or find annotator '{name}'. Skipping.")
59
+ continue
60
+ else:
61
+ log.info(f"Found existing annotator '{name}' (id={annotator.id}).")
62
+
63
+ if annotator:
64
+ processed_annotators_details.append({
65
+ 'annotator_obj': annotator,
66
+ 'password_display': password_to_display,
67
+ 'assigned_start': None,
68
+ 'assigned_end': None
69
+ })
70
+
71
+ if not processed_annotators_details:
72
+ log.error("No annotators processed or found. Exiting.")
73
+ return
74
+
75
+ # 2. Get total number of TTSData items
76
+ total_tts_items = db.query(func.count(TTSData.id)).scalar()
77
+ if total_tts_items is None or total_tts_items == 0:
78
+ log.info("No TTSData items found in the database. Nothing to assign.")
79
+ # Still print annotator info even if no items to assign
80
+ log.info("\\n--- Workload Distribution Summary ---")
81
+ for details in processed_annotators_details:
82
+ log.info(f"Annotator: {details['annotator_obj'].name}, Assigned Range: N/A (No data items), Password: {details['password_display']}")
83
+ return
84
+ log.info(f"Total TTSData items found: {total_tts_items}")
85
+
86
+ # 3. Calculate distribution
87
+ num_annotators_for_assignment = len(processed_annotators_details)
88
+ if num_annotators_for_assignment == 0: # Should be caught by earlier check, but as a safeguard
89
+ log.error("No annotators available for assignment. Exiting.")
90
+ return
91
+
92
+ items_per_annotator_base = total_tts_items // num_annotators_for_assignment
93
+ remainder_items = total_tts_items % num_annotators_for_assignment
94
+
95
+ log.info(f"Distributing {total_tts_items} items among {num_annotators_for_assignment} annotators.")
96
+ log.info(f"Base items per annotator: {items_per_annotator_base}, Remainder: {remainder_items}")
97
+
98
+ # 4. Assign intervals
99
+ current_start_idx = 1 # Assuming TTSData IDs start from 1
100
+ for details_dict in processed_annotators_details:
101
+ annotator = details_dict['annotator_obj']
102
+ num_items_for_this_annotator = items_per_annotator_base
103
+ if remainder_items > 0:
104
+ num_items_for_this_annotator += 1
105
+ remainder_items -= 1
106
+
107
+ if num_items_for_this_annotator == 0:
108
+ log.info(f"Annotator '{annotator.name}' assigned 0 items (total items might be less than annotators or workload already distributed).")
109
+ continue
110
+
111
+ current_end_idx = current_start_idx + num_items_for_this_annotator - 1
112
+
113
+ if current_end_idx > total_tts_items:
114
+ current_end_idx = total_tts_items
115
+
116
+ if current_start_idx > current_end_idx:
117
+ log.info(f"No items to assign to '{annotator.name}' (start_idx {current_start_idx} > end_idx {current_end_idx}).")
118
+ continue
119
+
120
+ log.info(f"Attempting to assign interval [{current_start_idx}-{current_end_idx}] to '{annotator.name}' (id={annotator.id})")
121
+ try:
122
+ existing_intervals = interval_repo.get_intervals_by_annotator(annotator.id)
123
+ if existing_intervals:
124
+ log.warning(f"Annotator '{annotator.name}' already has existing intervals. Skipping assignment to avoid conflicts. Manual review/cleanup of old intervals might be needed.")
125
+ # current_start_idx = current_end_idx + 1 # This line should not be here if we skip the user for this round of assignment.
126
+ # The items for this user won't be assigned and won't be passed to the next.
127
+ # This means the total items might not be fully distributed if users are skipped.
128
+ # For a full distribution even with skips, a more complex item re-allocation would be needed.
129
+ # For now, skipped users mean their share is not re-distributed.
130
+ continue # Skip this annotator for assignment
131
+
132
+ assigned_interval = interval_repo.assign_interval_to_annotator(
133
+ annotator_id=annotator.id,
134
+ start_idx=current_start_idx,
135
+ end_idx=current_end_idx,
136
+ allow_overlap=False
137
+ )
138
+ details_dict['assigned_start'] = assigned_interval.start_index
139
+ details_dict['assigned_end'] = assigned_interval.end_index
140
+ log.info(
141
+ f"Successfully assigned interval [{details_dict['assigned_start']}-{details_dict['assigned_end']}] "
142
+ f"to '{annotator.name}' (id={annotator.id})"
143
+ )
144
+ except ValueError as e:
145
+ log.error(f"Could not assign interval [{current_start_idx}-{current_end_idx}] to '{annotator.name}': {e}")
146
+ except Exception as e:
147
+ log.error(f"An unexpected error occurred while assigning interval to '{annotator.name}': {e}")
148
+
149
+ # Only advance current_start_idx if items were potentially assignable to *this* annotator
150
+ # If an annotator was skipped due to existing intervals, their share of items is not processed further in this loop.
151
+ current_start_idx = current_end_idx + 1
152
+ if current_start_idx > total_tts_items:
153
+ break
154
+
155
+ # 5. Print summary
156
+ log.info("\\n--- Workload Distribution Summary ---")
157
+ for details in processed_annotators_details:
158
+ range_str = "N/A (assignment skipped or failed)"
159
+ if details['assigned_start'] is not None and details['assigned_end'] is not None:
160
+ range_str = f"[{details['assigned_start']}-{details['assigned_end']}]"
161
+
162
+ log.info(f"Annotator: {details['annotator_obj'].name}, Assigned Range: {range_str}, Password: {details['password_display']}")
163
+
164
+ log.info("Workload distribution script finished.")
165
+
166
+ except Exception as e:
167
+ log.error(f"An critical error occurred during workload distribution: {e}", exc_info=True)
168
+
169
+ if __name__ == "__main__":
170
+ distribute_workload()