Shuwei Hou commited on
Commit
a213dac
·
1 Parent(s): ed8d94c

update_speaker_id_to_json

Browse files
app.py CHANGED
@@ -11,6 +11,7 @@ from segmentation import reorganize_transcription_c_unit
11
  from annotation import annotate_maze_for_mazewhisper
12
  from morpheme import stanza_v1
13
  from morpheme import annotate_morpheme
 
14
 
15
 
16
 
@@ -43,13 +44,17 @@ def process_audio():
43
  result, session_id = translate_audio_file(model="mazeWhisper", audio_path = audio_path, device=device, original_filename=filename)
44
 
45
  cunit_count, ignored_count = reorganize_transcription_c_unit(session_id, segment_batchalign)
46
- print(f"Created {cunit_count} C-units, ignored {ignored_count} boundaries")
47
 
 
 
 
 
48
  annotate_maze_for_mazewhisper(session_id)
49
 
 
50
  annotate_morpheme(session_id = session_id, morpheme_function = stanza_v1)
51
 
52
-
53
  # annotate_pauses(session_id, pause_threshold)
54
  # annotate_repetitions(session_id)
55
  # # annotate_syllables(session_id)
@@ -58,8 +63,6 @@ def process_audio():
58
  # annotate_morpheme(session_id)
59
  # annotate_morpheme_omission(session_id)
60
 
61
-
62
-
63
  json_path = f"session_data/{session_id}/transcription_cunit.json"
64
  if not os.path.isfile(json_path):
65
  return jsonify({'error': f"Annotation file {json_path} not found"}), 500
 
11
  from annotation import annotate_maze_for_mazewhisper
12
  from morpheme import stanza_v1
13
  from morpheme import annotate_morpheme
14
+ from speaker import assign_speaker
15
 
16
 
17
 
 
44
  result, session_id = translate_audio_file(model="mazeWhisper", audio_path = audio_path, device=device, original_filename=filename)
45
 
46
  cunit_count, ignored_count = reorganize_transcription_c_unit(session_id, segment_batchalign)
47
+ # print(f"Created {cunit_count} C-units, ignored {ignored_count} boundaries")
48
 
49
+ print("Processing speaker identification ... ...")
50
+ assign_speaker(session_id = session_id)
51
+
52
+ print("Processing maze detection ... ...")
53
  annotate_maze_for_mazewhisper(session_id)
54
 
55
+ print("Processing morpheme detection ... ...")
56
  annotate_morpheme(session_id = session_id, morpheme_function = stanza_v1)
57
 
 
58
  # annotate_pauses(session_id, pause_threshold)
59
  # annotate_repetitions(session_id)
60
  # # annotate_syllables(session_id)
 
63
  # annotate_morpheme(session_id)
64
  # annotate_morpheme_omission(session_id)
65
 
 
 
66
  json_path = f"session_data/{session_id}/transcription_cunit.json"
67
  if not os.path.isfile(json_path):
68
  return jsonify({'error': f"Annotation file {json_path} not found"}), 500
requirements.txt CHANGED
@@ -21,5 +21,3 @@ matplotlib>=3.3.0
21
  seaborn>=0.11.0
22
 
23
  # install ffmpeg
24
- librosa>=0.8.0
25
- transformers>=4.0.0
 
21
  seaborn>=0.11.0
22
 
23
  # install ffmpeg
 
 
speaker/speaker_identification.py CHANGED
@@ -1,6 +1,6 @@
1
  from typing import List, Union, Optional
2
  import os
3
-
4
  import numpy as np
5
  import librosa
6
  from transformers import pipeline
@@ -21,6 +21,7 @@ def get_predictor():
21
  if _PREDICTOR_INSTANCE is None:
22
  _PREDICTOR_INSTANCE = Predictor()
23
  return _PREDICTOR_INSTANCE
 
24
  class Predictor:
25
  def __init__(self, model_path: Optional[str] = None):
26
  """
@@ -100,7 +101,7 @@ def assign_speaker_for_audio_list(audio_list: List[Union[str, np.ndarray]]) -> L
100
 
101
  Returns:
102
  List[str]: List of speaker IDs corresponding to each audio segment.
103
- "Speaker_id_0" for child, "Speaker_id_1" for adult.
104
  """
105
  if not audio_list:
106
  return []
@@ -111,12 +112,85 @@ def assign_speaker_for_audio_list(audio_list: List[Union[str, np.ndarray]]) -> L
111
  # Get list of 0 (child) or 1 (adult)
112
  numeric_labels = predictor.predict(audio_list)
113
 
114
- # Map to Speaker_id_0 and Speaker_id_1, preserving order
115
- speaker_ids = [f"Speaker_id_{label}" if label in (0,1) else "Unknown" for label in numeric_labels]
116
  return speaker_ids
117
 
118
 
119
  # you don't have to implement this function
120
  def assign_speaker(session_id: str):
121
 
122
- return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from typing import List, Union, Optional
2
  import os
3
+ import json
4
  import numpy as np
5
  import librosa
6
  from transformers import pipeline
 
21
  if _PREDICTOR_INSTANCE is None:
22
  _PREDICTOR_INSTANCE = Predictor()
23
  return _PREDICTOR_INSTANCE
24
+
25
  class Predictor:
26
  def __init__(self, model_path: Optional[str] = None):
27
  """
 
101
 
102
  Returns:
103
  List[str]: List of speaker IDs corresponding to each audio segment.
104
+ "Child" for child, "Examiner" for adult.
105
  """
106
  if not audio_list:
107
  return []
 
112
  # Get list of 0 (child) or 1 (adult)
113
  numeric_labels = predictor.predict(audio_list)
114
 
115
+ # Map to Child and Examiner, preserving order
116
+ speaker_ids = ["Child" if label == 0 else "Examiner" if label == 1 else "Unknown" for label in numeric_labels]
117
  return speaker_ids
118
 
119
 
120
  # you don't have to implement this function
121
  def assign_speaker(session_id: str):
122
 
123
+ base_dir = os.path.join("session_data", session_id)
124
+ json_path = os.path.join(base_dir, "transcription_cunit.json")
125
+ wav_path = os.path.join(base_dir, "audio.wav")
126
+
127
+ with open(json_path, "r", encoding="utf-8") as f:
128
+ data = json.load(f)
129
+ segments = data.get("segments", [])
130
+
131
+ if not segments:
132
+ return
133
+
134
+ audio, sr = librosa.load(wav_path, sr=DEFAULT_SAMPLE_RATE, mono=True)
135
+ n_samples = len(audio)
136
+ dur_sec = n_samples / float(DEFAULT_SAMPLE_RATE)
137
+
138
+ model_inputs: List[np.ndarray] = []
139
+ model_indices: List[int] = []
140
+ prefilled_unknown: List[int] = []
141
+
142
+ for i, seg in enumerate(segments):
143
+ start = seg.get("start")
144
+ end = seg.get("end")
145
+
146
+ if (
147
+ start is None or end is None
148
+ or not isinstance(start, (int, float))
149
+ or not isinstance(end, (int, float))
150
+ or end <= start
151
+ or start >= dur_sec
152
+ ):
153
+ prefilled_unknown.append(i)
154
+ continue
155
+
156
+ s = max(0.0, float(start))
157
+ e = min(float(end), dur_sec)
158
+
159
+ if e <= s:
160
+ prefilled_unknown.append(i)
161
+ continue
162
+
163
+ s_idx = int(round(s * DEFAULT_SAMPLE_RATE))
164
+ e_idx = int(round(e * DEFAULT_SAMPLE_RATE))
165
+
166
+ s_idx = max(0, min(s_idx, n_samples))
167
+ e_idx = max(0, min(e_idx, n_samples))
168
+
169
+ if e_idx <= s_idx:
170
+ prefilled_unknown.append(i)
171
+ continue
172
+
173
+ snippet = audio[s_idx:e_idx]
174
+
175
+ if snippet.size == 0:
176
+ prefilled_unknown.append(i)
177
+ continue
178
+
179
+ model_inputs.append(snippet)
180
+ model_indices.append(i)
181
+
182
+ speakers = ["Unknown"] * len(segments)
183
+ if model_inputs:
184
+ predicted = assign_speaker_for_audio_list(model_inputs) # ["Child"/"Examiner"/"Unknown"]
185
+ for seg_idx, spk in zip(model_indices, predicted):
186
+ speakers[seg_idx] = spk
187
+
188
+ for seg_idx in prefilled_unknown:
189
+ speakers[seg_idx] = "Unknown"
190
+
191
+ for i, seg in enumerate(segments):
192
+ seg["speaker"] = speakers[i]
193
+
194
+
195
+ with open(json_path, "w", encoding="utf-8") as f:
196
+ json.dump(data, f, ensure_ascii=False, indent=2)
transcription/transcription.py CHANGED
@@ -298,6 +298,11 @@ def translate_audio_file(model: str = "mazeWhisper", audio_path: str = "", devic
298
 
299
  audio = load_audio(audio_path)
300
 
 
 
 
 
 
301
  print("Starting transcription...")
302
  result = pipeline.transcribe(audio_path, verbose=True)
303
 
 
298
 
299
  audio = load_audio(audio_path)
300
 
301
+ # Save the entire audio as audio.wav in the session directory
302
+ audio_output_path = session_dir / "audio.wav"
303
+ sf.write(audio_output_path, audio, SAMPLE_RATE)
304
+ print(f"Audio saved: {audio_output_path}")
305
+
306
  print("Starting transcription...")
307
  result = pipeline.transcribe(audio_path, verbose=True)
308