Shuwei Hou
commited on
Commit
·
a213dac
1
Parent(s):
ed8d94c
update_speaker_id_to_json
Browse files- app.py +7 -4
- requirements.txt +0 -2
- speaker/speaker_identification.py +79 -5
- transcription/transcription.py +5 -0
app.py
CHANGED
@@ -11,6 +11,7 @@ from segmentation import reorganize_transcription_c_unit
|
|
11 |
from annotation import annotate_maze_for_mazewhisper
|
12 |
from morpheme import stanza_v1
|
13 |
from morpheme import annotate_morpheme
|
|
|
14 |
|
15 |
|
16 |
|
@@ -43,13 +44,17 @@ def process_audio():
|
|
43 |
result, session_id = translate_audio_file(model="mazeWhisper", audio_path = audio_path, device=device, original_filename=filename)
|
44 |
|
45 |
cunit_count, ignored_count = reorganize_transcription_c_unit(session_id, segment_batchalign)
|
46 |
-
print(f"Created {cunit_count} C-units, ignored {ignored_count} boundaries")
|
47 |
|
|
|
|
|
|
|
|
|
48 |
annotate_maze_for_mazewhisper(session_id)
|
49 |
|
|
|
50 |
annotate_morpheme(session_id = session_id, morpheme_function = stanza_v1)
|
51 |
|
52 |
-
|
53 |
# annotate_pauses(session_id, pause_threshold)
|
54 |
# annotate_repetitions(session_id)
|
55 |
# # annotate_syllables(session_id)
|
@@ -58,8 +63,6 @@ def process_audio():
|
|
58 |
# annotate_morpheme(session_id)
|
59 |
# annotate_morpheme_omission(session_id)
|
60 |
|
61 |
-
|
62 |
-
|
63 |
json_path = f"session_data/{session_id}/transcription_cunit.json"
|
64 |
if not os.path.isfile(json_path):
|
65 |
return jsonify({'error': f"Annotation file {json_path} not found"}), 500
|
|
|
11 |
from annotation import annotate_maze_for_mazewhisper
|
12 |
from morpheme import stanza_v1
|
13 |
from morpheme import annotate_morpheme
|
14 |
+
from speaker import assign_speaker
|
15 |
|
16 |
|
17 |
|
|
|
44 |
result, session_id = translate_audio_file(model="mazeWhisper", audio_path = audio_path, device=device, original_filename=filename)
|
45 |
|
46 |
cunit_count, ignored_count = reorganize_transcription_c_unit(session_id, segment_batchalign)
|
47 |
+
# print(f"Created {cunit_count} C-units, ignored {ignored_count} boundaries")
|
48 |
|
49 |
+
print("Processing speaker identification ... ...")
|
50 |
+
assign_speaker(session_id = session_id)
|
51 |
+
|
52 |
+
print("Processing maze detection ... ...")
|
53 |
annotate_maze_for_mazewhisper(session_id)
|
54 |
|
55 |
+
print("Processing morpheme detection ... ...")
|
56 |
annotate_morpheme(session_id = session_id, morpheme_function = stanza_v1)
|
57 |
|
|
|
58 |
# annotate_pauses(session_id, pause_threshold)
|
59 |
# annotate_repetitions(session_id)
|
60 |
# # annotate_syllables(session_id)
|
|
|
63 |
# annotate_morpheme(session_id)
|
64 |
# annotate_morpheme_omission(session_id)
|
65 |
|
|
|
|
|
66 |
json_path = f"session_data/{session_id}/transcription_cunit.json"
|
67 |
if not os.path.isfile(json_path):
|
68 |
return jsonify({'error': f"Annotation file {json_path} not found"}), 500
|
requirements.txt
CHANGED
@@ -21,5 +21,3 @@ matplotlib>=3.3.0
|
|
21 |
seaborn>=0.11.0
|
22 |
|
23 |
# install ffmpeg
|
24 |
-
librosa>=0.8.0
|
25 |
-
transformers>=4.0.0
|
|
|
21 |
seaborn>=0.11.0
|
22 |
|
23 |
# install ffmpeg
|
|
|
|
speaker/speaker_identification.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
from typing import List, Union, Optional
|
2 |
import os
|
3 |
-
|
4 |
import numpy as np
|
5 |
import librosa
|
6 |
from transformers import pipeline
|
@@ -21,6 +21,7 @@ def get_predictor():
|
|
21 |
if _PREDICTOR_INSTANCE is None:
|
22 |
_PREDICTOR_INSTANCE = Predictor()
|
23 |
return _PREDICTOR_INSTANCE
|
|
|
24 |
class Predictor:
|
25 |
def __init__(self, model_path: Optional[str] = None):
|
26 |
"""
|
@@ -100,7 +101,7 @@ def assign_speaker_for_audio_list(audio_list: List[Union[str, np.ndarray]]) -> L
|
|
100 |
|
101 |
Returns:
|
102 |
List[str]: List of speaker IDs corresponding to each audio segment.
|
103 |
-
"
|
104 |
"""
|
105 |
if not audio_list:
|
106 |
return []
|
@@ -111,12 +112,85 @@ def assign_speaker_for_audio_list(audio_list: List[Union[str, np.ndarray]]) -> L
|
|
111 |
# Get list of 0 (child) or 1 (adult)
|
112 |
numeric_labels = predictor.predict(audio_list)
|
113 |
|
114 |
-
# Map to
|
115 |
-
speaker_ids = [
|
116 |
return speaker_ids
|
117 |
|
118 |
|
119 |
# you don't have to implement this function
|
120 |
def assign_speaker(session_id: str):
|
121 |
|
122 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from typing import List, Union, Optional
|
2 |
import os
|
3 |
+
import json
|
4 |
import numpy as np
|
5 |
import librosa
|
6 |
from transformers import pipeline
|
|
|
21 |
if _PREDICTOR_INSTANCE is None:
|
22 |
_PREDICTOR_INSTANCE = Predictor()
|
23 |
return _PREDICTOR_INSTANCE
|
24 |
+
|
25 |
class Predictor:
|
26 |
def __init__(self, model_path: Optional[str] = None):
|
27 |
"""
|
|
|
101 |
|
102 |
Returns:
|
103 |
List[str]: List of speaker IDs corresponding to each audio segment.
|
104 |
+
"Child" for child, "Examiner" for adult.
|
105 |
"""
|
106 |
if not audio_list:
|
107 |
return []
|
|
|
112 |
# Get list of 0 (child) or 1 (adult)
|
113 |
numeric_labels = predictor.predict(audio_list)
|
114 |
|
115 |
+
# Map to Child and Examiner, preserving order
|
116 |
+
speaker_ids = ["Child" if label == 0 else "Examiner" if label == 1 else "Unknown" for label in numeric_labels]
|
117 |
return speaker_ids
|
118 |
|
119 |
|
120 |
# you don't have to implement this function
|
121 |
def assign_speaker(session_id: str):
|
122 |
|
123 |
+
base_dir = os.path.join("session_data", session_id)
|
124 |
+
json_path = os.path.join(base_dir, "transcription_cunit.json")
|
125 |
+
wav_path = os.path.join(base_dir, "audio.wav")
|
126 |
+
|
127 |
+
with open(json_path, "r", encoding="utf-8") as f:
|
128 |
+
data = json.load(f)
|
129 |
+
segments = data.get("segments", [])
|
130 |
+
|
131 |
+
if not segments:
|
132 |
+
return
|
133 |
+
|
134 |
+
audio, sr = librosa.load(wav_path, sr=DEFAULT_SAMPLE_RATE, mono=True)
|
135 |
+
n_samples = len(audio)
|
136 |
+
dur_sec = n_samples / float(DEFAULT_SAMPLE_RATE)
|
137 |
+
|
138 |
+
model_inputs: List[np.ndarray] = []
|
139 |
+
model_indices: List[int] = []
|
140 |
+
prefilled_unknown: List[int] = []
|
141 |
+
|
142 |
+
for i, seg in enumerate(segments):
|
143 |
+
start = seg.get("start")
|
144 |
+
end = seg.get("end")
|
145 |
+
|
146 |
+
if (
|
147 |
+
start is None or end is None
|
148 |
+
or not isinstance(start, (int, float))
|
149 |
+
or not isinstance(end, (int, float))
|
150 |
+
or end <= start
|
151 |
+
or start >= dur_sec
|
152 |
+
):
|
153 |
+
prefilled_unknown.append(i)
|
154 |
+
continue
|
155 |
+
|
156 |
+
s = max(0.0, float(start))
|
157 |
+
e = min(float(end), dur_sec)
|
158 |
+
|
159 |
+
if e <= s:
|
160 |
+
prefilled_unknown.append(i)
|
161 |
+
continue
|
162 |
+
|
163 |
+
s_idx = int(round(s * DEFAULT_SAMPLE_RATE))
|
164 |
+
e_idx = int(round(e * DEFAULT_SAMPLE_RATE))
|
165 |
+
|
166 |
+
s_idx = max(0, min(s_idx, n_samples))
|
167 |
+
e_idx = max(0, min(e_idx, n_samples))
|
168 |
+
|
169 |
+
if e_idx <= s_idx:
|
170 |
+
prefilled_unknown.append(i)
|
171 |
+
continue
|
172 |
+
|
173 |
+
snippet = audio[s_idx:e_idx]
|
174 |
+
|
175 |
+
if snippet.size == 0:
|
176 |
+
prefilled_unknown.append(i)
|
177 |
+
continue
|
178 |
+
|
179 |
+
model_inputs.append(snippet)
|
180 |
+
model_indices.append(i)
|
181 |
+
|
182 |
+
speakers = ["Unknown"] * len(segments)
|
183 |
+
if model_inputs:
|
184 |
+
predicted = assign_speaker_for_audio_list(model_inputs) # ["Child"/"Examiner"/"Unknown"]
|
185 |
+
for seg_idx, spk in zip(model_indices, predicted):
|
186 |
+
speakers[seg_idx] = spk
|
187 |
+
|
188 |
+
for seg_idx in prefilled_unknown:
|
189 |
+
speakers[seg_idx] = "Unknown"
|
190 |
+
|
191 |
+
for i, seg in enumerate(segments):
|
192 |
+
seg["speaker"] = speakers[i]
|
193 |
+
|
194 |
+
|
195 |
+
with open(json_path, "w", encoding="utf-8") as f:
|
196 |
+
json.dump(data, f, ensure_ascii=False, indent=2)
|
transcription/transcription.py
CHANGED
@@ -298,6 +298,11 @@ def translate_audio_file(model: str = "mazeWhisper", audio_path: str = "", devic
|
|
298 |
|
299 |
audio = load_audio(audio_path)
|
300 |
|
|
|
|
|
|
|
|
|
|
|
301 |
print("Starting transcription...")
|
302 |
result = pipeline.transcribe(audio_path, verbose=True)
|
303 |
|
|
|
298 |
|
299 |
audio = load_audio(audio_path)
|
300 |
|
301 |
+
# Save the entire audio as audio.wav in the session directory
|
302 |
+
audio_output_path = session_dir / "audio.wav"
|
303 |
+
sf.write(audio_output_path, audio, SAMPLE_RATE)
|
304 |
+
print(f"Audio saved: {audio_output_path}")
|
305 |
+
|
306 |
print("Starting transcription...")
|
307 |
result = pipeline.transcribe(audio_path, verbose=True)
|
308 |
|