|
import os |
|
import json |
|
|
|
|
|
def create_text_clean(text_token: str): |
|
tokens = text_token.split() |
|
|
|
final_tokens = [] |
|
removed_count_map = [] |
|
|
|
skip_rep = skip_rev = False |
|
word_index = 0 |
|
|
|
for token in tokens: |
|
if token == "<REPSTART>": |
|
skip_rep = True |
|
continue |
|
if token == "<REPEND>": |
|
skip_rep = False |
|
continue |
|
if token == "<REVSTART>": |
|
skip_rev = True |
|
continue |
|
if token == "<REVEND>": |
|
skip_rev = False |
|
continue |
|
|
|
is_special = token.startswith("<") and token.endswith(">") |
|
|
|
if not is_special: |
|
if not (skip_rep or skip_rev): |
|
final_tokens.append(token) |
|
removed_count_map.append(word_index) |
|
word_index += 1 |
|
|
|
text_clean = " ".join(final_tokens) |
|
return text_clean, removed_count_map |
|
|
|
|
|
|
|
def restore_index_to_text(cleaned_index, removed_count_map): |
|
""" |
|
Restore index from text_clean back to original text index. |
|
|
|
Args: |
|
cleaned_index (int): Index in the cleaned text |
|
removed_count_map (list): Mapping from clean index to original text index |
|
|
|
Returns: |
|
int: Original text index |
|
""" |
|
if 0 <= cleaned_index < len(removed_count_map): |
|
return removed_count_map[cleaned_index] |
|
else: |
|
|
|
return cleaned_index |
|
|
|
|
|
def annotate_morpheme(session_id, morpheme_function, base_dir="session_data"): |
|
""" |
|
Annotate morphemes in transcription data. |
|
|
|
Args: |
|
session_id (str): Session identifier |
|
morpheme_function (callable): Function to extract morphemes from text |
|
base_dir (str): Base directory containing session data |
|
""" |
|
base_dir = base_dir or os.getcwd() |
|
json_file = os.path.join(base_dir, f"{session_id}/transcription_cunit.json") |
|
|
|
if not os.path.exists(json_file): |
|
raise FileNotFoundError(f"{json_file} not found – make sure transcription step ran first.") |
|
|
|
with open(json_file, "r", encoding="utf-8") as f: |
|
data = json.load(f) |
|
|
|
segments = data.get("segments", data) if isinstance(data, dict) else data |
|
|
|
for seg in segments: |
|
text_token = seg.get("text_token", "") |
|
|
|
|
|
text_clean, removed_count_map = create_text_clean(text_token) |
|
seg["text_clean"] = text_clean |
|
|
|
|
|
morphemes = morpheme_function(text_clean) |
|
|
|
|
|
for morpheme in morphemes: |
|
cleaned_index = morpheme["index"] |
|
original_index = restore_index_to_text(cleaned_index, removed_count_map) |
|
morpheme["index"] = original_index |
|
|
|
seg["morphemes"] = morphemes |
|
|
|
with open(json_file, "w", encoding="utf-8") as f: |
|
json.dump(data, f, ensure_ascii=False, indent=2) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
from morpheme_stanza_v1 import extract_inflectional_morphemes |
|
|
|
|
|
session_id = "000367" |
|
try: |
|
annotate_morpheme(session_id, extract_inflectional_morphemes) |
|
print(f"Morpheme annotation completed for session {session_id}") |
|
except FileNotFoundError as e: |
|
print(f"Error: {e}") |