SATEv1.5 / morpheme /morpheme_annotation.py
Shuwei Hou
initial_for_hf
5806e12
import os
import json
def create_text_clean(text_token: str):
tokens = text_token.split()
final_tokens = []
removed_count_map = []
skip_rep = skip_rev = False
word_index = 0
for token in tokens:
if token == "<REPSTART>":
skip_rep = True
continue
if token == "<REPEND>":
skip_rep = False
continue
if token == "<REVSTART>":
skip_rev = True
continue
if token == "<REVEND>":
skip_rev = False
continue
is_special = token.startswith("<") and token.endswith(">")
if not is_special:
if not (skip_rep or skip_rev):
final_tokens.append(token)
removed_count_map.append(word_index)
word_index += 1
text_clean = " ".join(final_tokens)
return text_clean, removed_count_map
def restore_index_to_text(cleaned_index, removed_count_map):
"""
Restore index from text_clean back to original text index.
Args:
cleaned_index (int): Index in the cleaned text
removed_count_map (list): Mapping from clean index to original text index
Returns:
int: Original text index
"""
if 0 <= cleaned_index < len(removed_count_map):
return removed_count_map[cleaned_index]
else:
# Fallback to original index if out of bounds
return cleaned_index
def annotate_morpheme(session_id, morpheme_function, base_dir="session_data"):
"""
Annotate morphemes in transcription data.
Args:
session_id (str): Session identifier
morpheme_function (callable): Function to extract morphemes from text
base_dir (str): Base directory containing session data
"""
base_dir = base_dir or os.getcwd()
json_file = os.path.join(base_dir, f"{session_id}/transcription_cunit.json")
if not os.path.exists(json_file):
raise FileNotFoundError(f"{json_file} not found – make sure transcription step ran first.")
with open(json_file, "r", encoding="utf-8") as f:
data = json.load(f)
segments = data.get("segments", data) if isinstance(data, dict) else data
for seg in segments:
text_token = seg.get("text_token", "")
# Create text_clean by processing text_token
text_clean, removed_count_map = create_text_clean(text_token)
seg["text_clean"] = text_clean
# Extract morphemes from text_clean using provided function
morphemes = morpheme_function(text_clean)
# Restore morpheme indices to original text indices
for morpheme in morphemes:
cleaned_index = morpheme["index"]
original_index = restore_index_to_text(cleaned_index, removed_count_map)
morpheme["index"] = original_index
seg["morphemes"] = morphemes
with open(json_file, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
if __name__ == "__main__":
# Example usage with morpheme_stanza_v1
from morpheme_stanza_v1 import extract_inflectional_morphemes
# Test the annotation function
session_id = "000367" # Replace with actual session ID
try:
annotate_morpheme(session_id, extract_inflectional_morphemes)
print(f"Morpheme annotation completed for session {session_id}")
except FileNotFoundError as e:
print(f"Error: {e}")