import os import json def create_text_clean(text_token: str): tokens = text_token.split() final_tokens = [] removed_count_map = [] skip_rep = skip_rev = False word_index = 0 for token in tokens: if token == "": skip_rep = True continue if token == "": skip_rep = False continue if token == "": skip_rev = True continue if token == "": skip_rev = False continue is_special = token.startswith("<") and token.endswith(">") if not is_special: if not (skip_rep or skip_rev): final_tokens.append(token) removed_count_map.append(word_index) word_index += 1 text_clean = " ".join(final_tokens) return text_clean, removed_count_map def restore_index_to_text(cleaned_index, removed_count_map): """ Restore index from text_clean back to original text index. Args: cleaned_index (int): Index in the cleaned text removed_count_map (list): Mapping from clean index to original text index Returns: int: Original text index """ if 0 <= cleaned_index < len(removed_count_map): return removed_count_map[cleaned_index] else: # Fallback to original index if out of bounds return cleaned_index def annotate_morpheme(session_id, morpheme_function, base_dir="session_data"): """ Annotate morphemes in transcription data. Args: session_id (str): Session identifier morpheme_function (callable): Function to extract morphemes from text base_dir (str): Base directory containing session data """ base_dir = base_dir or os.getcwd() json_file = os.path.join(base_dir, f"{session_id}/transcription_cunit.json") if not os.path.exists(json_file): raise FileNotFoundError(f"{json_file} not found – make sure transcription step ran first.") with open(json_file, "r", encoding="utf-8") as f: data = json.load(f) segments = data.get("segments", data) if isinstance(data, dict) else data for seg in segments: text_token = seg.get("text_token", "") # Create text_clean by processing text_token text_clean, removed_count_map = create_text_clean(text_token) seg["text_clean"] = text_clean # Extract morphemes from text_clean using provided function morphemes = morpheme_function(text_clean) # Restore morpheme indices to original text indices for morpheme in morphemes: cleaned_index = morpheme["index"] original_index = restore_index_to_text(cleaned_index, removed_count_map) morpheme["index"] = original_index seg["morphemes"] = morphemes with open(json_file, "w", encoding="utf-8") as f: json.dump(data, f, ensure_ascii=False, indent=2) if __name__ == "__main__": # Example usage with morpheme_stanza_v1 from morpheme_stanza_v1 import extract_inflectional_morphemes # Test the annotation function session_id = "000367" # Replace with actual session ID try: annotate_morpheme(session_id, extract_inflectional_morphemes) print(f"Morpheme annotation completed for session {session_id}") except FileNotFoundError as e: print(f"Error: {e}")