Spaces:

Sven33
/

SATEv1.5

Sleeping

File size: 3,472 Bytes

5806e12

import os
import json


def create_text_clean(text_token: str):
    tokens = text_token.split()

    final_tokens = []
    removed_count_map = []

    skip_rep = skip_rev = False
    word_index = 0

    for token in tokens:
        if token == "<REPSTART>":
            skip_rep = True
            continue
        if token == "<REPEND>":
            skip_rep = False
            continue
        if token == "<REVSTART>":
            skip_rev = True
            continue
        if token == "<REVEND>":
            skip_rev = False
            continue

        is_special = token.startswith("<") and token.endswith(">")

        if not is_special:
            if not (skip_rep or skip_rev):
                final_tokens.append(token)
                removed_count_map.append(word_index)
            word_index += 1  

    text_clean = " ".join(final_tokens)
    return text_clean, removed_count_map



def restore_index_to_text(cleaned_index, removed_count_map):
    """
    Restore index from text_clean back to original text index.
    
    Args:
        cleaned_index (int): Index in the cleaned text
        removed_count_map (list): Mapping from clean index to original text index
    
    Returns:
        int: Original text index
    """
    if 0 <= cleaned_index < len(removed_count_map):
        return removed_count_map[cleaned_index]
    else:
        # Fallback to original index if out of bounds
        return cleaned_index


def annotate_morpheme(session_id, morpheme_function, base_dir="session_data"):
    """
    Annotate morphemes in transcription data.
    
    Args:
        session_id (str): Session identifier
        morpheme_function (callable): Function to extract morphemes from text
        base_dir (str): Base directory containing session data
    """
    base_dir = base_dir or os.getcwd()
    json_file = os.path.join(base_dir, f"{session_id}/transcription_cunit.json")

    if not os.path.exists(json_file):
        raise FileNotFoundError(f"{json_file} not found – make sure transcription step ran first.")

    with open(json_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    segments = data.get("segments", data) if isinstance(data, dict) else data

    for seg in segments:
        text_token = seg.get("text_token", "")
        
        # Create text_clean by processing text_token
        text_clean, removed_count_map = create_text_clean(text_token)
        seg["text_clean"] = text_clean
        
        # Extract morphemes from text_clean using provided function
        morphemes = morpheme_function(text_clean)
        
        # Restore morpheme indices to original text indices
        for morpheme in morphemes:
            cleaned_index = morpheme["index"]
            original_index = restore_index_to_text(cleaned_index, removed_count_map)
            morpheme["index"] = original_index
        
        seg["morphemes"] = morphemes

    with open(json_file, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)


if __name__ == "__main__":
    # Example usage with morpheme_stanza_v1
    from morpheme_stanza_v1 import extract_inflectional_morphemes
    
    # Test the annotation function
    session_id = "000367"  # Replace with actual session ID
    try:
        annotate_morpheme(session_id, extract_inflectional_morphemes)
        print(f"Morpheme annotation completed for session {session_id}")
    except FileNotFoundError as e:
        print(f"Error: {e}")