Spaces:

Sven33
/

SATEv1.5

Running on L4

SATEv1.5 / morpheme /morpheme_annotation.py

Shuwei Hou

initial_for_hf

5806e12 8 days ago

3.47 kB

	import os
	import json


	def create_text_clean(text_token: str):
	tokens = text_token.split()

	final_tokens = []
	removed_count_map = []

	skip_rep = skip_rev = False
	word_index = 0

	for token in tokens:
	if token == "<REPSTART>":
	skip_rep = True
	continue
	if token == "<REPEND>":
	skip_rep = False
	continue
	if token == "<REVSTART>":
	skip_rev = True
	continue
	if token == "<REVEND>":
	skip_rev = False
	continue

	is_special = token.startswith("<") and token.endswith(">")

	if not is_special:
	if not (skip_rep or skip_rev):
	final_tokens.append(token)
	removed_count_map.append(word_index)
	word_index += 1

	text_clean = " ".join(final_tokens)
	return text_clean, removed_count_map



	def restore_index_to_text(cleaned_index, removed_count_map):
	"""
	Restore index from text_clean back to original text index.

	Args:
	cleaned_index (int): Index in the cleaned text
	removed_count_map (list): Mapping from clean index to original text index

	Returns:
	int: Original text index
	"""
	if 0 <= cleaned_index < len(removed_count_map):
	return removed_count_map[cleaned_index]
	else:
	# Fallback to original index if out of bounds
	return cleaned_index


	def annotate_morpheme(session_id, morpheme_function, base_dir="session_data"):
	"""
	Annotate morphemes in transcription data.

	Args:
	session_id (str): Session identifier
	morpheme_function (callable): Function to extract morphemes from text
	base_dir (str): Base directory containing session data
	"""
	base_dir = base_dir or os.getcwd()
	json_file = os.path.join(base_dir, f"{session_id}/transcription_cunit.json")

	if not os.path.exists(json_file):
	raise FileNotFoundError(f"{json_file} not found – make sure transcription step ran first.")

	with open(json_file, "r", encoding="utf-8") as f:
	data = json.load(f)

	segments = data.get("segments", data) if isinstance(data, dict) else data

	for seg in segments:
	text_token = seg.get("text_token", "")

	# Create text_clean by processing text_token
	text_clean, removed_count_map = create_text_clean(text_token)
	seg["text_clean"] = text_clean

	# Extract morphemes from text_clean using provided function
	morphemes = morpheme_function(text_clean)

	# Restore morpheme indices to original text indices
	for morpheme in morphemes:
	cleaned_index = morpheme["index"]
	original_index = restore_index_to_text(cleaned_index, removed_count_map)
	morpheme["index"] = original_index

	seg["morphemes"] = morphemes

	with open(json_file, "w", encoding="utf-8") as f:
	json.dump(data, f, ensure_ascii=False, indent=2)


	if __name__ == "__main__":
	# Example usage with morpheme_stanza_v1
	from morpheme_stanza_v1 import extract_inflectional_morphemes

	# Test the annotation function
	session_id = "000367" # Replace with actual session ID
	try:
	annotate_morpheme(session_id, extract_inflectional_morphemes)
	print(f"Morpheme annotation completed for session {session_id}")
	except FileNotFoundError as e:
	print(f"Error: {e}")