Spaces:

Sven33
/

SATEv1.5

Sleeping

SATEv1.5 / segmentation /models /SaT_cunit_with_maze /check_newlines.py

Shuwei Hou

initial_for_hf

5806e12 9 days ago

3.23 kB

	#!/usr/bin/env python3
	"""
	Check for newline characters in the ENNI SALT dataset
	"""

	import torch
	import os


	def check_newlines_in_dataset(dataset_path: str) -> None:
	"""
	Check for newline characters in the dataset

	Args:
	dataset_path: Path to the .pth dataset file
	"""
	if not os.path.exists(dataset_path):
	print(f"Error: Dataset file not found at {dataset_path}")
	return

	print(f"Loading dataset from {dataset_path}")
	try:
	dataset = torch.load(dataset_path)
	print("Dataset loaded successfully!")
	except Exception as e:
	print(f"Error loading dataset: {e}")
	return

	# Navigate through the dataset structure
	total_sentences = 0
	sentences_with_newlines = 0
	newline_examples = []

	for lang_code, lang_data in dataset.items():
	print(f"\nChecking language: {lang_code}")

	if "sentence" in lang_data:
	for dataset_name, dataset_info in lang_data["sentence"].items():
	print(f" Dataset: {dataset_name}")

	# Check training data
	if "meta" in dataset_info and "train_data" in dataset_info["meta"]:
	train_data = dataset_info["meta"]["train_data"]
	print(f" Training sentences: {len(train_data)}")

	for i, sentence in enumerate(train_data):
	total_sentences += 1
	if '\n' in sentence or '\r' in sentence:
	sentences_with_newlines += 1
	if len(newline_examples) < 5: # Store first 5 examples
	newline_examples.append(f"Train[{i}]: {repr(sentence)}")

	# Check test data
	if "data" in dataset_info:
	test_data = dataset_info["data"]
	print(f" Test sentences: {len(test_data)}")

	for i, sentence in enumerate(test_data):
	total_sentences += 1
	if '\n' in sentence or '\r' in sentence:
	sentences_with_newlines += 1
	if len(newline_examples) < 5: # Store first 5 examples
	newline_examples.append(f"Test[{i}]: {repr(sentence)}")

	# Print results
	print(f"\n{'='*50}")
	print(f"NEWLINE CHECK RESULTS:")
	print(f"{'='*50}")
	print(f"Total sentences checked: {total_sentences}")
	print(f"Sentences with newlines: {sentences_with_newlines}")
	print(f"Percentage with newlines: {sentences_with_newlines/total_sentences*100:.2f}%" if total_sentences > 0 else "N/A")

	if sentences_with_newlines > 0:
	print(f"\nWARNING: Found {sentences_with_newlines} sentences containing newline characters!")
	print(f"Examples of sentences with newlines:")
	for example in newline_examples:
	print(f" {example}")
	else:
	print(f"\n✓ No newline characters found in the dataset!")


	if __name__ == "__main__":
	dataset_path = "enni-salt-dataset.pth"
	check_newlines_in_dataset(dataset_path)