#!/usr/bin/env python3 """ Check for newline characters in the ENNI SALT dataset """ import torch import os def check_newlines_in_dataset(dataset_path: str) -> None: """ Check for newline characters in the dataset Args: dataset_path: Path to the .pth dataset file """ if not os.path.exists(dataset_path): print(f"Error: Dataset file not found at {dataset_path}") return print(f"Loading dataset from {dataset_path}") try: dataset = torch.load(dataset_path) print("Dataset loaded successfully!") except Exception as e: print(f"Error loading dataset: {e}") return # Navigate through the dataset structure total_sentences = 0 sentences_with_newlines = 0 newline_examples = [] for lang_code, lang_data in dataset.items(): print(f"\nChecking language: {lang_code}") if "sentence" in lang_data: for dataset_name, dataset_info in lang_data["sentence"].items(): print(f" Dataset: {dataset_name}") # Check training data if "meta" in dataset_info and "train_data" in dataset_info["meta"]: train_data = dataset_info["meta"]["train_data"] print(f" Training sentences: {len(train_data)}") for i, sentence in enumerate(train_data): total_sentences += 1 if '\n' in sentence or '\r' in sentence: sentences_with_newlines += 1 if len(newline_examples) < 5: # Store first 5 examples newline_examples.append(f"Train[{i}]: {repr(sentence)}") # Check test data if "data" in dataset_info: test_data = dataset_info["data"] print(f" Test sentences: {len(test_data)}") for i, sentence in enumerate(test_data): total_sentences += 1 if '\n' in sentence or '\r' in sentence: sentences_with_newlines += 1 if len(newline_examples) < 5: # Store first 5 examples newline_examples.append(f"Test[{i}]: {repr(sentence)}") # Print results print(f"\n{'='*50}") print(f"NEWLINE CHECK RESULTS:") print(f"{'='*50}") print(f"Total sentences checked: {total_sentences}") print(f"Sentences with newlines: {sentences_with_newlines}") print(f"Percentage with newlines: {sentences_with_newlines/total_sentences*100:.2f}%" if total_sentences > 0 else "N/A") if sentences_with_newlines > 0: print(f"\nWARNING: Found {sentences_with_newlines} sentences containing newline characters!") print(f"Examples of sentences with newlines:") for example in newline_examples: print(f" {example}") else: print(f"\n✓ No newline characters found in the dataset!") if __name__ == "__main__": dataset_path = "enni-salt-dataset.pth" check_newlines_in_dataset(dataset_path)