Shuwei Hou
initial_for_hf
5806e12
raw
history blame
3.23 kB
#!/usr/bin/env python3
"""
Check for newline characters in the ENNI SALT dataset
"""
import torch
import os
def check_newlines_in_dataset(dataset_path: str) -> None:
"""
Check for newline characters in the dataset
Args:
dataset_path: Path to the .pth dataset file
"""
if not os.path.exists(dataset_path):
print(f"Error: Dataset file not found at {dataset_path}")
return
print(f"Loading dataset from {dataset_path}")
try:
dataset = torch.load(dataset_path)
print("Dataset loaded successfully!")
except Exception as e:
print(f"Error loading dataset: {e}")
return
# Navigate through the dataset structure
total_sentences = 0
sentences_with_newlines = 0
newline_examples = []
for lang_code, lang_data in dataset.items():
print(f"\nChecking language: {lang_code}")
if "sentence" in lang_data:
for dataset_name, dataset_info in lang_data["sentence"].items():
print(f" Dataset: {dataset_name}")
# Check training data
if "meta" in dataset_info and "train_data" in dataset_info["meta"]:
train_data = dataset_info["meta"]["train_data"]
print(f" Training sentences: {len(train_data)}")
for i, sentence in enumerate(train_data):
total_sentences += 1
if '\n' in sentence or '\r' in sentence:
sentences_with_newlines += 1
if len(newline_examples) < 5: # Store first 5 examples
newline_examples.append(f"Train[{i}]: {repr(sentence)}")
# Check test data
if "data" in dataset_info:
test_data = dataset_info["data"]
print(f" Test sentences: {len(test_data)}")
for i, sentence in enumerate(test_data):
total_sentences += 1
if '\n' in sentence or '\r' in sentence:
sentences_with_newlines += 1
if len(newline_examples) < 5: # Store first 5 examples
newline_examples.append(f"Test[{i}]: {repr(sentence)}")
# Print results
print(f"\n{'='*50}")
print(f"NEWLINE CHECK RESULTS:")
print(f"{'='*50}")
print(f"Total sentences checked: {total_sentences}")
print(f"Sentences with newlines: {sentences_with_newlines}")
print(f"Percentage with newlines: {sentences_with_newlines/total_sentences*100:.2f}%" if total_sentences > 0 else "N/A")
if sentences_with_newlines > 0:
print(f"\nWARNING: Found {sentences_with_newlines} sentences containing newline characters!")
print(f"Examples of sentences with newlines:")
for example in newline_examples:
print(f" {example}")
else:
print(f"\n✓ No newline characters found in the dataset!")
if __name__ == "__main__":
dataset_path = "enni-salt-dataset.pth"
check_newlines_in_dataset(dataset_path)