File size: 3,233 Bytes
5806e12 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
#!/usr/bin/env python3
"""
Check for newline characters in the ENNI SALT dataset
"""
import torch
import os
def check_newlines_in_dataset(dataset_path: str) -> None:
"""
Check for newline characters in the dataset
Args:
dataset_path: Path to the .pth dataset file
"""
if not os.path.exists(dataset_path):
print(f"Error: Dataset file not found at {dataset_path}")
return
print(f"Loading dataset from {dataset_path}")
try:
dataset = torch.load(dataset_path)
print("Dataset loaded successfully!")
except Exception as e:
print(f"Error loading dataset: {e}")
return
# Navigate through the dataset structure
total_sentences = 0
sentences_with_newlines = 0
newline_examples = []
for lang_code, lang_data in dataset.items():
print(f"\nChecking language: {lang_code}")
if "sentence" in lang_data:
for dataset_name, dataset_info in lang_data["sentence"].items():
print(f" Dataset: {dataset_name}")
# Check training data
if "meta" in dataset_info and "train_data" in dataset_info["meta"]:
train_data = dataset_info["meta"]["train_data"]
print(f" Training sentences: {len(train_data)}")
for i, sentence in enumerate(train_data):
total_sentences += 1
if '\n' in sentence or '\r' in sentence:
sentences_with_newlines += 1
if len(newline_examples) < 5: # Store first 5 examples
newline_examples.append(f"Train[{i}]: {repr(sentence)}")
# Check test data
if "data" in dataset_info:
test_data = dataset_info["data"]
print(f" Test sentences: {len(test_data)}")
for i, sentence in enumerate(test_data):
total_sentences += 1
if '\n' in sentence or '\r' in sentence:
sentences_with_newlines += 1
if len(newline_examples) < 5: # Store first 5 examples
newline_examples.append(f"Test[{i}]: {repr(sentence)}")
# Print results
print(f"\n{'='*50}")
print(f"NEWLINE CHECK RESULTS:")
print(f"{'='*50}")
print(f"Total sentences checked: {total_sentences}")
print(f"Sentences with newlines: {sentences_with_newlines}")
print(f"Percentage with newlines: {sentences_with_newlines/total_sentences*100:.2f}%" if total_sentences > 0 else "N/A")
if sentences_with_newlines > 0:
print(f"\nWARNING: Found {sentences_with_newlines} sentences containing newline characters!")
print(f"Examples of sentences with newlines:")
for example in newline_examples:
print(f" {example}")
else:
print(f"\n✓ No newline characters found in the dataset!")
if __name__ == "__main__":
dataset_path = "enni-salt-dataset.pth"
check_newlines_in_dataset(dataset_path) |