File size: 3,233 Bytes
5806e12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/env python3
"""
Check for newline characters in the ENNI SALT dataset
"""

import torch
import os


def check_newlines_in_dataset(dataset_path: str) -> None:
    """
    Check for newline characters in the dataset
    
    Args:
        dataset_path: Path to the .pth dataset file
    """
    if not os.path.exists(dataset_path):
        print(f"Error: Dataset file not found at {dataset_path}")
        return
    
    print(f"Loading dataset from {dataset_path}")
    try:
        dataset = torch.load(dataset_path)
        print("Dataset loaded successfully!")
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return
    
    # Navigate through the dataset structure
    total_sentences = 0
    sentences_with_newlines = 0
    newline_examples = []
    
    for lang_code, lang_data in dataset.items():
        print(f"\nChecking language: {lang_code}")
        
        if "sentence" in lang_data:
            for dataset_name, dataset_info in lang_data["sentence"].items():
                print(f"  Dataset: {dataset_name}")
                
                # Check training data
                if "meta" in dataset_info and "train_data" in dataset_info["meta"]:
                    train_data = dataset_info["meta"]["train_data"]
                    print(f"    Training sentences: {len(train_data)}")
                    
                    for i, sentence in enumerate(train_data):
                        total_sentences += 1
                        if '\n' in sentence or '\r' in sentence:
                            sentences_with_newlines += 1
                            if len(newline_examples) < 5:  # Store first 5 examples
                                newline_examples.append(f"Train[{i}]: {repr(sentence)}")
                
                # Check test data
                if "data" in dataset_info:
                    test_data = dataset_info["data"]
                    print(f"    Test sentences: {len(test_data)}")
                    
                    for i, sentence in enumerate(test_data):
                        total_sentences += 1
                        if '\n' in sentence or '\r' in sentence:
                            sentences_with_newlines += 1
                            if len(newline_examples) < 5:  # Store first 5 examples
                                newline_examples.append(f"Test[{i}]: {repr(sentence)}")
    
    # Print results
    print(f"\n{'='*50}")
    print(f"NEWLINE CHECK RESULTS:")
    print(f"{'='*50}")
    print(f"Total sentences checked: {total_sentences}")
    print(f"Sentences with newlines: {sentences_with_newlines}")
    print(f"Percentage with newlines: {sentences_with_newlines/total_sentences*100:.2f}%" if total_sentences > 0 else "N/A")
    
    if sentences_with_newlines > 0:
        print(f"\nWARNING: Found {sentences_with_newlines} sentences containing newline characters!")
        print(f"Examples of sentences with newlines:")
        for example in newline_examples:
            print(f"  {example}")
    else:
        print(f"\n✓ No newline characters found in the dataset!")


if __name__ == "__main__":
    dataset_path = "enni-salt-dataset.pth"
    check_newlines_in_dataset(dataset_path)