|
|
|
""" |
|
Check for newline characters in the ENNI SALT dataset |
|
""" |
|
|
|
import torch |
|
import os |
|
|
|
|
|
def check_newlines_in_dataset(dataset_path: str) -> None: |
|
""" |
|
Check for newline characters in the dataset |
|
|
|
Args: |
|
dataset_path: Path to the .pth dataset file |
|
""" |
|
if not os.path.exists(dataset_path): |
|
print(f"Error: Dataset file not found at {dataset_path}") |
|
return |
|
|
|
print(f"Loading dataset from {dataset_path}") |
|
try: |
|
dataset = torch.load(dataset_path) |
|
print("Dataset loaded successfully!") |
|
except Exception as e: |
|
print(f"Error loading dataset: {e}") |
|
return |
|
|
|
|
|
total_sentences = 0 |
|
sentences_with_newlines = 0 |
|
newline_examples = [] |
|
|
|
for lang_code, lang_data in dataset.items(): |
|
print(f"\nChecking language: {lang_code}") |
|
|
|
if "sentence" in lang_data: |
|
for dataset_name, dataset_info in lang_data["sentence"].items(): |
|
print(f" Dataset: {dataset_name}") |
|
|
|
|
|
if "meta" in dataset_info and "train_data" in dataset_info["meta"]: |
|
train_data = dataset_info["meta"]["train_data"] |
|
print(f" Training sentences: {len(train_data)}") |
|
|
|
for i, sentence in enumerate(train_data): |
|
total_sentences += 1 |
|
if '\n' in sentence or '\r' in sentence: |
|
sentences_with_newlines += 1 |
|
if len(newline_examples) < 5: |
|
newline_examples.append(f"Train[{i}]: {repr(sentence)}") |
|
|
|
|
|
if "data" in dataset_info: |
|
test_data = dataset_info["data"] |
|
print(f" Test sentences: {len(test_data)}") |
|
|
|
for i, sentence in enumerate(test_data): |
|
total_sentences += 1 |
|
if '\n' in sentence or '\r' in sentence: |
|
sentences_with_newlines += 1 |
|
if len(newline_examples) < 5: |
|
newline_examples.append(f"Test[{i}]: {repr(sentence)}") |
|
|
|
|
|
print(f"\n{'='*50}") |
|
print(f"NEWLINE CHECK RESULTS:") |
|
print(f"{'='*50}") |
|
print(f"Total sentences checked: {total_sentences}") |
|
print(f"Sentences with newlines: {sentences_with_newlines}") |
|
print(f"Percentage with newlines: {sentences_with_newlines/total_sentences*100:.2f}%" if total_sentences > 0 else "N/A") |
|
|
|
if sentences_with_newlines > 0: |
|
print(f"\nWARNING: Found {sentences_with_newlines} sentences containing newline characters!") |
|
print(f"Examples of sentences with newlines:") |
|
for example in newline_examples: |
|
print(f" {example}") |
|
else: |
|
print(f"\n✓ No newline characters found in the dataset!") |
|
|
|
|
|
if __name__ == "__main__": |
|
dataset_path = "enni-salt-dataset.pth" |
|
check_newlines_in_dataset(dataset_path) |