| import json | |
| def count_mismatch(file_path): | |
| count_mismatch = 0 | |
| with open(file_path, 'r') as file: | |
| for line_number, line in enumerate(file, start=1): | |
| data = json.loads(line) | |
| tokens, tags = data['tokens'], data['tags_knowledge'] | |
| if len(tokens) != len(tags): | |
| count_mismatch += 1 | |
| return count_mismatch | |
| def delete_mismatched_lines(file_path): | |
| with open(file_path, 'r') as file: | |
| lines = file.readlines() | |
| with open(file_path, 'w') as file: | |
| for line in lines: | |
| data = json.loads(line) | |
| tokens, tags = data['tokens'], data['tags_knowledge'] | |
| if len(tokens) == len(tags): | |
| file.write(line) | |
| if __name__ == "__main__": | |
| file_path = 'data/tags-04-01-2025.jsonl' | |
| count = count_mismatch(file_path) | |
| if count > 0: | |
| delete_mismatched_lines(file_path) | |
| print(f"Deleted {count} mismatched lines.") |