|
import csv |
|
import os |
|
import sys |
|
import glob |
|
import tqdm |
|
|
|
|
|
def split_csv_files(input_files, output_dir, lines_per_file=100000): |
|
|
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
|
|
total_lines = 0 |
|
file_count = 0 |
|
current_line_count = 0 |
|
|
|
|
|
output_file = os.path.join(output_dir, f"{str(file_count).zfill(3)}.csv") |
|
output_writer = open(output_file, "w", newline="") |
|
csv_writer = None |
|
|
|
try: |
|
for file_path in tqdm.tqdm(input_files, desc="Processing files"): |
|
with open(file_path, "r") as csv_file: |
|
csv_reader = csv.reader(csv_file) |
|
|
|
|
|
if csv_writer is None: |
|
header = next(csv_reader) |
|
csv_writer = csv.writer(output_writer) |
|
csv_writer.writerow(header) |
|
|
|
|
|
for row in csv_reader: |
|
if current_line_count >= lines_per_file: |
|
|
|
output_writer.close() |
|
file_count += 1 |
|
current_line_count = 0 |
|
output_file = os.path.join( |
|
output_dir, f"{str(file_count).zfill(3)}.csv" |
|
) |
|
output_writer = open(output_file, "w", newline="") |
|
csv_writer = csv.writer(output_writer) |
|
csv_writer.writerow(header) |
|
|
|
|
|
csv_writer.writerow(row) |
|
current_line_count += 1 |
|
total_lines += 1 |
|
|
|
finally: |
|
|
|
if output_writer: |
|
output_writer.close() |
|
|
|
print(f"Total lines processed: {total_lines}") |
|
print(f"Files created: {file_count + 1}") |
|
|
|
|
|
if __name__ == "__main__": |
|
input_dir = "../datasets/YFCC100M/yfcc100m_dataset_with_gps_train" |
|
output_dir = "../datasets/YFCC100M/yfcc100m_dataset_with_gps_train_balanced" |
|
lines_per_file = 100000 |
|
|
|
|
|
input_files = glob.glob(os.path.join(input_dir, "*.csv")) |
|
|
|
if not input_files: |
|
print(f"No CSV files found in {input_dir}") |
|
sys.exit(1) |
|
|
|
print(f"Found {len(input_files)} CSV files") |
|
split_csv_files(input_files, output_dir, lines_per_file) |
|
|