|
import os |
|
import pandas as pd |
|
import shutil |
|
from sklearn.model_selection import train_test_split |
|
from tqdm.auto import tqdm |
|
|
|
df_2014 = pd.read_csv('data/CROHME/2014/caption.txt', sep='\t', header=None, names=['filenames', 'captions']) |
|
df_2016 = pd.read_csv('data/CROHME/2016/caption.txt', sep='\t', header=None, names=['filenames', 'captions']) |
|
df_2019 = pd.read_csv('data/CROHME/2019/caption.txt', sep='\t', header=None, names=['filenames', 'captions']) |
|
df_train = pd.read_csv('data/CROHME/train/caption.txt', sep='\t', header=None, names=['filenames', 'captions']) |
|
|
|
data = pd.concat( |
|
[ |
|
df_2014, |
|
df_2016, |
|
df_2019, |
|
df_train |
|
] |
|
) |
|
|
|
|
|
train, test = train_test_split(data, test_size=0.1, random_state=42) |
|
|
|
|
|
train, val = train_test_split(train, test_size=0.1, random_state=42) |
|
|
|
print("Train shape:", train.shape) |
|
print("Test shape:", test.shape) |
|
print("Validation shape:", val.shape) |
|
|
|
train_filenames = train['filenames'].tolist() |
|
train_captions = train['captions'].tolist() |
|
|
|
test_filenames = test['filenames'].tolist() |
|
test_captions = test['captions'].tolist() |
|
|
|
val_filenames = val['filenames'].tolist() |
|
val_captions = val['captions'].tolist() |
|
|
|
|
|
with open('data/CROHME_splitted/train/caption.txt', 'w', encoding='utf-8') as f: |
|
for filename, caption in zip(train_filenames, train_captions): |
|
f.write(f"{filename}\t{caption}\n") |
|
|
|
with open('data/CROHME_splitted/test/caption.txt', 'w', encoding='utf-8') as f: |
|
for filename, caption in zip(test_filenames, test_captions): |
|
f.write(f"{filename}\t{caption}\n") |
|
|
|
with open('data/CROHME_splitted/val/caption.txt', 'w', encoding='utf-8') as f: |
|
for filename, caption in zip(val_filenames, val_captions): |
|
f.write(f"{filename}\t{caption}\n") |
|
|
|
|
|
IMAGES_DIR = 'data/images' |
|
TRAIN_DIR = 'data/CROHME_splitted/train/img' |
|
TEST_DIR = 'data/CROHME_splitted/test/img' |
|
VAL_DIR = 'data/CROHME_splitted/val/img' |
|
|
|
os.makedirs(TRAIN_DIR, exist_ok=True) |
|
os.makedirs(TEST_DIR, exist_ok=True) |
|
os.makedirs(VAL_DIR, exist_ok=True) |
|
|
|
for train_filename in tqdm(train_filenames, desc="Copying train images"): |
|
src = os.path.join(IMAGES_DIR, train_filename) + '.bmp' |
|
dst = os.path.join(TRAIN_DIR, train_filename) + '.bmp' |
|
shutil.copy(src, dst) |
|
|
|
for test_filename in tqdm(test_filenames, desc="Copying test images"): |
|
src = os.path.join(IMAGES_DIR, test_filename) + '.bmp' |
|
dst = os.path.join(TEST_DIR, test_filename) + '.bmp' |
|
shutil.copy(src, dst) |
|
|
|
for val_filename in tqdm(val_filenames, desc="Copying validation images"): |
|
src = os.path.join(IMAGES_DIR, val_filename) + '.bmp' |
|
dst = os.path.join(VAL_DIR, val_filename) + '.bmp' |
|
shutil.copy(src, dst) |