eawolf2357-git / utils /process_sakuga_dataset.py
seawolf2357's picture
Upload folder using huggingface_hub
321d89c verified
import torch
import pandas as pd
import os
from tqdm import tqdm
df = pd.read_parquet("../../Datasets/SakugaDataset/parquet/train_aesthetic/sakugadataset_train_aesthetic.parquet")
df = df[['identifier', 'scene_start_time', 'scene_end_time', 'fps',"text_description","aesthetic_score","dynamic_score"]]
# drop rows with nan
df = df.dropna(subset=['scene_start_time', 'scene_end_time', 'fps',"text_description","aesthetic_score","dynamic_score"])
df['identifier_video'] = df['identifier'].apply(lambda x: int(x.split(':')[0]))
base_path = '/home/cn/Datasets/SakugaDataset/split/train_aesthetic'
rows_to_delete = []
print(df.shape)
# 遍历数据框的每一行
for index, row in df.iterrows():
folder_path = os.path.join(base_path, str(row['identifier_video']))
#print(str(row['identifier_video']))
# 检查文件夹是否存在
#print(folder_path)
if not os.path.exists(folder_path):
print(folder_path)
rows_to_delete.append(index)
if os.path.exists(folder_path) and os.path.isdir(folder_path):
# 检查文件夹中的文件数量
if len(os.listdir(folder_path)) == 0:
rows_to_delete.append(index)
#print(index)
# 删除满足条件的行
df.drop(rows_to_delete, inplace=True)
# 重置索引
df.reset_index(drop=True, inplace=True)
print(df.shape)
output_parquet_path = '/home/cn/Datasets/SakugaDataset/parquet/fliter_aesthetic.parquet'
df.to_parquet(output_parquet_path, index=False)
#132337
# 132102
# 删完无法读取的部分之后剩下了132067个
'''
print(df)
i=10
df.iloc[i]['identifier_video']
print(df.columns)
# 查看前几行数据
print(df.head())
print(df.iloc[1])
for index,row in df.iterrows():
print(f"Row {index}: {row.to_dict()}")
break
'''