import torch import pandas as pd import os from tqdm import tqdm df = pd.read_parquet("../../Datasets/SakugaDataset/parquet/train_aesthetic/sakugadataset_train_aesthetic.parquet") df = df[['identifier', 'scene_start_time', 'scene_end_time', 'fps',"text_description","aesthetic_score","dynamic_score"]] # drop rows with nan df = df.dropna(subset=['scene_start_time', 'scene_end_time', 'fps',"text_description","aesthetic_score","dynamic_score"]) df['identifier_video'] = df['identifier'].apply(lambda x: int(x.split(':')[0])) base_path = '/home/cn/Datasets/SakugaDataset/split/train_aesthetic' rows_to_delete = [] print(df.shape) # 遍历数据框的每一行 for index, row in df.iterrows(): folder_path = os.path.join(base_path, str(row['identifier_video'])) #print(str(row['identifier_video'])) # 检查文件夹是否存在 #print(folder_path) if not os.path.exists(folder_path): print(folder_path) rows_to_delete.append(index) if os.path.exists(folder_path) and os.path.isdir(folder_path): # 检查文件夹中的文件数量 if len(os.listdir(folder_path)) == 0: rows_to_delete.append(index) #print(index) # 删除满足条件的行 df.drop(rows_to_delete, inplace=True) # 重置索引 df.reset_index(drop=True, inplace=True) print(df.shape) output_parquet_path = '/home/cn/Datasets/SakugaDataset/parquet/fliter_aesthetic.parquet' df.to_parquet(output_parquet_path, index=False) #132337 # 132102 # 删完无法读取的部分之后剩下了132067个 ''' print(df) i=10 df.iloc[i]['identifier_video'] print(df.columns) # 查看前几行数据 print(df.head()) print(df.iloc[1]) for index,row in df.iterrows(): print(f"Row {index}: {row.to_dict()}") break '''