Spaces:
Configuration error
Configuration error
import torch | |
import pandas as pd | |
import os | |
from tqdm import tqdm | |
df = pd.read_parquet("../../Datasets/SakugaDataset/parquet/train_aesthetic/sakugadataset_train_aesthetic.parquet") | |
df = df[['identifier', 'scene_start_time', 'scene_end_time', 'fps',"text_description","aesthetic_score","dynamic_score"]] | |
# drop rows with nan | |
df = df.dropna(subset=['scene_start_time', 'scene_end_time', 'fps',"text_description","aesthetic_score","dynamic_score"]) | |
df['identifier_video'] = df['identifier'].apply(lambda x: int(x.split(':')[0])) | |
base_path = '/home/cn/Datasets/SakugaDataset/split/train_aesthetic' | |
rows_to_delete = [] | |
print(df.shape) | |
# 遍历数据框的每一行 | |
for index, row in df.iterrows(): | |
folder_path = os.path.join(base_path, str(row['identifier_video'])) | |
#print(str(row['identifier_video'])) | |
# 检查文件夹是否存在 | |
#print(folder_path) | |
if not os.path.exists(folder_path): | |
print(folder_path) | |
rows_to_delete.append(index) | |
if os.path.exists(folder_path) and os.path.isdir(folder_path): | |
# 检查文件夹中的文件数量 | |
if len(os.listdir(folder_path)) == 0: | |
rows_to_delete.append(index) | |
#print(index) | |
# 删除满足条件的行 | |
df.drop(rows_to_delete, inplace=True) | |
# 重置索引 | |
df.reset_index(drop=True, inplace=True) | |
print(df.shape) | |
output_parquet_path = '/home/cn/Datasets/SakugaDataset/parquet/fliter_aesthetic.parquet' | |
df.to_parquet(output_parquet_path, index=False) | |
#132337 | |
# 132102 | |
# 删完无法读取的部分之后剩下了132067个 | |
''' | |
print(df) | |
i=10 | |
df.iloc[i]['identifier_video'] | |
print(df.columns) | |
# 查看前几行数据 | |
print(df.head()) | |
print(df.iloc[1]) | |
for index,row in df.iterrows(): | |
print(f"Row {index}: {row.to_dict()}") | |
break | |
''' |