Spaces:

seawolf2357
/

eawolf2357-git

Configuration error

App Files Files Community

eawolf2357-git / utils /process_sakuga_dataset.py

seawolf2357

Upload folder using huggingface_hub

321d89c verified about 1 month ago

raw

history blame contribute delete

1.77 kB

	import torch
	import pandas as pd
	import os
	from tqdm import tqdm
	df = pd.read_parquet("../../Datasets/SakugaDataset/parquet/train_aesthetic/sakugadataset_train_aesthetic.parquet")
	df = df[['identifier', 'scene_start_time', 'scene_end_time', 'fps',"text_description","aesthetic_score","dynamic_score"]]
	# drop rows with nan
	df = df.dropna(subset=['scene_start_time', 'scene_end_time', 'fps',"text_description","aesthetic_score","dynamic_score"])
	df['identifier_video'] = df['identifier'].apply(lambda x: int(x.split(':')[0]))

	base_path = '/home/cn/Datasets/SakugaDataset/split/train_aesthetic'
	rows_to_delete = []

	print(df.shape)

	# 遍历数据框的每一行
	for index, row in df.iterrows():
	folder_path = os.path.join(base_path, str(row['identifier_video']))
	#print(str(row['identifier_video']))
	# 检查文件夹是否存在
	#print(folder_path)
	if not os.path.exists(folder_path):
	print(folder_path)
	rows_to_delete.append(index)

	if os.path.exists(folder_path) and os.path.isdir(folder_path):
	# 检查文件夹中的文件数量
	if len(os.listdir(folder_path)) == 0:
	rows_to_delete.append(index)
	#print(index)

	# 删除满足条件的行
	df.drop(rows_to_delete, inplace=True)
	# 重置索引
	df.reset_index(drop=True, inplace=True)
	print(df.shape)

	output_parquet_path = '/home/cn/Datasets/SakugaDataset/parquet/fliter_aesthetic.parquet'
	df.to_parquet(output_parquet_path, index=False)



	#132337
	# 132102
	# 删完无法读取的部分之后剩下了132067个

	'''
	print(df)

	i=10
	df.iloc[i]['identifier_video']


	print(df.columns)
	# 查看前几行数据
	print(df.head())
	print(df.iloc[1])
	for index,row in df.iterrows():
	print(f"Row {index}: {row.to_dict()}")
	break
	'''