Spaces:

seawolf2357
/

eawolf2357-git

Configuration error

App Files Files Community

eawolf2357-git / utils /process_sakuga_dataset_multi_thread_save.py

seawolf2357

Upload folder using huggingface_hub

321d89c verified about 1 month ago

raw

history blame contribute delete

4.38 kB

	import torch
	import pandas as pd
	import os
	from tqdm import tqdm
	from concurrent.futures import ThreadPoolExecutor
	from datetime import datetime
	import shutil

	def from_time_2_second(time_str):
	# 使用 strptime 解析时间字符串
	time_obj = datetime.strptime(time_str, '%H:%M:%S.%f')
	# 计算总秒数
	total_seconds = time_obj.hour * 3600 + time_obj.minute * 60 + time_obj.second + time_obj.microsecond / 1e6
	#print(total_seconds)
	return total_seconds


	# 读取数据
	df = pd.read_parquet("/home/cn/Datasets/SakugaDataset/parquet/test/sakugadataset_test.parquet")
	df = df[['identifier', 'scene_start_time', 'scene_end_time', 'fps', "text_description", "aesthetic_score", "dynamic_score"]]
	df = df.dropna(subset=['scene_start_time', 'scene_end_time', 'fps', "text_description", "aesthetic_score", "dynamic_score"])
	df['identifier_video'] = df['identifier'].apply(lambda x: int(x.split(':')[0]))

	base_path = '/private/task/cn/Datasets/SakugaDataset-main/download/split/test'
	rows_to_delete = []

	print(df.shape)

	# 定义检查函数
	def check_row(index, row):
	folder_path = os.path.join(base_path, str(row['identifier_video']))

	start_time=from_time_2_second(row['scene_start_time'])
	end_time=from_time_2_second(row['scene_end_time'])
	fps=row['fps']
	total_frame_num=(end_time-start_time)*fps
	if total_frame_num<300:
	return index

	# if not os.path.exists(folder_path):
	# return index
	# if os.path.exists(folder_path) and os.path.isdir(folder_path):
	# if len(os.listdir(folder_path)) == 0:
	# return index
	return None

	# 设置进度条
	progress_dataset_bar = tqdm(total=df.shape[0], desc="Loading videos")

	# 使用多线程执行检查
	with ThreadPoolExecutor(max_workers=24) as executor:
	futures = []
	for index, row in df.iterrows():
	futures.append(executor.submit(check_row, index, row))

	# 收集结果
	for future in tqdm(futures, desc="Processing results"):
	result = future.result()
	if result is not None:
	rows_to_delete.append(result)
	progress_dataset_bar.update(1)

	progress_dataset_bar.close()

	# 删除满足条件的行
	df.drop(rows_to_delete, inplace=True)
	df.reset_index(drop=True, inplace=True)
	print(df.shape)

	aesthetic_median = df['aesthetic_score'].median()
	dynamic_median = df['dynamic_score'].median()
	print(aesthetic_median)
	print(dynamic_median)

	#aesthetic_median = df['aesthetic_score'].nlargest(len(df) // 3)

	aesthetic_median = 0.8
	# #dynamic_median = df['dynamic_score'].nlargest(len(df) // 3)
	dynamic_median =0.4

	print(f"Aesthetic Score Median: {aesthetic_median}")
	print(f"Dynamic Score Median: {dynamic_median}")
	filtered_df = df[(df['aesthetic_score'] >= aesthetic_median) & (df['dynamic_score'] >= dynamic_median)]

	print(filtered_df.shape)

	all_frame=0
	for index, row in filtered_df.iterrows():
	start_time=from_time_2_second(row['scene_start_time'])
	end_time=from_time_2_second(row['scene_end_time'])
	fps=row['fps']
	total_frame_num=(end_time-start_time)*fps
	print(total_frame_num)
	all_frame+=total_frame_num
	print(all_frame)


	# 保存过滤后的数据
	# output_parquet_path ="/private/task/cn/Datasets/SakugaDataset-main/download/parquet/fliter_196_test.parquet"
	# df.to_parquet(output_parquet_path, index=False)




	'''
	#基础路径
	base_path ='/private/task/cn/Datasets/SakugaDataset-main/download/split/test'
	destination_path = '/private/task/cn/Datasets/SakugaDataset-main/download/test_dataset'

	# 确保目标目录存在
	os.makedirs(destination_path, exist_ok=True)

	# 循环遍历过滤后的DataFrame
	for index, row in filtered_df.iterrows():
	folder_path = os.path.join(base_path, str(row['identifier_video']))

	# 检查文件夹是否存在
	if os.path.exists(folder_path):
	# 构建目标路径
	target_path = os.path.join(destination_path, str(row['identifier_video']))
	if not os.path.exists(target_path):
	# 复制文件夹
	shutil.copytree(folder_path, target_path)
	print(f"Copied {folder_path} to {target_path}")
	else:
	print(f"Folder {folder_path} does not exist")




	output_parquet_path ="/private/task/cn/Datasets/SakugaDataset-main/download/parquet/fliter_196_test_high_and.parquet"
	filtered_df.to_parquet(output_parquet_path, index=False)


	#1054702 *8
	'''