Spaces:

seawolf2357
/

eawolf2357-git

Configuration error

App Files Files Community

eawolf2357-git / utils /process_sakuga_dataset_multi_thread.py

seawolf2357

Upload folder using huggingface_hub

321d89c verified about 1 month ago

raw

history blame contribute delete

2.42 kB

	import torch
	import pandas as pd
	import os
	from tqdm import tqdm
	from concurrent.futures import ThreadPoolExecutor
	from datetime import datetime


	def from_time_2_second(time_str):
	# 使用 strptime 解析时间字符串
	time_obj = datetime.strptime(time_str, '%H:%M:%S.%f')
	# 计算总秒数
	total_seconds = time_obj.hour * 3600 + time_obj.minute * 60 + time_obj.second + time_obj.microsecond / 1e6
	#print(total_seconds)
	return total_seconds


	# 读取数据
	df = pd.read_parquet("//home/cn/Datasets/SakugaDataset/parquet/fliter_49_aesthetic.parquet")
	df = df[['identifier', 'scene_start_time', 'scene_end_time', 'fps', "text_description", "aesthetic_score", "dynamic_score"]]
	df = df.dropna(subset=['scene_start_time', 'scene_end_time', 'fps', "text_description", "aesthetic_score", "dynamic_score"])
	df['identifier_video'] = df['identifier'].apply(lambda x: int(x.split(':')[0]))

	base_path = '/home/cn/Datasets/SakugaDataset/split/train_aesthetic'
	rows_to_delete = []

	print(df.shape)

	# 定义检查函数
	def check_row(index, row):
	folder_path = os.path.join(base_path, str(row['identifier_video']))

	start_time=from_time_2_second(row['scene_start_time'])
	end_time=from_time_2_second(row['scene_end_time'])
	fps=row['fps']
	total_frame_num=(end_time-start_time)*fps
	if total_frame_num<81:
	return index

	if not os.path.exists(folder_path):
	return index
	if os.path.exists(folder_path) and os.path.isdir(folder_path):
	if len(os.listdir(folder_path)) == 0:
	return index
	return None

	# 设置进度条
	progress_dataset_bar = tqdm(total=df.shape[0], desc="Loading videos")

	# 使用多线程执行检查
	with ThreadPoolExecutor(max_workers=16) as executor:
	futures = []
	for index, row in df.iterrows():
	futures.append(executor.submit(check_row, index, row))

	# 收集结果
	for future in tqdm(futures, desc="Processing results"):
	result = future.result()
	if result is not None:
	rows_to_delete.append(result)
	progress_dataset_bar.update(1)

	progress_dataset_bar.close()

	# 删除满足条件的行
	df.drop(rows_to_delete, inplace=True)
	df.reset_index(drop=True, inplace=True)
	print(df.shape)

	# 保存过滤后的数据
	output_parquet_path ="/home/cn/Datasets/SakugaDataset/parquet/fliter_train_81_2.parquet"
	df.to_parquet(output_parquet_path, index=False)



	#1054702 *8