eawolf2357-git / utils /process_sakuga_dataset_multi_thread_save.py
seawolf2357's picture
Upload folder using huggingface_hub
321d89c verified
raw
history blame
4.38 kB
import torch
import pandas as pd
import os
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
import shutil
def from_time_2_second(time_str):
# 使用 strptime 解析时间字符串
time_obj = datetime.strptime(time_str, '%H:%M:%S.%f')
# 计算总秒数
total_seconds = time_obj.hour * 3600 + time_obj.minute * 60 + time_obj.second + time_obj.microsecond / 1e6
#print(total_seconds)
return total_seconds
# 读取数据
df = pd.read_parquet("/home/cn/Datasets/SakugaDataset/parquet/test/sakugadataset_test.parquet")
df = df[['identifier', 'scene_start_time', 'scene_end_time', 'fps', "text_description", "aesthetic_score", "dynamic_score"]]
df = df.dropna(subset=['scene_start_time', 'scene_end_time', 'fps', "text_description", "aesthetic_score", "dynamic_score"])
df['identifier_video'] = df['identifier'].apply(lambda x: int(x.split(':')[0]))
base_path = '/private/task/cn/Datasets/SakugaDataset-main/download/split/test'
rows_to_delete = []
print(df.shape)
# 定义检查函数
def check_row(index, row):
folder_path = os.path.join(base_path, str(row['identifier_video']))
start_time=from_time_2_second(row['scene_start_time'])
end_time=from_time_2_second(row['scene_end_time'])
fps=row['fps']
total_frame_num=(end_time-start_time)*fps
if total_frame_num<300:
return index
# if not os.path.exists(folder_path):
# return index
# if os.path.exists(folder_path) and os.path.isdir(folder_path):
# if len(os.listdir(folder_path)) == 0:
# return index
return None
# 设置进度条
progress_dataset_bar = tqdm(total=df.shape[0], desc="Loading videos")
# 使用多线程执行检查
with ThreadPoolExecutor(max_workers=24) as executor:
futures = []
for index, row in df.iterrows():
futures.append(executor.submit(check_row, index, row))
# 收集结果
for future in tqdm(futures, desc="Processing results"):
result = future.result()
if result is not None:
rows_to_delete.append(result)
progress_dataset_bar.update(1)
progress_dataset_bar.close()
# 删除满足条件的行
df.drop(rows_to_delete, inplace=True)
df.reset_index(drop=True, inplace=True)
print(df.shape)
aesthetic_median = df['aesthetic_score'].median()
dynamic_median = df['dynamic_score'].median()
print(aesthetic_median)
print(dynamic_median)
#aesthetic_median = df['aesthetic_score'].nlargest(len(df) // 3)
aesthetic_median = 0.8
# #dynamic_median = df['dynamic_score'].nlargest(len(df) // 3)
dynamic_median =0.4
print(f"Aesthetic Score Median: {aesthetic_median}")
print(f"Dynamic Score Median: {dynamic_median}")
filtered_df = df[(df['aesthetic_score'] >= aesthetic_median) & (df['dynamic_score'] >= dynamic_median)]
print(filtered_df.shape)
all_frame=0
for index, row in filtered_df.iterrows():
start_time=from_time_2_second(row['scene_start_time'])
end_time=from_time_2_second(row['scene_end_time'])
fps=row['fps']
total_frame_num=(end_time-start_time)*fps
print(total_frame_num)
all_frame+=total_frame_num
print(all_frame)
# 保存过滤后的数据
# output_parquet_path ="/private/task/cn/Datasets/SakugaDataset-main/download/parquet/fliter_196_test.parquet"
# df.to_parquet(output_parquet_path, index=False)
'''
#基础路径
base_path ='/private/task/cn/Datasets/SakugaDataset-main/download/split/test'
destination_path = '/private/task/cn/Datasets/SakugaDataset-main/download/test_dataset'
# 确保目标目录存在
os.makedirs(destination_path, exist_ok=True)
# 循环遍历过滤后的DataFrame
for index, row in filtered_df.iterrows():
folder_path = os.path.join(base_path, str(row['identifier_video']))
# 检查文件夹是否存在
if os.path.exists(folder_path):
# 构建目标路径
target_path = os.path.join(destination_path, str(row['identifier_video']))
if not os.path.exists(target_path):
# 复制文件夹
shutil.copytree(folder_path, target_path)
print(f"Copied {folder_path} to {target_path}")
else:
print(f"Folder {folder_path} does not exist")
output_parquet_path ="/private/task/cn/Datasets/SakugaDataset-main/download/parquet/fliter_196_test_high_and.parquet"
filtered_df.to_parquet(output_parquet_path, index=False)
#1054702 *8
'''