import torch import pandas as pd import os from tqdm import tqdm from concurrent.futures import ThreadPoolExecutor from datetime import datetime import shutil def from_time_2_second(time_str): # 使用 strptime 解析时间字符串 time_obj = datetime.strptime(time_str, '%H:%M:%S.%f') # 计算总秒数 total_seconds = time_obj.hour * 3600 + time_obj.minute * 60 + time_obj.second + time_obj.microsecond / 1e6 #print(total_seconds) return total_seconds # 读取数据 df = pd.read_parquet("/home/cn/Datasets/SakugaDataset/parquet/test/sakugadataset_test.parquet") df = df[['identifier', 'scene_start_time', 'scene_end_time', 'fps', "text_description", "aesthetic_score", "dynamic_score"]] df = df.dropna(subset=['scene_start_time', 'scene_end_time', 'fps', "text_description", "aesthetic_score", "dynamic_score"]) df['identifier_video'] = df['identifier'].apply(lambda x: int(x.split(':')[0])) base_path = '/private/task/cn/Datasets/SakugaDataset-main/download/split/test' rows_to_delete = [] print(df.shape) # 定义检查函数 def check_row(index, row): folder_path = os.path.join(base_path, str(row['identifier_video'])) start_time=from_time_2_second(row['scene_start_time']) end_time=from_time_2_second(row['scene_end_time']) fps=row['fps'] total_frame_num=(end_time-start_time)*fps if total_frame_num<300: return index # if not os.path.exists(folder_path): # return index # if os.path.exists(folder_path) and os.path.isdir(folder_path): # if len(os.listdir(folder_path)) == 0: # return index return None # 设置进度条 progress_dataset_bar = tqdm(total=df.shape[0], desc="Loading videos") # 使用多线程执行检查 with ThreadPoolExecutor(max_workers=24) as executor: futures = [] for index, row in df.iterrows(): futures.append(executor.submit(check_row, index, row)) # 收集结果 for future in tqdm(futures, desc="Processing results"): result = future.result() if result is not None: rows_to_delete.append(result) progress_dataset_bar.update(1) progress_dataset_bar.close() # 删除满足条件的行 df.drop(rows_to_delete, inplace=True) df.reset_index(drop=True, inplace=True) print(df.shape) aesthetic_median = df['aesthetic_score'].median() dynamic_median = df['dynamic_score'].median() print(aesthetic_median) print(dynamic_median) #aesthetic_median = df['aesthetic_score'].nlargest(len(df) // 3) aesthetic_median = 0.8 # #dynamic_median = df['dynamic_score'].nlargest(len(df) // 3) dynamic_median =0.4 print(f"Aesthetic Score Median: {aesthetic_median}") print(f"Dynamic Score Median: {dynamic_median}") filtered_df = df[(df['aesthetic_score'] >= aesthetic_median) & (df['dynamic_score'] >= dynamic_median)] print(filtered_df.shape) all_frame=0 for index, row in filtered_df.iterrows(): start_time=from_time_2_second(row['scene_start_time']) end_time=from_time_2_second(row['scene_end_time']) fps=row['fps'] total_frame_num=(end_time-start_time)*fps print(total_frame_num) all_frame+=total_frame_num print(all_frame) # 保存过滤后的数据 # output_parquet_path ="/private/task/cn/Datasets/SakugaDataset-main/download/parquet/fliter_196_test.parquet" # df.to_parquet(output_parquet_path, index=False) ''' #基础路径 base_path ='/private/task/cn/Datasets/SakugaDataset-main/download/split/test' destination_path = '/private/task/cn/Datasets/SakugaDataset-main/download/test_dataset' # 确保目标目录存在 os.makedirs(destination_path, exist_ok=True) # 循环遍历过滤后的DataFrame for index, row in filtered_df.iterrows(): folder_path = os.path.join(base_path, str(row['identifier_video'])) # 检查文件夹是否存在 if os.path.exists(folder_path): # 构建目标路径 target_path = os.path.join(destination_path, str(row['identifier_video'])) if not os.path.exists(target_path): # 复制文件夹 shutil.copytree(folder_path, target_path) print(f"Copied {folder_path} to {target_path}") else: print(f"Folder {folder_path} does not exist") output_parquet_path ="/private/task/cn/Datasets/SakugaDataset-main/download/parquet/fliter_196_test_high_and.parquet" filtered_df.to_parquet(output_parquet_path, index=False) #1054702 *8 '''