eawolf2357-git / utils /process_all_wrong_data_multi_thread.py
seawolf2357's picture
Upload folder using huggingface_hub
321d89c verified
from pathlib import PosixPath
from concurrent.futures import ThreadPoolExecutor, as_completed
import decord
from tqdm import tqdm
# 检查 decord 是否安装
try:
import decord
except ImportError:
raise ImportError(
"The `decord` package is required for loading the video dataset. Install with `pip install decord`"
)
# 设置 decord 使用的桥接
decord.bridge.set_bridge("torch")
# 从文件中读取行
def read_lines_from_file(file_path):
try:
with open(file_path, 'r') as file:
lines = file.readlines()
for line in lines:
print(line.strip()) # 使用 strip() 去除行末的换行符
except Exception as e:
print(f"An error occurred: {e}")
# 将文件行读取到列表中
def read_lines_to_list(file_path):
lines_list = []
try:
with open(file_path, 'r') as file:
lines_list = [line.strip() for line in file] # 使用列表推导式逐行读取并去除行末换行符
except Exception as e:
print(f"An error occurred: {e}")
return lines_list
# 处理单个视频文件
def process_video(file, error_file_path):
filename = PosixPath(file)
try:
video_reader = decord.VideoReader(uri=filename.as_posix())
except Exception as e:
with open(error_file_path, 'a') as f:
f.write(f"{file}\n")
print(f"Could not read video: {file}. Error: {e}")
# 使用示例
file_path = '/home/cn/Datasets/SakugaDataset/output_81.txt'
file_list = read_lines_to_list(file_path)
error_file_path = 'unreadable_videos_81.txt'
# 使用 ThreadPoolExecutor 实现多线程处理
with ThreadPoolExecutor(max_workers=16) as executor: # 可以根据需要调整 max_workers 的数量
futures = {executor.submit(process_video, file, error_file_path): file for file in file_list}
for future in tqdm(as_completed(futures), total=len(file_list)):
try:
future.result()
except Exception as e:
print(f"An error occurred: {e}")