Spaces:

YUNSUN7
/

Fbfbfhtthth

Runtime error

App Files Files Community

Fbfbfhtthth / app.py

YUNSUN7

Create app.py

9971dc1 verified over 1 year ago

raw

history blame contribute delete

17.7 kB

	# Ke Chen
	# [email protected]
	# Zero-shot Audio Source Separation via Query-based Learning from Weakly-labeled Data
	# The Main Script

	import os
	# this is to avoid the sdr calculation from occupying all cpus
	os.environ["OMP_NUM_THREADS"] = "4"
	os.environ["OPENBLAS_NUM_THREADS"] = "4"
	os.environ["MKL_NUM_THREADS"] = "6"
	os.environ["VECLIB_MAXIMUM_THREADS"] = "4"
	os.environ["NUMEXPR_NUM_THREADS"] = "6"

	import sys
	import librosa
	import numpy as np
	import argparse
	import logging

	import torch
	from torch.utils.data import DataLoader
	from torch.utils.data.distributed import DistributedSampler

	from utils import collect_fn, dump_config, create_folder, prepprocess_audio
	import musdb

	from models.asp_model import ZeroShotASP, SeparatorModel, AutoTaggingWarpper, WhitingWarpper
	from data_processor import LGSPDataset, MusdbDataset
	import config
	import htsat_config
	from models.htsat import HTSAT_Swin_Transformer
	from sed_model import SEDWrapper

	import pytorch_lightning as pl
	from pytorch_lightning.callbacks import ModelCheckpoint

	from htsat_utils import process_idc

	import warnings
	warnings.filterwarnings("ignore")



	class data_prep(pl.LightningDataModule):
	def __init__(self, train_dataset, eval_dataset, device_num, config):
	super().__init__()
	self.train_dataset = train_dataset
	self.eval_dataset = eval_dataset
	self.device_num = device_num
	self.config = config

	def train_dataloader(self):
	train_sampler = DistributedSampler(self.train_dataset, shuffle = False) if self.device_num > 1 else None
	train_loader = DataLoader(
	dataset = self.train_dataset,
	num_workers = config.num_workers,
	batch_size = config.batch_size // self.device_num,
	shuffle = False,
	sampler = train_sampler,
	collate_fn = collect_fn
	)
	return train_loader
	def val_dataloader(self):
	eval_sampler = DistributedSampler(self.eval_dataset, shuffle = False) if self.device_num > 1 else None
	eval_loader = DataLoader(
	dataset = self.eval_dataset,
	num_workers = config.num_workers,
	batch_size = config.batch_size // self.device_num,
	shuffle = False,
	sampler = eval_sampler,
	collate_fn = collect_fn
	)
	return eval_loader
	def test_dataloader(self):
	test_sampler = DistributedSampler(self.eval_dataset, shuffle = False) if self.device_num > 1 else None
	test_loader = DataLoader(
	dataset = self.eval_dataset,
	num_workers = config.num_workers,
	batch_size = config.batch_size // self.device_num,
	shuffle = False,
	sampler = test_sampler,
	collate_fn = collect_fn
	)
	return test_loader

	def save_idc():
	train_index_path = os.path.join(config.dataset_path, "hdf5s", "indexes", config.index_type + ".h5")
	eval_index_path = os.path.join(config.dataset_path,"hdf5s", "indexes", "eval.h5")
	process_idc(train_index_path, config.classes_num, config.index_type + "_idc.npy")
	process_idc(eval_index_path, config.classes_num, "eval_idc.npy")

	# Process the musdb tracks into the sample rate of 32000 Hz sample rate, the original is 44100 Hz
	def process_musdb():
	# use musdb as testset
	test_data = musdb.DB(
	root = config.musdb_path,
	download = False,
	subsets = "test",
	is_wav = True
	)
	print(len(test_data.tracks))
	mus_tracks = []
	# in musdb, all fs is the same (44100)
	orig_fs = test_data.tracks[0].rate
	print(orig_fs)
	for track in test_data.tracks:
	temp = {}
	mixture = prepprocess_audio(
	track.audio,
	orig_fs, config.sample_rate,
	config.test_type
	)
	temp["mixture" ]= mixture
	for dickey in config.test_key:
	source = prepprocess_audio(
	track.targets[dickey].audio,
	orig_fs, config.sample_rate,
	config.test_type
	)
	temp[dickey] = source
	print(track.audio.shape, len(temp.keys()), temp["mixture"].shape)
	mus_tracks.append(temp)
	print(len(mus_tracks))
	# save the file to npy
	np.save("musdb-32000fs.npy", mus_tracks)

	# weight average will perform in the given folder
	# It will output one model checkpoint, which avergas the weight of all models in the folder
	def weight_average():
	model_ckpt = []
	model_files = os.listdir(config.wa_model_folder)
	wa_ckpt = {
	"state_dict": {}
	}

	for model_file in model_files:
	model_file = os.path.join(config.esm_model_folder, model_file)
	model_ckpt.append(torch.load(model_file, map_location="cpu")["state_dict"])
	keys = model_ckpt[0].keys()
	for key in keys:
	model_ckpt_key = torch.cat([d[key].float().unsqueeze(0) for d in model_ckpt])
	model_ckpt_key = torch.mean(model_ckpt_key, dim = 0)
	assert model_ckpt_key.shape == model_ckpt[0][key].shape, "the shape is unmatched " + model_ckpt_key.shape + " " + model_ckpt[0][key].shape
	wa_ckpt["state_dict"][key] = model_ckpt_key
	torch.save(wa_ckpt, config.wa_model_path)


	# use the model to quickly separate a track given a query
	# it requires four variables in config.py:
	# inference_file: the track you want to separate
	# inference_query: a folder containing all samples from the same source
	# test_key: ["name"] indicate the source name (just a name for final output, no other functions)
	# wave_output_path: the output folder

	# make sure the query folder contain the samples from the same source
	# each time, the model is able to separate one source from the track
	# if you want to separate multiple sources, you need to change the query folder or write a script to help you do that
	def inference():
	# set exp settings
	device_name = "cuda" if torch.cuda.is_available() else "cpu"
	device = torch.device("cuda")
	assert config.test_key is not None, "there should be a separate key"
	create_folder(config.wave_output_path)
	test_track, fs = librosa.load(config.inference_file, sr = None)
	test_track = test_track[:,None]
	print(test_track.shape)
	print(fs)
	# convert the track into 32000 Hz sample rate
	test_track = prepprocess_audio(
	test_track,
	fs, config.sample_rate,
	config.test_type
	)
	test_tracks = []
	temp = [test_track]
	for dickey in config.test_key:
	temp.append(test_track)
	temp = np.array(temp)
	test_tracks.append(temp)
	dataset = MusdbDataset(tracks = test_tracks) # the action is similar to musdbdataset, reuse it
	loader = DataLoader(
	dataset = dataset,
	num_workers = 1,
	batch_size = 1,
	shuffle = False
	)
	# obtain the samples for query
	queries = []
	for query_file in os.listdir(config.inference_query):
	f_path = os.path.join(config.inference_query, query_file)
	if query_file.endswith(".wav"):
	temp_q, fs = librosa.load(f_path, sr = None)
	temp_q = temp_q[:, None]
	temp_q = prepprocess_audio(
	temp_q,
	fs, config.sample_rate,
	config.test_type
	)
	temp = [temp_q]
	for dickey in config.test_key:
	temp.append(temp_q)
	temp = np.array(temp)
	queries.append(temp)

	assert config.resume_checkpoint is not None, "there should be a saved model when inferring"

	sed_model = HTSAT_Swin_Transformer(
	spec_size=htsat_config.htsat_spec_size,
	patch_size=htsat_config.htsat_patch_size,
	in_chans=1,
	num_classes=htsat_config.classes_num,
	window_size=htsat_config.htsat_window_size,
	config = htsat_config,
	depths = htsat_config.htsat_depth,
	embed_dim = htsat_config.htsat_dim,
	patch_stride=htsat_config.htsat_stride,
	num_heads=htsat_config.htsat_num_head
	)
	at_model = SEDWrapper(
	sed_model = sed_model,
	config = htsat_config,
	dataset = None
	)
	ckpt = torch.load(htsat_config.resume_checkpoint, map_location="cpu")
	at_model.load_state_dict(ckpt["state_dict"])

	trainer = pl.Trainer(
	gpus = 1
	)
	avg_at = None
	# obtain the latent embedding as query
	if config.infer_type == "mean":
	avg_dataset = MusdbDataset(tracks = queries)
	avg_loader = DataLoader(
	dataset = avg_dataset,
	num_workers = 1,
	batch_size = 1,
	shuffle = False
	)
	at_wrapper = AutoTaggingWarpper(
	at_model = at_model,
	config = config,
	target_keys = config.test_key
	)
	trainer.test(at_wrapper, test_dataloaders = avg_loader)
	avg_at = at_wrapper.avg_at

	# import seapration model
	model = ZeroShotASP(
	channels = 1, config = config,
	at_model = at_model,
	dataset = dataset
	)
	# resume checkpoint
	ckpt = torch.load(config.resume_checkpoint, map_location="cpu")
	model.load_state_dict(ckpt["state_dict"], strict= False)
	exp_model = SeparatorModel(
	model = model,
	config = config,
	target_keys = config.test_key,
	avg_at = avg_at,
	using_wiener = False,
	calc_sdr = False,
	output_wav = True
	)
	trainer.test(exp_model, test_dataloaders = loader)

	# test the separation model, mainly in musdb
	def test():
	# set exp settings
	device_name = "cuda" if torch.cuda.is_available() else "cpu"
	device = torch.device("cuda")
	assert config.test_key is not None, "there should be a separate key"
	create_folder(config.wave_output_path)
	# use musdb as testset
	test_data = np.load(config.testset_path, allow_pickle = True)
	print(len(test_data))
	mus_tracks = []
	# in musdb, all fs is the same (44100)
	# load the dataset
	for track in test_data:
	temp = []
	mixture = track["mixture"]
	temp.append(mixture)
	for dickey in config.test_key:
	source = track[dickey]
	temp.append(source)
	temp = np.array(temp)
	print(temp.shape)
	mus_tracks.append(temp)
	print(len(mus_tracks))
	dataset = MusdbDataset(tracks = mus_tracks)
	loader = DataLoader(
	dataset = dataset,
	num_workers = 1,
	batch_size = 1,
	shuffle = False
	)
	assert config.resume_checkpoint is not None, "there should be a saved model when inferring"

	sed_model = HTSAT_Swin_Transformer(
	spec_size=htsat_config.htsat_spec_size,
	patch_size=htsat_config.htsat_patch_size,
	in_chans=1,
	num_classes=htsat_config.classes_num,
	window_size=htsat_config.htsat_window_size,
	config = htsat_config,
	depths = htsat_config.htsat_depth,
	embed_dim = htsat_config.htsat_dim,
	patch_stride=htsat_config.htsat_stride,
	num_heads=htsat_config.htsat_num_head
	)
	at_model = SEDWrapper(
	sed_model = sed_model,
	config = htsat_config,
	dataset = None
	)
	ckpt = torch.load(htsat_config.resume_checkpoint, map_location="cpu")
	at_model.load_state_dict(ckpt["state_dict"])
	trainer = pl.Trainer(
	gpus = 1
	)
	avg_at = None
	# obtain the query of four stems from the training set
	if config.infer_type == "mean":
	avg_data = np.load(config.testavg_path, allow_pickle = True)[:90]
	print(len(avg_data))
	avgmus_tracks = []
	# in musdb, all fs is the same (44100)
	# load the dataset
	for track in avg_data:
	temp = []
	mixture = track["mixture"]
	temp.append(mixture)
	for dickey in config.test_key:
	source = track[dickey]
	temp.append(source)
	temp = np.array(temp)
	print(temp.shape)
	avgmus_tracks.append(temp)
	print(len(avgmus_tracks))
	avg_dataset = MusdbDataset(tracks = avgmus_tracks)
	avg_loader = DataLoader(
	dataset = avg_dataset,
	num_workers = 1,
	batch_size = 1,
	shuffle = False
	)
	at_wrapper = AutoTaggingWarpper(
	at_model = at_model,
	config = config,
	target_keys = config.test_key
	)
	trainer.test(at_wrapper, test_dataloaders = avg_loader)
	avg_at = at_wrapper.avg_at

	model = ZeroShotASP(
	channels = 1, config = config,
	at_model = at_model,
	dataset = dataset
	)
	ckpt = torch.load(config.resume_checkpoint, map_location="cpu")
	model.load_state_dict(ckpt["state_dict"], strict= False)
	exp_model = SeparatorModel(
	model = model,
	config = config,
	target_keys = config.test_key,
	avg_at = avg_at,
	using_wiener = config.using_wiener
	)
	trainer.test(exp_model, test_dataloaders = loader)

	def train():
	# set exp settings
	# device_name = "cuda" if torch.cuda.is_available() else "cpu"
	# device = torch.device("cuda")

	device_num = torch.cuda.device_count()
	print("each batch size:", config.batch_size // device_num)

	train_index_path = os.path.join(config.dataset_path, "hdf5s","indexes", config.index_type + ".h5")
	train_idc = np.load(os.path.join(config.idc_path, config.index_type + "_idc.npy"), allow_pickle = True)

	eval_index_path = os.path.join(config.dataset_path,"hdf5s", "indexes", "eval.h5")
	eval_idc = np.load(os.path.join(config.idc_path, "eval_idc.npy"), allow_pickle = True)

	# set exp folder
	exp_dir = os.path.join(config.workspace, "results", config.exp_name)
	checkpoint_dir = os.path.join(config.workspace, "results", config.exp_name, "checkpoint")

	if not config.debug:
	create_folder(os.path.join(config.workspace, "results"))
	create_folder(exp_dir)
	create_folder(checkpoint_dir)
	dump_config(config, os.path.join(exp_dir, config.exp_name), False)

	# load data
	# import dataset LGSPDataset (latent general source separation) and sampler
	dataset = LGSPDataset(
	index_path = train_index_path,
	idc = train_idc,
	config = config,
	factor = 0.05,
	eval_mode = False
	)
	eval_dataset = LGSPDataset(
	index_path = eval_index_path,
	idc = eval_idc,
	config = config,
	factor = 0.05,
	eval_mode = True
	)

	audioset_data = data_prep(train_dataset=dataset,eval_dataset=eval_dataset,device_num=device_num, config=config)
	checkpoint_callback = ModelCheckpoint(
	monitor = "mixture_sdr",
	filename='l-{epoch:d}-{mixture_sdr:.3f}-{clean_sdr:.3f}-{silence_sdr:.3f}',
	save_top_k = 10,
	mode = "max"
	)
	# infer at model
	sed_model = HTSAT_Swin_Transformer(
	spec_size=htsat_config.htsat_spec_size,
	patch_size=htsat_config.htsat_patch_size,
	in_chans=1,
	num_classes=htsat_config.classes_num,
	window_size=htsat_config.htsat_window_size,
	config = htsat_config,
	depths = htsat_config.htsat_depth,
	embed_dim = htsat_config.htsat_dim,
	patch_stride=htsat_config.htsat_stride,
	num_heads=htsat_config.htsat_num_head
	)
	at_model = SEDWrapper(
	sed_model = sed_model,
	config = htsat_config,
	dataset = None
	)
	# load the checkpoint
	ckpt = torch.load(htsat_config.resume_checkpoint, map_location="cpu")
	at_model.load_state_dict(ckpt["state_dict"])

	trainer = pl.Trainer(
	deterministic=True,
	default_root_dir = checkpoint_dir,
	gpus = device_num,
	val_check_interval = 0.2,
	# check_val_every_n_epoch = 1,
	max_epochs = config.max_epoch,
	auto_lr_find = True,
	sync_batchnorm = True,
	callbacks = [checkpoint_callback],
	accelerator = "ddp" if device_num > 1 else None,
	resume_from_checkpoint = None, #config.resume_checkpoint,
	replace_sampler_ddp = False,
	gradient_clip_val=1.0,
	num_sanity_val_steps = 0,
	)
	model = ZeroShotASP(
	channels = 1, config = config,
	at_model = at_model,
	dataset = dataset
	)
	if config.resume_checkpoint is not None:
	ckpt = torch.load(config.resume_checkpoint, map_location="cpu")
	model.load_state_dict(ckpt["state_dict"])
	# trainer.test(model, datamodule = audioset_data)
	trainer.fit(model, audioset_data)

	def main():
	parser = argparse.ArgumentParser(description="latent genreal source separation parser")
	subparsers = parser.add_subparsers(dest = "mode")
	parser_train = subparsers.add_parser("train")
	parser_test = subparsers.add_parser("test")
	parser_musdb = subparsers.add_parser("musdb_process")
	parser_saveidc = subparsers.add_parser("save_idc")
	parser_wa = subparsers.add_parser("weight_average")
	parser_infer = subparsers.add_parser("inference")
	args = parser.parse_args()
	# default settings
	logging.basicConfig(level=logging.INFO)
	pl.utilities.seed.seed_everything(seed = config.random_seed)

	if args.mode == "train":
	train()
	elif args.mode == "test":
	test()
	elif args.mode == "musdb_process":
	process_musdb()
	elif args.mode == "weight_average":
	weight_average()
	elif args.mode == "save_idc":
	save_idc()
	elif args.mode == "inference":
	inference()
	else:
	raise Exception("Error Mode!")


	if __name__ == '__main__':
	main()