openbmb
/

Ultra-FineWeb-classifier

add Ultra-FineWeb lighteval task python file

4ed02d8 13 days ago

500 Bytes

	from transformers import LlamaTokenizerFast


	# Ultra-FineWeb classifier is using "deepseek-ai/DeepSeek-V2"
	# path = "deepseek-ai/DeepSeek-V2"
	path = "local_tokenizer"
	tokenizer = LlamaTokenizerFast.from_pretrained(path, trust_remote_code=True)

	# test tokenizer
	content = "MiniCPM4: Ultra-Efficient LLMs on End Devices"
	token_ids = tokenizer.encode(content, add_special_tokens=False)
	print(token_ids)

	# decode each token and print
	for token_id in token_ids:
	print(tokenizer.decode([token_id]))