Spaces:

poltextlab
/

babel_machine

Runtime error

babel_machine / utils.py

kovacsvi

scan_cache (huggingface_hub)

44d3c68 27 days ago

3.31 kB

	import os
	import shutil
	import subprocess

	from transformers import AutoTokenizer, AutoModelForSequenceClassification

	from interfaces.cap import languages as languages_cap
	from interfaces.cap import domains as domains_cap

	from interfaces.emotion9 import languages as languages_emotion9

	from interfaces.illframes import domains as domains_illframes

	from interfaces.cap import build_huggingface_path as hf_cap_path
	from interfaces.cap_minor import build_huggingface_path as hf_cap_minor_path
	from interfaces.cap_media_demo import build_huggingface_path as hf_cap_media_path # why... just follow the name template the next time pls
	from interfaces.manifesto import build_huggingface_path as hf_manifesto_path
	from interfaces.sentiment import build_huggingface_path as hf_sentiment_path
	from interfaces.emotion import build_huggingface_path as hf_emotion_path
	from interfaces.emotion9 import build_huggingface_path as hf_emotion9_path
	from interfaces.ontolisst import build_huggingface_path as hf_ontlisst_path
	from interfaces.illframes import build_huggingface_path as hf_illframes_path
	from interfaces.ontolisst import build_huggingface_path as hf_ontolisst_path

	from huggingface_hub import scan_cache_dir

	HF_TOKEN = os.environ["hf_read"]

	# should be a temporary solution
	models = [hf_manifesto_path(""), hf_sentiment_path(""), hf_emotion_path(""), hf_cap_minor_path("", ""), hf_ontolisst_path("")]

	# it gets more difficult with cap
	domains_cap = list(domains_cap.values())
	for language in languages_cap:
	for domain in domains_cap:
	models.append(hf_cap_path(language, domain))

	# cap media
	models.append(hf_cap_media_path("", ""))

	# emotion9
	for language in languages_emotion9:
	models.append(hf_emotion9_path(language))

	# illframes (domains is a dict for some reason?)
	for domain in domains_illframes.values():
	models.append(hf_illframes_path(domain))

	tokenizers = ["xlm-roberta-large"]

	def download_hf_models():
	for model_id in models:
	AutoModelForSequenceClassification.from_pretrained(model_id, low_cpu_mem_usage=True, device_map="auto", offload_folder="offload",
	token=HF_TOKEN)
	for tokenizer_id in tokenizers:
	AutoTokenizer.from_pretrained(tokenizer_id)


	def df_h():
	result = subprocess.run(["df", "-H"], capture_output=True, text=True)
	print(result.stdout)

	def scan_cache():
	cache_dir = os.environ.get("TRANSFORMERS_CACHE", os.path.expanduser("~/.cache/huggingface/transformers"))
	scan_result = scan_cache_dir(cache_dir)

	print("=== Model Cache Report ===")
	print(f"Cache size: {scan_result.size_on_disk / 1e6:.2f} MB")
	print(f"Number of repos: {len(scan_result.repos)}")
	for repo in scan_result.repos:
	print(f"- {repo.repo_id} ({repo.repo_type}) — {repo.size_on_disk / 1e6:.2f} MB")

	def set_hf_cache_dir(path:str):
	os.environ['TRANSFORMERS_CACHE'] = path
	os.environ['HF_HOME'] = path
	os.environ['HF_DATASETS_CACHE'] = path
	os.environ['TORCH_HOME'] = path


	def is_disk_full(min_free_space_in_GB=10):
	total, used, free = shutil.disk_usage("/")
	free_gb = free / (1024 ** 3)

	if free_gb >= min_free_space_in_GB:
	return False
	else:
	return True