babel_machine / utils.py
kovacsvi
scan_cache (huggingface_hub)
44d3c68
raw
history blame
3.31 kB
import os
import shutil
import subprocess
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from interfaces.cap import languages as languages_cap
from interfaces.cap import domains as domains_cap
from interfaces.emotion9 import languages as languages_emotion9
from interfaces.illframes import domains as domains_illframes
from interfaces.cap import build_huggingface_path as hf_cap_path
from interfaces.cap_minor import build_huggingface_path as hf_cap_minor_path
from interfaces.cap_media_demo import build_huggingface_path as hf_cap_media_path # why... just follow the name template the next time pls
from interfaces.manifesto import build_huggingface_path as hf_manifesto_path
from interfaces.sentiment import build_huggingface_path as hf_sentiment_path
from interfaces.emotion import build_huggingface_path as hf_emotion_path
from interfaces.emotion9 import build_huggingface_path as hf_emotion9_path
from interfaces.ontolisst import build_huggingface_path as hf_ontlisst_path
from interfaces.illframes import build_huggingface_path as hf_illframes_path
from interfaces.ontolisst import build_huggingface_path as hf_ontolisst_path
from huggingface_hub import scan_cache_dir
HF_TOKEN = os.environ["hf_read"]
# should be a temporary solution
models = [hf_manifesto_path(""), hf_sentiment_path(""), hf_emotion_path(""), hf_cap_minor_path("", ""), hf_ontolisst_path("")]
# it gets more difficult with cap
domains_cap = list(domains_cap.values())
for language in languages_cap:
for domain in domains_cap:
models.append(hf_cap_path(language, domain))
# cap media
models.append(hf_cap_media_path("", ""))
# emotion9
for language in languages_emotion9:
models.append(hf_emotion9_path(language))
# illframes (domains is a dict for some reason?)
for domain in domains_illframes.values():
models.append(hf_illframes_path(domain))
tokenizers = ["xlm-roberta-large"]
def download_hf_models():
for model_id in models:
AutoModelForSequenceClassification.from_pretrained(model_id, low_cpu_mem_usage=True, device_map="auto", offload_folder="offload",
token=HF_TOKEN)
for tokenizer_id in tokenizers:
AutoTokenizer.from_pretrained(tokenizer_id)
def df_h():
result = subprocess.run(["df", "-H"], capture_output=True, text=True)
print(result.stdout)
def scan_cache():
cache_dir = os.environ.get("TRANSFORMERS_CACHE", os.path.expanduser("~/.cache/huggingface/transformers"))
scan_result = scan_cache_dir(cache_dir)
print("=== Model Cache Report ===")
print(f"Cache size: {scan_result.size_on_disk / 1e6:.2f} MB")
print(f"Number of repos: {len(scan_result.repos)}")
for repo in scan_result.repos:
print(f"- {repo.repo_id} ({repo.repo_type}) β€” {repo.size_on_disk / 1e6:.2f} MB")
def set_hf_cache_dir(path:str):
os.environ['TRANSFORMERS_CACHE'] = path
os.environ['HF_HOME'] = path
os.environ['HF_DATASETS_CACHE'] = path
os.environ['TORCH_HOME'] = path
def is_disk_full(min_free_space_in_GB=10):
total, used, free = shutil.disk_usage("/")
free_gb = free / (1024 ** 3)
if free_gb >= min_free_space_in_GB:
return False
else:
return True