Spaces:

poltextlab
/

babel_machine

Running

File size: 3,314 Bytes

e390ccc
4bba8df
 
e390ccc
4bba8df
e390ccc
c554973
 
 
4bba8df
 
 
 
e390ccc
4bba8df
17ff73c
e390ccc
 
 
4bba8df
 
 
 
e390ccc
44d3c68
 
e390ccc
 
c554973
4bba8df
 
 
 
6d39e54
 
c554973
4bba8df
2926563
 
 
4bba8df
 
 
 
 
 
 
c554973
e390ccc
 
 
41bc8d2
4bba8df
e390ccc
41bc8d2
4bba8df
 
 
 
 
 
 
af77a1c
44d3c68
 
af77a1c
44d3c68
 
 
 
 
4bba8df
 
 
 
 
 
 
e390ccc
4bba8df

import os
import shutil
import subprocess

from transformers import AutoTokenizer, AutoModelForSequenceClassification

from interfaces.cap import languages as languages_cap
from interfaces.cap import domains as domains_cap

from interfaces.emotion9 import languages as languages_emotion9

from interfaces.illframes import domains as domains_illframes

from interfaces.cap import build_huggingface_path as hf_cap_path
from interfaces.cap_minor import build_huggingface_path as hf_cap_minor_path
from interfaces.cap_media_demo import build_huggingface_path as hf_cap_media_path # why... just follow the name template the next time pls
from interfaces.manifesto import build_huggingface_path as hf_manifesto_path
from interfaces.sentiment import build_huggingface_path as hf_sentiment_path
from interfaces.emotion import build_huggingface_path as hf_emotion_path
from interfaces.emotion9 import build_huggingface_path as hf_emotion9_path
from interfaces.ontolisst import build_huggingface_path as hf_ontlisst_path
from interfaces.illframes import build_huggingface_path as hf_illframes_path
from interfaces.ontolisst import build_huggingface_path as hf_ontolisst_path

from huggingface_hub import scan_cache_dir

HF_TOKEN = os.environ["hf_read"]

# should be a temporary solution
models = [hf_manifesto_path(""), hf_sentiment_path(""), hf_emotion_path(""), hf_cap_minor_path("", ""), hf_ontolisst_path("")]

# it gets more difficult with cap
domains_cap = list(domains_cap.values())
for language in languages_cap:
    for domain in domains_cap:
        models.append(hf_cap_path(language, domain))
        
# cap media
models.append(hf_cap_media_path("", ""))
        
# emotion9
for language in languages_emotion9:
    models.append(hf_emotion9_path(language))
    
# illframes (domains is a dict for some reason?)
for domain in domains_illframes.values():
    models.append(hf_illframes_path(domain))

tokenizers = ["xlm-roberta-large"]

def download_hf_models():
    for model_id in models:
        AutoModelForSequenceClassification.from_pretrained(model_id, low_cpu_mem_usage=True, device_map="auto", offload_folder="offload", 
                                                                   token=HF_TOKEN)
    for tokenizer_id in tokenizers:
        AutoTokenizer.from_pretrained(tokenizer_id)
        
        
def df_h():
    result = subprocess.run(["df", "-H"], capture_output=True, text=True)
    print(result.stdout)
    
def scan_cache():
    cache_dir = os.environ.get("TRANSFORMERS_CACHE", os.path.expanduser("~/.cache/huggingface/transformers"))
    scan_result = scan_cache_dir(cache_dir)
    
    print("=== Model Cache Report ===")
    print(f"Cache size: {scan_result.size_on_disk / 1e6:.2f} MB")
    print(f"Number of repos: {len(scan_result.repos)}")
    for repo in scan_result.repos:
        print(f"- {repo.repo_id} ({repo.repo_type}) — {repo.size_on_disk / 1e6:.2f} MB")
    
def set_hf_cache_dir(path:str):
    os.environ['TRANSFORMERS_CACHE'] = path
    os.environ['HF_HOME'] = path
    os.environ['HF_DATASETS_CACHE'] = path
    os.environ['TORCH_HOME'] = path


def is_disk_full(min_free_space_in_GB=10):
    total, used, free = shutil.disk_usage("/")
    free_gb = free / (1024 ** 3)
    
    if free_gb >= min_free_space_in_GB:
        return False
    else:
        return True