File size: 5,166 Bytes
e390ccc
4bba8df
 
e390ccc
ca62943
 
4bba8df
e390ccc
c554973
 
 
4bba8df
 
 
 
e390ccc
4bba8df
af68a82
17ff73c
e390ccc
 
 
4bba8df
 
 
 
e390ccc
44d3c68
 
caa0374
d68fe8b
e390ccc
 
c554973
4bba8df
 
 
 
6d39e54
 
c554973
4bba8df
2926563
 
af68a82
 
 
2926563
4bba8df
 
 
 
 
 
 
c554973
e390ccc
 
3abd99d
caa0374
 
 
41bc8d2
caa0374
4bba8df
caa0374
 
 
3abd99d
caa0374
 
654bf8b
 
caa0374
 
654bf8b
fb1a253
654bf8b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4bba8df
 
 
 
 
af77a1c
0a394ee
44d3c68
 
af77a1c
0a394ee
44d3c68
 
 
 
0a394ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4bba8df
 
 
 
 
 
 
e390ccc
4bba8df
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import os
import shutil
import subprocess

import torch

from transformers import AutoTokenizer, AutoModelForSequenceClassification

from interfaces.cap import languages as languages_cap
from interfaces.cap import domains as domains_cap

from interfaces.emotion9 import languages as languages_emotion9

from interfaces.illframes import domains as domains_illframes

from interfaces.cap import build_huggingface_path as hf_cap_path
from interfaces.cap_minor import build_huggingface_path as hf_cap_minor_path
from interfaces.cap_minor_media import build_huggingface_path as hf_cap_minor_media_path
from interfaces.cap_media_demo import build_huggingface_path as hf_cap_media_path # why... just follow the name template the next time pls
from interfaces.manifesto import build_huggingface_path as hf_manifesto_path
from interfaces.sentiment import build_huggingface_path as hf_sentiment_path
from interfaces.emotion import build_huggingface_path as hf_emotion_path
from interfaces.emotion9 import build_huggingface_path as hf_emotion9_path
from interfaces.ontolisst import build_huggingface_path as hf_ontlisst_path
from interfaces.illframes import build_huggingface_path as hf_illframes_path
from interfaces.ontolisst import build_huggingface_path as hf_ontolisst_path

from huggingface_hub import scan_cache_dir

JIT_DIR = "/data/jit_models"

HF_TOKEN = os.environ["hf_read"]

# should be a temporary solution
models = [hf_manifesto_path(""), hf_sentiment_path(""), hf_emotion_path(""), hf_cap_minor_path("", ""), hf_ontolisst_path("")]

# it gets more difficult with cap
domains_cap = list(domains_cap.values())
for language in languages_cap:
    for domain in domains_cap:
        models.append(hf_cap_path(language, domain))
        
# cap media
models.append(hf_cap_media_path("", ""))

# cap minor media
models.append(hf_cap_minor_media_path("", "", False))
        
# emotion9
for language in languages_emotion9:
    models.append(hf_emotion9_path(language))
    
# illframes (domains is a dict for some reason?)
for domain in domains_illframes.values():
    models.append(hf_illframes_path(domain))

tokenizers = ["xlm-roberta-large"]

def download_hf_models():
    # Ensure the JIT model directory exists
    os.makedirs(JIT_DIR, exist_ok=True)

    for model_id in models:
        print(f"Downloading + JIT tracing model: {model_id}")
        
        # Load model and tokenizer
        model = AutoModelForSequenceClassification.from_pretrained(
            model_id,
            token=HF_TOKEN,
            device_map="auto"
        )
        tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large")
        
        safe_model_name = model_id.replace("/", "_")
        traced_model_path = os.path.join(JIT_DIR, f"{safe_model_name}.pt")
        
        if os.path.exists(traced_model_path):
            print(f"⏩ Skipping JIT β€” already exists: {traced_model_path}")
        else:
            print(f"βš™οΈ  Tracing and saving: {traced_model_path}")

            model.eval()

            # Dummy input for tracing
            dummy_input = tokenizer(
                "Hello, world!",
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=256
            )

            # JIT trace
            traced_model = torch.jit.trace(
                model,
                (dummy_input["input_ids"], dummy_input["attention_mask"]),
                strict=False
            )

            # Save traced model
            traced_model.save(traced_model_path)
            print(f"βœ”οΈ Saved JIT model to: {traced_model_path}")
        
def df_h():
    result = subprocess.run(["df", "-H"], capture_output=True, text=True)
    print(result.stdout)
    
def scan_cache():
    # Scan Hugging Face model cache
    cache_dir = os.environ.get("TRANSFORMERS_CACHE", os.path.expanduser("~/.cache/huggingface/transformers"))
    scan_result = scan_cache_dir(cache_dir)
    
    print("=== πŸ€— Hugging Face Model Cache ===")
    print(f"Cache size: {scan_result.size_on_disk / 1e6:.2f} MB")
    print(f"Number of repos: {len(scan_result.repos)}")
    for repo in scan_result.repos:
        print(f"- {repo.repo_id} ({repo.repo_type}) β€” {repo.size_on_disk / 1e6:.2f} MB")

    print("\n=== 🧊 TorchScript JIT Cache ===")
    if not os.path.exists(JIT_DIR):
        print(f"(Directory does not exist: {JIT_DIR})")
        return

    total_size = 0
    for filename in os.listdir(JIT_DIR):
        if filename.endswith(".pt"):
            path = os.path.join(JIT_DIR, filename)
            size = os.path.getsize(path)
            total_size += size
            print(f"- {filename}: {size / 1e6:.2f} MB")
    
    print(f"Total JIT cache size: {total_size / 1e6:.2f} MB")
    
def set_hf_cache_dir(path:str):
    os.environ['TRANSFORMERS_CACHE'] = path
    os.environ['HF_HOME'] = path
    os.environ['HF_DATASETS_CACHE'] = path
    os.environ['TORCH_HOME'] = path


def is_disk_full(min_free_space_in_GB=10):
    total, used, free = shutil.disk_usage("/")
    free_gb = free / (1024 ** 3)
    
    if free_gb >= min_free_space_in_GB:
        return False
    else:
        return True