Spaces:

poltextlab
/

babel_machine

Running

File size: 7,772 Bytes

e390ccc
4bba8df
b1b87fb
4bba8df
3f77878
e390ccc
ca62943
7e0dad9
 
 
 
 
ca62943
4bba8df
e390ccc
c554973
 
 
4bba8df
 
 
 
e390ccc
4bba8df
af68a82
7e0dad9
0c9d7b1
e390ccc
 
 
4bba8df
 
 
 
e390ccc
44d3c68
 
caa0374
d68fe8b
e390ccc
 
c554973
0c08f54
 
 
 
9b2fea4
0c08f54
 
4bba8df
 
 
6d39e54
 
c554973
0c08f54
2926563
 
af68a82
0c9d7b1
 
 
af68a82
 
0c08f54
4bba8df
 
 
0c08f54
4bba8df
 
 
c554973
e390ccc
 
0c08f54
3abd99d
caa0374
 
41bc8d2
caa0374
0c08f54
caa0374
 
0c08f54
fb1a253
8453705
654bf8b
 
 
0c08f54
8453705
0c08f54
8453705
 
654bf8b
 
 
 
 
 
 
 
 
0c08f54
654bf8b
 
 
 
 
 
0c08f54
654bf8b
 
 
 
 
0c08f54
 
4bba8df
04d7b9c
 
 
 
0c08f54
 
 
04d7b9c
 
0c08f54
8453705
 
 
bf07f99
65e6711
10307a1
0c08f54
bf07f99
 
 
 
 
 
 
 
8d3cc6e
 
 
 
 
 
0c08f54
 
3f77878
b1b87fb
 
 
 
 
0c08f54
 
3f77878
 
 
 
 
 
 
0c08f54
 
af77a1c
0a394ee
0c08f54
 
 
44d3c68
0c08f54
0a394ee
44d3c68
 
 
 
0a394ee
 
 
 
 
 
 
 
 
 
 
 
 
0c08f54
0a394ee
0c08f54
 
 
 
 
 
 
 
 
8027e9b
 
 
 
4bba8df
e390ccc
4bba8df
 
0c08f54
 
4bba8df
 
 
 
7e0dad9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
410b6ef
 
7e0dad9

import os
import shutil
import glob
import subprocess
from contextlib import contextmanager

import torch
import pandas as pd

import json
from google.oauth2 import service_account
from pandas_gbq import to_gbq

from transformers import AutoTokenizer, AutoModelForSequenceClassification

from interfaces.cap import languages as languages_cap
from interfaces.cap import domains as domains_cap

from interfaces.emotion9 import languages as languages_emotion9

from interfaces.illframes import domains as domains_illframes

from interfaces.cap import build_huggingface_path as hf_cap_path
from interfaces.cap_minor import build_huggingface_path as hf_cap_minor_path
from interfaces.cap_minor_media import build_huggingface_path as hf_cap_minor_media_path
from interfaces.cap_media import build_huggingface_path as hf_cap_media_path
from interfaces.cap_media2 import build_huggingface_path as hf_cap_media2_path
from interfaces.manifesto import build_huggingface_path as hf_manifesto_path
from interfaces.sentiment import build_huggingface_path as hf_sentiment_path
from interfaces.emotion import build_huggingface_path as hf_emotion_path
from interfaces.emotion9 import build_huggingface_path as hf_emotion9_path
from interfaces.ontolisst import build_huggingface_path as hf_ontlisst_path
from interfaces.illframes import build_huggingface_path as hf_illframes_path
from interfaces.ontolisst import build_huggingface_path as hf_ontolisst_path

from huggingface_hub import scan_cache_dir

JIT_DIR = "/data/jit_models"

HF_TOKEN = os.environ["hf_read"]

# should be a temporary solution
models = [
    hf_manifesto_path(""),
    hf_sentiment_path(""),
    hf_emotion_path(""),
    hf_cap_minor_path("", "", False), hf_cap_minor_path("", "social", False),
    hf_ontolisst_path(""),
]

# it gets more difficult with cap
domains_cap = list(domains_cap.values())
for language in languages_cap:
    for domain in domains_cap:
        models.append(hf_cap_path(language, domain))

# cap media
models.append(hf_cap_media_path("", ""))

# cap media2
models.append(hf_cap_media2_path("", ""))

# cap minor media
models.append(hf_cap_minor_media_path("", "", False))

# emotion9
for language in languages_emotion9:
    models.append(hf_emotion9_path(language))

# illframes (domains is a dict for some reason?)
for domain in domains_illframes.values():
    models.append(hf_illframes_path(domain))

tokenizers = ["xlm-roberta-large"]


def download_hf_models():
    os.makedirs(JIT_DIR, exist_ok=True)

    for model_id in models:
        print(f"Downloading + JIT tracing model: {model_id}")

        safe_model_name = model_id.replace("/", "_")
        traced_model_path = os.path.join(JIT_DIR, f"{safe_model_name}.pt")

        if os.path.exists(traced_model_path):
            delete_unused_bin_files(model_id)
            print(f"⏩ Skipping JIT — already exists: {traced_model_path}")
        else:
            print(f"⚙️  Tracing and saving: {traced_model_path}")

            model = AutoModelForSequenceClassification.from_pretrained(
                model_id, token=HF_TOKEN, device_map="auto"
            )
            tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large")

            model.eval()

            # Dummy input for tracing
            dummy_input = tokenizer(
                "Hello, world!",
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=64,
            )

            # JIT trace
            traced_model = torch.jit.trace(
                model,
                (dummy_input["input_ids"], dummy_input["attention_mask"]),
                strict=False,
            )

            # Save traced model
            traced_model.save(traced_model_path)
            print(f"✔️ Saved JIT model to: {traced_model_path}")


def df_h():
    df_result = subprocess.run(["df", "-H"], capture_output=True, text=True)
    print("=== Disk Free Space (df -H) ===")
    print(df_result.stdout)

    du_result = subprocess.run(
        ["du", "-h", "--max-depth=2", "/data/"], capture_output=True, text=True
    )
    print("=== Disk Usage for /data/ (du -h --max-depth=2) ===")
    print(du_result.stdout)


def delete_unused_bin_files(model_id: str):
    target_path = f"/data/models--poltextlab--{model_id}"

    # delete files in blobs/
    blob_bins = glob.glob(f"{target_path}/blobs/**/*", recursive=True)

    # delete .bin files in snapshots/, except config.json
    snapshot_bins = glob.glob(f"{target_path}/snapshots/**/*.bin", recursive=True)

    files_to_delete = blob_bins + snapshot_bins

    for file_path in files_to_delete:
        if os.path.basename(file_path) == "config.json":
            continue
        if os.path.isfile(path):
            print(f"Deleting file: {path}")
            os.remove(path)
        elif os.path.isdir(path):
            print(f"Deleting directory: {path}")
            shutil.rmtree(path)


def delete_http_folders():
    http_folders = glob.glob("/data/http*")
    for folder in http_folders:
        if os.path.isdir(folder):
            print(f"Deleting: {folder}")
            shutil.rmtree(folder)


@contextmanager
def hf_cleanup():
    delete_http_folders()
    try:
        yield
    finally:
        delete_http_folders()


def scan_cache():
    # Scan Hugging Face model cache
    cache_dir = os.environ.get(
        "TRANSFORMERS_CACHE", os.path.expanduser("~/.cache/huggingface/transformers")
    )
    scan_result = scan_cache_dir(cache_dir)

    print("=== 🤗 Hugging Face Model Cache ===")
    print(f"Cache size: {scan_result.size_on_disk / 1e6:.2f} MB")
    print(f"Number of repos: {len(scan_result.repos)}")
    for repo in scan_result.repos:
        print(f"- {repo.repo_id} ({repo.repo_type}) — {repo.size_on_disk / 1e6:.2f} MB")

    print("\n=== 🧊 TorchScript JIT Cache ===")
    if not os.path.exists(JIT_DIR):
        print(f"(Directory does not exist: {JIT_DIR})")
        return

    total_size = 0
    for filename in os.listdir(JIT_DIR):
        if filename.endswith(".pt"):
            path = os.path.join(JIT_DIR, filename)
            size = os.path.getsize(path)
            total_size += size
            print(f"- {filename}: {size / 1e6:.2f} MB")

    print(f"Total JIT cache size: {total_size / 1e6:.2f} MB")


def set_hf_cache_dir(path: str):
    os.environ["TRANSFORMERS_CACHE"] = path
    os.environ["HF_HOME"] = path
    os.environ["HF_DATASETS_CACHE"] = path
    os.environ["TORCH_HOME"] = path


def set_torch_threads():
    torch.set_num_threads(1)
    os.environ["OMP_NUM_THREADS"] = "1"
    os.environ["MKL_NUM_THREADS"] = "1"


def is_disk_full(min_free_space_in_GB=10):
    total, used, free = shutil.disk_usage("/")
    free_gb = free / (1024**3)

    if free_gb >= min_free_space_in_GB:
        return False
    else:
        return True


def update_bq_model_table():
    try:
        # Load BQ credentials from HF secret
        service_account_info = json.loads(os.environ["GCP_SERVICE_ACCOUNT_JSON"])
        credentials = service_account.Credentials.from_service_account_info(
            service_account_info
        )

        project_id = os.environ.get("BQ_PROJECT_ID", None)
        dataset_id = os.environ.get("BQ_DATASET_ID", None)
        table_id = os.environ.get("BQ_TABLE_ID", None)  # hf_space_models
        full_table_id = f"{dataset_id}.{table_id}"

        to_gbq(
            pd.DataFrame({"model_id": models}),
            destination_table=full_table_id,
            project_id=project_id,
            if_exists="replace",
            credentials=credentials,
        )
        # TO-DO: add timestamp column?
        print(f"Updated BigQuery model table!") 
    except Exception as e:
        print(f"BigQuery model table update failed: {e}")