import os
import shutil
import glob
import subprocess
from contextlib import contextmanager

import torch
import pandas as pd

import json
from google.oauth2 import service_account
from pandas_gbq import to_gbq

from transformers import AutoTokenizer, AutoModelForSequenceClassification

from interfaces.cap import languages as languages_cap
from interfaces.cap import domains as domains_cap

from interfaces.emotion9 import languages as languages_emotion9

from interfaces.illframes import domains as domains_illframes

from interfaces.cap import build_huggingface_path as hf_cap_path
from interfaces.cap_minor import build_huggingface_path as hf_cap_minor_path
from interfaces.cap_minor_media import build_huggingface_path as hf_cap_minor_media_path
from interfaces.cap_media import build_huggingface_path as hf_cap_media_path
from interfaces.cap_media2 import build_huggingface_path as hf_cap_media2_path
from interfaces.manifesto import build_huggingface_path as hf_manifesto_path
from interfaces.sentiment import build_huggingface_path as hf_sentiment_path
from interfaces.emotion import build_huggingface_path as hf_emotion_path
from interfaces.emotion9 import build_huggingface_path as hf_emotion9_path
from interfaces.ontolisst import build_huggingface_path as hf_ontlisst_path
from interfaces.illframes import build_huggingface_path as hf_illframes_path
from interfaces.ontolisst import build_huggingface_path as hf_ontolisst_path

from huggingface_hub import scan_cache_dir

JIT_DIR = "/data/jit_models"

HF_TOKEN = os.environ["hf_read"]

# should be a temporary solution
models = [
    hf_manifesto_path(""),
    hf_sentiment_path(""),
    hf_emotion_path(""),
    hf_cap_minor_path("", "", False), hf_cap_minor_path("", "social", False),
    hf_ontolisst_path(""),
]

# it gets more difficult with cap
domains_cap = list(domains_cap.values())
for language in languages_cap:
    for domain in domains_cap:
        models.append(hf_cap_path(language, domain))

# cap media
models.append(hf_cap_media_path("", ""))

# cap media2
models.append(hf_cap_media2_path("", ""))

# cap minor media
models.append(hf_cap_minor_media_path("", "", False))

# emotion9
for language in languages_emotion9:
    models.append(hf_emotion9_path(language))

# illframes (domains is a dict for some reason?)
for domain in domains_illframes.values():
    models.append(hf_illframes_path(domain))

tokenizers = ["xlm-roberta-large"]


def download_hf_models():
    os.makedirs(JIT_DIR, exist_ok=True)

    for model_id in models:
        print(f"Downloading + JIT tracing model: {model_id}")

        safe_model_name = model_id.replace("/", "_")
        traced_model_path = os.path.join(JIT_DIR, f"{safe_model_name}.pt")

        if os.path.exists(traced_model_path):
            delete_unused_bin_files(model_id)
            print(f"⏩ Skipping JIT — already exists: {traced_model_path}")
        else:
            print(f"⚙️  Tracing and saving: {traced_model_path}")

            model = AutoModelForSequenceClassification.from_pretrained(
                model_id, token=HF_TOKEN, device_map="auto"
            )
            tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large")

            model.eval()

            # Dummy input for tracing
            dummy_input = tokenizer(
                "Hello, world!",
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=64,
            )

            # JIT trace
            traced_model = torch.jit.trace(
                model,
                (dummy_input["input_ids"], dummy_input["attention_mask"]),
                strict=False,
            )

            # Save traced model
            traced_model.save(traced_model_path)
            print(f"✔️ Saved JIT model to: {traced_model_path}")


def df_h():
    df_result = subprocess.run(["df", "-H"], capture_output=True, text=True)
    print("=== Disk Free Space (df -H) ===")
    print(df_result.stdout)

    du_result = subprocess.run(
        ["du", "-h", "--max-depth=2", "/data/"], capture_output=True, text=True
    )
    print("=== Disk Usage for /data/ (du -h --max-depth=2) ===")
    print(du_result.stdout)


def delete_unused_bin_files(model_id: str):
    target_path = f"/data/models--poltextlab--{model_id}"

    # delete files in blobs/
    blob_bins = glob.glob(f"{target_path}/blobs/**/*", recursive=True)

    # delete .bin files in snapshots/, except config.json
    snapshot_bins = glob.glob(f"{target_path}/snapshots/**/*.bin", recursive=True)

    files_to_delete = blob_bins + snapshot_bins

    for file_path in files_to_delete:
        if os.path.basename(file_path) == "config.json":
            continue
        if os.path.isfile(path):
            print(f"Deleting file: {path}")
            os.remove(path)
        elif os.path.isdir(path):
            print(f"Deleting directory: {path}")
            shutil.rmtree(path)


def delete_http_folders():
    http_folders = glob.glob("/data/http*")
    for folder in http_folders:
        if os.path.isdir(folder):
            print(f"Deleting: {folder}")
            shutil.rmtree(folder)


@contextmanager
def hf_cleanup():
    delete_http_folders()
    try:
        yield
    finally:
        delete_http_folders()


def scan_cache():
    # Scan Hugging Face model cache
    cache_dir = os.environ.get(
        "TRANSFORMERS_CACHE", os.path.expanduser("~/.cache/huggingface/transformers")
    )
    scan_result = scan_cache_dir(cache_dir)

    print("=== 🤗 Hugging Face Model Cache ===")
    print(f"Cache size: {scan_result.size_on_disk / 1e6:.2f} MB")
    print(f"Number of repos: {len(scan_result.repos)}")
    for repo in scan_result.repos:
        print(f"- {repo.repo_id} ({repo.repo_type}) — {repo.size_on_disk / 1e6:.2f} MB")

    print("\n=== 🧊 TorchScript JIT Cache ===")
    if not os.path.exists(JIT_DIR):
        print(f"(Directory does not exist: {JIT_DIR})")
        return

    total_size = 0
    for filename in os.listdir(JIT_DIR):
        if filename.endswith(".pt"):
            path = os.path.join(JIT_DIR, filename)
            size = os.path.getsize(path)
            total_size += size
            print(f"- {filename}: {size / 1e6:.2f} MB")

    print(f"Total JIT cache size: {total_size / 1e6:.2f} MB")


def set_hf_cache_dir(path: str):
    os.environ["TRANSFORMERS_CACHE"] = path
    os.environ["HF_HOME"] = path
    os.environ["HF_DATASETS_CACHE"] = path
    os.environ["TORCH_HOME"] = path


def set_torch_threads():
    torch.set_num_threads(1)
    os.environ["OMP_NUM_THREADS"] = "1"
    os.environ["MKL_NUM_THREADS"] = "1"


def is_disk_full(min_free_space_in_GB=10):
    total, used, free = shutil.disk_usage("/")
    free_gb = free / (1024**3)

    if free_gb >= min_free_space_in_GB:
        return False
    else:
        return True


def update_bq_model_table():
    try:
        # Load BQ credentials from HF secret
        service_account_info = json.loads(os.environ["GCP_SERVICE_ACCOUNT_JSON"])
        credentials = service_account.Credentials.from_service_account_info(
            service_account_info
        )

        project_id = os.environ.get("BQ_PROJECT_ID", None)
        dataset_id = os.environ.get("BQ_DATASET_ID", None)
        table_id = os.environ.get("BQ_TABLE_ID", None)  # hf_space_models
        full_table_id = f"{dataset_id}.{table_id}"

        to_gbq(
            pd.DataFrame({"model_id": models}),
            destination_table=full_table_id,
            project_id=project_id,
            if_exists="replace",
            credentials=credentials,
        )
        # TO-DO: add timestamp column?
        print(f"Updated BigQuery model table!") 
    except Exception as e:
        print(f"BigQuery model table update failed: {e}")