run garbage collector to delete empty models periodically
Browse files- app.py +20 -1
- clean_community_org.py +36 -0
app.py
CHANGED
|
@@ -3,6 +3,8 @@ import pathlib
|
|
| 3 |
import random
|
| 4 |
import string
|
| 5 |
import tempfile
|
|
|
|
|
|
|
| 6 |
from typing import Iterable, List
|
| 7 |
|
| 8 |
import gradio as gr
|
|
@@ -12,6 +14,8 @@ import yaml
|
|
| 12 |
from gradio_logsview.logsview import Log, LogsView, LogsViewRunner
|
| 13 |
from mergekit.config import MergeConfiguration
|
| 14 |
|
|
|
|
|
|
|
| 15 |
has_gpu = torch.cuda.is_available()
|
| 16 |
|
| 17 |
# Running directly from Python doesn't work well with Gradio+run_process because of:
|
|
@@ -164,7 +168,7 @@ def merge(yaml_config: str, hf_token: str, repo_name: str) -> Iterable[List[Log]
|
|
| 164 |
return
|
| 165 |
|
| 166 |
# Set tmp HF_HOME to avoid filling up disk Space
|
| 167 |
-
tmp_env = os.environ.copy()
|
| 168 |
tmp_env["HF_HOME"] = f"{tmpdirname}/.cache"
|
| 169 |
yield from runner.run_command(cli.split(), cwd=merged_path, env=tmp_env)
|
| 170 |
|
|
@@ -215,4 +219,19 @@ with gr.Blocks() as demo:
|
|
| 215 |
|
| 216 |
button.click(fn=merge, inputs=[config, token, repo_name], outputs=[logs])
|
| 217 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 218 |
demo.queue(default_concurrency_limit=1).launch()
|
|
|
|
| 3 |
import random
|
| 4 |
import string
|
| 5 |
import tempfile
|
| 6 |
+
import time
|
| 7 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 8 |
from typing import Iterable, List
|
| 9 |
|
| 10 |
import gradio as gr
|
|
|
|
| 14 |
from gradio_logsview.logsview import Log, LogsView, LogsViewRunner
|
| 15 |
from mergekit.config import MergeConfiguration
|
| 16 |
|
| 17 |
+
from clean_community_org import garbage_collect_empty_models
|
| 18 |
+
|
| 19 |
has_gpu = torch.cuda.is_available()
|
| 20 |
|
| 21 |
# Running directly from Python doesn't work well with Gradio+run_process because of:
|
|
|
|
| 168 |
return
|
| 169 |
|
| 170 |
# Set tmp HF_HOME to avoid filling up disk Space
|
| 171 |
+
tmp_env = os.environ.copy() # taken from https://stackoverflow.com/a/4453495
|
| 172 |
tmp_env["HF_HOME"] = f"{tmpdirname}/.cache"
|
| 173 |
yield from runner.run_command(cli.split(), cwd=merged_path, env=tmp_env)
|
| 174 |
|
|
|
|
| 219 |
|
| 220 |
button.click(fn=merge, inputs=[config, token, repo_name], outputs=[logs])
|
| 221 |
|
| 222 |
+
|
| 223 |
+
# Run garbage collection every hour to keep the community org clean.
|
| 224 |
+
# Empty models might exists if the merge fails abruptly (e.g. if user leaves the Space).
|
| 225 |
+
def _garbage_collect_every_hour():
|
| 226 |
+
while True:
|
| 227 |
+
try:
|
| 228 |
+
garbage_collect_empty_models(token=COMMUNITY_HF_TOKEN)
|
| 229 |
+
except Exception as e:
|
| 230 |
+
print("Error running garbage collection", e)
|
| 231 |
+
time.sleep(3600)
|
| 232 |
+
|
| 233 |
+
|
| 234 |
+
pool = ThreadPoolExecutor()
|
| 235 |
+
pool.submit(_garbage_collect_every_hour)
|
| 236 |
+
|
| 237 |
demo.queue(default_concurrency_limit=1).launch()
|
clean_community_org.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Script to delete empty models from the community org.
|
| 2 |
+
# Can be run manually or scheduled to run periodically in the Space.
|
| 3 |
+
# Usage: python clean_community_org.py
|
| 4 |
+
#
|
| 5 |
+
# 1. List models from https://huggingface.co/mergekit-community
|
| 6 |
+
# 2. Filter out models with no files.
|
| 7 |
+
# 3. Filter out models that are newer than 1 hour.
|
| 8 |
+
# 4. Delete the remaining models.
|
| 9 |
+
from datetime import datetime, timezone
|
| 10 |
+
|
| 11 |
+
from huggingface_hub import HfApi
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def garbage_collect_empty_models(token: str | None = None):
|
| 15 |
+
api = HfApi(token=token)
|
| 16 |
+
now = datetime.now(timezone.utc)
|
| 17 |
+
print("Running garbage collection on mergekit-community.")
|
| 18 |
+
for model in api.list_models(author="mergekit-community", full=True):
|
| 19 |
+
if model.siblings and len(model.siblings) > 1:
|
| 20 |
+
# If model has files, then it's not empty
|
| 21 |
+
print("Skipping", model.modelId, "(not empty)")
|
| 22 |
+
continue
|
| 23 |
+
if (now - model.last_modified).total_seconds() < 3600:
|
| 24 |
+
# If model was updated in the last hour, then keep it
|
| 25 |
+
# to avoid deleting models that are being uploaded
|
| 26 |
+
print("Skipping", model.modelId, "(recently updated)")
|
| 27 |
+
continue
|
| 28 |
+
try:
|
| 29 |
+
print(f"Deleting {model.modelId}")
|
| 30 |
+
api.delete_repo(model.modelId, missing_ok=True)
|
| 31 |
+
except Exception as e:
|
| 32 |
+
print(f"Error deleting {model.modelId}: {e}")
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
if __name__ == "__main__":
|
| 36 |
+
garbage_collect_empty_models()
|