cronjob / app.py
testdeep123's picture
Update app.py
474dae4 verified
raw
history blame
18.6 kB
# app.py
import os
import shutil
import zipfile
import threading
import time
import logging
from flask import Flask, request, jsonify, render_template_string
import gdown
from huggingface_hub import HfApi, login
from huggingface_hub.utils import HfHubHTTPError
# --- CONFIGURATION ---
# Ensure Hugging Face cache and other temp files write to the writable /tmp directory
os.environ["HF_HOME"] = "/tmp/hf_home"
os.environ["GDOWN_CACHE_DIR"] = "/tmp/gdown_cache"
# Environment variables (set these in your Space secret settings)
FOLDER_URL = os.getenv("FOLDER_URL", "YOUR_GOOGLE_DRIVE_FOLDER_URL_HERE")
REPO_ID = os.getenv("REPO_ID", "your-hf-username/your-dataset-name")
TOKEN = os.getenv("HF_TOKEN")
# Directories in writable /tmp
DOWNLOAD_DIR = "/tmp/backups"
EXTRACT_DIR = "/tmp/extracted_backups"
# --- HTML TEMPLATE WITH EMBEDDED CSS AND JAVASCRIPT ---
HTML_TEMPLATE = """
<!DOCTYPE html>
<html lang="en" data-theme="dark">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>HF Backup & Manager</title>
<!-- Pico.css for a clean, modern look -->
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@picocss/pico@1/css/pico.min.css">
<!-- Embedded Custom Styles -->
<style>
:root {
--pico-card-background-color: #1e2025;
--pico-card-border-color: #33363d;
}
body {
padding: 1rem;
}
main.container {
max-width: 1000px;
padding-top: 1rem;
}
header {
text-align: center;
margin-bottom: 2rem;
}
article {
padding: 1.5rem;
}
.grid {
grid-template-columns: 1fr;
gap: 1.5rem;
}
@media (min-width: 992px) {
.grid {
grid-template-columns: 1fr 1fr;
}
}
.log-box {
background-color: #111317;
border: 1px solid var(--pico-card-border-color);
border-radius: var(--pico-border-radius);
padding: 1rem;
height: 200px;
overflow-y: auto;
font-family: monospace;
font-size: 0.875em;
white-space: pre-wrap;
word-break: break-all;
}
#status-text.idle { color: var(--pico-color-green-400); }
#status-text.running { color: var(--pico-color-amber-400); }
#status-text.error { color: var(--pico-color-red-400); }
button {
display: flex;
align-items: center;
justify-content: center;
gap: 0.75rem;
}
.spinner {
border: 3px solid rgba(255, 255, 255, 0.2);
border-top: 3px solid var(--pico-primary);
border-radius: 50%;
width: 16px;
height: 16px;
animation: spin 1s linear infinite;
}
@keyframes spin {
0% { transform: rotate(0deg); }
100% { transform: rotate(360deg); }
}
.file-manager-container {
max-height: 400px;
overflow-y: auto;
margin-top: 1rem;
}
.file-manager-container table button {
margin: 0;
padding: 0.25rem 0.5rem;
background-color: var(--pico-color-red-600);
border-color: var(--pico-color-red-600);
}
small {
display: block;
margin-top: -0.5rem;
margin-bottom: 1rem;
color: var(--pico-secondary-text);
}
</style>
</head>
<body>
<main class="container">
<header>
<hgroup>
<h1>Hugging Face Backup & Manager</h1>
<p>Automate server backups and manage your dataset on the Hub.</p>
</hgroup>
</header>
<div class="grid">
<article>
<hgroup>
<h2>Control Panel</h2>
<h3>Manage your backup tasks and schedule.</h3>
</hgroup>
<button id="run-now-btn" onclick="runNow()">
<span id="run-now-spinner" class="spinner" style="display: none;"></span>
Run Backup Now
</button>
<small>Manually trigger a full backup cycle.</small>
<form id="schedule-form" onsubmit="setSchedule(event)">
<label for="interval">Automatic Backup Interval (minutes)</label>
<input type="number" id="interval" name="interval" placeholder="0" min="0">
<small>Set to 0 to disable automatic backups.</small>
<button type="submit">Set Schedule</button>
</form>
</article>
<article>
<hgroup>
<h2>Live Status</h2>
<h3 id="status-text">Status: Fetching...</h3>
</hgroup>
<p><strong>Last Successful Backup:</strong> <span id="last-backup-time">Never</span></p>
<p><strong>Current Schedule:</strong> Every <span id="current-schedule">...</span> minutes</p>
<strong>Logs:</strong>
<pre id="logs" class="log-box"></pre>
</article>
</div>
<article>
<hgroup>
<h2>Dataset File Manager</h2>
<h3>Manage files in your repository: <a href="https://huggingface.co/datasets/{{ repo_id }}" target="_blank">{{ repo_id }}</a></h3>
</hgroup>
<button id="refresh-files-btn" onclick="fetchRepoFiles()" aria-busy="false">Refresh File List</button>
<div id="file-manager" class="file-manager-container">
<p>Loading files...</p>
</div>
</article>
</main>
<!-- Embedded JavaScript -->
<script>
const runNowBtn = document.getElementById('run-now-btn');
const runNowSpinner = document.getElementById('run-now-spinner');
const statusText = document.getElementById('status-text');
const lastBackupTime = document.getElementById('last-backup-time');
const currentSchedule = document.getElementById('current-schedule');
const scheduleInput = document.getElementById('interval');
const logsBox = document.getElementById('logs');
const fileManagerDiv = document.getElementById('file-manager');
const refreshFilesBtn = document.getElementById('refresh-files-btn');
async function fetchAPI(url, options = {}) {
try {
const response = await fetch(url, options);
if (!response.ok) {
const errorData = await response.json();
throw new Error(errorData.error || `HTTP error! status: ${response.status}`);
}
return await response.json();
} catch (error) {
console.error(`API Error on ${url}:`, error);
throw error;
}
}
async function fetchStatus() {
try {
const data = await fetchAPI('/api/status');
updateStatusUI(data);
} catch (error) {
statusText.textContent = "Status: Connection Error";
statusText.className = "error";
}
}
async function runNow() {
if (runNowBtn.disabled) return;
try {
await fetchAPI('/api/start-backup', { method: 'POST' });
} catch (error) {
alert(`Failed to start backup: ${error.message}`);
}
}
async function setSchedule(event) {
event.preventDefault();
const interval = parseInt(scheduleInput.value, 10);
if (isNaN(interval) || interval < 0) {
alert("Please enter a valid non-negative number for the interval.");
return;
}
try {
await fetchAPI('/api/set-schedule', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ interval })
});
} catch (error) {
alert(`Error setting schedule: ${error.message}`);
}
}
async function fetchRepoFiles() {
refreshFilesBtn.setAttribute('aria-busy', 'true');
try {
const data = await fetchAPI('/api/repo-files');
renderFileManager(data.files);
} catch (error) {
fileManagerDiv.innerHTML = `<p style="color: var(--pico-color-red-500);">Error loading files: ${error.message}</p>`;
} finally {
refreshFilesBtn.setAttribute('aria-busy', 'false');
}
}
async function deleteFile(path) {
if (!confirm(`Are you sure you want to permanently delete "${path}"? This cannot be undone.`)) return;
try {
await fetchAPI('/api/delete-file', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ path })
});
await fetchRepoFiles();
} catch (error) {
alert(`Error deleting file: ${error.message}`);
}
}
function updateStatusUI(data) {
statusText.textContent = `Status: ${data.status}`;
statusText.className = data.status.toLowerCase();
const isRunning = data.status === 'Running';
runNowBtn.disabled = isRunning;
runNowSpinner.style.display = isRunning ? 'inline-block' : 'none';
lastBackupTime.textContent = data.last_backup_time;
const interval = data.schedule_interval;
currentSchedule.textContent = interval > 0 ? `${interval}` : '0 (disabled)';
if (document.activeElement !== scheduleInput) {
scheduleInput.value = interval > 0 ? interval : '';
}
const newLogs = data.logs.join('\\n');
if (logsBox.textContent !== newLogs) {
logsBox.textContent = newLogs;
logsBox.scrollTop = logsBox.scrollHeight;
}
}
function renderFileManager(files) {
if (!files || files.length === 0) {
fileManagerDiv.innerHTML = "<p>No files found in the repository.</p>";
return;
}
let html = '<table><thead><tr><th>File Path</th><th style="text-align: right;">Action</th></tr></thead><tbody>';
files.forEach(file => {
html += `
<tr>
<td>${file}</td>
<td style="text-align: right;"><button class="outline secondary" onclick="deleteFile('${file}')">Delete</button></td>
</tr>
`;
});
html += '</tbody></table>';
fileManagerDiv.innerHTML = html;
}
document.addEventListener('DOMContentLoaded', () => {
fetchStatus();
fetchRepoFiles();
setInterval(fetchStatus, 3000);
});
</script>
</body>
</html>
"""
# --- FLASK APP & STATE MANAGEMENT ---
app = Flask(__name__)
logging.basicConfig(level=logging.INFO)
app_state = {
"status": "Idle", # Idle, Running, Error
"logs": [],
"last_backup_time": "Never",
"schedule_interval": 0, # in minutes
"scheduler_thread": None,
"lock": threading.Lock(),
}
# --- HUGGING FACE HELPER CLASS ---
class HFManager:
def __init__(self, token, repo_id, repo_type="dataset"):
if not token:
raise ValueError("Hugging Face token (HF_TOKEN) is not set.")
self.token = token
self.repo_id = repo_id
self.repo_type = repo_type
self.api = HfApi()
login(token=self.token)
def ensure_repo_exists(self):
self.api.create_repo(repo_id=self.repo_id, repo_type=self.repo_type, exist_ok=True, token=self.token)
def list_files(self):
try:
return sorted(self.api.list_repo_files(repo_id=self.repo_id, repo_type=self.repo_type, token=self.token))
except HfHubHTTPError as e:
if e.response.status_code == 404: return []
raise e
def delete_file(self, path_in_repo):
self.api.delete_file(path_in_repo, repo_id=self.repo_id, repo_type=self.repo_type, token=self.token, commit_message=f"Delete file: {path_in_repo}")
def upload(self, folder_path, path_in_repo, commit_message):
self.api.upload_folder(repo_id=self.repo_id, folder_path=folder_path, repo_type=self.repo_type, token=self.token, path_in_repo=path_in_repo, commit_message=commit_message)
# --- BACKUP LOGIC ---
def run_backup_job():
with app_state['lock']:
if app_state["status"] == "Running":
app_state['logs'].append("Backup is already in progress. Skipping scheduled run.")
return
app_state["status"] = "Running"
app_state["logs"] = ["Starting backup process..."]
log_entry = lambda msg: app_state['logs'].append(f"[{time.strftime('%H:%M:%S')}] {msg}")
try:
if not FOLDER_URL or "YOUR_GOOGLE_DRIVE" in FOLDER_URL:
raise ValueError("FOLDER_URL is not set. Please set it in your Space secrets.")
if not TOKEN:
raise ValueError("HF_TOKEN is not set. Please set it in your Space secrets.")
log_entry("Cleaning up temporary directories...")
shutil.rmtree(DOWNLOAD_DIR, ignore_errors=True)
shutil.rmtree(EXTRACT_DIR, ignore_errors=True)
os.makedirs(EXTRACT_DIR, exist_ok=True)
log_entry(f"Downloading from Google Drive...")
gdown.download_folder(url=FOLDER_URL, output=DOWNLOAD_DIR, use_cookies=False, quiet=True)
log_entry("Download finished.")
extracted_files = False
for root, _, files in os.walk(DOWNLOAD_DIR):
for f in files:
if f.endswith(".zip"):
zip_path = os.path.join(root, f)
with zipfile.ZipFile(zip_path, 'r') as z:
z.extractall(EXTRACT_DIR)
log_entry(f"Extracted: {f}")
extracted_files = True
if not extracted_files:
log_entry("Warning: No .zip files found to extract.")
bad_path, good_path = os.path.join(EXTRACT_DIR, "world_nither"), os.path.join(EXTRACT_DIR, "world_nether")
if os.path.exists(bad_path):
os.rename(bad_path, good_path)
log_entry("Fixed 'world_nither' typo to 'world_nether'.")
hf_manager = HFManager(TOKEN, REPO_ID)
hf_manager.ensure_repo_exists()
log_entry(f"Repo ready: {REPO_ID}")
for name in ["world", "world_nether", "world_the_end", "plugins"]:
local_path = os.path.join(EXTRACT_DIR, name)
if os.path.exists(local_path):
log_entry(f"Uploading '{name}'...")
hf_manager.upload(local_path, name, f"Backup update for {name}")
log_entry(f"Successfully uploaded '{name}'.")
else:
log_entry(f"Source folder '{name}' not found, skipping.")
with app_state['lock']:
app_state["last_backup_time"] = time.strftime('%Y-%m-%d %H:%M:%S %Z')
log_entry(f"Backup complete!")
app_state["status"] = "Idle"
except Exception as e:
error_message = f"An error occurred: {str(e)}"
logging.error(error_message, exc_info=True)
with app_state['lock']:
app_state["logs"].append(f"ERROR: {error_message}")
app_state["status"] = "Error"
# --- SCHEDULER THREAD ---
def scheduler_loop():
while True:
with app_state['lock']:
interval_minutes = app_state['schedule_interval']
if interval_minutes > 0:
next_run_time = time.time() + interval_minutes * 60
run_backup_job()
sleep_duration = next_run_time - time.time()
if sleep_duration > 0:
time.sleep(sleep_duration)
else:
time.sleep(15)
# --- FLASK ROUTES ---
@app.route("/")
def index():
return render_template_string(HTML_TEMPLATE, repo_id=REPO_ID)
@app.route("/api/status")
def status():
# THE ONLY CHANGE IS IN THIS FUNCTION
with app_state['lock']:
# Create a new dictionary with only the JSON-serializable items.
# This prevents the "TypeError: Object of type lock is not JSON serializable"
serializable_state = {
"status": app_state["status"],
"logs": app_state["logs"],
"last_backup_time": app_state["last_backup_time"],
"schedule_interval": app_state["schedule_interval"],
}
return jsonify(serializable_state)
@app.route("/api/start-backup", methods=["POST"])
def start_backup():
threading.Thread(target=run_backup_job).start()
return jsonify({"message": "Backup process initiated."})
@app.route("/api/set-schedule", methods=["POST"])
def set_schedule():
try:
interval = int(request.json.get("interval", 0))
if interval < 0: raise ValueError("Interval cannot be negative.")
with app_state['lock']:
app_state['schedule_interval'] = interval
return jsonify({"message": f"Schedule updated to {interval} minutes."})
except (ValueError, TypeError):
return jsonify({"error": "Invalid interval. Please provide a non-negative integer."}), 400
@app.route("/api/repo-files")
def get_repo_files():
try:
hf_manager = HFManager(TOKEN, REPO_ID)
return jsonify({"files": hf_manager.list_files()})
except Exception as e:
return jsonify({"error": str(e)}), 500
@app.route("/api/delete-file", methods=["POST"])
def delete_repo_file():
path = request.json.get("path")
if not path:
return jsonify({"error": "File path not provided."}), 400
try:
hf_manager = HFManager(TOKEN, REPO_ID)
hf_manager.delete_file(path)
return jsonify({"message": f"Successfully deleted {path}"})
except Exception as e:
return jsonify({"error": str(e)}), 500
# --- MAIN EXECUTION ---
if __name__ == "__main__":
app_state["scheduler_thread"] = threading.Thread(target=scheduler_loop, daemon=True)
app_state["scheduler_thread"].start()
app.run(host="0.0.0.0", port=7860)