Spaces:
Running
Running
Commit
·
76a0a50
1
Parent(s):
7f039e5
attempting salvage after Claude Code deleted all my precious data
Browse files
vms/ui/project/services/training.py
CHANGED
@@ -1118,32 +1118,22 @@ class TrainingService:
|
|
1118 |
step_num = int(checkpoint_dir.name.split("_")[-1])
|
1119 |
logger.info(f"Validating checkpoint at step {step_num}: {checkpoint_dir}")
|
1120 |
|
1121 |
-
# Check if the .metadata file exists
|
1122 |
metadata_file = checkpoint_dir / ".metadata"
|
1123 |
if not metadata_file.exists():
|
1124 |
logger.warning(f"Checkpoint {checkpoint_dir.name} is corrupted: missing .metadata file")
|
1125 |
corrupted_checkpoints.append(checkpoint_dir)
|
1126 |
continue
|
1127 |
|
1128 |
-
#
|
1129 |
-
|
1130 |
-
|
1131 |
-
|
1132 |
-
|
1133 |
-
|
1134 |
-
|
1135 |
-
|
1136 |
-
|
1137 |
-
# Clean up any corrupted checkpoints we found before this valid one
|
1138 |
-
if corrupted_checkpoints:
|
1139 |
-
self.cleanup_corrupted_checkpoints(corrupted_checkpoints)
|
1140 |
-
|
1141 |
-
return str(checkpoint_dir)
|
1142 |
-
|
1143 |
-
except (json.JSONDecodeError, IOError, ValueError) as e:
|
1144 |
-
logger.warning(f"Checkpoint {checkpoint_dir.name} is corrupted: failed to read .metadata: {e}")
|
1145 |
-
corrupted_checkpoints.append(checkpoint_dir)
|
1146 |
-
continue
|
1147 |
|
1148 |
# If we reach here, all checkpoints are corrupted
|
1149 |
if corrupted_checkpoints:
|
@@ -1684,4 +1674,22 @@ class TrainingService:
|
|
1684 |
return temp_zip_path
|
1685 |
except Exception as e:
|
1686 |
print(f"Failed to create zip: {str(e)}")
|
1687 |
-
raise gr.Error(f"Failed to create zip: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1118 |
step_num = int(checkpoint_dir.name.split("_")[-1])
|
1119 |
logger.info(f"Validating checkpoint at step {step_num}: {checkpoint_dir}")
|
1120 |
|
1121 |
+
# Check if the .metadata file exists (indicator of complete checkpoint)
|
1122 |
metadata_file = checkpoint_dir / ".metadata"
|
1123 |
if not metadata_file.exists():
|
1124 |
logger.warning(f"Checkpoint {checkpoint_dir.name} is corrupted: missing .metadata file")
|
1125 |
corrupted_checkpoints.append(checkpoint_dir)
|
1126 |
continue
|
1127 |
|
1128 |
+
# .metadata file exists, checkpoint is considered valid
|
1129 |
+
# We don't read the file contents to avoid encoding/parsing issues
|
1130 |
+
logger.info(f"Checkpoint {checkpoint_dir.name} is valid")
|
1131 |
+
|
1132 |
+
# Clean up any corrupted checkpoints we found before this valid one
|
1133 |
+
if corrupted_checkpoints:
|
1134 |
+
self.cleanup_corrupted_checkpoints(corrupted_checkpoints)
|
1135 |
+
|
1136 |
+
return str(checkpoint_dir)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1137 |
|
1138 |
# If we reach here, all checkpoints are corrupted
|
1139 |
if corrupted_checkpoints:
|
|
|
1674 |
return temp_zip_path
|
1675 |
except Exception as e:
|
1676 |
print(f"Failed to create zip: {str(e)}")
|
1677 |
+
raise gr.Error(f"Failed to create zip: {str(e)}")
|
1678 |
+
|
1679 |
+
def create_output_directory_zip(self) -> str:
|
1680 |
+
"""Create a ZIP file containing all output data (checkpoints, models, etc.)
|
1681 |
+
|
1682 |
+
Returns:
|
1683 |
+
Path to created ZIP file
|
1684 |
+
"""
|
1685 |
+
# Create temporary zip file
|
1686 |
+
with tempfile.NamedTemporaryFile(suffix='.zip', delete=False) as temp_zip:
|
1687 |
+
temp_zip_path = str(temp_zip.name)
|
1688 |
+
print(f"Creating zip file for {self.app.output_path}..")
|
1689 |
+
try:
|
1690 |
+
make_archive(self.app.output_path, temp_zip_path)
|
1691 |
+
print(f"Output zip file created!")
|
1692 |
+
return temp_zip_path
|
1693 |
+
except Exception as e:
|
1694 |
+
print(f"Failed to create output zip: {str(e)}")
|
1695 |
+
raise gr.Error(f"Failed to create output zip: {str(e)}")
|
vms/ui/project/tabs/manage_tab.py
CHANGED
@@ -75,6 +75,12 @@ class ManageTab(BaseTab):
|
|
75 |
variant="secondary",
|
76 |
size="lg"
|
77 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
with gr.Row():
|
79 |
with gr.Column():
|
80 |
gr.Markdown("## 📡 Publish your model")
|
@@ -213,6 +219,11 @@ class ManageTab(BaseTab):
|
|
213 |
outputs=[self.components["download_model_btn"]]
|
214 |
)
|
215 |
|
|
|
|
|
|
|
|
|
|
|
216 |
# Dataset deletion with modal
|
217 |
self.components["delete_dataset_btn"].click(
|
218 |
fn=lambda: Modal(visible=True),
|
|
|
75 |
variant="secondary",
|
76 |
size="lg"
|
77 |
)
|
78 |
+
|
79 |
+
self.components["download_output_btn"] = gr.DownloadButton(
|
80 |
+
"📁 Download output directory (.zip)",
|
81 |
+
variant="secondary",
|
82 |
+
size="lg"
|
83 |
+
)
|
84 |
with gr.Row():
|
85 |
with gr.Column():
|
86 |
gr.Markdown("## 📡 Publish your model")
|
|
|
219 |
outputs=[self.components["download_model_btn"]]
|
220 |
)
|
221 |
|
222 |
+
self.components["download_output_btn"].click(
|
223 |
+
fn=self.app.training.create_output_directory_zip,
|
224 |
+
outputs=[self.components["download_output_btn"]]
|
225 |
+
)
|
226 |
+
|
227 |
# Dataset deletion with modal
|
228 |
self.components["delete_dataset_btn"].click(
|
229 |
fn=lambda: Modal(visible=True),
|