jbilcke-hf HF Staff commited on
Commit
76a0a50
·
1 Parent(s): 7f039e5

attempting salvage after Claude Code deleted all my precious data

Browse files
vms/ui/project/services/training.py CHANGED
@@ -1118,32 +1118,22 @@ class TrainingService:
1118
  step_num = int(checkpoint_dir.name.split("_")[-1])
1119
  logger.info(f"Validating checkpoint at step {step_num}: {checkpoint_dir}")
1120
 
1121
- # Check if the .metadata file exists
1122
  metadata_file = checkpoint_dir / ".metadata"
1123
  if not metadata_file.exists():
1124
  logger.warning(f"Checkpoint {checkpoint_dir.name} is corrupted: missing .metadata file")
1125
  corrupted_checkpoints.append(checkpoint_dir)
1126
  continue
1127
 
1128
- # Try to read the metadata file to ensure it's not corrupted
1129
- try:
1130
- with open(metadata_file, 'r') as f:
1131
- metadata = json.load(f)
1132
- # Basic validation - metadata should have expected structure
1133
- if not isinstance(metadata, dict):
1134
- raise ValueError("Invalid metadata format")
1135
- logger.info(f"Checkpoint {checkpoint_dir.name} is valid")
1136
-
1137
- # Clean up any corrupted checkpoints we found before this valid one
1138
- if corrupted_checkpoints:
1139
- self.cleanup_corrupted_checkpoints(corrupted_checkpoints)
1140
-
1141
- return str(checkpoint_dir)
1142
-
1143
- except (json.JSONDecodeError, IOError, ValueError) as e:
1144
- logger.warning(f"Checkpoint {checkpoint_dir.name} is corrupted: failed to read .metadata: {e}")
1145
- corrupted_checkpoints.append(checkpoint_dir)
1146
- continue
1147
 
1148
  # If we reach here, all checkpoints are corrupted
1149
  if corrupted_checkpoints:
@@ -1684,4 +1674,22 @@ class TrainingService:
1684
  return temp_zip_path
1685
  except Exception as e:
1686
  print(f"Failed to create zip: {str(e)}")
1687
- raise gr.Error(f"Failed to create zip: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1118
  step_num = int(checkpoint_dir.name.split("_")[-1])
1119
  logger.info(f"Validating checkpoint at step {step_num}: {checkpoint_dir}")
1120
 
1121
+ # Check if the .metadata file exists (indicator of complete checkpoint)
1122
  metadata_file = checkpoint_dir / ".metadata"
1123
  if not metadata_file.exists():
1124
  logger.warning(f"Checkpoint {checkpoint_dir.name} is corrupted: missing .metadata file")
1125
  corrupted_checkpoints.append(checkpoint_dir)
1126
  continue
1127
 
1128
+ # .metadata file exists, checkpoint is considered valid
1129
+ # We don't read the file contents to avoid encoding/parsing issues
1130
+ logger.info(f"Checkpoint {checkpoint_dir.name} is valid")
1131
+
1132
+ # Clean up any corrupted checkpoints we found before this valid one
1133
+ if corrupted_checkpoints:
1134
+ self.cleanup_corrupted_checkpoints(corrupted_checkpoints)
1135
+
1136
+ return str(checkpoint_dir)
 
 
 
 
 
 
 
 
 
 
1137
 
1138
  # If we reach here, all checkpoints are corrupted
1139
  if corrupted_checkpoints:
 
1674
  return temp_zip_path
1675
  except Exception as e:
1676
  print(f"Failed to create zip: {str(e)}")
1677
+ raise gr.Error(f"Failed to create zip: {str(e)}")
1678
+
1679
+ def create_output_directory_zip(self) -> str:
1680
+ """Create a ZIP file containing all output data (checkpoints, models, etc.)
1681
+
1682
+ Returns:
1683
+ Path to created ZIP file
1684
+ """
1685
+ # Create temporary zip file
1686
+ with tempfile.NamedTemporaryFile(suffix='.zip', delete=False) as temp_zip:
1687
+ temp_zip_path = str(temp_zip.name)
1688
+ print(f"Creating zip file for {self.app.output_path}..")
1689
+ try:
1690
+ make_archive(self.app.output_path, temp_zip_path)
1691
+ print(f"Output zip file created!")
1692
+ return temp_zip_path
1693
+ except Exception as e:
1694
+ print(f"Failed to create output zip: {str(e)}")
1695
+ raise gr.Error(f"Failed to create output zip: {str(e)}")
vms/ui/project/tabs/manage_tab.py CHANGED
@@ -75,6 +75,12 @@ class ManageTab(BaseTab):
75
  variant="secondary",
76
  size="lg"
77
  )
 
 
 
 
 
 
78
  with gr.Row():
79
  with gr.Column():
80
  gr.Markdown("## 📡 Publish your model")
@@ -213,6 +219,11 @@ class ManageTab(BaseTab):
213
  outputs=[self.components["download_model_btn"]]
214
  )
215
 
 
 
 
 
 
216
  # Dataset deletion with modal
217
  self.components["delete_dataset_btn"].click(
218
  fn=lambda: Modal(visible=True),
 
75
  variant="secondary",
76
  size="lg"
77
  )
78
+
79
+ self.components["download_output_btn"] = gr.DownloadButton(
80
+ "📁 Download output directory (.zip)",
81
+ variant="secondary",
82
+ size="lg"
83
+ )
84
  with gr.Row():
85
  with gr.Column():
86
  gr.Markdown("## 📡 Publish your model")
 
219
  outputs=[self.components["download_model_btn"]]
220
  )
221
 
222
+ self.components["download_output_btn"].click(
223
+ fn=self.app.training.create_output_directory_zip,
224
+ outputs=[self.components["download_output_btn"]]
225
+ )
226
+
227
  # Dataset deletion with modal
228
  self.components["delete_dataset_btn"].click(
229
  fn=lambda: Modal(visible=True),