ryanoakley commited on
Commit
79da033
·
verified ·
1 Parent(s): 7dc4dbc

Update src/aibom-generator/generator.py

Browse files
Files changed (1) hide show
  1. src/aibom-generator/generator.py +115 -145
src/aibom-generator/generator.py CHANGED
@@ -81,12 +81,6 @@ class AIBOMGenerator:
81
  # Calculate final score with industry-neutral approach if enabled
82
  final_score = calculate_completeness_score(aibom, validate=True, use_best_practices=use_best_practices)
83
 
84
- # Ensure metadata.properties exists
85
- if "metadata" in aibom and "properties" not in aibom["metadata"]:
86
- aibom["metadata"]["properties"] = []
87
-
88
- # Note: Quality score information is no longer added to the AIBOM metadata
89
- # This was removed as requested by the user
90
 
91
  if output_file:
92
  with open(output_file, 'w') as f:
@@ -214,16 +208,17 @@ class AIBOMGenerator:
214
  ]
215
  }
216
 
217
- # Add downloadLocation if available
 
 
 
 
 
218
  if metadata and "commit_url" in metadata:
219
- # Add external reference for downloadLocation
220
- if "externalReferences" not in aibom:
221
- aibom["externalReferences"] = []
222
-
223
  aibom["externalReferences"].append({
224
- "type": "distribution",
225
- "url": f"https://huggingface.co/{model_id}"
226
- })
227
 
228
  return aibom
229
 
@@ -234,22 +229,30 @@ class AIBOMGenerator:
234
  model_card: Optional[ModelCard],
235
  ) -> Dict[str, Any]:
236
  metadata = {}
237
-
238
  if model_info:
239
  try:
 
 
 
 
 
 
 
 
240
  metadata.update({
241
- "name": model_info.modelId.split("/")[-1] if hasattr(model_info, "modelId") else model_id.split("/")[-1],
242
- "author": model_info.author if hasattr(model_info, "author") else None,
243
- "tags": model_info.tags if hasattr(model_info, "tags") else [],
244
- "pipeline_tag": model_info.pipeline_tag if hasattr(model_info, "pipeline_tag") else None,
245
- "downloads": model_info.downloads if hasattr(model_info, "downloads") else 0,
246
- "last_modified": model_info.lastModified if hasattr(model_info, "lastModified") else None,
247
- "commit": model_info.sha[:7] if hasattr(model_info, "sha") and model_info.sha else None,
248
- "commit_url": f"https://huggingface.co/{model_id}/commit/{model_info.sha}" if hasattr(model_info, "sha") and model_info.sha else None,
249
  })
250
  except Exception as e:
251
  print(f"Error extracting model info metadata: {e}")
252
-
253
  if model_card and hasattr(model_card, "data") and model_card.data:
254
  try:
255
  card_data = model_card.data.to_dict() if hasattr(model_card.data, "to_dict") else {}
@@ -267,104 +270,35 @@ class AIBOMGenerator:
267
  metadata["eval_results"] = model_card.data.eval_results
268
  except Exception as e:
269
  print(f"Error extracting model card metadata: {e}")
270
-
271
  metadata["ai:type"] = "Transformer"
272
  metadata["ai:task"] = metadata.get("pipeline_tag", "Text Generation")
273
  metadata["ai:framework"] = "PyTorch" if "transformers" in metadata.get("library_name", "") else "Unknown"
274
-
275
- # Add fields for industry-neutral scoring (silently aligned with SPDX)
276
- metadata["primaryPurpose"] = metadata.get("ai:task", "Text Generation")
277
- metadata["suppliedBy"] = metadata.get("author", "Unknown")
278
-
279
- # Add typeOfModel field
 
 
 
280
  metadata["typeOfModel"] = metadata.get("ai:type", "Transformer")
281
-
 
 
 
 
282
  return {k: v for k, v in metadata.items() if v is not None}
 
283
 
284
  def _extract_unstructured_metadata(self, model_card: Optional[ModelCard], model_id: str) -> Dict[str, Any]:
285
  """
286
- Extract additional metadata from model card using BERT model.
287
- This is a placeholder implementation that would be replaced with actual BERT inference.
288
-
289
- In a real implementation, this would:
290
- 1. Extract text from model card
291
- 2. Use BERT to identify key information
292
- 3. Structure the extracted information
293
-
294
- For now, we'll simulate this with some basic extraction logic.
295
  """
296
- enhanced_metadata = {}
297
 
298
- # In a real implementation, we would use a BERT model here
299
- # Since we can't install the required libraries due to space constraints,
300
- # we'll simulate the enhancement with a placeholder implementation
301
-
302
- if model_card and hasattr(model_card, "text") and model_card.text:
303
- try:
304
- card_text = model_card.text
305
-
306
- # Simulate BERT extraction with basic text analysis
307
- # In reality, this would be done with NLP models
308
-
309
- # Extract description if missing
310
- if card_text and "description" not in enhanced_metadata:
311
- # Take first paragraph that's longer than 20 chars as description
312
- paragraphs = [p.strip() for p in card_text.split('\n\n')]
313
- for p in paragraphs:
314
- if len(p) > 20 and not p.startswith('#'):
315
- enhanced_metadata["description"] = p
316
- break
317
-
318
- # Extract limitations if present
319
- if "limitations" not in enhanced_metadata:
320
- if "## Limitations" in card_text:
321
- limitations_section = card_text.split("## Limitations")[1].split("##")[0].strip()
322
- if limitations_section:
323
- enhanced_metadata["limitations"] = limitations_section
324
-
325
- # Extract ethical considerations if present
326
- if "ethical_considerations" not in enhanced_metadata:
327
- for heading in ["## Ethical Considerations", "## Ethics", "## Bias"]:
328
- if heading in card_text:
329
- section = card_text.split(heading)[1].split("##")[0].strip()
330
- if section:
331
- enhanced_metadata["ethical_considerations"] = section
332
- break
333
-
334
- # Extract risks if present
335
- if "risks" not in enhanced_metadata:
336
- if "## Risks" in card_text:
337
- risks_section = card_text.split("## Risks")[1].split("##")[0].strip()
338
- if risks_section:
339
- enhanced_metadata["risks"] = risks_section
340
-
341
- # Extract datasets if present
342
- if "datasets" not in enhanced_metadata:
343
- datasets = []
344
- if "## Dataset" in card_text or "## Datasets" in card_text:
345
- dataset_section = ""
346
- if "## Dataset" in card_text:
347
- dataset_section = card_text.split("## Dataset")[1].split("##")[0].strip()
348
- elif "## Datasets" in card_text:
349
- dataset_section = card_text.split("## Datasets")[1].split("##")[0].strip()
350
-
351
- if dataset_section:
352
- # Simple parsing to extract dataset names
353
- lines = dataset_section.split("\n")
354
- for line in lines:
355
- if line.strip() and not line.startswith("#"):
356
- datasets.append({
357
- "type": "dataset",
358
- "name": line.strip().split()[0] if line.strip().split() else "Unknown",
359
- "description": line.strip()
360
- })
361
-
362
- if datasets:
363
- enhanced_metadata["datasets"] = datasets
364
- except Exception as e:
365
- print(f"Error extracting unstructured metadata: {e}")
366
-
367
- return enhanced_metadata
368
 
369
  def _create_metadata_section(self, model_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
370
  timestamp = datetime.datetime.utcnow().isoformat() + "Z"
@@ -419,10 +353,25 @@ class AIBOMGenerator:
419
  # Add copyright
420
  component["copyright"] = "NOASSERTION"
421
 
422
- # Create properties array for additional metadata
423
  properties = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
424
  for key, value in metadata.items():
425
- if key not in ["name", "author", "license", "description", "commit"] and value is not None:
426
  if isinstance(value, (list, dict)):
427
  if not isinstance(value, str):
428
  value = json.dumps(value)
@@ -432,12 +381,10 @@ class AIBOMGenerator:
432
  metadata_section = {
433
  "timestamp": timestamp,
434
  "tools": tools,
435
- "component": component
 
436
  }
437
 
438
- if properties:
439
- metadata_section["properties"] = properties
440
-
441
  return metadata_section
442
 
443
  def _create_component_section(self, model_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
@@ -465,18 +412,29 @@ class AIBOMGenerator:
465
  "purl": purl
466
  }
467
 
468
- # Add licenses if available
469
- if "license" in metadata:
470
  component["licenses"] = [{
471
  "license": {
472
  "id": metadata["license"],
473
  "url": self._get_license_url(metadata["license"])
474
  }
475
  }]
476
-
477
- # Add description if available
478
- if "description" in metadata:
479
- component["description"] = metadata["description"]
 
 
 
 
 
 
 
 
 
 
 
480
 
481
  # Add external references
482
  external_refs = [{
@@ -490,17 +448,18 @@ class AIBOMGenerator:
490
  })
491
  component["externalReferences"] = external_refs
492
 
493
- # Add authors, publisher, supplier, manufacturer
494
- if "author" in metadata and metadata["author"]:
495
- component["authors"] = [{"name": metadata["author"]}]
496
- component["publisher"] = metadata["author"]
 
497
  component["supplier"] = {
498
- "name": metadata["author"],
499
- "url": [f"https://huggingface.co/{metadata['author']}"]
500
  }
501
  component["manufacturer"] = {
502
- "name": metadata["author"],
503
- "url": [f"https://huggingface.co/{metadata['author']}"]
504
  }
505
 
506
  # Add copyright
@@ -593,19 +552,30 @@ class AIBOMGenerator:
593
  def _get_license_url(self, license_id: str) -> str:
594
  """Get the URL for a license based on its SPDX ID."""
595
  license_urls = {
596
- "Apache-2.0": "https://www.apache.org/licenses/LICENSE-2.0",
597
- "MIT": "https://opensource.org/licenses/MIT",
598
- "BSD-3-Clause": "https://opensource.org/licenses/BSD-3-Clause",
599
- "GPL-3.0": "https://www.gnu.org/licenses/gpl-3.0.en.html",
600
- "CC-BY-4.0": "https://creativecommons.org/licenses/by/4.0/",
601
- "CC-BY-SA-4.0": "https://creativecommons.org/licenses/by-sa/4.0/",
602
- "CC-BY-NC-4.0": "https://creativecommons.org/licenses/by-nc/4.0/",
603
- "CC-BY-ND-4.0": "https://creativecommons.org/licenses/by-nd/4.0/",
604
- "CC-BY-NC-SA-4.0": "https://creativecommons.org/licenses/by-nc-sa/4.0/",
605
- "CC-BY-NC-ND-4.0": "https://creativecommons.org/licenses/by-nc-nd/4.0/",
606
- "LGPL-3.0": "https://www.gnu.org/licenses/lgpl-3.0.en.html",
607
- "MPL-2.0": "https://www.mozilla.org/en-US/MPL/2.0/",
608
  }
609
 
610
- return license_urls.get(license_id, "https://spdx.org/licenses/")
611
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  # Calculate final score with industry-neutral approach if enabled
82
  final_score = calculate_completeness_score(aibom, validate=True, use_best_practices=use_best_practices)
83
 
 
 
 
 
 
 
84
 
85
  if output_file:
86
  with open(output_file, 'w') as f:
 
208
  ]
209
  }
210
 
211
+ # ALWAYS add root-level external references
212
+ aibom["externalReferences"] = [{
213
+ "type": "distribution",
214
+ "url": f"https://huggingface.co/{model_id}"
215
+ }]
216
+
217
  if metadata and "commit_url" in metadata:
 
 
 
 
218
  aibom["externalReferences"].append({
219
+ "type": "vcs",
220
+ "url": metadata["commit_url"]
221
+ } )
222
 
223
  return aibom
224
 
 
229
  model_card: Optional[ModelCard],
230
  ) -> Dict[str, Any]:
231
  metadata = {}
232
+
233
  if model_info:
234
  try:
235
+ author = getattr(model_info, "author", None)
236
+ if not author or author.strip() == "":
237
+ parts = model_id.split("/")
238
+ author = parts[0] if len(parts) > 1 else "unknown"
239
+ print(f"DEBUG: Fallback author used: {author}")
240
+ else:
241
+ print(f"DEBUG: Author from model_info: {author}")
242
+
243
  metadata.update({
244
+ "name": getattr(model_info, "modelId", model_id).split("/")[-1],
245
+ "author": author,
246
+ "tags": getattr(model_info, "tags", []),
247
+ "pipeline_tag": getattr(model_info, "pipeline_tag", None),
248
+ "downloads": getattr(model_info, "downloads", 0),
249
+ "last_modified": getattr(model_info, "lastModified", None),
250
+ "commit": getattr(model_info, "sha", None)[:7] if getattr(model_info, "sha", None) else None,
251
+ "commit_url": f"https://huggingface.co/{model_id}/commit/{model_info.sha}" if getattr(model_info, "sha", None) else None,
252
  })
253
  except Exception as e:
254
  print(f"Error extracting model info metadata: {e}")
255
+
256
  if model_card and hasattr(model_card, "data") and model_card.data:
257
  try:
258
  card_data = model_card.data.to_dict() if hasattr(model_card.data, "to_dict") else {}
 
270
  metadata["eval_results"] = model_card.data.eval_results
271
  except Exception as e:
272
  print(f"Error extracting model card metadata: {e}")
273
+
274
  metadata["ai:type"] = "Transformer"
275
  metadata["ai:task"] = metadata.get("pipeline_tag", "Text Generation")
276
  metadata["ai:framework"] = "PyTorch" if "transformers" in metadata.get("library_name", "") else "Unknown"
277
+
278
+ metadata["primaryPurpose"] = metadata.get("ai:task", "text-generation")
279
+
280
+ # Use model owner as fallback for suppliedBy if no author
281
+ if not metadata.get("author"):
282
+ parts = model_id.split("/")
283
+ metadata["author"] = parts[0] if len(parts) > 1 else "unknown"
284
+
285
+ metadata["suppliedBy"] = metadata.get("author", "unknown")
286
  metadata["typeOfModel"] = metadata.get("ai:type", "Transformer")
287
+
288
+ print(f"DEBUG: Final metadata['author'] = {metadata.get('author')}")
289
+ print(f"DEBUG: Adding primaryPurpose = {metadata.get('ai:task', 'Text Generation')}")
290
+ print(f"DEBUG: Adding suppliedBy = {metadata.get('suppliedBy')}")
291
+
292
  return {k: v for k, v in metadata.items() if v is not None}
293
+
294
 
295
  def _extract_unstructured_metadata(self, model_card: Optional[ModelCard], model_id: str) -> Dict[str, Any]:
296
  """
297
+ Placeholder for future AI enhancement.
298
+ Currently returns empty dict since AI enhancement is not implemented.
 
 
 
 
 
 
 
299
  """
300
+ return {}
301
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
302
 
303
  def _create_metadata_section(self, model_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
304
  timestamp = datetime.datetime.utcnow().isoformat() + "Z"
 
353
  # Add copyright
354
  component["copyright"] = "NOASSERTION"
355
 
356
+ # Create properties array for additional metadata (ALWAYS include critical fields)
357
  properties = []
358
+
359
+ # ALWAYS add critical fields for scoring
360
+ critical_fields = {
361
+ "primaryPurpose": metadata.get("primaryPurpose", metadata.get("ai:task", "text-generation")),
362
+ "suppliedBy": metadata.get("suppliedBy", metadata.get("author", "unknown")),
363
+ "typeOfModel": metadata.get("ai:type", "transformer")
364
+ }
365
+
366
+ # Add critical fields first
367
+ for key, value in critical_fields.items():
368
+ if value and value != "unknown":
369
+ properties.append({"name": key, "value": str(value)})
370
+
371
+ # Add other metadata fields (excluding basic component fields)
372
+ excluded_fields = ["name", "author", "license", "description", "commit", "primaryPurpose", "suppliedBy", "typeOfModel"]
373
  for key, value in metadata.items():
374
+ if key not in excluded_fields and value is not None:
375
  if isinstance(value, (list, dict)):
376
  if not isinstance(value, str):
377
  value = json.dumps(value)
 
381
  metadata_section = {
382
  "timestamp": timestamp,
383
  "tools": tools,
384
+ "component": component,
385
+ "properties": properties # ALWAYS include properties
386
  }
387
 
 
 
 
388
  return metadata_section
389
 
390
  def _create_component_section(self, model_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
 
412
  "purl": purl
413
  }
414
 
415
+ # ALWAYS add licenses (use default if not available)
416
+ if metadata and "license" in metadata and metadata["license"]:
417
  component["licenses"] = [{
418
  "license": {
419
  "id": metadata["license"],
420
  "url": self._get_license_url(metadata["license"])
421
  }
422
  }]
423
+ else:
424
+ # Add default license structure for consistency
425
+ component["licenses"] = [{
426
+ "license": {
427
+ "id": "unknown",
428
+ "url": "https://spdx.org/licenses/"
429
+ }
430
+ }]
431
+ # Debug
432
+ print(f"DEBUG: License in metadata: {'license' in metadata}" )
433
+ if "license" in metadata:
434
+ print(f"DEBUG: Adding licenses = {metadata['license']}")
435
+
436
+ # ALWAYS add description
437
+ component["description"] = metadata.get("description", f"AI model {model_id}")
438
 
439
  # Add external references
440
  external_refs = [{
 
448
  })
449
  component["externalReferences"] = external_refs
450
 
451
+ # ALWAYS add author information (use model owner if not available )
452
+ author_name = metadata.get("author", group if group else "unknown")
453
+ if author_name and author_name != "unknown":
454
+ component["authors"] = [{"name": author_name}]
455
+ component["publisher"] = author_name
456
  component["supplier"] = {
457
+ "name": author_name,
458
+ "url": [f"https://huggingface.co/{author_name}"]
459
  }
460
  component["manufacturer"] = {
461
+ "name": author_name,
462
+ "url": [f"https://huggingface.co/{author_name}"]
463
  }
464
 
465
  # Add copyright
 
552
  def _get_license_url(self, license_id: str) -> str:
553
  """Get the URL for a license based on its SPDX ID."""
554
  license_urls = {
555
+ "apache-2.0": "https://www.apache.org/licenses/LICENSE-2.0",
556
+ "mit": "https://opensource.org/licenses/MIT",
557
+ "bsd-3-clause": "https://opensource.org/licenses/BSD-3-Clause",
558
+ "gpl-3.0": "https://www.gnu.org/licenses/gpl-3.0.en.html",
559
+ "cc-by-4.0": "https://creativecommons.org/licenses/by/4.0/",
560
+ "cc-by-sa-4.0": "https://creativecommons.org/licenses/by-sa/4.0/",
561
+ "cc-by-nc-4.0": "https://creativecommons.org/licenses/by-nc/4.0/",
562
+ "cc-by-nd-4.0": "https://creativecommons.org/licenses/by-nd/4.0/",
563
+ "cc-by-nc-sa-4.0": "https://creativecommons.org/licenses/by-nc-sa/4.0/",
564
+ "cc-by-nc-nd-4.0": "https://creativecommons.org/licenses/by-nc-nd/4.0/",
565
+ "lgpl-3.0": "https://www.gnu.org/licenses/lgpl-3.0.en.html",
566
+ "mpl-2.0": "https://www.mozilla.org/en-US/MPL/2.0/",
567
  }
568
 
569
+ return license_urls.get(license_id.lower(), "https://spdx.org/licenses/" )
570
 
571
+ def _fetch_with_retry(self, fetch_func, *args, max_retries=3, **kwargs):
572
+ """Fetch data with retry logic for network failures."""
573
+ for attempt in range(max_retries):
574
+ try:
575
+ return fetch_func(*args, **kwargs)
576
+ except Exception as e:
577
+ if attempt == max_retries - 1:
578
+ logger.warning(f"Failed to fetch after {max_retries} attempts: {e}")
579
+ return None
580
+ time.sleep(1 * (attempt + 1)) # Exponential backoff
581
+ return None