Spaces:
Running
Running
""" | |
Utility functions for the AIBOM Generator. | |
""" | |
import json | |
import logging | |
import os | |
import re | |
import uuid | |
from typing import Dict, List, Optional, Any, Union, Tuple | |
from enum import Enum | |
logger = logging.getLogger(__name__) | |
# Validation severity levels | |
class ValidationSeverity(Enum): | |
ERROR = "error" | |
WARNING = "warning" | |
INFO = "info" | |
def setup_logging(level=logging.INFO): | |
logging.basicConfig( | |
level=level, | |
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", | |
datefmt="%Y-%m-%d %H:%M:%S", | |
) | |
def ensure_directory(directory_path): | |
if not os.path.exists(directory_path): | |
os.makedirs(directory_path) | |
return directory_path | |
def generate_uuid(): | |
return str(uuid.uuid4()) | |
def normalize_license_id(license_text): | |
license_mappings = { | |
"mit": "MIT", | |
"apache": "Apache-2.0", | |
"apache 2": "Apache-2.0", | |
"apache 2.0": "Apache-2.0", | |
"apache-2": "Apache-2.0", | |
"apache-2.0": "Apache-2.0", | |
"gpl": "GPL-3.0-only", | |
"gpl-3": "GPL-3.0-only", | |
"gpl-3.0": "GPL-3.0-only", | |
"gpl3": "GPL-3.0-only", | |
"gpl v3": "GPL-3.0-only", | |
"gpl-2": "GPL-2.0-only", | |
"gpl-2.0": "GPL-2.0-only", | |
"gpl2": "GPL-2.0-only", | |
"gpl v2": "GPL-2.0-only", | |
"lgpl": "LGPL-3.0-only", | |
"lgpl-3": "LGPL-3.0-only", | |
"lgpl-3.0": "LGPL-3.0-only", | |
"bsd": "BSD-3-Clause", | |
"bsd-3": "BSD-3-Clause", | |
"bsd-3-clause": "BSD-3-Clause", | |
"bsd-2": "BSD-2-Clause", | |
"bsd-2-clause": "BSD-2-Clause", | |
"cc": "CC-BY-4.0", | |
"cc-by": "CC-BY-4.0", | |
"cc-by-4.0": "CC-BY-4.0", | |
"cc-by-sa": "CC-BY-SA-4.0", | |
"cc-by-sa-4.0": "CC-BY-SA-4.0", | |
"cc-by-nc": "CC-BY-NC-4.0", | |
"cc-by-nc-4.0": "CC-BY-NC-4.0", | |
"cc0": "CC0-1.0", | |
"cc0-1.0": "CC0-1.0", | |
"public domain": "CC0-1.0", | |
"unlicense": "Unlicense", | |
"proprietary": "NONE", | |
"commercial": "NONE", | |
} | |
if not license_text: | |
return None | |
normalized = re.sub(r'[^\w\s-]', '', license_text.lower()) | |
if normalized in license_mappings: | |
return license_mappings[normalized] | |
for key, value in license_mappings.items(): | |
if key in normalized: | |
return value | |
return license_text | |
def validate_spdx(license_entry): | |
spdx_licenses = [ | |
"MIT", "Apache-2.0", "GPL-3.0-only", "GPL-2.0-only", "LGPL-3.0-only", | |
"BSD-3-Clause", "BSD-2-Clause", "CC-BY-4.0", "CC-BY-SA-4.0", "CC0-1.0", | |
"Unlicense", "NONE" | |
] | |
if isinstance(license_entry, list): | |
return all(lic in spdx_licenses for lic in license_entry) | |
return license_entry in spdx_licenses | |
def _validate_ai_requirements(aibom: Dict[str, Any]) -> List[Dict[str, Any]]: | |
""" | |
Validate AI-specific requirements for an AIBOM. | |
Args: | |
aibom: The AIBOM to validate | |
Returns: | |
List of validation issues | |
""" | |
issues = [] | |
# Check basic structure | |
if "components" not in aibom or not aibom["components"]: | |
issues.append({ | |
"severity": ValidationSeverity.ERROR.value, | |
"code": "MISSING_COMPONENTS", | |
"message": "AIBOM must contain at least one component", | |
"path": "$.components" | |
}) | |
return issues # Can't continue validation without components | |
# Check for schema compliance issues with authors | |
if "metadata" in aibom and "authors" in aibom["metadata"]: | |
for i, author in enumerate(aibom["metadata"]["authors"]): | |
if "url" in author: | |
issues.append({ | |
"severity": ValidationSeverity.ERROR.value, | |
"code": "INVALID_AUTHOR_PROPERTY", | |
"message": "Author object contains 'url' property which is not allowed in CycloneDX schema. Use 'email' instead.", | |
"path": f"$.metadata.authors[{i}].url" | |
}) | |
# Validate each component | |
for i, component in enumerate(aibom["components"]): | |
component_path = f"$.components[{i}]" | |
# Check component type | |
if "type" not in component: | |
issues.append({ | |
"severity": ValidationSeverity.ERROR.value, | |
"code": "MISSING_COMPONENT_TYPE", | |
"message": "Component must have a type", | |
"path": f"{component_path}.type" | |
}) | |
elif component.get("type") != "machine-learning-model": | |
issues.append({ | |
"severity": ValidationSeverity.WARNING.value, | |
"code": "INVALID_COMPONENT_TYPE", | |
"message": "Component type should be 'machine-learning-model' for AI components", | |
"path": f"{component_path}.type" | |
}) | |
# Check PURL format | |
if "purl" not in component: | |
issues.append({ | |
"severity": ValidationSeverity.ERROR.value, | |
"code": "MISSING_PURL", | |
"message": "Component must have a PURL", | |
"path": f"{component_path}.purl" | |
}) | |
else: | |
purl = component["purl"] | |
if not purl.startswith("pkg:"): | |
issues.append({ | |
"severity": ValidationSeverity.ERROR.value, | |
"code": "INVALID_PURL_FORMAT", | |
"message": "PURL must start with 'pkg:'", | |
"path": f"{component_path}.purl" | |
}) | |
elif "huggingface" in purl and "@" not in purl and "version" in component: | |
issues.append({ | |
"severity": ValidationSeverity.WARNING.value, | |
"code": "MISSING_VERSION_IN_PURL", | |
"message": "PURL should include version information with '@' for versioned components", | |
"path": f"{component_path}.purl" | |
}) | |
# Check model card | |
if "modelCard" not in component: | |
issues.append({ | |
"severity": ValidationSeverity.WARNING.value, | |
"code": "MISSING_MODEL_CARD", | |
"message": "AI component should include a model card", | |
"path": f"{component_path}.modelCard" | |
}) | |
else: | |
model_card = component["modelCard"] | |
model_card_path = f"{component_path}.modelCard" | |
# Check model parameters | |
if "modelParameters" not in model_card: | |
issues.append({ | |
"severity": ValidationSeverity.WARNING.value, | |
"code": "MISSING_MODEL_PARAMETERS", | |
"message": "Model card should include model parameters", | |
"path": f"{model_card_path}.modelParameters" | |
}) | |
# Check considerations | |
if "considerations" not in model_card: | |
issues.append({ | |
"severity": ValidationSeverity.INFO.value, | |
"code": "MISSING_CONSIDERATIONS", | |
"message": "Model card should include considerations section for ethical considerations, limitations, etc.", | |
"path": f"{model_card_path}.considerations" | |
}) | |
# Validate metadata | |
if "metadata" not in aibom: | |
issues.append({ | |
"severity": ValidationSeverity.ERROR.value, | |
"code": "MISSING_METADATA", | |
"message": "AIBOM must contain metadata", | |
"path": "$.metadata" | |
}) | |
else: | |
metadata = aibom["metadata"] | |
metadata_path = "$.metadata" | |
# Check tools | |
if "tools" not in metadata or not metadata["tools"]: | |
issues.append({ | |
"severity": ValidationSeverity.WARNING.value, | |
"code": "MISSING_TOOLS", | |
"message": "Metadata should include tools that generated the AIBOM", | |
"path": f"{metadata_path}.tools" | |
}) | |
# Check authors | |
if "authors" not in metadata or not metadata["authors"]: | |
issues.append({ | |
"severity": ValidationSeverity.INFO.value, | |
"code": "MISSING_AUTHORS", | |
"message": "Metadata should include authors information", | |
"path": f"{metadata_path}.authors" | |
}) | |
# Check properties | |
if "properties" not in metadata or not metadata["properties"]: | |
issues.append({ | |
"severity": ValidationSeverity.INFO.value, | |
"code": "MISSING_PROPERTIES", | |
"message": "Metadata should include properties for additional information", | |
"path": f"{metadata_path}.properties" | |
}) | |
return issues | |
def _generate_validation_recommendations(issues: List[Dict[str, Any]]) -> List[str]: | |
""" | |
Generate recommendations based on validation issues. | |
Args: | |
issues: List of validation issues | |
Returns: | |
List of recommendations | |
""" | |
recommendations = [] | |
# Group issues by code | |
issue_codes = set(issue["code"] for issue in issues) | |
# Generate recommendations based on issue codes | |
if "MISSING_COMPONENTS" in issue_codes: | |
recommendations.append("Add at least one component to the AIBOM") | |
if "MISSING_COMPONENT_TYPE" in issue_codes or "INVALID_COMPONENT_TYPE" in issue_codes: | |
recommendations.append("Ensure all AI components have type 'machine-learning-model'") | |
if "MISSING_PURL" in issue_codes or "INVALID_PURL_FORMAT" in issue_codes: | |
recommendations.append("Ensure all components have a valid PURL starting with 'pkg:'") | |
if "MISSING_VERSION_IN_PURL" in issue_codes: | |
recommendations.append("Include version information in PURLs using '@' syntax (e.g., pkg:huggingface/org/model@version)") | |
if "MISSING_MODEL_CARD" in issue_codes: | |
recommendations.append("Add a model card section to AI components") | |
if "MISSING_MODEL_PARAMETERS" in issue_codes: | |
recommendations.append("Include model parameters in the model card section") | |
if "MISSING_CONSIDERATIONS" in issue_codes: | |
recommendations.append("Add ethical considerations, limitations, and risks to the model card") | |
if "MISSING_METADATA" in issue_codes: | |
recommendations.append("Add metadata section to the AIBOM") | |
if "MISSING_TOOLS" in issue_codes: | |
recommendations.append("Include tools information in the metadata section") | |
if "MISSING_AUTHORS" in issue_codes: | |
recommendations.append("Add authors information to the metadata section") | |
if "MISSING_PROPERTIES" in issue_codes: | |
recommendations.append("Include additional properties in the metadata section") | |
if "INVALID_AUTHOR_PROPERTY" in issue_codes: | |
recommendations.append("Remove 'url' property from author objects and use 'email' instead to comply with CycloneDX schema") | |
return recommendations | |
def validate_aibom(aibom: Dict[str, Any]) -> Dict[str, Any]: | |
""" | |
Validate an AIBOM against AI-specific requirements. | |
Args: | |
aibom: The AIBOM to validate | |
Returns: | |
Validation report with issues and recommendations | |
""" | |
# Initialize validation report | |
report = { | |
"valid": True, | |
"ai_valid": True, | |
"issues": [], | |
"recommendations": [], | |
"summary": { | |
"error_count": 0, | |
"warning_count": 0, | |
"info_count": 0 | |
} | |
} | |
# Validate AI-specific requirements | |
ai_issues = _validate_ai_requirements(aibom) | |
if ai_issues: | |
report["ai_valid"] = False | |
report["valid"] = False | |
report["issues"].extend(ai_issues) | |
# Generate recommendations | |
report["recommendations"] = _generate_validation_recommendations(report["issues"]) | |
# Update summary counts | |
for issue in report["issues"]: | |
if issue["severity"] == ValidationSeverity.ERROR.value: | |
report["summary"]["error_count"] += 1 | |
elif issue["severity"] == ValidationSeverity.WARNING.value: | |
report["summary"]["warning_count"] += 1 | |
elif issue["severity"] == ValidationSeverity.INFO.value: | |
report["summary"]["info_count"] += 1 | |
return report | |
def get_validation_summary(report: Dict[str, Any]) -> str: | |
""" | |
Get a human-readable summary of the validation report. | |
Args: | |
report: Validation report | |
Returns: | |
Human-readable summary | |
""" | |
if report["valid"]: | |
summary = "β AIBOM is valid and complies with AI requirements.\n" | |
else: | |
summary = "β AIBOM validation failed.\n" | |
summary += f"\nSummary:\n" | |
summary += f"- Errors: {report['summary']['error_count']}\n" | |
summary += f"- Warnings: {report['summary']['warning_count']}\n" | |
summary += f"- Info: {report['summary']['info_count']}\n" | |
if not report["valid"]: | |
summary += "\nIssues:\n" | |
for issue in report["issues"]: | |
severity = issue["severity"].upper() | |
code = issue["code"] | |
message = issue["message"] | |
path = issue["path"] | |
summary += f"- [{severity}] {code}: {message} (at {path})\n" | |
summary += "\nRecommendations:\n" | |
for i, recommendation in enumerate(report["recommendations"], 1): | |
summary += f"{i}. {recommendation}\n" | |
return summary | |
def calculate_completeness_score(aibom: Dict[str, Any], validate: bool = True) -> Dict[str, Any]: | |
""" | |
Calculate completeness score for an AIBOM and optionally validate against AI requirements. | |
Args: | |
aibom: The AIBOM to score and validate | |
validate: Whether to perform validation | |
Returns: | |
Dictionary containing score and validation results | |
""" | |
field_checklist = {} | |
max_scores = { | |
"required_fields": 20, | |
"metadata": 20, | |
"component_basic": 20, | |
"component_model_card": 30, | |
"external_references": 10 | |
} | |
# Required Fields (20 points max) | |
required_fields = ["bomFormat", "specVersion", "serialNumber", "version"] | |
required_score = sum([5 if aibom.get(field) else 0 for field in required_fields]) | |
for field in required_fields: | |
field_checklist[field] = "β" if aibom.get(field) else "β" | |
# Metadata (20 points max) | |
metadata = aibom.get("metadata", {}) | |
metadata_fields = ["timestamp", "tools", "authors", "component"] | |
metadata_score = sum([5 if metadata.get(field) else 0 for field in metadata_fields]) | |
for field in metadata_fields: | |
field_checklist[f"metadata.{field}"] = "β" if metadata.get(field) else "β" | |
# Component Basic Info (20 points max) | |
components = aibom.get("components", []) | |
component_score = 0 | |
if components: | |
# Use the first component as specified in the design | |
comp = components[0] | |
comp_fields = ["type", "name", "bom-ref", "purl", "description", "licenses"] | |
component_score = sum([ | |
2 if comp.get("type") else 0, | |
4 if comp.get("name") else 0, | |
2 if comp.get("bom-ref") else 0, | |
4 if comp.get("purl") and re.match(r'^pkg:huggingface/.+', comp["purl"]) else 0, | |
4 if comp.get("description") and len(comp["description"]) > 20 else 0, | |
4 if comp.get("licenses") and validate_spdx(comp["licenses"]) else 0 | |
]) | |
for field in comp_fields: | |
field_checklist[f"component.{field}"] = "β" if comp.get(field) else "β" | |
if field == "purl" and comp.get(field) and not re.match(r'^pkg:huggingface/.+', comp["purl"]): | |
field_checklist[f"component.{field}"] = "β" | |
if field == "description" and comp.get(field) and len(comp["description"]) <= 20: | |
field_checklist[f"component.{field}"] = "β" | |
if field == "licenses" and comp.get(field) and not validate_spdx(comp["licenses"]): | |
field_checklist[f"component.{field}"] = "β" | |
# Model Card Section (30 points max) | |
model_card_score = 0 | |
if components: | |
# Use the first component's model card as specified in the design | |
comp = components[0] | |
card = comp.get("modelCard", {}) | |
card_fields = ["modelParameters", "quantitativeAnalysis", "considerations"] | |
model_card_score = sum([ | |
10 if card.get("modelParameters") else 0, | |
10 if card.get("quantitativeAnalysis") else 0, | |
10 if card.get("considerations") and len(card["considerations"]) > 50 else 0 | |
]) | |
for field in card_fields: | |
field_checklist[f"modelCard.{field}"] = "β" if field in card else "β" | |
if field == "considerations" and field in card and len(card["considerations"]) <= 50: | |
field_checklist[f"modelCard.{field}"] = "β" | |
# External References (10 points max) | |
ext_refs = aibom.get("externalReferences", []) | |
ext_score = 0 | |
for ref in ext_refs: | |
url = ref.get("url", "").lower() | |
if "modelcard" in url: | |
ext_score += 4 | |
elif "huggingface.co" in url or "github.com" in url: | |
ext_score += 3 | |
elif "dataset" in url: | |
ext_score += 3 | |
ext_score = min(ext_score, 10) | |
field_checklist["externalReferences"] = "β" if ext_refs else "β" | |
# Calculate total score | |
total_score = ( | |
(required_score * 0.20) + | |
(metadata_score * 0.20) + | |
(component_score * 0.20) + | |
(model_card_score * 0.30) + | |
(ext_score * 0.10) | |
) | |
result = { | |
"total_score": round(total_score, 2), | |
"section_scores": { | |
"required_fields": required_score, | |
"metadata": metadata_score, | |
"component_basic": component_score, | |
"component_model_card": model_card_score, | |
"external_references": ext_score | |
}, | |
"max_scores": max_scores, | |
"field_checklist": field_checklist | |
} | |
# Add validation if requested | |
if validate: | |
validation_result = validate_aibom(aibom) | |
result["validation"] = validation_result | |
# Adjust score based on validation results | |
if not validation_result["valid"]: | |
# Count errors and warnings | |
error_count = validation_result["summary"]["error_count"] | |
warning_count = validation_result["summary"]["warning_count"] | |
# Apply penalties to the score | |
if error_count > 0: | |
# Severe penalty for errors (up to 50% reduction) | |
error_penalty = min(0.5, error_count * 0.1) | |
result["total_score"] = round(result["total_score"] * (1 - error_penalty), 2) | |
result["validation_penalty"] = f"-{int(error_penalty * 100)}% due to {error_count} schema errors" | |
elif warning_count > 0: | |
# Minor penalty for warnings (up to 20% reduction) | |
warning_penalty = min(0.2, warning_count * 0.05) | |
result["total_score"] = round(result["total_score"] * (1 - warning_penalty), 2) | |
result["validation_penalty"] = f"-{int(warning_penalty * 100)}% due to {warning_count} schema warnings" | |
return result | |
def merge_metadata(primary: Dict[str, Any], secondary: Dict[str, Any]) -> Dict[str, Any]: | |
result = secondary.copy() | |
for key, value in primary.items(): | |
if value is not None: | |
if key in result and isinstance(value, dict) and isinstance(result[key], dict): | |
result[key] = merge_metadata(value, result[key]) | |
else: | |
result[key] = value | |
return result | |
def extract_model_id_parts(model_id: str) -> Dict[str, str]: | |
parts = model_id.split("/") | |
if len(parts) == 1: | |
return {"owner": None, "name": parts[0]} | |
return {"owner": parts[0], "name": "/".join(parts[1:])} | |
def create_purl(model_id: str) -> str: | |
parts = extract_model_id_parts(model_id) | |
if parts["owner"]: | |
return f"pkg:huggingface/{parts['owner']}/{parts['name']}" | |
return f"pkg:huggingface/{parts['name']}" | |