a1c00l's picture
Update src/aibom_generator/utils.py
95c032e verified
raw
history blame
20.4 kB
"""
Utility functions for the AIBOM Generator.
"""
import json
import logging
import os
import re
import uuid
from typing import Dict, List, Optional, Any, Union, Tuple
from enum import Enum
logger = logging.getLogger(__name__)
# Validation severity levels
class ValidationSeverity(Enum):
ERROR = "error"
WARNING = "warning"
INFO = "info"
def setup_logging(level=logging.INFO):
logging.basicConfig(
level=level,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
def ensure_directory(directory_path):
if not os.path.exists(directory_path):
os.makedirs(directory_path)
return directory_path
def generate_uuid():
return str(uuid.uuid4())
def normalize_license_id(license_text):
license_mappings = {
"mit": "MIT",
"apache": "Apache-2.0",
"apache 2": "Apache-2.0",
"apache 2.0": "Apache-2.0",
"apache-2": "Apache-2.0",
"apache-2.0": "Apache-2.0",
"gpl": "GPL-3.0-only",
"gpl-3": "GPL-3.0-only",
"gpl-3.0": "GPL-3.0-only",
"gpl3": "GPL-3.0-only",
"gpl v3": "GPL-3.0-only",
"gpl-2": "GPL-2.0-only",
"gpl-2.0": "GPL-2.0-only",
"gpl2": "GPL-2.0-only",
"gpl v2": "GPL-2.0-only",
"lgpl": "LGPL-3.0-only",
"lgpl-3": "LGPL-3.0-only",
"lgpl-3.0": "LGPL-3.0-only",
"bsd": "BSD-3-Clause",
"bsd-3": "BSD-3-Clause",
"bsd-3-clause": "BSD-3-Clause",
"bsd-2": "BSD-2-Clause",
"bsd-2-clause": "BSD-2-Clause",
"cc": "CC-BY-4.0",
"cc-by": "CC-BY-4.0",
"cc-by-4.0": "CC-BY-4.0",
"cc-by-sa": "CC-BY-SA-4.0",
"cc-by-sa-4.0": "CC-BY-SA-4.0",
"cc-by-nc": "CC-BY-NC-4.0",
"cc-by-nc-4.0": "CC-BY-NC-4.0",
"cc0": "CC0-1.0",
"cc0-1.0": "CC0-1.0",
"public domain": "CC0-1.0",
"unlicense": "Unlicense",
"proprietary": "NONE",
"commercial": "NONE",
}
if not license_text:
return None
normalized = re.sub(r'[^\w\s-]', '', license_text.lower())
if normalized in license_mappings:
return license_mappings[normalized]
for key, value in license_mappings.items():
if key in normalized:
return value
return license_text
def validate_spdx(license_entry):
spdx_licenses = [
"MIT", "Apache-2.0", "GPL-3.0-only", "GPL-2.0-only", "LGPL-3.0-only",
"BSD-3-Clause", "BSD-2-Clause", "CC-BY-4.0", "CC-BY-SA-4.0", "CC0-1.0",
"Unlicense", "NONE"
]
if isinstance(license_entry, list):
return all(lic in spdx_licenses for lic in license_entry)
return license_entry in spdx_licenses
def _validate_ai_requirements(aibom: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
Validate AI-specific requirements for an AIBOM.
Args:
aibom: The AIBOM to validate
Returns:
List of validation issues
"""
issues = []
# Check basic structure
if "components" not in aibom or not aibom["components"]:
issues.append({
"severity": ValidationSeverity.ERROR.value,
"code": "MISSING_COMPONENTS",
"message": "AIBOM must contain at least one component",
"path": "$.components"
})
return issues # Can't continue validation without components
# Check for schema compliance issues with authors
if "metadata" in aibom and "authors" in aibom["metadata"]:
for i, author in enumerate(aibom["metadata"]["authors"]):
if "url" in author:
issues.append({
"severity": ValidationSeverity.ERROR.value,
"code": "INVALID_AUTHOR_PROPERTY",
"message": "Author object contains 'url' property which is not allowed in CycloneDX schema. Use 'email' instead.",
"path": f"$.metadata.authors[{i}].url"
})
# Validate each component
for i, component in enumerate(aibom["components"]):
component_path = f"$.components[{i}]"
# Check component type
if "type" not in component:
issues.append({
"severity": ValidationSeverity.ERROR.value,
"code": "MISSING_COMPONENT_TYPE",
"message": "Component must have a type",
"path": f"{component_path}.type"
})
elif component.get("type") != "machine-learning-model":
issues.append({
"severity": ValidationSeverity.WARNING.value,
"code": "INVALID_COMPONENT_TYPE",
"message": "Component type should be 'machine-learning-model' for AI components",
"path": f"{component_path}.type"
})
# Check PURL format
if "purl" not in component:
issues.append({
"severity": ValidationSeverity.ERROR.value,
"code": "MISSING_PURL",
"message": "Component must have a PURL",
"path": f"{component_path}.purl"
})
else:
purl = component["purl"]
if not purl.startswith("pkg:"):
issues.append({
"severity": ValidationSeverity.ERROR.value,
"code": "INVALID_PURL_FORMAT",
"message": "PURL must start with 'pkg:'",
"path": f"{component_path}.purl"
})
elif "huggingface" in purl and "@" not in purl and "version" in component:
issues.append({
"severity": ValidationSeverity.WARNING.value,
"code": "MISSING_VERSION_IN_PURL",
"message": "PURL should include version information with '@' for versioned components",
"path": f"{component_path}.purl"
})
# Check model card
if "modelCard" not in component:
issues.append({
"severity": ValidationSeverity.WARNING.value,
"code": "MISSING_MODEL_CARD",
"message": "AI component should include a model card",
"path": f"{component_path}.modelCard"
})
else:
model_card = component["modelCard"]
model_card_path = f"{component_path}.modelCard"
# Check model parameters
if "modelParameters" not in model_card:
issues.append({
"severity": ValidationSeverity.WARNING.value,
"code": "MISSING_MODEL_PARAMETERS",
"message": "Model card should include model parameters",
"path": f"{model_card_path}.modelParameters"
})
# Check considerations
if "considerations" not in model_card:
issues.append({
"severity": ValidationSeverity.INFO.value,
"code": "MISSING_CONSIDERATIONS",
"message": "Model card should include considerations section for ethical considerations, limitations, etc.",
"path": f"{model_card_path}.considerations"
})
# Validate metadata
if "metadata" not in aibom:
issues.append({
"severity": ValidationSeverity.ERROR.value,
"code": "MISSING_METADATA",
"message": "AIBOM must contain metadata",
"path": "$.metadata"
})
else:
metadata = aibom["metadata"]
metadata_path = "$.metadata"
# Check tools
if "tools" not in metadata or not metadata["tools"]:
issues.append({
"severity": ValidationSeverity.WARNING.value,
"code": "MISSING_TOOLS",
"message": "Metadata should include tools that generated the AIBOM",
"path": f"{metadata_path}.tools"
})
# Check authors
if "authors" not in metadata or not metadata["authors"]:
issues.append({
"severity": ValidationSeverity.INFO.value,
"code": "MISSING_AUTHORS",
"message": "Metadata should include authors information",
"path": f"{metadata_path}.authors"
})
# Check properties
if "properties" not in metadata or not metadata["properties"]:
issues.append({
"severity": ValidationSeverity.INFO.value,
"code": "MISSING_PROPERTIES",
"message": "Metadata should include properties for additional information",
"path": f"{metadata_path}.properties"
})
return issues
def _generate_validation_recommendations(issues: List[Dict[str, Any]]) -> List[str]:
"""
Generate recommendations based on validation issues.
Args:
issues: List of validation issues
Returns:
List of recommendations
"""
recommendations = []
# Group issues by code
issue_codes = set(issue["code"] for issue in issues)
# Generate recommendations based on issue codes
if "MISSING_COMPONENTS" in issue_codes:
recommendations.append("Add at least one component to the AIBOM")
if "MISSING_COMPONENT_TYPE" in issue_codes or "INVALID_COMPONENT_TYPE" in issue_codes:
recommendations.append("Ensure all AI components have type 'machine-learning-model'")
if "MISSING_PURL" in issue_codes or "INVALID_PURL_FORMAT" in issue_codes:
recommendations.append("Ensure all components have a valid PURL starting with 'pkg:'")
if "MISSING_VERSION_IN_PURL" in issue_codes:
recommendations.append("Include version information in PURLs using '@' syntax (e.g., pkg:huggingface/org/model@version)")
if "MISSING_MODEL_CARD" in issue_codes:
recommendations.append("Add a model card section to AI components")
if "MISSING_MODEL_PARAMETERS" in issue_codes:
recommendations.append("Include model parameters in the model card section")
if "MISSING_CONSIDERATIONS" in issue_codes:
recommendations.append("Add ethical considerations, limitations, and risks to the model card")
if "MISSING_METADATA" in issue_codes:
recommendations.append("Add metadata section to the AIBOM")
if "MISSING_TOOLS" in issue_codes:
recommendations.append("Include tools information in the metadata section")
if "MISSING_AUTHORS" in issue_codes:
recommendations.append("Add authors information to the metadata section")
if "MISSING_PROPERTIES" in issue_codes:
recommendations.append("Include additional properties in the metadata section")
if "INVALID_AUTHOR_PROPERTY" in issue_codes:
recommendations.append("Remove 'url' property from author objects and use 'email' instead to comply with CycloneDX schema")
return recommendations
def validate_aibom(aibom: Dict[str, Any]) -> Dict[str, Any]:
"""
Validate an AIBOM against AI-specific requirements.
Args:
aibom: The AIBOM to validate
Returns:
Validation report with issues and recommendations
"""
# Initialize validation report
report = {
"valid": True,
"ai_valid": True,
"issues": [],
"recommendations": [],
"summary": {
"error_count": 0,
"warning_count": 0,
"info_count": 0
}
}
# Validate AI-specific requirements
ai_issues = _validate_ai_requirements(aibom)
if ai_issues:
report["ai_valid"] = False
report["valid"] = False
report["issues"].extend(ai_issues)
# Generate recommendations
report["recommendations"] = _generate_validation_recommendations(report["issues"])
# Update summary counts
for issue in report["issues"]:
if issue["severity"] == ValidationSeverity.ERROR.value:
report["summary"]["error_count"] += 1
elif issue["severity"] == ValidationSeverity.WARNING.value:
report["summary"]["warning_count"] += 1
elif issue["severity"] == ValidationSeverity.INFO.value:
report["summary"]["info_count"] += 1
return report
def get_validation_summary(report: Dict[str, Any]) -> str:
"""
Get a human-readable summary of the validation report.
Args:
report: Validation report
Returns:
Human-readable summary
"""
if report["valid"]:
summary = "βœ… AIBOM is valid and complies with AI requirements.\n"
else:
summary = "❌ AIBOM validation failed.\n"
summary += f"\nSummary:\n"
summary += f"- Errors: {report['summary']['error_count']}\n"
summary += f"- Warnings: {report['summary']['warning_count']}\n"
summary += f"- Info: {report['summary']['info_count']}\n"
if not report["valid"]:
summary += "\nIssues:\n"
for issue in report["issues"]:
severity = issue["severity"].upper()
code = issue["code"]
message = issue["message"]
path = issue["path"]
summary += f"- [{severity}] {code}: {message} (at {path})\n"
summary += "\nRecommendations:\n"
for i, recommendation in enumerate(report["recommendations"], 1):
summary += f"{i}. {recommendation}\n"
return summary
def calculate_completeness_score(aibom: Dict[str, Any], validate: bool = True) -> Dict[str, Any]:
"""
Calculate completeness score for an AIBOM and optionally validate against AI requirements.
Args:
aibom: The AIBOM to score and validate
validate: Whether to perform validation
Returns:
Dictionary containing score and validation results
"""
field_checklist = {}
max_scores = {
"required_fields": 20,
"metadata": 20,
"component_basic": 20,
"component_model_card": 30,
"external_references": 10
}
# Required Fields (20 points max)
required_fields = ["bomFormat", "specVersion", "serialNumber", "version"]
required_score = sum([5 if aibom.get(field) else 0 for field in required_fields])
for field in required_fields:
field_checklist[field] = "βœ”" if aibom.get(field) else "✘"
# Metadata (20 points max)
metadata = aibom.get("metadata", {})
metadata_fields = ["timestamp", "tools", "authors", "component"]
metadata_score = sum([5 if metadata.get(field) else 0 for field in metadata_fields])
for field in metadata_fields:
field_checklist[f"metadata.{field}"] = "βœ”" if metadata.get(field) else "✘"
# Component Basic Info (20 points max)
components = aibom.get("components", [])
component_score = 0
if components:
# Use the first component as specified in the design
comp = components[0]
comp_fields = ["type", "name", "bom-ref", "purl", "description", "licenses"]
component_score = sum([
2 if comp.get("type") else 0,
4 if comp.get("name") else 0,
2 if comp.get("bom-ref") else 0,
4 if comp.get("purl") and re.match(r'^pkg:huggingface/.+', comp["purl"]) else 0,
4 if comp.get("description") and len(comp["description"]) > 20 else 0,
4 if comp.get("licenses") and validate_spdx(comp["licenses"]) else 0
])
for field in comp_fields:
field_checklist[f"component.{field}"] = "βœ”" if comp.get(field) else "✘"
if field == "purl" and comp.get(field) and not re.match(r'^pkg:huggingface/.+', comp["purl"]):
field_checklist[f"component.{field}"] = "✘"
if field == "description" and comp.get(field) and len(comp["description"]) <= 20:
field_checklist[f"component.{field}"] = "✘"
if field == "licenses" and comp.get(field) and not validate_spdx(comp["licenses"]):
field_checklist[f"component.{field}"] = "✘"
# Model Card Section (30 points max)
model_card_score = 0
if components:
# Use the first component's model card as specified in the design
comp = components[0]
card = comp.get("modelCard", {})
card_fields = ["modelParameters", "quantitativeAnalysis", "considerations"]
model_card_score = sum([
10 if card.get("modelParameters") else 0,
10 if card.get("quantitativeAnalysis") else 0,
10 if card.get("considerations") and len(card["considerations"]) > 50 else 0
])
for field in card_fields:
field_checklist[f"modelCard.{field}"] = "βœ”" if field in card else "✘"
if field == "considerations" and field in card and len(card["considerations"]) <= 50:
field_checklist[f"modelCard.{field}"] = "✘"
# External References (10 points max)
ext_refs = aibom.get("externalReferences", [])
ext_score = 0
for ref in ext_refs:
url = ref.get("url", "").lower()
if "modelcard" in url:
ext_score += 4
elif "huggingface.co" in url or "github.com" in url:
ext_score += 3
elif "dataset" in url:
ext_score += 3
ext_score = min(ext_score, 10)
field_checklist["externalReferences"] = "βœ”" if ext_refs else "✘"
# Calculate total score
total_score = (
(required_score * 0.20) +
(metadata_score * 0.20) +
(component_score * 0.20) +
(model_card_score * 0.30) +
(ext_score * 0.10)
)
result = {
"total_score": round(total_score, 2),
"section_scores": {
"required_fields": required_score,
"metadata": metadata_score,
"component_basic": component_score,
"component_model_card": model_card_score,
"external_references": ext_score
},
"max_scores": max_scores,
"field_checklist": field_checklist
}
# Add validation if requested
if validate:
validation_result = validate_aibom(aibom)
result["validation"] = validation_result
# Adjust score based on validation results
if not validation_result["valid"]:
# Count errors and warnings
error_count = validation_result["summary"]["error_count"]
warning_count = validation_result["summary"]["warning_count"]
# Apply penalties to the score
if error_count > 0:
# Severe penalty for errors (up to 50% reduction)
error_penalty = min(0.5, error_count * 0.1)
result["total_score"] = round(result["total_score"] * (1 - error_penalty), 2)
result["validation_penalty"] = f"-{int(error_penalty * 100)}% due to {error_count} schema errors"
elif warning_count > 0:
# Minor penalty for warnings (up to 20% reduction)
warning_penalty = min(0.2, warning_count * 0.05)
result["total_score"] = round(result["total_score"] * (1 - warning_penalty), 2)
result["validation_penalty"] = f"-{int(warning_penalty * 100)}% due to {warning_count} schema warnings"
return result
def merge_metadata(primary: Dict[str, Any], secondary: Dict[str, Any]) -> Dict[str, Any]:
result = secondary.copy()
for key, value in primary.items():
if value is not None:
if key in result and isinstance(value, dict) and isinstance(result[key], dict):
result[key] = merge_metadata(value, result[key])
else:
result[key] = value
return result
def extract_model_id_parts(model_id: str) -> Dict[str, str]:
parts = model_id.split("/")
if len(parts) == 1:
return {"owner": None, "name": parts[0]}
return {"owner": parts[0], "name": "/".join(parts[1:])}
def create_purl(model_id: str) -> str:
parts = extract_model_id_parts(model_id)
if parts["owner"]:
return f"pkg:huggingface/{parts['owner']}/{parts['name']}"
return f"pkg:huggingface/{parts['name']}"