Spaces:

aetheris-ai
/

aibom-generator

Running

App Files Files

aibom-generator / src /aibom_generator /utils.py

a1c00l

Update src/aibom_generator/utils.py

95c032e verified about 2 months ago

raw

history blame

20.4 kB

	"""
	Utility functions for the AIBOM Generator.
	"""

	import json
	import logging
	import os
	import re
	import uuid
	from typing import Dict, List, Optional, Any, Union, Tuple
	from enum import Enum

	logger = logging.getLogger(__name__)

	# Validation severity levels
	class ValidationSeverity(Enum):
	ERROR = "error"
	WARNING = "warning"
	INFO = "info"


	def setup_logging(level=logging.INFO):
	logging.basicConfig(
	level=level,
	format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
	datefmt="%Y-%m-%d %H:%M:%S",
	)


	def ensure_directory(directory_path):
	if not os.path.exists(directory_path):
	os.makedirs(directory_path)
	return directory_path


	def generate_uuid():
	return str(uuid.uuid4())


	def normalize_license_id(license_text):
	license_mappings = {
	"mit": "MIT",
	"apache": "Apache-2.0",
	"apache 2": "Apache-2.0",
	"apache 2.0": "Apache-2.0",
	"apache-2": "Apache-2.0",
	"apache-2.0": "Apache-2.0",
	"gpl": "GPL-3.0-only",
	"gpl-3": "GPL-3.0-only",
	"gpl-3.0": "GPL-3.0-only",
	"gpl3": "GPL-3.0-only",
	"gpl v3": "GPL-3.0-only",
	"gpl-2": "GPL-2.0-only",
	"gpl-2.0": "GPL-2.0-only",
	"gpl2": "GPL-2.0-only",
	"gpl v2": "GPL-2.0-only",
	"lgpl": "LGPL-3.0-only",
	"lgpl-3": "LGPL-3.0-only",
	"lgpl-3.0": "LGPL-3.0-only",
	"bsd": "BSD-3-Clause",
	"bsd-3": "BSD-3-Clause",
	"bsd-3-clause": "BSD-3-Clause",
	"bsd-2": "BSD-2-Clause",
	"bsd-2-clause": "BSD-2-Clause",
	"cc": "CC-BY-4.0",
	"cc-by": "CC-BY-4.0",
	"cc-by-4.0": "CC-BY-4.0",
	"cc-by-sa": "CC-BY-SA-4.0",
	"cc-by-sa-4.0": "CC-BY-SA-4.0",
	"cc-by-nc": "CC-BY-NC-4.0",
	"cc-by-nc-4.0": "CC-BY-NC-4.0",
	"cc0": "CC0-1.0",
	"cc0-1.0": "CC0-1.0",
	"public domain": "CC0-1.0",
	"unlicense": "Unlicense",
	"proprietary": "NONE",
	"commercial": "NONE",
	}

	if not license_text:
	return None

	normalized = re.sub(r'[^\w\s-]', '', license_text.lower())

	if normalized in license_mappings:
	return license_mappings[normalized]

	for key, value in license_mappings.items():
	if key in normalized:
	return value

	return license_text


	def validate_spdx(license_entry):
	spdx_licenses = [
	"MIT", "Apache-2.0", "GPL-3.0-only", "GPL-2.0-only", "LGPL-3.0-only",
	"BSD-3-Clause", "BSD-2-Clause", "CC-BY-4.0", "CC-BY-SA-4.0", "CC0-1.0",
	"Unlicense", "NONE"
	]
	if isinstance(license_entry, list):
	return all(lic in spdx_licenses for lic in license_entry)
	return license_entry in spdx_licenses


	def _validate_ai_requirements(aibom: Dict[str, Any]) -> List[Dict[str, Any]]:
	"""
	Validate AI-specific requirements for an AIBOM.

	Args:
	aibom: The AIBOM to validate

	Returns:
	List of validation issues
	"""
	issues = []

	# Check basic structure
	if "components" not in aibom or not aibom["components"]:
	issues.append({
	"severity": ValidationSeverity.ERROR.value,
	"code": "MISSING_COMPONENTS",
	"message": "AIBOM must contain at least one component",
	"path": "$.components"
	})
	return issues # Can't continue validation without components

	# Check for schema compliance issues with authors
	if "metadata" in aibom and "authors" in aibom["metadata"]:
	for i, author in enumerate(aibom["metadata"]["authors"]):
	if "url" in author:
	issues.append({
	"severity": ValidationSeverity.ERROR.value,
	"code": "INVALID_AUTHOR_PROPERTY",
	"message": "Author object contains 'url' property which is not allowed in CycloneDX schema. Use 'email' instead.",
	"path": f"$.metadata.authors[{i}].url"
	})

	# Validate each component
	for i, component in enumerate(aibom["components"]):
	component_path = f"$.components[{i}]"

	# Check component type
	if "type" not in component:
	issues.append({
	"severity": ValidationSeverity.ERROR.value,
	"code": "MISSING_COMPONENT_TYPE",
	"message": "Component must have a type",
	"path": f"{component_path}.type"
	})
	elif component.get("type") != "machine-learning-model":
	issues.append({
	"severity": ValidationSeverity.WARNING.value,
	"code": "INVALID_COMPONENT_TYPE",
	"message": "Component type should be 'machine-learning-model' for AI components",
	"path": f"{component_path}.type"
	})

	# Check PURL format
	if "purl" not in component:
	issues.append({
	"severity": ValidationSeverity.ERROR.value,
	"code": "MISSING_PURL",
	"message": "Component must have a PURL",
	"path": f"{component_path}.purl"
	})
	else:
	purl = component["purl"]
	if not purl.startswith("pkg:"):
	issues.append({
	"severity": ValidationSeverity.ERROR.value,
	"code": "INVALID_PURL_FORMAT",
	"message": "PURL must start with 'pkg:'",
	"path": f"{component_path}.purl"
	})
	elif "huggingface" in purl and "@" not in purl and "version" in component:
	issues.append({
	"severity": ValidationSeverity.WARNING.value,
	"code": "MISSING_VERSION_IN_PURL",
	"message": "PURL should include version information with '@' for versioned components",
	"path": f"{component_path}.purl"
	})

	# Check model card
	if "modelCard" not in component:
	issues.append({
	"severity": ValidationSeverity.WARNING.value,
	"code": "MISSING_MODEL_CARD",
	"message": "AI component should include a model card",
	"path": f"{component_path}.modelCard"
	})
	else:
	model_card = component["modelCard"]
	model_card_path = f"{component_path}.modelCard"

	# Check model parameters
	if "modelParameters" not in model_card:
	issues.append({
	"severity": ValidationSeverity.WARNING.value,
	"code": "MISSING_MODEL_PARAMETERS",
	"message": "Model card should include model parameters",
	"path": f"{model_card_path}.modelParameters"
	})

	# Check considerations
	if "considerations" not in model_card:
	issues.append({
	"severity": ValidationSeverity.INFO.value,
	"code": "MISSING_CONSIDERATIONS",
	"message": "Model card should include considerations section for ethical considerations, limitations, etc.",
	"path": f"{model_card_path}.considerations"
	})

	# Validate metadata
	if "metadata" not in aibom:
	issues.append({
	"severity": ValidationSeverity.ERROR.value,
	"code": "MISSING_METADATA",
	"message": "AIBOM must contain metadata",
	"path": "$.metadata"
	})
	else:
	metadata = aibom["metadata"]
	metadata_path = "$.metadata"

	# Check tools
	if "tools" not in metadata or not metadata["tools"]:
	issues.append({
	"severity": ValidationSeverity.WARNING.value,
	"code": "MISSING_TOOLS",
	"message": "Metadata should include tools that generated the AIBOM",
	"path": f"{metadata_path}.tools"
	})

	# Check authors
	if "authors" not in metadata or not metadata["authors"]:
	issues.append({
	"severity": ValidationSeverity.INFO.value,
	"code": "MISSING_AUTHORS",
	"message": "Metadata should include authors information",
	"path": f"{metadata_path}.authors"
	})

	# Check properties
	if "properties" not in metadata or not metadata["properties"]:
	issues.append({
	"severity": ValidationSeverity.INFO.value,
	"code": "MISSING_PROPERTIES",
	"message": "Metadata should include properties for additional information",
	"path": f"{metadata_path}.properties"
	})

	return issues


	def _generate_validation_recommendations(issues: List[Dict[str, Any]]) -> List[str]:
	"""
	Generate recommendations based on validation issues.

	Args:
	issues: List of validation issues

	Returns:
	List of recommendations
	"""
	recommendations = []

	# Group issues by code
	issue_codes = set(issue["code"] for issue in issues)

	# Generate recommendations based on issue codes
	if "MISSING_COMPONENTS" in issue_codes:
	recommendations.append("Add at least one component to the AIBOM")

	if "MISSING_COMPONENT_TYPE" in issue_codes or "INVALID_COMPONENT_TYPE" in issue_codes:
	recommendations.append("Ensure all AI components have type 'machine-learning-model'")

	if "MISSING_PURL" in issue_codes or "INVALID_PURL_FORMAT" in issue_codes:
	recommendations.append("Ensure all components have a valid PURL starting with 'pkg:'")

	if "MISSING_VERSION_IN_PURL" in issue_codes:
	recommendations.append("Include version information in PURLs using '@' syntax (e.g., pkg:huggingface/org/model@version)")

	if "MISSING_MODEL_CARD" in issue_codes:
	recommendations.append("Add a model card section to AI components")

	if "MISSING_MODEL_PARAMETERS" in issue_codes:
	recommendations.append("Include model parameters in the model card section")

	if "MISSING_CONSIDERATIONS" in issue_codes:
	recommendations.append("Add ethical considerations, limitations, and risks to the model card")

	if "MISSING_METADATA" in issue_codes:
	recommendations.append("Add metadata section to the AIBOM")

	if "MISSING_TOOLS" in issue_codes:
	recommendations.append("Include tools information in the metadata section")

	if "MISSING_AUTHORS" in issue_codes:
	recommendations.append("Add authors information to the metadata section")

	if "MISSING_PROPERTIES" in issue_codes:
	recommendations.append("Include additional properties in the metadata section")

	if "INVALID_AUTHOR_PROPERTY" in issue_codes:
	recommendations.append("Remove 'url' property from author objects and use 'email' instead to comply with CycloneDX schema")

	return recommendations


	def validate_aibom(aibom: Dict[str, Any]) -> Dict[str, Any]:
	"""
	Validate an AIBOM against AI-specific requirements.

	Args:
	aibom: The AIBOM to validate

	Returns:
	Validation report with issues and recommendations
	"""
	# Initialize validation report
	report = {
	"valid": True,
	"ai_valid": True,
	"issues": [],
	"recommendations": [],
	"summary": {
	"error_count": 0,
	"warning_count": 0,
	"info_count": 0
	}
	}

	# Validate AI-specific requirements
	ai_issues = _validate_ai_requirements(aibom)
	if ai_issues:
	report["ai_valid"] = False
	report["valid"] = False
	report["issues"].extend(ai_issues)

	# Generate recommendations
	report["recommendations"] = _generate_validation_recommendations(report["issues"])

	# Update summary counts
	for issue in report["issues"]:
	if issue["severity"] == ValidationSeverity.ERROR.value:
	report["summary"]["error_count"] += 1
	elif issue["severity"] == ValidationSeverity.WARNING.value:
	report["summary"]["warning_count"] += 1
	elif issue["severity"] == ValidationSeverity.INFO.value:
	report["summary"]["info_count"] += 1

	return report


	def get_validation_summary(report: Dict[str, Any]) -> str:
	"""
	Get a human-readable summary of the validation report.

	Args:
	report: Validation report

	Returns:
	Human-readable summary
	"""
	if report["valid"]:
	summary = "✅ AIBOM is valid and complies with AI requirements.\n"
	else:
	summary = "❌ AIBOM validation failed.\n"

	summary += f"\nSummary:\n"
	summary += f"- Errors: {report['summary']['error_count']}\n"
	summary += f"- Warnings: {report['summary']['warning_count']}\n"
	summary += f"- Info: {report['summary']['info_count']}\n"

	if not report["valid"]:
	summary += "\nIssues:\n"
	for issue in report["issues"]:
	severity = issue["severity"].upper()
	code = issue["code"]
	message = issue["message"]
	path = issue["path"]
	summary += f"- [{severity}] {code}: {message} (at {path})\n"

	summary += "\nRecommendations:\n"
	for i, recommendation in enumerate(report["recommendations"], 1):
	summary += f"{i}. {recommendation}\n"

	return summary


	def calculate_completeness_score(aibom: Dict[str, Any], validate: bool = True) -> Dict[str, Any]:
	"""
	Calculate completeness score for an AIBOM and optionally validate against AI requirements.

	Args:
	aibom: The AIBOM to score and validate
	validate: Whether to perform validation

	Returns:
	Dictionary containing score and validation results
	"""
	field_checklist = {}
	max_scores = {
	"required_fields": 20,
	"metadata": 20,
	"component_basic": 20,
	"component_model_card": 30,
	"external_references": 10
	}

	# Required Fields (20 points max)
	required_fields = ["bomFormat", "specVersion", "serialNumber", "version"]
	required_score = sum([5 if aibom.get(field) else 0 for field in required_fields])
	for field in required_fields:
	field_checklist[field] = "✔" if aibom.get(field) else "✘"

	# Metadata (20 points max)
	metadata = aibom.get("metadata", {})
	metadata_fields = ["timestamp", "tools", "authors", "component"]
	metadata_score = sum([5 if metadata.get(field) else 0 for field in metadata_fields])
	for field in metadata_fields:
	field_checklist[f"metadata.{field}"] = "✔" if metadata.get(field) else "✘"

	# Component Basic Info (20 points max)
	components = aibom.get("components", [])
	component_score = 0

	if components:
	# Use the first component as specified in the design
	comp = components[0]
	comp_fields = ["type", "name", "bom-ref", "purl", "description", "licenses"]
	component_score = sum([
	2 if comp.get("type") else 0,
	4 if comp.get("name") else 0,
	2 if comp.get("bom-ref") else 0,
	4 if comp.get("purl") and re.match(r'^pkg:huggingface/.+', comp["purl"]) else 0,
	4 if comp.get("description") and len(comp["description"]) > 20 else 0,
	4 if comp.get("licenses") and validate_spdx(comp["licenses"]) else 0
	])
	for field in comp_fields:
	field_checklist[f"component.{field}"] = "✔" if comp.get(field) else "✘"
	if field == "purl" and comp.get(field) and not re.match(r'^pkg:huggingface/.+', comp["purl"]):
	field_checklist[f"component.{field}"] = "✘"
	if field == "description" and comp.get(field) and len(comp["description"]) <= 20:
	field_checklist[f"component.{field}"] = "✘"
	if field == "licenses" and comp.get(field) and not validate_spdx(comp["licenses"]):
	field_checklist[f"component.{field}"] = "✘"

	# Model Card Section (30 points max)
	model_card_score = 0

	if components:
	# Use the first component's model card as specified in the design
	comp = components[0]
	card = comp.get("modelCard", {})
	card_fields = ["modelParameters", "quantitativeAnalysis", "considerations"]
	model_card_score = sum([
	10 if card.get("modelParameters") else 0,
	10 if card.get("quantitativeAnalysis") else 0,
	10 if card.get("considerations") and len(card["considerations"]) > 50 else 0
	])
	for field in card_fields:
	field_checklist[f"modelCard.{field}"] = "✔" if field in card else "✘"
	if field == "considerations" and field in card and len(card["considerations"]) <= 50:
	field_checklist[f"modelCard.{field}"] = "✘"

	# External References (10 points max)
	ext_refs = aibom.get("externalReferences", [])
	ext_score = 0
	for ref in ext_refs:
	url = ref.get("url", "").lower()
	if "modelcard" in url:
	ext_score += 4
	elif "huggingface.co" in url or "github.com" in url:
	ext_score += 3
	elif "dataset" in url:
	ext_score += 3
	ext_score = min(ext_score, 10)
	field_checklist["externalReferences"] = "✔" if ext_refs else "✘"

	# Calculate total score
	total_score = (
	(required_score * 0.20) +
	(metadata_score * 0.20) +
	(component_score * 0.20) +
	(model_card_score * 0.30) +
	(ext_score * 0.10)
	)

	result = {
	"total_score": round(total_score, 2),
	"section_scores": {
	"required_fields": required_score,
	"metadata": metadata_score,
	"component_basic": component_score,
	"component_model_card": model_card_score,
	"external_references": ext_score
	},
	"max_scores": max_scores,
	"field_checklist": field_checklist
	}

	# Add validation if requested
	if validate:
	validation_result = validate_aibom(aibom)
	result["validation"] = validation_result

	# Adjust score based on validation results
	if not validation_result["valid"]:
	# Count errors and warnings
	error_count = validation_result["summary"]["error_count"]
	warning_count = validation_result["summary"]["warning_count"]

	# Apply penalties to the score
	if error_count > 0:
	# Severe penalty for errors (up to 50% reduction)
	error_penalty = min(0.5, error_count * 0.1)
	result["total_score"] = round(result["total_score"] * (1 - error_penalty), 2)
	result["validation_penalty"] = f"-{int(error_penalty * 100)}% due to {error_count} schema errors"
	elif warning_count > 0:
	# Minor penalty for warnings (up to 20% reduction)
	warning_penalty = min(0.2, warning_count * 0.05)
	result["total_score"] = round(result["total_score"] * (1 - warning_penalty), 2)
	result["validation_penalty"] = f"-{int(warning_penalty * 100)}% due to {warning_count} schema warnings"

	return result


	def merge_metadata(primary: Dict[str, Any], secondary: Dict[str, Any]) -> Dict[str, Any]:
	result = secondary.copy()
	for key, value in primary.items():
	if value is not None:
	if key in result and isinstance(value, dict) and isinstance(result[key], dict):
	result[key] = merge_metadata(value, result[key])
	else:
	result[key] = value
	return result


	def extract_model_id_parts(model_id: str) -> Dict[str, str]:
	parts = model_id.split("/")
	if len(parts) == 1:
	return {"owner": None, "name": parts[0]}
	return {"owner": parts[0], "name": "/".join(parts[1:])}


	def create_purl(model_id: str) -> str:
	parts = extract_model_id_parts(model_id)
	if parts["owner"]:
	return f"pkg:huggingface/{parts['owner']}/{parts['name']}"
	return f"pkg:huggingface/{parts['name']}"