Spaces:
Running
Running
Upload 5 files
Browse files- src/aibom-generator/enhanced_extractor.py +876 -0
- src/aibom-generator/field_registry.json +737 -0
- src/aibom-generator/field_registry_manager.py +648 -0
- src/aibom-generator/generator.py +442 -35
- src/aibom-generator/utils.py +335 -141
src/aibom-generator/enhanced_extractor.py
ADDED
@@ -0,0 +1,876 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Registry-Integrated (field_registry.json) Enhanced Multi-Layer Data Extraction for AI SBOM Generator
|
4 |
+
|
5 |
+
This module provides a fully configurable enhanced data extraction system that
|
6 |
+
automatically picks up new fields from the JSON registry (field_registry.json) without requiring code changes.
|
7 |
+
It includes comprehensive logging, fallback mechanisms, and confidence tracking.
|
8 |
+
|
9 |
+
Key Features:
|
10 |
+
- Automatically discovers all fields from the registry (field_registry.json)
|
11 |
+
- Attempts extraction for every registry field
|
12 |
+
- Provides detailed logging for each field attempt
|
13 |
+
- Graceful error handling for individual field failures
|
14 |
+
- Maintains backward compatibility with existing code
|
15 |
+
|
16 |
+
"""
|
17 |
+
|
18 |
+
import json
|
19 |
+
import logging
|
20 |
+
import re
|
21 |
+
import requests
|
22 |
+
from typing import Dict, Any, Optional, List, Tuple
|
23 |
+
from enum import Enum
|
24 |
+
from dataclasses import dataclass, field
|
25 |
+
from datetime import datetime
|
26 |
+
from urllib.parse import urlparse, urljoin
|
27 |
+
import time
|
28 |
+
|
29 |
+
# Import existing dependencies
|
30 |
+
from huggingface_hub import HfApi, ModelCard, hf_hub_download
|
31 |
+
from huggingface_hub.utils import RepositoryNotFoundError, EntryNotFoundError
|
32 |
+
|
33 |
+
# Import field registry manager (field_registry_manager.py)
|
34 |
+
try:
|
35 |
+
from .field_registry_manager import get_field_registry_manager
|
36 |
+
REGISTRY_AVAILABLE = True
|
37 |
+
except ImportError:
|
38 |
+
try:
|
39 |
+
from field_registry_manager import get_field_registry_manager
|
40 |
+
REGISTRY_AVAILABLE = True
|
41 |
+
except ImportError:
|
42 |
+
REGISTRY_AVAILABLE = False
|
43 |
+
print("β οΈ Field registry manager not available, falling back to legacy extraction")
|
44 |
+
|
45 |
+
# Configure logging for this module
|
46 |
+
logger = logging.getLogger(__name__)
|
47 |
+
|
48 |
+
class DataSource(Enum):
|
49 |
+
"""Enumeration of data sources for provenance tracking"""
|
50 |
+
HF_API = "huggingface_api"
|
51 |
+
MODEL_CARD = "model_card_yaml"
|
52 |
+
README_TEXT = "readme_text"
|
53 |
+
CONFIG_FILE = "config_file"
|
54 |
+
REPOSITORY_FILES = "repository_files"
|
55 |
+
EXTERNAL_REFERENCE = "external_reference"
|
56 |
+
INTELLIGENT_DEFAULT = "intelligent_default"
|
57 |
+
PLACEHOLDER = "placeholder"
|
58 |
+
REGISTRY_DRIVEN = "registry_driven"
|
59 |
+
|
60 |
+
class ConfidenceLevel(Enum):
|
61 |
+
"""Confidence levels for extracted data"""
|
62 |
+
HIGH = "high" # Direct API data, official sources
|
63 |
+
MEDIUM = "medium" # Inferred from reliable patterns
|
64 |
+
LOW = "low" # Weak inference or pattern matching
|
65 |
+
NONE = "none" # Placeholder values
|
66 |
+
|
67 |
+
@dataclass
|
68 |
+
class ExtractionResult:
|
69 |
+
"""Container for extraction results with full provenance"""
|
70 |
+
value: Any
|
71 |
+
source: DataSource
|
72 |
+
confidence: ConfidenceLevel
|
73 |
+
extraction_method: str
|
74 |
+
timestamp: str = field(default_factory=lambda: datetime.utcnow().isoformat())
|
75 |
+
fallback_chain: List[str] = field(default_factory=list)
|
76 |
+
|
77 |
+
def __str__(self):
|
78 |
+
return f"{self.value} (source: {self.source.value}, confidence: {self.confidence.value})"
|
79 |
+
|
80 |
+
class EnhancedExtractor:
|
81 |
+
"""
|
82 |
+
Registry-integrated enhanced extractor that automatically picks up new fields
|
83 |
+
from the JSON registry (field_registry.json) without requiring code changes.
|
84 |
+
"""
|
85 |
+
|
86 |
+
def __init__(self, hf_api: Optional[HfApi] = None, field_registry_manager=None):
|
87 |
+
"""
|
88 |
+
Initialize the enhanced extractor with registry integration (field_registry.json and field_registry_manager.py).
|
89 |
+
|
90 |
+
Args:
|
91 |
+
hf_api: Optional HuggingFace API instance (will create if not provided)
|
92 |
+
field_registry_manager.py: Optional registry manager instance
|
93 |
+
"""
|
94 |
+
self.hf_api = hf_api or HfApi()
|
95 |
+
self.extraction_results = {}
|
96 |
+
|
97 |
+
# Initialize registry manager (field_registry_manager.py)
|
98 |
+
self.registry_manager = field_registry_manager
|
99 |
+
if not self.registry_manager and REGISTRY_AVAILABLE:
|
100 |
+
try:
|
101 |
+
self.registry_manager = get_field_registry_manager()
|
102 |
+
logger.info("β
Registry manager initialized successfully")
|
103 |
+
except Exception as e:
|
104 |
+
logger.warning(f"β οΈ Could not initialize registry manager: {e}")
|
105 |
+
self.registry_manager = None
|
106 |
+
|
107 |
+
# Load registry fields
|
108 |
+
self.registry_fields = {}
|
109 |
+
if self.registry_manager:
|
110 |
+
try:
|
111 |
+
registry = self.registry_manager.registry
|
112 |
+
self.registry_fields = registry.get('fields', {})
|
113 |
+
logger.info(f"β
Loaded {len(self.registry_fields)} fields from registry")
|
114 |
+
except Exception as e:
|
115 |
+
logger.error(f"β Error loading registry fields: {e}")
|
116 |
+
self.registry_fields = {}
|
117 |
+
|
118 |
+
# Configure logging
|
119 |
+
self._setup_logging()
|
120 |
+
|
121 |
+
# Compile regex patterns for text extraction
|
122 |
+
self._compile_patterns()
|
123 |
+
|
124 |
+
logger.info(f"Enhanced extractor initialized (registry-driven: {bool(self.registry_fields)})")
|
125 |
+
|
126 |
+
def _setup_logging(self):
|
127 |
+
"""Setup logging configuration for detailed extraction tracking"""
|
128 |
+
# Ensure a logger that will show in HF Spaces
|
129 |
+
if not logger.handlers:
|
130 |
+
handler = logging.StreamHandler()
|
131 |
+
formatter = logging.Formatter(
|
132 |
+
'%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
133 |
+
)
|
134 |
+
handler.setFormatter(formatter)
|
135 |
+
logger.addHandler(handler)
|
136 |
+
logger.setLevel(logging.INFO)
|
137 |
+
|
138 |
+
def _compile_patterns(self):
|
139 |
+
"""Compile regex patterns for text extraction"""
|
140 |
+
self.patterns = {
|
141 |
+
'license': [
|
142 |
+
r'license[:\s]+([a-zA-Z0-9\-\.]+)',
|
143 |
+
r'licensed under[:\s]+([a-zA-Z0-9\-\.]+)',
|
144 |
+
r'released under[:\s]+([a-zA-Z0-9\-\.]+)',
|
145 |
+
],
|
146 |
+
'datasets': [
|
147 |
+
r'trained on[:\s]+([a-zA-Z0-9\-\_\/]+)',
|
148 |
+
r'dataset[:\s]+([a-zA-Z0-9\-\_\/]+)',
|
149 |
+
r'using[:\s]+([a-zA-Z0-9\-\_\/]+)\s+dataset',
|
150 |
+
],
|
151 |
+
'metrics': [
|
152 |
+
r'([a-zA-Z]+)[:\s]+([0-9\.]+)',
|
153 |
+
r'achieves[:\s]+([0-9\.]+)[:\s]+([a-zA-Z]+)',
|
154 |
+
],
|
155 |
+
'model_type': [
|
156 |
+
r'model type[:\s]+([a-zA-Z0-9\-]+)',
|
157 |
+
r'architecture[:\s]+([a-zA-Z0-9\-]+)',
|
158 |
+
],
|
159 |
+
'energy': [
|
160 |
+
r'energy[:\s]+([0-9\.]+)\s*([a-zA-Z]+)',
|
161 |
+
r'power[:\s]+([0-9\.]+)\s*([a-zA-Z]+)',
|
162 |
+
r'consumption[:\s]+([0-9\.]+)\s*([a-zA-Z]+)',
|
163 |
+
],
|
164 |
+
'limitations': [
|
165 |
+
r'limitation[s]?[:\s]+([^\.]+)',
|
166 |
+
r'known issue[s]?[:\s]+([^\.]+)',
|
167 |
+
r'constraint[s]?[:\s]+([^\.]+)',
|
168 |
+
],
|
169 |
+
'safety': [
|
170 |
+
r'safety[:\s]+([^\.]+)',
|
171 |
+
r'risk[s]?[:\s]+([^\.]+)',
|
172 |
+
r'bias[:\s]+([^\.]+)',
|
173 |
+
]
|
174 |
+
}
|
175 |
+
|
176 |
+
# Compile all patterns
|
177 |
+
for category, pattern_list in self.patterns.items():
|
178 |
+
self.patterns[category] = [re.compile(pattern, re.IGNORECASE) for pattern in pattern_list]
|
179 |
+
|
180 |
+
def extract_metadata(self, model_id: str, model_info: Dict[str, Any], model_card: Optional[ModelCard]) -> Dict[str, Any]:
|
181 |
+
"""
|
182 |
+
Main extraction method with full registry integration.
|
183 |
+
|
184 |
+
This method automatically discovers all fields from the registry and attempts
|
185 |
+
to extract them without requiring code changes when new fields are added.
|
186 |
+
|
187 |
+
Args:
|
188 |
+
model_id: Hugging Face model identifier
|
189 |
+
model_info: Model information from HF API
|
190 |
+
model_card: Model card object from HF
|
191 |
+
|
192 |
+
Returns:
|
193 |
+
Dictionary of extracted metadata
|
194 |
+
"""
|
195 |
+
logger.info(f"π Starting registry-driven extraction for model: {model_id}")
|
196 |
+
|
197 |
+
# Initialize extraction results tracking
|
198 |
+
self.extraction_results = {}
|
199 |
+
metadata = {}
|
200 |
+
|
201 |
+
if self.registry_fields:
|
202 |
+
# Registry-driven extraction
|
203 |
+
logger.info(f"π Registry-driven mode: Attempting extraction for {len(self.registry_fields)} fields")
|
204 |
+
metadata = self._registry_driven_extraction(model_id, model_info, model_card)
|
205 |
+
else:
|
206 |
+
# Fallback to legacy extraction
|
207 |
+
logger.warning("β οΈ Registry not available, falling back to legacy extraction")
|
208 |
+
metadata = self._legacy_extraction(model_id, model_info, model_card)
|
209 |
+
|
210 |
+
# Log extraction summary
|
211 |
+
self._log_extraction_summary(model_id, metadata)
|
212 |
+
|
213 |
+
# Return metadata in the same format as original method
|
214 |
+
return {k: v for k, v in metadata.items() if v is not None}
|
215 |
+
|
216 |
+
def _registry_driven_extraction(self, model_id: str, model_info: Dict[str, Any], model_card: Optional[ModelCard]) -> Dict[str, Any]:
|
217 |
+
"""
|
218 |
+
Registry-driven extraction that automatically processes all registry fields.
|
219 |
+
"""
|
220 |
+
metadata = {}
|
221 |
+
|
222 |
+
# Prepare extraction context
|
223 |
+
extraction_context = {
|
224 |
+
'model_id': model_id,
|
225 |
+
'model_info': model_info,
|
226 |
+
'model_card': model_card,
|
227 |
+
'readme_content': self._get_readme_content(model_card, model_id),
|
228 |
+
'config_data': self._download_and_parse_config(model_id, "config.json"),
|
229 |
+
'tokenizer_config': self._download_and_parse_config(model_id, "tokenizer_config.json")
|
230 |
+
}
|
231 |
+
|
232 |
+
# Process each field from the registry
|
233 |
+
successful_extractions = 0
|
234 |
+
failed_extractions = 0
|
235 |
+
|
236 |
+
for field_name, field_config in self.registry_fields.items():
|
237 |
+
try:
|
238 |
+
logger.info(f"π Attempting extraction for field: {field_name}")
|
239 |
+
|
240 |
+
# Extract field using registry configuration
|
241 |
+
extracted_value = self._extract_registry_field(field_name, field_config, extraction_context)
|
242 |
+
|
243 |
+
if extracted_value is not None:
|
244 |
+
metadata[field_name] = extracted_value
|
245 |
+
successful_extractions += 1
|
246 |
+
logger.info(f"β
Successfully extracted {field_name}: {extracted_value}")
|
247 |
+
else:
|
248 |
+
failed_extractions += 1
|
249 |
+
logger.info(f"β Failed to extract {field_name}")
|
250 |
+
|
251 |
+
except Exception as e:
|
252 |
+
failed_extractions += 1
|
253 |
+
logger.error(f"β Error extracting {field_name}: {e}")
|
254 |
+
# Continue with other fields - individual failures don't stop the process
|
255 |
+
continue
|
256 |
+
|
257 |
+
logger.info(f"π Registry extraction complete: {successful_extractions} successful, {failed_extractions} failed")
|
258 |
+
|
259 |
+
# Add external references
|
260 |
+
metadata.update(self._generate_external_references(model_id, metadata))
|
261 |
+
|
262 |
+
return metadata
|
263 |
+
|
264 |
+
def _extract_registry_field(self, field_name: str, field_config: Dict[str, Any], context: Dict[str, Any]) -> Any:
|
265 |
+
"""
|
266 |
+
Extract a single field based on its registry configuration.
|
267 |
+
|
268 |
+
This method uses multiple extraction strategies in order of preference:
|
269 |
+
1. Direct API extraction
|
270 |
+
2. Model card YAML extraction
|
271 |
+
3. Text pattern matching
|
272 |
+
4. Intelligent inference
|
273 |
+
5. Fallback values
|
274 |
+
"""
|
275 |
+
extraction_methods = []
|
276 |
+
|
277 |
+
# Strategy 1: Direct API extraction
|
278 |
+
api_value = self._try_api_extraction(field_name, context)
|
279 |
+
if api_value is not None:
|
280 |
+
self.extraction_results[field_name] = ExtractionResult(
|
281 |
+
value=api_value,
|
282 |
+
source=DataSource.HF_API,
|
283 |
+
confidence=ConfidenceLevel.HIGH,
|
284 |
+
extraction_method="api_direct"
|
285 |
+
)
|
286 |
+
extraction_methods.append("api_direct")
|
287 |
+
return api_value
|
288 |
+
|
289 |
+
# Strategy 2: Model card YAML extraction
|
290 |
+
yaml_value = self._try_model_card_extraction(field_name, context)
|
291 |
+
if yaml_value is not None:
|
292 |
+
self.extraction_results[field_name] = ExtractionResult(
|
293 |
+
value=yaml_value,
|
294 |
+
source=DataSource.MODEL_CARD,
|
295 |
+
confidence=ConfidenceLevel.HIGH,
|
296 |
+
extraction_method="model_card_yaml"
|
297 |
+
)
|
298 |
+
extraction_methods.append("model_card_yaml")
|
299 |
+
return yaml_value
|
300 |
+
|
301 |
+
# Strategy 3: Configuration file extraction
|
302 |
+
config_value = self._try_config_extraction(field_name, context)
|
303 |
+
if config_value is not None:
|
304 |
+
self.extraction_results[field_name] = ExtractionResult(
|
305 |
+
value=config_value,
|
306 |
+
source=DataSource.CONFIG_FILE,
|
307 |
+
confidence=ConfidenceLevel.HIGH,
|
308 |
+
extraction_method="config_file"
|
309 |
+
)
|
310 |
+
extraction_methods.append("config_file")
|
311 |
+
return config_value
|
312 |
+
|
313 |
+
# Strategy 4: Text pattern extraction
|
314 |
+
text_value = self._try_text_pattern_extraction(field_name, context)
|
315 |
+
if text_value is not None:
|
316 |
+
self.extraction_results[field_name] = ExtractionResult(
|
317 |
+
value=text_value,
|
318 |
+
source=DataSource.README_TEXT,
|
319 |
+
confidence=ConfidenceLevel.MEDIUM,
|
320 |
+
extraction_method="text_pattern"
|
321 |
+
)
|
322 |
+
extraction_methods.append("text_pattern")
|
323 |
+
return text_value
|
324 |
+
|
325 |
+
# Strategy 5: Intelligent inference
|
326 |
+
inferred_value = self._try_intelligent_inference(field_name, context)
|
327 |
+
if inferred_value is not None:
|
328 |
+
self.extraction_results[field_name] = ExtractionResult(
|
329 |
+
value=inferred_value,
|
330 |
+
source=DataSource.INTELLIGENT_DEFAULT,
|
331 |
+
confidence=ConfidenceLevel.MEDIUM,
|
332 |
+
extraction_method="intelligent_inference"
|
333 |
+
)
|
334 |
+
extraction_methods.append("intelligent_inference")
|
335 |
+
return inferred_value
|
336 |
+
|
337 |
+
# Strategy 6: Fallback value (if configured)
|
338 |
+
fallback_value = self._try_fallback_value(field_name, field_config)
|
339 |
+
if fallback_value is not None:
|
340 |
+
self.extraction_results[field_name] = ExtractionResult(
|
341 |
+
value=fallback_value,
|
342 |
+
source=DataSource.PLACEHOLDER,
|
343 |
+
confidence=ConfidenceLevel.NONE,
|
344 |
+
extraction_method="fallback_placeholder",
|
345 |
+
fallback_chain=extraction_methods
|
346 |
+
)
|
347 |
+
return fallback_value
|
348 |
+
|
349 |
+
# No extraction successful
|
350 |
+
self.extraction_results[field_name] = ExtractionResult(
|
351 |
+
value=None,
|
352 |
+
source=DataSource.PLACEHOLDER,
|
353 |
+
confidence=ConfidenceLevel.NONE,
|
354 |
+
extraction_method="extraction_failed",
|
355 |
+
fallback_chain=extraction_methods
|
356 |
+
)
|
357 |
+
return None
|
358 |
+
|
359 |
+
def _try_api_extraction(self, field_name: str, context: Dict[str, Any]) -> Any:
|
360 |
+
"""Try to extract field from HuggingFace API data"""
|
361 |
+
model_info = context.get('model_info')
|
362 |
+
if not model_info:
|
363 |
+
return None
|
364 |
+
|
365 |
+
# Field mapping for API extraction
|
366 |
+
api_mappings = {
|
367 |
+
'author': lambda info: getattr(info, 'author', None) or context['model_id'].split('/')[0],
|
368 |
+
'name': lambda info: getattr(info, 'modelId', context['model_id']).split('/')[-1],
|
369 |
+
'tags': lambda info: getattr(info, 'tags', []),
|
370 |
+
'pipeline_tag': lambda info: getattr(info, 'pipeline_tag', None),
|
371 |
+
'downloads': lambda info: getattr(info, 'downloads', 0),
|
372 |
+
'commit': lambda info: getattr(info, 'sha', '')[:7] if getattr(info, 'sha', None) else None,
|
373 |
+
'suppliedBy': lambda info: getattr(info, 'author', None) or context['model_id'].split('/')[0],
|
374 |
+
'primaryPurpose': lambda info: getattr(info, 'pipeline_tag', 'text-generation'),
|
375 |
+
'downloadLocation': lambda info: f"https://huggingface.co/{context['model_id']}/tree/main"
|
376 |
+
}
|
377 |
+
|
378 |
+
if field_name in api_mappings:
|
379 |
+
try:
|
380 |
+
return api_mappings[field_name](model_info)
|
381 |
+
except Exception as e:
|
382 |
+
logger.debug(f"API extraction failed for {field_name}: {e}")
|
383 |
+
return None
|
384 |
+
|
385 |
+
return None
|
386 |
+
|
387 |
+
def _try_model_card_extraction(self, field_name: str, context: Dict[str, Any]) -> Any:
|
388 |
+
"""Try to extract field from model card YAML frontmatter"""
|
389 |
+
model_card = context.get('model_card')
|
390 |
+
if not model_card or not hasattr(model_card, 'data') or not model_card.data:
|
391 |
+
return None
|
392 |
+
|
393 |
+
try:
|
394 |
+
card_data = model_card.data.to_dict() if hasattr(model_card.data, 'to_dict') else {}
|
395 |
+
|
396 |
+
# Field mapping for model card extraction
|
397 |
+
card_mappings = {
|
398 |
+
'license': 'license',
|
399 |
+
'language': 'language',
|
400 |
+
'library_name': 'library_name',
|
401 |
+
'base_model': 'base_model',
|
402 |
+
'datasets': 'datasets',
|
403 |
+
'description': ['model_summary', 'description'],
|
404 |
+
'typeOfModel': 'model_type',
|
405 |
+
'licenses': 'license' # Alternative mapping
|
406 |
+
}
|
407 |
+
|
408 |
+
if field_name in card_mappings:
|
409 |
+
mapping = card_mappings[field_name]
|
410 |
+
if isinstance(mapping, list):
|
411 |
+
# Try multiple keys
|
412 |
+
for key in mapping:
|
413 |
+
value = card_data.get(key)
|
414 |
+
if value:
|
415 |
+
return value
|
416 |
+
else:
|
417 |
+
# Single key
|
418 |
+
return card_data.get(mapping)
|
419 |
+
|
420 |
+
# Direct field name lookup
|
421 |
+
return card_data.get(field_name)
|
422 |
+
|
423 |
+
except Exception as e:
|
424 |
+
logger.debug(f"Model card extraction failed for {field_name}: {e}")
|
425 |
+
return None
|
426 |
+
|
427 |
+
def _try_config_extraction(self, field_name: str, context: Dict[str, Any]) -> Any:
|
428 |
+
"""Try to extract field from configuration files"""
|
429 |
+
config_data = context.get('config_data')
|
430 |
+
tokenizer_config = context.get('tokenizer_config')
|
431 |
+
|
432 |
+
# Config file mappings
|
433 |
+
config_mappings = {
|
434 |
+
'model_type': ('config_data', 'model_type'),
|
435 |
+
'architectures': ('config_data', 'architectures'),
|
436 |
+
'vocab_size': ('config_data', 'vocab_size'),
|
437 |
+
'tokenizer_class': ('tokenizer_config', 'tokenizer_class'),
|
438 |
+
'typeOfModel': ('config_data', 'model_type')
|
439 |
+
}
|
440 |
+
|
441 |
+
if field_name in config_mappings:
|
442 |
+
config_type, config_key = config_mappings[field_name]
|
443 |
+
config_source = context.get(config_type)
|
444 |
+
if config_source:
|
445 |
+
return config_source.get(config_key)
|
446 |
+
|
447 |
+
return None
|
448 |
+
|
449 |
+
def _try_text_pattern_extraction(self, field_name: str, context: Dict[str, Any]) -> Any:
|
450 |
+
"""Try to extract field using text pattern matching"""
|
451 |
+
readme_content = context.get('readme_content')
|
452 |
+
if not readme_content:
|
453 |
+
return None
|
454 |
+
|
455 |
+
# Pattern mappings for different fields
|
456 |
+
pattern_mappings = {
|
457 |
+
'license': 'license',
|
458 |
+
'datasets': 'datasets',
|
459 |
+
'energyConsumption': 'energy',
|
460 |
+
'limitation': 'limitations',
|
461 |
+
'safetyRiskAssessment': 'safety',
|
462 |
+
'model_type': 'model_type'
|
463 |
+
}
|
464 |
+
|
465 |
+
if field_name in pattern_mappings:
|
466 |
+
pattern_key = pattern_mappings[field_name]
|
467 |
+
if pattern_key in self.patterns:
|
468 |
+
matches = self._find_pattern_matches(readme_content, self.patterns[pattern_key])
|
469 |
+
if matches:
|
470 |
+
return matches[0] if len(matches) == 1 else matches
|
471 |
+
|
472 |
+
return None
|
473 |
+
|
474 |
+
def _try_intelligent_inference(self, field_name: str, context: Dict[str, Any]) -> Any:
|
475 |
+
"""Try to infer field value from other available data"""
|
476 |
+
model_id = context['model_id']
|
477 |
+
|
478 |
+
# Intelligent inference rules
|
479 |
+
inference_rules = {
|
480 |
+
'author': lambda: model_id.split('/')[0] if '/' in model_id else 'unknown',
|
481 |
+
'suppliedBy': lambda: model_id.split('/')[0] if '/' in model_id else 'unknown',
|
482 |
+
'name': lambda: model_id.split('/')[-1],
|
483 |
+
'primaryPurpose': lambda: 'text-generation', # Default for most HF models
|
484 |
+
'typeOfModel': lambda: 'transformer', # Default for most HF models
|
485 |
+
'downloadLocation': lambda: f"https://huggingface.co/{model_id}/tree/main",
|
486 |
+
'bomFormat': lambda: 'CycloneDX',
|
487 |
+
'specVersion': lambda: '1.6',
|
488 |
+
'serialNumber': lambda: f"urn:uuid:{model_id.replace('/', '-')}",
|
489 |
+
'version': lambda: '1.0.0'
|
490 |
+
}
|
491 |
+
|
492 |
+
if field_name in inference_rules:
|
493 |
+
try:
|
494 |
+
return inference_rules[field_name]()
|
495 |
+
except Exception as e:
|
496 |
+
logger.debug(f"Intelligent inference failed for {field_name}: {e}")
|
497 |
+
return None
|
498 |
+
|
499 |
+
return None
|
500 |
+
|
501 |
+
def _try_fallback_value(self, field_name: str, field_config: Dict[str, Any]) -> Any:
|
502 |
+
"""Try to get fallback value from field configuration"""
|
503 |
+
# Check if field config has fallback value
|
504 |
+
if isinstance(field_config, dict):
|
505 |
+
fallback = field_config.get('fallback_value')
|
506 |
+
if fallback:
|
507 |
+
return fallback
|
508 |
+
|
509 |
+
# Standard fallback values for common fields
|
510 |
+
standard_fallbacks = {
|
511 |
+
'license': 'NOASSERTION',
|
512 |
+
'description': 'No description available',
|
513 |
+
'version': '1.0.0',
|
514 |
+
'bomFormat': 'CycloneDX',
|
515 |
+
'specVersion': '1.6'
|
516 |
+
}
|
517 |
+
|
518 |
+
return standard_fallbacks.get(field_name)
|
519 |
+
|
520 |
+
def _legacy_extraction(self, model_id: str, model_info: Dict[str, Any], model_card: Optional[ModelCard]) -> Dict[str, Any]:
|
521 |
+
"""
|
522 |
+
Fallback to legacy extraction when registry is not available.
|
523 |
+
This maintains backward compatibility.
|
524 |
+
"""
|
525 |
+
logger.info("π Executing legacy extraction mode")
|
526 |
+
metadata = {}
|
527 |
+
|
528 |
+
# Execute legacy extraction layers
|
529 |
+
metadata.update(self._layer1_structured_api(model_id, model_info, model_card))
|
530 |
+
metadata.update(self._layer2_repository_files(model_id))
|
531 |
+
metadata.update(self._layer3_stp_extraction(model_card, model_id))
|
532 |
+
metadata.update(self._layer4_external_references(model_id, metadata))
|
533 |
+
metadata.update(self._layer5_intelligent_defaults(model_id, metadata))
|
534 |
+
|
535 |
+
return metadata
|
536 |
+
|
537 |
+
def _generate_external_references(self, model_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
|
538 |
+
"""Generate external references for the model"""
|
539 |
+
external_refs = []
|
540 |
+
|
541 |
+
# Model repository
|
542 |
+
repo_url = f"https://huggingface.co/{model_id}"
|
543 |
+
external_refs.append({
|
544 |
+
"type": "website",
|
545 |
+
"url": repo_url,
|
546 |
+
"comment": "Model repository"
|
547 |
+
})
|
548 |
+
|
549 |
+
# Model files
|
550 |
+
files_url = f"https://huggingface.co/{model_id}/tree/main"
|
551 |
+
external_refs.append({
|
552 |
+
"type": "distribution",
|
553 |
+
"url": files_url,
|
554 |
+
"comment": "Model files"
|
555 |
+
})
|
556 |
+
|
557 |
+
# Commit URL if available
|
558 |
+
if 'commit' in metadata:
|
559 |
+
commit_url = f"https://huggingface.co/{model_id}/commit/{metadata['commit']}"
|
560 |
+
external_refs.append({
|
561 |
+
"type": "vcs",
|
562 |
+
"url": commit_url,
|
563 |
+
"comment": "Specific commit"
|
564 |
+
})
|
565 |
+
|
566 |
+
# Dataset references
|
567 |
+
if 'datasets' in metadata:
|
568 |
+
datasets = metadata['datasets']
|
569 |
+
if isinstance(datasets, list):
|
570 |
+
for dataset in datasets:
|
571 |
+
if isinstance(dataset, str):
|
572 |
+
dataset_url = f"https://huggingface.co/datasets/{dataset}"
|
573 |
+
external_refs.append({
|
574 |
+
"type": "distribution",
|
575 |
+
"url": dataset_url,
|
576 |
+
"comment": f"Training dataset: {dataset}"
|
577 |
+
})
|
578 |
+
|
579 |
+
result = {'external_references': external_refs}
|
580 |
+
|
581 |
+
self.extraction_results['external_references'] = ExtractionResult(
|
582 |
+
value=external_refs,
|
583 |
+
source=DataSource.EXTERNAL_REFERENCE,
|
584 |
+
confidence=ConfidenceLevel.HIGH,
|
585 |
+
extraction_method="url_generation"
|
586 |
+
)
|
587 |
+
|
588 |
+
return result
|
589 |
+
|
590 |
+
# Legacy methods for backward compatibility
|
591 |
+
def _layer1_structured_api(self, model_id: str, model_info: Dict[str, Any], model_card: Optional[ModelCard]) -> Dict[str, Any]:
|
592 |
+
"""Legacy Layer 1: Enhanced structured data extraction from HF API and model card."""
|
593 |
+
logger.info("π Executing Legacy Layer 1: Enhanced Structured API Extraction")
|
594 |
+
metadata = {}
|
595 |
+
|
596 |
+
# Enhanced model info extraction
|
597 |
+
if model_info:
|
598 |
+
try:
|
599 |
+
# Extract author with fallback logic
|
600 |
+
author = getattr(model_info, "author", None)
|
601 |
+
if not author or author.strip() == "":
|
602 |
+
parts = model_id.split("/")
|
603 |
+
author = parts[0] if len(parts) > 1 else "unknown"
|
604 |
+
|
605 |
+
metadata['author'] = author
|
606 |
+
metadata['name'] = getattr(model_info, "modelId", model_id).split("/")[-1]
|
607 |
+
metadata['tags'] = getattr(model_info, "tags", [])
|
608 |
+
metadata['pipeline_tag'] = getattr(model_info, "pipeline_tag", None)
|
609 |
+
metadata['downloads'] = getattr(model_info, "downloads", 0)
|
610 |
+
|
611 |
+
# Commit information
|
612 |
+
commit_sha = getattr(model_info, "sha", None)
|
613 |
+
if commit_sha:
|
614 |
+
metadata['commit'] = commit_sha[:7]
|
615 |
+
|
616 |
+
except Exception as e:
|
617 |
+
logger.error(f"β Legacy Layer 1: Error extracting from model_info: {e}")
|
618 |
+
|
619 |
+
# Enhanced model card extraction
|
620 |
+
if model_card and hasattr(model_card, "data") and model_card.data:
|
621 |
+
try:
|
622 |
+
card_data = model_card.data.to_dict() if hasattr(model_card.data, "to_dict") else {}
|
623 |
+
|
624 |
+
metadata['license'] = card_data.get("license")
|
625 |
+
metadata['language'] = card_data.get("language")
|
626 |
+
metadata['library_name'] = card_data.get("library_name")
|
627 |
+
metadata['base_model'] = card_data.get("base_model")
|
628 |
+
metadata['datasets'] = card_data.get("datasets")
|
629 |
+
metadata['description'] = card_data.get("model_summary") or card_data.get("description")
|
630 |
+
|
631 |
+
except Exception as e:
|
632 |
+
logger.error(f"β Legacy Layer 1: Error extracting from model card: {e}")
|
633 |
+
|
634 |
+
# Add standard AI metadata
|
635 |
+
metadata["primaryPurpose"] = metadata.get("pipeline_tag", "text-generation")
|
636 |
+
metadata["suppliedBy"] = metadata.get("author", "unknown")
|
637 |
+
metadata["typeOfModel"] = "transformer"
|
638 |
+
|
639 |
+
return metadata
|
640 |
+
|
641 |
+
def _layer2_repository_files(self, model_id: str) -> Dict[str, Any]:
|
642 |
+
"""Legacy Layer 2: Repository file analysis"""
|
643 |
+
logger.info("π§ Executing Legacy Layer 2: Repository File Analysis")
|
644 |
+
metadata = {}
|
645 |
+
|
646 |
+
try:
|
647 |
+
config_data = self._download_and_parse_config(model_id, "config.json")
|
648 |
+
if config_data:
|
649 |
+
metadata['model_type'] = config_data.get("model_type")
|
650 |
+
metadata['architectures'] = config_data.get("architectures", [])
|
651 |
+
metadata['vocab_size'] = config_data.get("vocab_size")
|
652 |
+
|
653 |
+
tokenizer_config = self._download_and_parse_config(model_id, "tokenizer_config.json")
|
654 |
+
if tokenizer_config:
|
655 |
+
metadata['tokenizer_class'] = tokenizer_config.get("tokenizer_class")
|
656 |
+
|
657 |
+
except Exception as e:
|
658 |
+
logger.warning(f"β οΈ Legacy Layer 2: Could not analyze repository files: {e}")
|
659 |
+
|
660 |
+
return metadata
|
661 |
+
|
662 |
+
def _layer3_stp_extraction(self, model_card: Optional[ModelCard], model_id: str) -> Dict[str, Any]:
|
663 |
+
"""Legacy Layer 3: Smart Text Parsing"""
|
664 |
+
logger.info("π Executing Legacy Layer 3: Smart Text Parsing")
|
665 |
+
metadata = {}
|
666 |
+
|
667 |
+
try:
|
668 |
+
readme_content = self._get_readme_content(model_card, model_id)
|
669 |
+
if readme_content:
|
670 |
+
extracted_info = self._extract_from_text(readme_content)
|
671 |
+
metadata.update(extracted_info)
|
672 |
+
except Exception as e:
|
673 |
+
logger.warning(f"β οΈ Legacy Layer 3: Error in Smart Text Parsing: {e}")
|
674 |
+
|
675 |
+
return metadata
|
676 |
+
|
677 |
+
def _layer4_external_references(self, model_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
|
678 |
+
"""Legacy Layer 4: External reference generation"""
|
679 |
+
logger.info("π Executing Legacy Layer 4: External Reference Generation")
|
680 |
+
return self._generate_external_references(model_id, metadata)
|
681 |
+
|
682 |
+
def _layer5_intelligent_defaults(self, model_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
|
683 |
+
"""Legacy Layer 5: Intelligent default generation"""
|
684 |
+
logger.info("π§ Executing Legacy Layer 5: Intelligent Default Generation")
|
685 |
+
|
686 |
+
if 'author' not in metadata or not metadata['author']:
|
687 |
+
parts = model_id.split("/")
|
688 |
+
metadata['author'] = parts[0] if len(parts) > 1 else "unknown"
|
689 |
+
|
690 |
+
if 'license' not in metadata or not metadata['license']:
|
691 |
+
metadata['license'] = "NOASSERTION"
|
692 |
+
|
693 |
+
return metadata
|
694 |
+
|
695 |
+
# Utility methods
|
696 |
+
def _download_and_parse_config(self, model_id: str, filename: str) -> Optional[Dict[str, Any]]:
|
697 |
+
"""Download and parse a configuration file from the model repository"""
|
698 |
+
try:
|
699 |
+
file_path = hf_hub_download(repo_id=model_id, filename=filename)
|
700 |
+
with open(file_path, 'r') as f:
|
701 |
+
return json.load(f)
|
702 |
+
except (RepositoryNotFoundError, EntryNotFoundError, json.JSONDecodeError) as e:
|
703 |
+
logger.debug(f"Could not download/parse {filename}: {e}")
|
704 |
+
return None
|
705 |
+
except Exception as e:
|
706 |
+
logger.warning(f"Unexpected error downloading {filename}: {e}")
|
707 |
+
return None
|
708 |
+
|
709 |
+
def _get_readme_content(self, model_card: Optional[ModelCard], model_id: str) -> Optional[str]:
|
710 |
+
"""Get README content from model card or by downloading"""
|
711 |
+
try:
|
712 |
+
if model_card and hasattr(model_card, 'content'):
|
713 |
+
return model_card.content
|
714 |
+
|
715 |
+
readme_path = hf_hub_download(repo_id=model_id, filename="README.md")
|
716 |
+
with open(readme_path, 'r', encoding='utf-8') as f:
|
717 |
+
return f.read()
|
718 |
+
|
719 |
+
except Exception as e:
|
720 |
+
logger.debug(f"Could not get README content: {e}")
|
721 |
+
return None
|
722 |
+
|
723 |
+
def _extract_from_text(self, text: str) -> Dict[str, Any]:
|
724 |
+
"""Extract structured information from unstructured text"""
|
725 |
+
metadata = {}
|
726 |
+
|
727 |
+
# Extract license information
|
728 |
+
license_matches = self._find_pattern_matches(text, self.patterns['license'])
|
729 |
+
if license_matches:
|
730 |
+
metadata['license_from_text'] = license_matches[0]
|
731 |
+
|
732 |
+
# Extract dataset information
|
733 |
+
dataset_matches = self._find_pattern_matches(text, self.patterns['datasets'])
|
734 |
+
if dataset_matches:
|
735 |
+
metadata['datasets_from_text'] = dataset_matches
|
736 |
+
|
737 |
+
# Extract performance metrics
|
738 |
+
metric_matches = self._extract_metrics(text)
|
739 |
+
if metric_matches:
|
740 |
+
metadata['performance_metrics'] = metric_matches
|
741 |
+
|
742 |
+
return metadata
|
743 |
+
|
744 |
+
def _find_pattern_matches(self, text: str, patterns: List[re.Pattern]) -> List[str]:
|
745 |
+
"""Find matches for a list of regex patterns in text"""
|
746 |
+
matches = []
|
747 |
+
for pattern in patterns:
|
748 |
+
found = pattern.findall(text)
|
749 |
+
matches.extend(found)
|
750 |
+
return list(set(matches)) # Remove duplicates
|
751 |
+
|
752 |
+
def _extract_metrics(self, text: str) -> Dict[str, float]:
|
753 |
+
"""Extract performance metrics from text"""
|
754 |
+
metrics = {}
|
755 |
+
|
756 |
+
metric_patterns = [
|
757 |
+
r'accuracy[:\s]+([0-9\.]+)',
|
758 |
+
r'f1[:\s]+([0-9\.]+)',
|
759 |
+
r'bleu[:\s]+([0-9\.]+)',
|
760 |
+
r'rouge[:\s]+([0-9\.]+)',
|
761 |
+
]
|
762 |
+
|
763 |
+
for pattern_str in metric_patterns:
|
764 |
+
pattern = re.compile(pattern_str, re.IGNORECASE)
|
765 |
+
matches = pattern.findall(text)
|
766 |
+
if matches:
|
767 |
+
metric_name = pattern_str.split('[')[0]
|
768 |
+
try:
|
769 |
+
metrics[metric_name] = float(matches[0])
|
770 |
+
except ValueError:
|
771 |
+
continue
|
772 |
+
|
773 |
+
return metrics
|
774 |
+
|
775 |
+
def _log_extraction_summary(self, model_id: str, metadata: Dict[str, Any]):
|
776 |
+
"""Log comprehensive extraction summary"""
|
777 |
+
logger.info("=" * 60)
|
778 |
+
logger.info(f"π REGISTRY-DRIVEN EXTRACTION SUMMARY FOR: {model_id}")
|
779 |
+
logger.info("=" * 60)
|
780 |
+
|
781 |
+
if self.registry_fields:
|
782 |
+
logger.info(f"π Registry fields available: {len(self.registry_fields)}")
|
783 |
+
logger.info(f"π Total fields extracted: {len(self.extraction_results)}")
|
784 |
+
|
785 |
+
# Count fields by confidence level
|
786 |
+
confidence_counts = {}
|
787 |
+
source_counts = {}
|
788 |
+
|
789 |
+
for field_name, result in self.extraction_results.items():
|
790 |
+
conf = result.confidence.value
|
791 |
+
source = result.source.value
|
792 |
+
confidence_counts[conf] = confidence_counts.get(conf, 0) + 1
|
793 |
+
source_counts[source] = source_counts.get(source, 0) + 1
|
794 |
+
|
795 |
+
logger.info("π Confidence distribution:")
|
796 |
+
for conf, count in confidence_counts.items():
|
797 |
+
logger.info(f" {conf}: {count} fields")
|
798 |
+
|
799 |
+
logger.info("π Source distribution:")
|
800 |
+
for source, count in source_counts.items():
|
801 |
+
logger.info(f" {source}: {count} fields")
|
802 |
+
|
803 |
+
# Log registry field coverage
|
804 |
+
extracted_fields = set(self.extraction_results.keys())
|
805 |
+
registry_field_names = set(self.registry_fields.keys())
|
806 |
+
coverage = len(extracted_fields & registry_field_names) / len(registry_field_names) * 100
|
807 |
+
logger.info(f"π Registry field coverage: {coverage:.1f}%")
|
808 |
+
|
809 |
+
# Log missing registry fields
|
810 |
+
missing_fields = registry_field_names - extracted_fields
|
811 |
+
if missing_fields:
|
812 |
+
logger.info(f"β Missing registry fields: {', '.join(sorted(missing_fields))}")
|
813 |
+
else:
|
814 |
+
logger.info(f"π Legacy extraction mode: {len(metadata)} fields extracted")
|
815 |
+
|
816 |
+
logger.info("=" * 60)
|
817 |
+
|
818 |
+
def get_extraction_results(self) -> Dict[str, ExtractionResult]:
|
819 |
+
"""Get detailed extraction results with provenance"""
|
820 |
+
return self.extraction_results.copy()
|
821 |
+
|
822 |
+
|
823 |
+
# Convenience function for drop-in replacement
|
824 |
+
def extract_enhanced_metadata(model_id: str, model_info: Dict[str, Any], model_card: Optional[ModelCard], hf_api: Optional[HfApi] = None) -> Dict[str, Any]:
|
825 |
+
"""
|
826 |
+
Drop-in replacement function for _extract_structured_metadata with registry integration.
|
827 |
+
|
828 |
+
This function automatically picks up new fields from the registry without code changes.
|
829 |
+
|
830 |
+
Args:
|
831 |
+
model_id: Hugging Face model identifier
|
832 |
+
model_info: Model information from HF API
|
833 |
+
model_card: Model card object from HF
|
834 |
+
hf_api: Optional HuggingFace API instance
|
835 |
+
|
836 |
+
Returns:
|
837 |
+
Dictionary of extracted metadata
|
838 |
+
"""
|
839 |
+
extractor = EnhancedExtractor(hf_api)
|
840 |
+
return extractor.extract_metadata(model_id, model_info, model_card)
|
841 |
+
|
842 |
+
|
843 |
+
if __name__ == "__main__":
|
844 |
+
# Test the registry-integrated enhanced extractor
|
845 |
+
import sys
|
846 |
+
|
847 |
+
if len(sys.argv) > 1:
|
848 |
+
test_model_id = sys.argv[1]
|
849 |
+
else:
|
850 |
+
test_model_id = "deepseek-ai/DeepSeek-R1"
|
851 |
+
|
852 |
+
print(f"Testing registry-integrated enhanced extractor with model: {test_model_id}")
|
853 |
+
|
854 |
+
# Initialize HF API
|
855 |
+
hf_api = HfApi()
|
856 |
+
|
857 |
+
try:
|
858 |
+
# Fetch model info and card
|
859 |
+
model_info = hf_api.model_info(test_model_id)
|
860 |
+
model_card = ModelCard.load(test_model_id)
|
861 |
+
|
862 |
+
# Test extraction
|
863 |
+
extractor = EnhancedExtractor(hf_api)
|
864 |
+
metadata = extractor.extract_metadata(test_model_id, model_info, model_card)
|
865 |
+
|
866 |
+
print(f"\nExtracted {len(metadata)} metadata fields:")
|
867 |
+
for key, value in metadata.items():
|
868 |
+
print(f" {key}: {value}")
|
869 |
+
|
870 |
+
print(f"\nExtraction results with provenance:")
|
871 |
+
for field, result in extractor.get_extraction_results().items():
|
872 |
+
print(f" {field}: {result}")
|
873 |
+
|
874 |
+
except Exception as e:
|
875 |
+
print(f"Error testing extractor: {e}")
|
876 |
+
|
src/aibom-generator/field_registry.json
ADDED
@@ -0,0 +1,737 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"registry_metadata": {
|
3 |
+
"description": "Field registry for configurable AI SBOM generation and scoring"
|
4 |
+
},
|
5 |
+
"scoring_config": {
|
6 |
+
"tier_weights": {
|
7 |
+
"critical": 3,
|
8 |
+
"important": 2,
|
9 |
+
"supplementary": 1
|
10 |
+
},
|
11 |
+
"category_weights": {
|
12 |
+
"required_fields": 20,
|
13 |
+
"metadata": 20,
|
14 |
+
"component_basic": 20,
|
15 |
+
"component_model_card": 30,
|
16 |
+
"external_references": 10
|
17 |
+
},
|
18 |
+
"scoring_profiles": {
|
19 |
+
"basic": {
|
20 |
+
"description": "Minimal fields required for identification",
|
21 |
+
"required_categories": ["required_fields", "component_basic"],
|
22 |
+
"required_fields": ["bomFormat", "specVersion", "serialNumber", "version", "name"],
|
23 |
+
"minimum_score": 40,
|
24 |
+
"weight_multiplier": 1.0
|
25 |
+
},
|
26 |
+
"standard": {
|
27 |
+
"description": "Comprehensive fields for proper documentation",
|
28 |
+
"required_categories": ["required_fields", "metadata", "component_basic"],
|
29 |
+
"required_fields": ["bomFormat", "specVersion", "serialNumber", "version", "name", "downloadLocation", "primaryPurpose", "suppliedBy"],
|
30 |
+
"minimum_score": 70,
|
31 |
+
"weight_multiplier": 1.0
|
32 |
+
},
|
33 |
+
"advanced": {
|
34 |
+
"description": "Extensive documentation for maximum transparency",
|
35 |
+
"required_categories": ["required_fields", "metadata", "component_basic", "component_model_card", "external_references"],
|
36 |
+
"required_fields": ["bomFormat", "specVersion", "serialNumber", "version", "name", "downloadLocation", "primaryPurpose", "suppliedBy", "type", "purl", "description", "licenses", "hyperparameter", "limitation", "energyConsumption", "safetyRiskAssessment", "typeOfModel"],
|
37 |
+
"minimum_score": 85,
|
38 |
+
"weight_multiplier": 1.0
|
39 |
+
}
|
40 |
+
},
|
41 |
+
"algorithm_config": {
|
42 |
+
"type": "weighted_sum",
|
43 |
+
"max_score": 100,
|
44 |
+
"normalization": "category_based",
|
45 |
+
"penalty_for_missing_critical": 0.5,
|
46 |
+
"bonus_for_complete_categories": 0.1
|
47 |
+
}
|
48 |
+
},
|
49 |
+
"aibom_config": {
|
50 |
+
"structure_template": "cyclonedx_1.6",
|
51 |
+
"generator_info": {
|
52 |
+
"name": "aetheris-aibom-generator",
|
53 |
+
"version": "1.0",
|
54 |
+
"manufacturer": "Aetheris AI"
|
55 |
+
},
|
56 |
+
"generation_rules": {
|
57 |
+
"include_metadata_properties": true,
|
58 |
+
"include_model_card": true,
|
59 |
+
"include_external_references": true,
|
60 |
+
"include_dependencies": true
|
61 |
+
},
|
62 |
+
"validation_rules": {
|
63 |
+
"require_critical_fields": true,
|
64 |
+
"validate_jsonpath_expressions": true,
|
65 |
+
"enforce_cyclonedx_schema": true
|
66 |
+
}
|
67 |
+
},
|
68 |
+
"fields": {
|
69 |
+
"bomFormat": {
|
70 |
+
"tier": "critical",
|
71 |
+
"weight": 4.0,
|
72 |
+
"category": "required_fields",
|
73 |
+
"description": "Format identifier for the SBOM",
|
74 |
+
"jsonpath": "$.bomFormat",
|
75 |
+
"aibom_generation": {
|
76 |
+
"location": "$.bomFormat",
|
77 |
+
"rule": "always_include",
|
78 |
+
"source_fields": ["bomFormat"],
|
79 |
+
"validation": "required",
|
80 |
+
"data_type": "string"
|
81 |
+
},
|
82 |
+
"scoring": {
|
83 |
+
"points": 4.0,
|
84 |
+
"required_for_profiles": ["basic", "standard", "advanced"],
|
85 |
+
"category_contribution": 0.2
|
86 |
+
},
|
87 |
+
"validation_message": {
|
88 |
+
"missing": "Missing critical field: bomFormat - essential for SBOM identification",
|
89 |
+
"recommendation": "Ensure bomFormat is set to 'CycloneDX'"
|
90 |
+
}
|
91 |
+
},
|
92 |
+
"specVersion": {
|
93 |
+
"tier": "critical",
|
94 |
+
"weight": 4.0,
|
95 |
+
"category": "required_fields",
|
96 |
+
"description": "CycloneDX specification version",
|
97 |
+
"jsonpath": "$.specVersion",
|
98 |
+
"aibom_generation": {
|
99 |
+
"location": "$.specVersion",
|
100 |
+
"rule": "always_include",
|
101 |
+
"source_fields": ["specVersion"],
|
102 |
+
"validation": "required",
|
103 |
+
"data_type": "string"
|
104 |
+
},
|
105 |
+
"scoring": {
|
106 |
+
"points": 4.0,
|
107 |
+
"required_for_profiles": ["basic", "standard", "advanced"],
|
108 |
+
"category_contribution": 0.2
|
109 |
+
},
|
110 |
+
"validation_message": {
|
111 |
+
"missing": "Missing critical field: specVersion - required for CycloneDX compliance",
|
112 |
+
"recommendation": "Set specVersion to '1.6' for CycloneDX 1.6 compliance"
|
113 |
+
}
|
114 |
+
},
|
115 |
+
"serialNumber": {
|
116 |
+
"tier": "critical",
|
117 |
+
"weight": 4.0,
|
118 |
+
"category": "required_fields",
|
119 |
+
"description": "Unique identifier for this SBOM instance",
|
120 |
+
"jsonpath": "$.serialNumber",
|
121 |
+
"aibom_generation": {
|
122 |
+
"location": "$.serialNumber",
|
123 |
+
"rule": "always_include",
|
124 |
+
"source_fields": ["serialNumber"],
|
125 |
+
"validation": "required",
|
126 |
+
"data_type": "string"
|
127 |
+
},
|
128 |
+
"scoring": {
|
129 |
+
"points": 4.0,
|
130 |
+
"required_for_profiles": ["basic", "standard", "advanced"],
|
131 |
+
"category_contribution": 0.2
|
132 |
+
},
|
133 |
+
"validation_message": {
|
134 |
+
"missing": "Missing critical field: serialNumber - unique identifier required",
|
135 |
+
"recommendation": "Generate a UUID for the SBOM instance"
|
136 |
+
}
|
137 |
+
},
|
138 |
+
"version": {
|
139 |
+
"tier": "critical",
|
140 |
+
"weight": 4.0,
|
141 |
+
"category": "required_fields",
|
142 |
+
"description": "Version of this SBOM document",
|
143 |
+
"jsonpath": "$.version",
|
144 |
+
"aibom_generation": {
|
145 |
+
"location": "$.version",
|
146 |
+
"rule": "always_include",
|
147 |
+
"source_fields": ["version"],
|
148 |
+
"validation": "required",
|
149 |
+
"data_type": "integer"
|
150 |
+
},
|
151 |
+
"scoring": {
|
152 |
+
"points": 4.0,
|
153 |
+
"required_for_profiles": ["basic", "standard", "advanced"],
|
154 |
+
"category_contribution": 0.2
|
155 |
+
},
|
156 |
+
"validation_message": {
|
157 |
+
"missing": "Missing critical field: version - document version required",
|
158 |
+
"recommendation": "Set version to 1 for initial SBOM generation"
|
159 |
+
}
|
160 |
+
},
|
161 |
+
"primaryPurpose": {
|
162 |
+
"tier": "critical",
|
163 |
+
"weight": 4.0,
|
164 |
+
"category": "metadata",
|
165 |
+
"description": "Primary purpose or task of the AI model",
|
166 |
+
"jsonpath": "$.metadata.properties[?(@.name=='primaryPurpose')].value",
|
167 |
+
"aibom_generation": {
|
168 |
+
"location": "$.metadata.properties",
|
169 |
+
"rule": "include_if_available",
|
170 |
+
"source_fields": ["primaryPurpose", "pipeline_tag", "ai:task"],
|
171 |
+
"validation": "recommended",
|
172 |
+
"data_type": "string"
|
173 |
+
},
|
174 |
+
"scoring": {
|
175 |
+
"points": 4.0,
|
176 |
+
"required_for_profiles": ["standard", "advanced"],
|
177 |
+
"category_contribution": 0.2
|
178 |
+
},
|
179 |
+
"validation_message": {
|
180 |
+
"missing": "Missing critical field: primaryPurpose - essential for understanding model intent",
|
181 |
+
"recommendation": "Add the primary task or purpose of the AI model"
|
182 |
+
}
|
183 |
+
},
|
184 |
+
"suppliedBy": {
|
185 |
+
"tier": "critical",
|
186 |
+
"weight": 4.0,
|
187 |
+
"category": "metadata",
|
188 |
+
"description": "Organization or individual that supplied the model",
|
189 |
+
"jsonpath": "$.metadata.properties[?(@.name=='suppliedBy')].value",
|
190 |
+
"aibom_generation": {
|
191 |
+
"location": "$.metadata.properties",
|
192 |
+
"rule": "include_if_available",
|
193 |
+
"source_fields": ["suppliedBy", "author", "publisher"],
|
194 |
+
"validation": "recommended",
|
195 |
+
"data_type": "string"
|
196 |
+
},
|
197 |
+
"scoring": {
|
198 |
+
"points": 4.0,
|
199 |
+
"required_for_profiles": ["standard", "advanced"],
|
200 |
+
"category_contribution": 0.2
|
201 |
+
},
|
202 |
+
"validation_message": {
|
203 |
+
"missing": "Missing critical field: suppliedBy - supplier identification required",
|
204 |
+
"recommendation": "Add the organization or individual who provided the model"
|
205 |
+
}
|
206 |
+
},
|
207 |
+
"standardCompliance": {
|
208 |
+
"tier": "supplementary",
|
209 |
+
"weight": 1.0,
|
210 |
+
"category": "metadata",
|
211 |
+
"description": "Standards or regulations the model complies with",
|
212 |
+
"jsonpath": "$.metadata.properties[?(@.name=='standardCompliance')].value",
|
213 |
+
"aibom_generation": {
|
214 |
+
"location": "$.metadata.properties",
|
215 |
+
"rule": "include_if_available",
|
216 |
+
"source_fields": ["standardCompliance", "compliance"],
|
217 |
+
"validation": "optional",
|
218 |
+
"data_type": "string"
|
219 |
+
},
|
220 |
+
"scoring": {
|
221 |
+
"points": 1.0,
|
222 |
+
"required_for_profiles": ["advanced"],
|
223 |
+
"category_contribution": 0.05
|
224 |
+
},
|
225 |
+
"validation_message": {
|
226 |
+
"missing": "Missing supplementary field: standardCompliance - compliance information helpful",
|
227 |
+
"recommendation": "Add any relevant standards or regulations the model complies with"
|
228 |
+
}
|
229 |
+
},
|
230 |
+
"domain": {
|
231 |
+
"tier": "supplementary",
|
232 |
+
"weight": 1.0,
|
233 |
+
"category": "metadata",
|
234 |
+
"description": "Domain or field of application",
|
235 |
+
"jsonpath": "$.metadata.properties[?(@.name=='domain')].value",
|
236 |
+
"aibom_generation": {
|
237 |
+
"location": "$.metadata.properties",
|
238 |
+
"rule": "include_if_available",
|
239 |
+
"source_fields": ["domain", "field", "application_area"],
|
240 |
+
"validation": "optional",
|
241 |
+
"data_type": "string"
|
242 |
+
},
|
243 |
+
"scoring": {
|
244 |
+
"points": 1.0,
|
245 |
+
"required_for_profiles": ["advanced"],
|
246 |
+
"category_contribution": 0.05
|
247 |
+
},
|
248 |
+
"validation_message": {
|
249 |
+
"missing": "Missing supplementary field: domain - application domain helpful for context",
|
250 |
+
"recommendation": "Add the domain or field where this model is typically applied"
|
251 |
+
}
|
252 |
+
},
|
253 |
+
"autonomyType": {
|
254 |
+
"tier": "supplementary",
|
255 |
+
"weight": 1.0,
|
256 |
+
"category": "metadata",
|
257 |
+
"description": "Level of autonomy or human involvement required",
|
258 |
+
"jsonpath": "$.metadata.properties[?(@.name=='autonomyType')].value",
|
259 |
+
"aibom_generation": {
|
260 |
+
"location": "$.metadata.properties",
|
261 |
+
"rule": "include_if_available",
|
262 |
+
"source_fields": ["autonomyType", "autonomy_level"],
|
263 |
+
"validation": "optional",
|
264 |
+
"data_type": "string"
|
265 |
+
},
|
266 |
+
"scoring": {
|
267 |
+
"points": 1.0,
|
268 |
+
"required_for_profiles": ["advanced"],
|
269 |
+
"category_contribution": 0.05
|
270 |
+
},
|
271 |
+
"validation_message": {
|
272 |
+
"missing": "Missing supplementary field: autonomyType - autonomy level information helpful",
|
273 |
+
"recommendation": "Add information about the level of human oversight required"
|
274 |
+
}
|
275 |
+
},
|
276 |
+
"name": {
|
277 |
+
"tier": "critical",
|
278 |
+
"weight": 4.0,
|
279 |
+
"category": "component_basic",
|
280 |
+
"description": "Name of the AI model component",
|
281 |
+
"jsonpath": "$.components[0].name",
|
282 |
+
"aibom_generation": {
|
283 |
+
"location": "$.components[0].name",
|
284 |
+
"rule": "always_include",
|
285 |
+
"source_fields": ["name", "model_name"],
|
286 |
+
"validation": "required",
|
287 |
+
"data_type": "string"
|
288 |
+
},
|
289 |
+
"scoring": {
|
290 |
+
"points": 4.0,
|
291 |
+
"required_for_profiles": ["basic", "standard", "advanced"],
|
292 |
+
"category_contribution": 0.2
|
293 |
+
},
|
294 |
+
"validation_message": {
|
295 |
+
"missing": "Missing critical field: name - essential for model identification",
|
296 |
+
"recommendation": "Add a descriptive name for the model"
|
297 |
+
}
|
298 |
+
},
|
299 |
+
"type": {
|
300 |
+
"tier": "important",
|
301 |
+
"weight": 3.0,
|
302 |
+
"category": "component_basic",
|
303 |
+
"description": "Type of component (machine-learning-model)",
|
304 |
+
"jsonpath": "$.components[0].type",
|
305 |
+
"aibom_generation": {
|
306 |
+
"location": "$.components[0].type",
|
307 |
+
"rule": "always_include",
|
308 |
+
"source_fields": ["type"],
|
309 |
+
"validation": "required",
|
310 |
+
"data_type": "string"
|
311 |
+
},
|
312 |
+
"scoring": {
|
313 |
+
"points": 3.0,
|
314 |
+
"required_for_profiles": ["standard", "advanced"],
|
315 |
+
"category_contribution": 0.15
|
316 |
+
},
|
317 |
+
"validation_message": {
|
318 |
+
"missing": "Missing important field: type - component type classification needed",
|
319 |
+
"recommendation": "Set type to 'machine-learning-model' for AI models"
|
320 |
+
}
|
321 |
+
},
|
322 |
+
"purl": {
|
323 |
+
"tier": "important",
|
324 |
+
"weight": 3.0,
|
325 |
+
"category": "component_basic",
|
326 |
+
"description": "Package URL identifier",
|
327 |
+
"jsonpath": "$.components[0].purl",
|
328 |
+
"aibom_generation": {
|
329 |
+
"location": "$.components[0].purl",
|
330 |
+
"rule": "include_if_available",
|
331 |
+
"source_fields": ["purl", "package_url"],
|
332 |
+
"validation": "recommended",
|
333 |
+
"data_type": "string"
|
334 |
+
},
|
335 |
+
"scoring": {
|
336 |
+
"points": 3.0,
|
337 |
+
"required_for_profiles": ["standard", "advanced"],
|
338 |
+
"category_contribution": 0.15
|
339 |
+
},
|
340 |
+
"validation_message": {
|
341 |
+
"missing": "Missing important field: purl - package URL for identification",
|
342 |
+
"recommendation": "Add a Package URL (PURL) for the model"
|
343 |
+
}
|
344 |
+
},
|
345 |
+
"description": {
|
346 |
+
"tier": "important",
|
347 |
+
"weight": 3.0,
|
348 |
+
"category": "component_basic",
|
349 |
+
"description": "Description of the AI model",
|
350 |
+
"jsonpath": "$.components[0].description",
|
351 |
+
"aibom_generation": {
|
352 |
+
"location": "$.components[0].description",
|
353 |
+
"rule": "include_if_available",
|
354 |
+
"source_fields": ["description", "summary"],
|
355 |
+
"validation": "recommended",
|
356 |
+
"data_type": "string"
|
357 |
+
},
|
358 |
+
"scoring": {
|
359 |
+
"points": 3.0,
|
360 |
+
"required_for_profiles": ["standard", "advanced"],
|
361 |
+
"category_contribution": 0.15
|
362 |
+
},
|
363 |
+
"validation_message": {
|
364 |
+
"missing": "Missing important field: description - model description helpful for understanding",
|
365 |
+
"recommendation": "Add a clear description of what the model does"
|
366 |
+
}
|
367 |
+
},
|
368 |
+
"licenses": {
|
369 |
+
"tier": "important",
|
370 |
+
"weight": 3.0,
|
371 |
+
"category": "component_basic",
|
372 |
+
"description": "License information for the model",
|
373 |
+
"jsonpath": "$.components[0].licenses",
|
374 |
+
"aibom_generation": {
|
375 |
+
"location": "$.components[0].licenses",
|
376 |
+
"rule": "include_if_available",
|
377 |
+
"source_fields": ["licenses", "license"],
|
378 |
+
"validation": "recommended",
|
379 |
+
"data_type": "array"
|
380 |
+
},
|
381 |
+
"scoring": {
|
382 |
+
"points": 3.0,
|
383 |
+
"required_for_profiles": ["standard", "advanced"],
|
384 |
+
"category_contribution": 0.15
|
385 |
+
},
|
386 |
+
"validation_message": {
|
387 |
+
"missing": "Missing important field: licenses - license information important for compliance",
|
388 |
+
"recommendation": "Add license information for the model"
|
389 |
+
}
|
390 |
+
},
|
391 |
+
"energyConsumption": {
|
392 |
+
"tier": "important",
|
393 |
+
"weight": 2.0,
|
394 |
+
"category": "component_model_card",
|
395 |
+
"description": "Energy consumption information",
|
396 |
+
"jsonpath": "$.metadata.properties[?(@.name=='energyConsumption')].value",
|
397 |
+
"aibom_generation": {
|
398 |
+
"location": "$.metadata.properties",
|
399 |
+
"rule": "include_if_available",
|
400 |
+
"source_fields": ["energyConsumption", "energy_usage"],
|
401 |
+
"validation": "optional",
|
402 |
+
"data_type": "string"
|
403 |
+
},
|
404 |
+
"scoring": {
|
405 |
+
"points": 2.0,
|
406 |
+
"required_for_profiles": ["advanced"],
|
407 |
+
"category_contribution": 0.067
|
408 |
+
},
|
409 |
+
"validation_message": {
|
410 |
+
"missing": "Missing important field: energyConsumption - energy usage information helpful for sustainability",
|
411 |
+
"recommendation": "Add information about the model's energy consumption"
|
412 |
+
}
|
413 |
+
},
|
414 |
+
"hyperparameter": {
|
415 |
+
"tier": "important",
|
416 |
+
"weight": 2.0,
|
417 |
+
"category": "component_model_card",
|
418 |
+
"description": "Key hyperparameters used in training",
|
419 |
+
"jsonpath": "$.metadata.properties[?(@.name=='hyperparameter')].value",
|
420 |
+
"aibom_generation": {
|
421 |
+
"location": "$.metadata.properties",
|
422 |
+
"rule": "include_if_available",
|
423 |
+
"source_fields": ["hyperparameter", "hyperparameters", "training_params"],
|
424 |
+
"validation": "optional",
|
425 |
+
"data_type": "string"
|
426 |
+
},
|
427 |
+
"scoring": {
|
428 |
+
"points": 2.0,
|
429 |
+
"required_for_profiles": ["advanced"],
|
430 |
+
"category_contribution": 0.067
|
431 |
+
},
|
432 |
+
"validation_message": {
|
433 |
+
"missing": "Missing important field: hyperparameter - training configuration helpful for reproducibility",
|
434 |
+
"recommendation": "Add key hyperparameters used during model training"
|
435 |
+
}
|
436 |
+
},
|
437 |
+
"limitation": {
|
438 |
+
"tier": "important",
|
439 |
+
"weight": 2.0,
|
440 |
+
"category": "component_model_card",
|
441 |
+
"description": "Known limitations of the model",
|
442 |
+
"jsonpath": "$.metadata.properties[?(@.name=='limitation')].value",
|
443 |
+
"aibom_generation": {
|
444 |
+
"location": "$.metadata.properties",
|
445 |
+
"rule": "include_if_available",
|
446 |
+
"source_fields": ["limitation", "limitations", "known_issues"],
|
447 |
+
"validation": "optional",
|
448 |
+
"data_type": "string"
|
449 |
+
},
|
450 |
+
"scoring": {
|
451 |
+
"points": 2.0,
|
452 |
+
"required_for_profiles": ["advanced"],
|
453 |
+
"category_contribution": 0.067
|
454 |
+
},
|
455 |
+
"validation_message": {
|
456 |
+
"missing": "Missing important field: limitation - known limitations important for responsible use",
|
457 |
+
"recommendation": "Add information about known limitations or constraints"
|
458 |
+
}
|
459 |
+
},
|
460 |
+
"safetyRiskAssessment": {
|
461 |
+
"tier": "important",
|
462 |
+
"weight": 2.0,
|
463 |
+
"category": "component_model_card",
|
464 |
+
"description": "Safety and risk assessment information",
|
465 |
+
"jsonpath": "$.metadata.properties[?(@.name=='safetyRiskAssessment')].value",
|
466 |
+
"aibom_generation": {
|
467 |
+
"location": "$.metadata.properties",
|
468 |
+
"rule": "include_if_available",
|
469 |
+
"source_fields": ["safetyRiskAssessment", "safety_assessment", "risk_analysis"],
|
470 |
+
"validation": "optional",
|
471 |
+
"data_type": "string"
|
472 |
+
},
|
473 |
+
"scoring": {
|
474 |
+
"points": 2.0,
|
475 |
+
"required_for_profiles": ["advanced"],
|
476 |
+
"category_contribution": 0.067
|
477 |
+
},
|
478 |
+
"validation_message": {
|
479 |
+
"missing": "Missing important field: safetyRiskAssessment - safety assessment important for responsible deployment",
|
480 |
+
"recommendation": "Add safety and risk assessment information"
|
481 |
+
}
|
482 |
+
},
|
483 |
+
"typeOfModel": {
|
484 |
+
"tier": "important",
|
485 |
+
"weight": 2.0,
|
486 |
+
"category": "component_model_card",
|
487 |
+
"description": "Type or architecture of the model",
|
488 |
+
"jsonpath": "$.metadata.properties[?(@.name=='typeOfModel')].value",
|
489 |
+
"aibom_generation": {
|
490 |
+
"location": "$.metadata.properties",
|
491 |
+
"rule": "include_if_available",
|
492 |
+
"source_fields": ["typeOfModel", "model_type", "architecture"],
|
493 |
+
"validation": "recommended",
|
494 |
+
"data_type": "string"
|
495 |
+
},
|
496 |
+
"scoring": {
|
497 |
+
"points": 2.0,
|
498 |
+
"required_for_profiles": ["advanced"],
|
499 |
+
"category_contribution": 0.067
|
500 |
+
},
|
501 |
+
"validation_message": {
|
502 |
+
"missing": "Missing important field: typeOfModel - model architecture information helpful",
|
503 |
+
"recommendation": "Add the type or architecture of the model (e.g., Transformer, CNN)"
|
504 |
+
}
|
505 |
+
},
|
506 |
+
"modelExplainability": {
|
507 |
+
"tier": "supplementary",
|
508 |
+
"weight": 1.0,
|
509 |
+
"category": "component_model_card",
|
510 |
+
"description": "Information about model explainability",
|
511 |
+
"jsonpath": "$.metadata.properties[?(@.name=='modelExplainability')].value",
|
512 |
+
"aibom_generation": {
|
513 |
+
"location": "$.metadata.properties",
|
514 |
+
"rule": "include_if_available",
|
515 |
+
"source_fields": ["modelExplainability", "explainability", "interpretability"],
|
516 |
+
"validation": "optional",
|
517 |
+
"data_type": "string"
|
518 |
+
},
|
519 |
+
"scoring": {
|
520 |
+
"points": 1.0,
|
521 |
+
"required_for_profiles": ["advanced"],
|
522 |
+
"category_contribution": 0.033
|
523 |
+
},
|
524 |
+
"validation_message": {
|
525 |
+
"missing": "Missing supplementary field: modelExplainability - explainability information helpful for transparency",
|
526 |
+
"recommendation": "Add information about model explainability or interpretability features"
|
527 |
+
}
|
528 |
+
},
|
529 |
+
"energyQuantity": {
|
530 |
+
"tier": "supplementary",
|
531 |
+
"weight": 1.0,
|
532 |
+
"category": "component_model_card",
|
533 |
+
"description": "Quantitative energy consumption data",
|
534 |
+
"jsonpath": "$.metadata.properties[?(@.name=='energyQuantity')].value",
|
535 |
+
"aibom_generation": {
|
536 |
+
"location": "$.metadata.properties",
|
537 |
+
"rule": "include_if_available",
|
538 |
+
"source_fields": ["energyQuantity", "energy_amount"],
|
539 |
+
"validation": "optional",
|
540 |
+
"data_type": "number"
|
541 |
+
},
|
542 |
+
"scoring": {
|
543 |
+
"points": 1.0,
|
544 |
+
"required_for_profiles": ["advanced"],
|
545 |
+
"category_contribution": 0.033
|
546 |
+
},
|
547 |
+
"validation_message": {
|
548 |
+
"missing": "Missing supplementary field: energyQuantity - quantitative energy data helpful for sustainability metrics",
|
549 |
+
"recommendation": "Add specific energy consumption quantities"
|
550 |
+
}
|
551 |
+
},
|
552 |
+
"energyUnit": {
|
553 |
+
"tier": "supplementary",
|
554 |
+
"weight": 1.0,
|
555 |
+
"category": "component_model_card",
|
556 |
+
"description": "Unit of measurement for energy consumption",
|
557 |
+
"jsonpath": "$.metadata.properties[?(@.name=='energyUnit')].value",
|
558 |
+
"aibom_generation": {
|
559 |
+
"location": "$.metadata.properties",
|
560 |
+
"rule": "include_if_available",
|
561 |
+
"source_fields": ["energyUnit", "energy_unit"],
|
562 |
+
"validation": "optional",
|
563 |
+
"data_type": "string"
|
564 |
+
},
|
565 |
+
"scoring": {
|
566 |
+
"points": 1.0,
|
567 |
+
"required_for_profiles": ["advanced"],
|
568 |
+
"category_contribution": 0.033
|
569 |
+
},
|
570 |
+
"validation_message": {
|
571 |
+
"missing": "Missing supplementary field: energyUnit - energy measurement unit helpful for standardization",
|
572 |
+
"recommendation": "Add the unit of measurement for energy consumption (e.g., kWh, Joules)"
|
573 |
+
}
|
574 |
+
},
|
575 |
+
"informationAboutTraining": {
|
576 |
+
"tier": "supplementary",
|
577 |
+
"weight": 1.0,
|
578 |
+
"category": "component_model_card",
|
579 |
+
"description": "Information about the training process",
|
580 |
+
"jsonpath": "$.metadata.properties[?(@.name=='informationAboutTraining')].value",
|
581 |
+
"aibom_generation": {
|
582 |
+
"location": "$.metadata.properties",
|
583 |
+
"rule": "include_if_available",
|
584 |
+
"source_fields": ["informationAboutTraining", "training_info", "training_details"],
|
585 |
+
"validation": "optional",
|
586 |
+
"data_type": "string"
|
587 |
+
},
|
588 |
+
"scoring": {
|
589 |
+
"points": 1.0,
|
590 |
+
"required_for_profiles": ["advanced"],
|
591 |
+
"category_contribution": 0.033
|
592 |
+
},
|
593 |
+
"validation_message": {
|
594 |
+
"missing": "Missing supplementary field: informationAboutTraining - training details helpful for understanding model development",
|
595 |
+
"recommendation": "Add information about the training process and methodology"
|
596 |
+
}
|
597 |
+
},
|
598 |
+
"informationAboutApplication": {
|
599 |
+
"tier": "supplementary",
|
600 |
+
"weight": 1.0,
|
601 |
+
"category": "component_model_card",
|
602 |
+
"description": "Information about intended applications",
|
603 |
+
"jsonpath": "$.metadata.properties[?(@.name=='informationAboutApplication')].value",
|
604 |
+
"aibom_generation": {
|
605 |
+
"location": "$.metadata.properties",
|
606 |
+
"rule": "include_if_available",
|
607 |
+
"source_fields": ["informationAboutApplication", "application_info", "intended_use"],
|
608 |
+
"validation": "optional",
|
609 |
+
"data_type": "string"
|
610 |
+
},
|
611 |
+
"scoring": {
|
612 |
+
"points": 1.0,
|
613 |
+
"required_for_profiles": ["advanced"],
|
614 |
+
"category_contribution": 0.033
|
615 |
+
},
|
616 |
+
"validation_message": {
|
617 |
+
"missing": "Missing supplementary field: informationAboutApplication - application guidance helpful for proper usage",
|
618 |
+
"recommendation": "Add information about intended applications and use cases"
|
619 |
+
}
|
620 |
+
},
|
621 |
+
"metric": {
|
622 |
+
"tier": "supplementary",
|
623 |
+
"weight": 1.0,
|
624 |
+
"category": "component_model_card",
|
625 |
+
"description": "Performance metrics and evaluation results",
|
626 |
+
"jsonpath": "$.metadata.properties[?(@.name=='metric')].value",
|
627 |
+
"aibom_generation": {
|
628 |
+
"location": "$.metadata.properties",
|
629 |
+
"rule": "include_if_available",
|
630 |
+
"source_fields": ["metric", "metrics", "performance"],
|
631 |
+
"validation": "optional",
|
632 |
+
"data_type": "string"
|
633 |
+
},
|
634 |
+
"scoring": {
|
635 |
+
"points": 1.0,
|
636 |
+
"required_for_profiles": ["advanced"],
|
637 |
+
"category_contribution": 0.033
|
638 |
+
},
|
639 |
+
"validation_message": {
|
640 |
+
"missing": "Missing supplementary field: metric - performance metrics helpful for evaluation",
|
641 |
+
"recommendation": "Add performance metrics and evaluation results"
|
642 |
+
}
|
643 |
+
},
|
644 |
+
"metricDecisionThreshold": {
|
645 |
+
"tier": "supplementary",
|
646 |
+
"weight": 1.0,
|
647 |
+
"category": "component_model_card",
|
648 |
+
"description": "Decision thresholds for metrics",
|
649 |
+
"jsonpath": "$.metadata.properties[?(@.name=='metricDecisionThreshold')].value",
|
650 |
+
"aibom_generation": {
|
651 |
+
"location": "$.metadata.properties",
|
652 |
+
"rule": "include_if_available",
|
653 |
+
"source_fields": ["metricDecisionThreshold", "decision_threshold", "threshold"],
|
654 |
+
"validation": "optional",
|
655 |
+
"data_type": "number"
|
656 |
+
},
|
657 |
+
"scoring": {
|
658 |
+
"points": 1.0,
|
659 |
+
"required_for_profiles": ["advanced"],
|
660 |
+
"category_contribution": 0.033
|
661 |
+
},
|
662 |
+
"validation_message": {
|
663 |
+
"missing": "Missing supplementary field: metricDecisionThreshold - decision thresholds helpful for operational guidance",
|
664 |
+
"recommendation": "Add decision thresholds for performance metrics"
|
665 |
+
}
|
666 |
+
},
|
667 |
+
"modelDataPreprocessing": {
|
668 |
+
"tier": "supplementary",
|
669 |
+
"weight": 1.0,
|
670 |
+
"category": "component_model_card",
|
671 |
+
"description": "Data preprocessing information",
|
672 |
+
"jsonpath": "$.metadata.properties[?(@.name=='modelDataPreprocessing')].value",
|
673 |
+
"aibom_generation": {
|
674 |
+
"location": "$.metadata.properties",
|
675 |
+
"rule": "include_if_available",
|
676 |
+
"source_fields": ["modelDataPreprocessing", "data_preprocessing", "preprocessing"],
|
677 |
+
"validation": "optional",
|
678 |
+
"data_type": "string"
|
679 |
+
},
|
680 |
+
"scoring": {
|
681 |
+
"points": 1.0,
|
682 |
+
"required_for_profiles": ["advanced"],
|
683 |
+
"category_contribution": 0.033
|
684 |
+
},
|
685 |
+
"validation_message": {
|
686 |
+
"missing": "Missing supplementary field: modelDataPreprocessing - preprocessing details helpful for reproducibility",
|
687 |
+
"recommendation": "Add information about data preprocessing steps"
|
688 |
+
}
|
689 |
+
},
|
690 |
+
"useSensitivePersonalInformation": {
|
691 |
+
"tier": "supplementary",
|
692 |
+
"weight": 1.0,
|
693 |
+
"category": "component_model_card",
|
694 |
+
"description": "Information about use of sensitive personal data",
|
695 |
+
"jsonpath": "$.metadata.properties[?(@.name=='useSensitivePersonalInformation')].value",
|
696 |
+
"aibom_generation": {
|
697 |
+
"location": "$.metadata.properties",
|
698 |
+
"rule": "include_if_available",
|
699 |
+
"source_fields": ["useSensitivePersonalInformation", "sensitive_data", "personal_data"],
|
700 |
+
"validation": "optional",
|
701 |
+
"data_type": "boolean"
|
702 |
+
},
|
703 |
+
"scoring": {
|
704 |
+
"points": 1.0,
|
705 |
+
"required_for_profiles": ["advanced"],
|
706 |
+
"category_contribution": 0.033
|
707 |
+
},
|
708 |
+
"validation_message": {
|
709 |
+
"missing": "Missing supplementary field: useSensitivePersonalInformation - privacy information important for compliance",
|
710 |
+
"recommendation": "Add information about use of sensitive or personal data"
|
711 |
+
}
|
712 |
+
},
|
713 |
+
"downloadLocation": {
|
714 |
+
"tier": "critical",
|
715 |
+
"weight": 4.0,
|
716 |
+
"category": "external_references",
|
717 |
+
"description": "Location where the model can be downloaded",
|
718 |
+
"jsonpath": "$.externalReferences[0].url",
|
719 |
+
"aibom_generation": {
|
720 |
+
"location": "$.externalReferences",
|
721 |
+
"rule": "include_if_available",
|
722 |
+
"source_fields": ["downloadLocation", "download_url", "repository_url"],
|
723 |
+
"validation": "recommended",
|
724 |
+
"data_type": "string"
|
725 |
+
},
|
726 |
+
"scoring": {
|
727 |
+
"points": 4.0,
|
728 |
+
"required_for_profiles": ["standard", "advanced"],
|
729 |
+
"category_contribution": 1.0
|
730 |
+
},
|
731 |
+
"validation_message": {
|
732 |
+
"missing": "Missing critical field: downloadLocation - download location essential for model access",
|
733 |
+
"recommendation": "Add the URL where the model can be downloaded or accessed"
|
734 |
+
}
|
735 |
+
}
|
736 |
+
}
|
737 |
+
}
|
src/aibom-generator/field_registry_manager.py
ADDED
@@ -0,0 +1,648 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Field Registry Manager for AI SBOM Generator
|
3 |
+
Combines registry loading, configuration generation, and field detection functionality
|
4 |
+
"""
|
5 |
+
|
6 |
+
import json
|
7 |
+
import os
|
8 |
+
import re
|
9 |
+
from typing import Dict, Any, Optional, List, Tuple
|
10 |
+
from functools import lru_cache
|
11 |
+
|
12 |
+
class FieldRegistryManager:
|
13 |
+
"""
|
14 |
+
Field registry manager that handles:
|
15 |
+
1. Registry loading and validation
|
16 |
+
2. Configuration generation for utils.py compatibility
|
17 |
+
3. Field detection and JSONPath parsing
|
18 |
+
4. AIBOM completeness analysis
|
19 |
+
5. Scoring calculations
|
20 |
+
"""
|
21 |
+
|
22 |
+
def __init__(self, registry_path: Optional[str] = None):
|
23 |
+
"""
|
24 |
+
Initialize the field registry manager
|
25 |
+
|
26 |
+
Args:
|
27 |
+
registry_path: Path to the field registry JSON file. If None, auto-detects.
|
28 |
+
"""
|
29 |
+
if registry_path is None:
|
30 |
+
# Auto-detect registry path relative to this file
|
31 |
+
current_dir = os.path.dirname(os.path.abspath(__file__))
|
32 |
+
registry_path = os.path.join(current_dir, "field_registry.json")
|
33 |
+
|
34 |
+
self.registry_path = registry_path
|
35 |
+
self.registry = self._load_registry()
|
36 |
+
|
37 |
+
# Cache for performance
|
38 |
+
self._field_classification = None
|
39 |
+
self._completeness_profiles = None
|
40 |
+
self._validation_messages = None
|
41 |
+
self._scoring_weights = None
|
42 |
+
|
43 |
+
def _load_registry(self) -> Dict[str, Any]:
|
44 |
+
"""Load the complete field registry from JSON file"""
|
45 |
+
try:
|
46 |
+
with open(self.registry_path, 'r', encoding='utf-8') as f:
|
47 |
+
registry = json.load(f)
|
48 |
+
|
49 |
+
# Validate basic structure
|
50 |
+
required_sections = ["fields"]
|
51 |
+
missing_sections = [section for section in required_sections if section not in registry]
|
52 |
+
|
53 |
+
if missing_sections:
|
54 |
+
raise ValueError(f"Registry missing required sections: {missing_sections}")
|
55 |
+
|
56 |
+
# Validate fields structure
|
57 |
+
fields = registry.get('fields', {})
|
58 |
+
if not fields:
|
59 |
+
raise ValueError("Registry 'fields' section is empty")
|
60 |
+
|
61 |
+
print(f"β
Field registry loaded: {len(fields)} fields from {self.registry_path}")
|
62 |
+
return registry
|
63 |
+
|
64 |
+
except FileNotFoundError:
|
65 |
+
raise FileNotFoundError(f"Field registry not found at: {self.registry_path}")
|
66 |
+
except json.JSONDecodeError as e:
|
67 |
+
raise ValueError(f"Invalid JSON in field registry: {e}")
|
68 |
+
except Exception as e:
|
69 |
+
raise Exception(f"Failed to load field registry: {e}")
|
70 |
+
|
71 |
+
# =============================================================================
|
72 |
+
# CONFIGURATION GENERATION
|
73 |
+
# =============================================================================
|
74 |
+
|
75 |
+
@lru_cache(maxsize=1)
|
76 |
+
def get_scoring_config(self) -> Dict[str, Any]:
|
77 |
+
"""Get scoring configuration from registry"""
|
78 |
+
return self.registry.get('scoring_config', {})
|
79 |
+
|
80 |
+
@lru_cache(maxsize=1)
|
81 |
+
def get_aibom_config(self) -> Dict[str, Any]:
|
82 |
+
"""Get AIBOM generation configuration from registry"""
|
83 |
+
return self.registry.get('aibom_config', {})
|
84 |
+
|
85 |
+
@lru_cache(maxsize=1)
|
86 |
+
def get_field_definitions(self) -> Dict[str, Any]:
|
87 |
+
"""Get all field definitions from registry"""
|
88 |
+
return self.registry.get('fields', {})
|
89 |
+
|
90 |
+
def generate_field_classification(self) -> Dict[str, Any]:
|
91 |
+
"""
|
92 |
+
Generate FIELD_CLASSIFICATION dictionary from registry
|
93 |
+
"""
|
94 |
+
if self._field_classification is not None:
|
95 |
+
return self._field_classification
|
96 |
+
|
97 |
+
fields = self.get_field_definitions()
|
98 |
+
classification = {}
|
99 |
+
|
100 |
+
for field_name, field_config in fields.items():
|
101 |
+
classification[field_name] = {
|
102 |
+
"tier": field_config.get("tier", "supplementary"),
|
103 |
+
"weight": field_config.get("weight", 1),
|
104 |
+
"category": field_config.get("category", "unknown")
|
105 |
+
}
|
106 |
+
|
107 |
+
self._field_classification = classification
|
108 |
+
return classification
|
109 |
+
|
110 |
+
def generate_completeness_profiles(self) -> Dict[str, Any]:
|
111 |
+
"""
|
112 |
+
Generate COMPLETENESS_PROFILES dictionary from registry
|
113 |
+
"""
|
114 |
+
if self._completeness_profiles is not None:
|
115 |
+
return self._completeness_profiles
|
116 |
+
|
117 |
+
scoring_config = self.get_scoring_config()
|
118 |
+
profiles = scoring_config.get("scoring_profiles", {})
|
119 |
+
|
120 |
+
# Convert to utils.py format
|
121 |
+
completeness_profiles = {}
|
122 |
+
for profile_name, profile_config in profiles.items():
|
123 |
+
completeness_profiles[profile_name] = {
|
124 |
+
"description": profile_config.get("description", f"{profile_name.title()} completeness profile"),
|
125 |
+
"required_fields": profile_config.get("required_fields", []),
|
126 |
+
"minimum_score": profile_config.get("minimum_score", 50)
|
127 |
+
}
|
128 |
+
|
129 |
+
# Fallback profiles if none defined in registry
|
130 |
+
if not completeness_profiles:
|
131 |
+
completeness_profiles = {
|
132 |
+
"basic": {
|
133 |
+
"description": "Minimal fields required for identification",
|
134 |
+
"required_fields": ["bomFormat", "specVersion", "serialNumber", "version", "name"],
|
135 |
+
"minimum_score": 40
|
136 |
+
},
|
137 |
+
"standard": {
|
138 |
+
"description": "Comprehensive fields for proper documentation",
|
139 |
+
"required_fields": ["bomFormat", "specVersion", "serialNumber", "version", "name",
|
140 |
+
"downloadLocation", "primaryPurpose", "suppliedBy"],
|
141 |
+
"minimum_score": 70
|
142 |
+
},
|
143 |
+
"advanced": {
|
144 |
+
"description": "Extensive documentation for maximum transparency",
|
145 |
+
"required_fields": ["bomFormat", "specVersion", "serialNumber", "version", "name",
|
146 |
+
"downloadLocation", "primaryPurpose", "suppliedBy",
|
147 |
+
"type", "purl", "description", "licenses", "hyperparameter", "limitation",
|
148 |
+
"energyConsumption", "safetyRiskAssessment", "typeOfModel"],
|
149 |
+
"minimum_score": 85
|
150 |
+
}
|
151 |
+
}
|
152 |
+
|
153 |
+
self._completeness_profiles = completeness_profiles
|
154 |
+
return completeness_profiles
|
155 |
+
|
156 |
+
def generate_validation_messages(self) -> Dict[str, Any]:
|
157 |
+
"""
|
158 |
+
Generate VALIDATION_MESSAGES dictionary from registry
|
159 |
+
"""
|
160 |
+
if self._validation_messages is not None:
|
161 |
+
return self._validation_messages
|
162 |
+
|
163 |
+
fields = self.get_field_definitions()
|
164 |
+
validation_messages = {}
|
165 |
+
|
166 |
+
for field_name, field_config in fields.items():
|
167 |
+
validation_msg = field_config.get("validation_message", {})
|
168 |
+
if validation_msg:
|
169 |
+
validation_messages[field_name] = {
|
170 |
+
"missing": validation_msg.get("missing", f"Missing field: {field_name}"),
|
171 |
+
"recommendation": validation_msg.get("recommendation", f"Consider adding {field_name} field")
|
172 |
+
}
|
173 |
+
|
174 |
+
self._validation_messages = validation_messages
|
175 |
+
return validation_messages
|
176 |
+
|
177 |
+
def get_configurable_scoring_weights(self) -> Dict[str, Any]:
|
178 |
+
"""Get configurable scoring weights from registry"""
|
179 |
+
if self._scoring_weights is not None:
|
180 |
+
return self._scoring_weights
|
181 |
+
|
182 |
+
scoring_config = self.get_scoring_config()
|
183 |
+
|
184 |
+
weights = {
|
185 |
+
"tier_weights": scoring_config.get("tier_weights", {
|
186 |
+
"critical": 3,
|
187 |
+
"important": 2,
|
188 |
+
"supplementary": 1
|
189 |
+
}),
|
190 |
+
"category_weights": scoring_config.get("category_weights", {
|
191 |
+
"required_fields": 20,
|
192 |
+
"metadata": 20,
|
193 |
+
"component_basic": 20,
|
194 |
+
"component_model_card": 30,
|
195 |
+
"external_references": 10
|
196 |
+
}),
|
197 |
+
"algorithm_config": scoring_config.get("algorithm_config", {
|
198 |
+
"type": "weighted_sum",
|
199 |
+
"max_score": 100,
|
200 |
+
"normalization": "category_based"
|
201 |
+
})
|
202 |
+
}
|
203 |
+
|
204 |
+
self._scoring_weights = weights
|
205 |
+
return weights
|
206 |
+
|
207 |
+
# =============================================================================
|
208 |
+
# FIELD DETECTION
|
209 |
+
# =============================================================================
|
210 |
+
|
211 |
+
def _get_nested_value(self, data: dict, path: str) -> Tuple[bool, Any]:
|
212 |
+
"""
|
213 |
+
Get value from nested dictionary using dot notation and array filters
|
214 |
+
Supports paths like: $.components[0].name, $.metadata.properties[?(@.name=='primaryPurpose')].value
|
215 |
+
"""
|
216 |
+
try:
|
217 |
+
# Remove leading $. if present
|
218 |
+
if path.startswith('$.'):
|
219 |
+
path = path[2:]
|
220 |
+
|
221 |
+
# Handle special JSONPath-like syntax for property arrays
|
222 |
+
if '[?(@.name==' in path:
|
223 |
+
return self._handle_property_array_path(data, path)
|
224 |
+
|
225 |
+
# Split path and traverse
|
226 |
+
parts = self._split_path(path)
|
227 |
+
current = data
|
228 |
+
|
229 |
+
for part in parts:
|
230 |
+
if '[' in part and ']' in part:
|
231 |
+
# Handle array access like components[0]
|
232 |
+
key, index_str = part.split('[')
|
233 |
+
index = int(index_str.rstrip(']'))
|
234 |
+
|
235 |
+
if key and key in current:
|
236 |
+
current = current[key]
|
237 |
+
|
238 |
+
if isinstance(current, list) and 0 <= index < len(current):
|
239 |
+
current = current[index]
|
240 |
+
else:
|
241 |
+
return False, None
|
242 |
+
else:
|
243 |
+
# Regular key access
|
244 |
+
if isinstance(current, dict) and part in current:
|
245 |
+
current = current[part]
|
246 |
+
else:
|
247 |
+
return False, None
|
248 |
+
|
249 |
+
# Check if value is meaningful
|
250 |
+
if current is not None and current != "" and current != []:
|
251 |
+
return True, current
|
252 |
+
|
253 |
+
return False, None
|
254 |
+
|
255 |
+
except Exception as e:
|
256 |
+
print(f"Error getting value at path {path}: {e}")
|
257 |
+
return False, None
|
258 |
+
|
259 |
+
def _handle_property_array_path(self, data: dict, path: str) -> Tuple[bool, Any]:
|
260 |
+
"""
|
261 |
+
Handle JSONPath-like syntax for property arrays
|
262 |
+
Example: metadata.properties[?(@.name=='primaryPurpose')].value
|
263 |
+
"""
|
264 |
+
try:
|
265 |
+
# Extract the base path, property name, and final key
|
266 |
+
match = re.match(r'(.+)\.properties\[\?\(@\.name==\'(.+)\'\)\]\.(.+)', path)
|
267 |
+
if not match:
|
268 |
+
return False, None
|
269 |
+
|
270 |
+
base_path, prop_name, final_key = match.groups()
|
271 |
+
|
272 |
+
# Get to the properties array
|
273 |
+
base_found, base_value = self._get_nested_value(data, base_path + '.properties')
|
274 |
+
if not base_found or not isinstance(base_value, list):
|
275 |
+
return False, None
|
276 |
+
|
277 |
+
# Find the property with matching name
|
278 |
+
for prop in base_value:
|
279 |
+
if isinstance(prop, dict) and prop.get('name') == prop_name:
|
280 |
+
if final_key in prop:
|
281 |
+
value = prop[final_key]
|
282 |
+
if value is not None and value != "" and value != []:
|
283 |
+
return True, value
|
284 |
+
|
285 |
+
return False, None
|
286 |
+
|
287 |
+
except Exception as e:
|
288 |
+
print(f"Error handling property array path {path}: {e}")
|
289 |
+
return False, None
|
290 |
+
|
291 |
+
def _split_path(self, path: str) -> List[str]:
|
292 |
+
"""Split path into parts, handling array notation"""
|
293 |
+
parts = []
|
294 |
+
current_part = ""
|
295 |
+
in_brackets = False
|
296 |
+
|
297 |
+
for char in path:
|
298 |
+
if char == '[':
|
299 |
+
in_brackets = True
|
300 |
+
current_part += char
|
301 |
+
elif char == ']':
|
302 |
+
in_brackets = False
|
303 |
+
current_part += char
|
304 |
+
elif char == '.' and not in_brackets:
|
305 |
+
if current_part:
|
306 |
+
parts.append(current_part)
|
307 |
+
current_part = ""
|
308 |
+
else:
|
309 |
+
current_part += char
|
310 |
+
|
311 |
+
if current_part:
|
312 |
+
parts.append(current_part)
|
313 |
+
|
314 |
+
return parts
|
315 |
+
|
316 |
+
def detect_field_presence(self, aibom: dict, field_path: str) -> Tuple[bool, Any]:
|
317 |
+
"""
|
318 |
+
Detect if a field exists at the given path in the AIBOM
|
319 |
+
Returns: (field_exists, field_value)
|
320 |
+
"""
|
321 |
+
return self._get_nested_value(aibom, field_path)
|
322 |
+
|
323 |
+
def analyze_aibom_completeness(self, aibom: dict) -> Dict[str, Any]:
|
324 |
+
"""
|
325 |
+
Analyze AIBOM completeness against the enhanced field registry
|
326 |
+
Compatible with enhanced registry structure: registry['fields'][field_name]
|
327 |
+
"""
|
328 |
+
results = {
|
329 |
+
'category_scores': {},
|
330 |
+
'total_score': 0,
|
331 |
+
'field_details': {},
|
332 |
+
'summary': {}
|
333 |
+
}
|
334 |
+
|
335 |
+
# Get fields from enhanced registry structure
|
336 |
+
fields = self.get_field_definitions()
|
337 |
+
if not fields:
|
338 |
+
print("β No fields found in registry")
|
339 |
+
return results
|
340 |
+
|
341 |
+
# Get scoring configuration
|
342 |
+
scoring_weights = self.get_configurable_scoring_weights()
|
343 |
+
category_weights = scoring_weights.get('category_weights', {})
|
344 |
+
|
345 |
+
# Group fields by category
|
346 |
+
categories = {}
|
347 |
+
for field_name, field_config in fields.items():
|
348 |
+
category = field_config.get('category', 'unknown')
|
349 |
+
if category not in categories:
|
350 |
+
categories[category] = []
|
351 |
+
categories[category].append((field_name, field_config))
|
352 |
+
|
353 |
+
print(f"π Analyzing {len(fields)} fields across {len(categories)} categories")
|
354 |
+
|
355 |
+
total_weighted_score = 0
|
356 |
+
|
357 |
+
for category_name, category_fields in categories.items():
|
358 |
+
category_weight = category_weights.get(category_name, 20)
|
359 |
+
|
360 |
+
present_fields = 0
|
361 |
+
total_fields = len(category_fields)
|
362 |
+
field_details = {}
|
363 |
+
|
364 |
+
print(f"\nπ Category: {category_name} (weight: {category_weight})")
|
365 |
+
|
366 |
+
for field_name, field_config in category_fields:
|
367 |
+
field_path = field_config.get('jsonpath', '')
|
368 |
+
tier = field_config.get('tier', 'supplementary')
|
369 |
+
weight = field_config.get('weight', 1)
|
370 |
+
|
371 |
+
if not field_path:
|
372 |
+
print(f"β οΈ Field {field_name} has no jsonpath defined")
|
373 |
+
field_details[field_name] = {
|
374 |
+
'present': False,
|
375 |
+
'value': None,
|
376 |
+
'path': field_path,
|
377 |
+
'tier': tier,
|
378 |
+
'weight': weight,
|
379 |
+
'error': 'No jsonpath defined'
|
380 |
+
}
|
381 |
+
continue
|
382 |
+
|
383 |
+
is_present, value = self.detect_field_presence(aibom, field_path)
|
384 |
+
|
385 |
+
field_details[field_name] = {
|
386 |
+
'present': is_present,
|
387 |
+
'value': value,
|
388 |
+
'path': field_path,
|
389 |
+
'tier': tier,
|
390 |
+
'weight': weight
|
391 |
+
}
|
392 |
+
|
393 |
+
if is_present:
|
394 |
+
present_fields += 1
|
395 |
+
print(f"β
FOUND: {field_name} = {value} (tier: {tier}, weight: {weight})")
|
396 |
+
else:
|
397 |
+
print(f"β MISSING: {field_name} at {field_path} (tier: {tier})")
|
398 |
+
|
399 |
+
# Calculate category score
|
400 |
+
category_percentage = (present_fields / total_fields) * 100 if total_fields > 0 else 0
|
401 |
+
category_score = (category_percentage / 100) * category_weight
|
402 |
+
|
403 |
+
results['category_scores'][category_name] = category_score
|
404 |
+
results['field_details'][category_name] = field_details
|
405 |
+
results['summary'][category_name] = {
|
406 |
+
'present': present_fields,
|
407 |
+
'total': total_fields,
|
408 |
+
'percentage': category_percentage,
|
409 |
+
'weight': category_weight
|
410 |
+
}
|
411 |
+
|
412 |
+
total_weighted_score += category_score
|
413 |
+
|
414 |
+
print(f"π {category_name}: {present_fields}/{total_fields} ({category_percentage:.1f}%) Γ {category_weight} = {category_score:.1f} pts")
|
415 |
+
|
416 |
+
results['total_score'] = total_weighted_score
|
417 |
+
|
418 |
+
print(f"\nπ― TOTAL SCORE: {total_weighted_score:.1f}")
|
419 |
+
|
420 |
+
return results
|
421 |
+
|
422 |
+
# =============================================================================
|
423 |
+
# UTILITY METHODS
|
424 |
+
# =============================================================================
|
425 |
+
|
426 |
+
def get_field_info(self, field_name: str) -> Optional[Dict[str, Any]]:
|
427 |
+
"""Get complete information for a specific field"""
|
428 |
+
fields = self.get_field_definitions()
|
429 |
+
return fields.get(field_name)
|
430 |
+
|
431 |
+
def get_field_jsonpath(self, field_name: str) -> Optional[str]:
|
432 |
+
"""Get JSONPath expression for a specific field"""
|
433 |
+
field_info = self.get_field_info(field_name)
|
434 |
+
return field_info.get("jsonpath") if field_info else None
|
435 |
+
|
436 |
+
def get_fields_by_category(self, category: str) -> List[str]:
|
437 |
+
"""Get all field names in a specific category"""
|
438 |
+
fields = self.get_field_definitions()
|
439 |
+
return [
|
440 |
+
field_name for field_name, field_config in fields.items()
|
441 |
+
if field_config.get("category") == category
|
442 |
+
]
|
443 |
+
|
444 |
+
def get_fields_by_tier(self, tier: str) -> List[str]:
|
445 |
+
"""Get all field names in a specific tier"""
|
446 |
+
fields = self.get_field_definitions()
|
447 |
+
return [
|
448 |
+
field_name for field_name, field_config in fields.items()
|
449 |
+
if field_config.get("tier") == tier
|
450 |
+
]
|
451 |
+
|
452 |
+
def validate_registry_integrity(self) -> Dict[str, Any]:
|
453 |
+
"""Validate the integrity of the loaded registry"""
|
454 |
+
validation_results = {
|
455 |
+
"valid": True,
|
456 |
+
"errors": [],
|
457 |
+
"warnings": [],
|
458 |
+
"field_count": 0,
|
459 |
+
"category_distribution": {},
|
460 |
+
"tier_distribution": {}
|
461 |
+
}
|
462 |
+
|
463 |
+
try:
|
464 |
+
fields = self.get_field_definitions()
|
465 |
+
validation_results["field_count"] = len(fields)
|
466 |
+
|
467 |
+
# Check category and tier distribution
|
468 |
+
categories = {}
|
469 |
+
tiers = {}
|
470 |
+
|
471 |
+
for field_name, field_config in fields.items():
|
472 |
+
# Check required field properties
|
473 |
+
required_props = ["tier", "weight", "category", "jsonpath"]
|
474 |
+
missing_props = [prop for prop in required_props if prop not in field_config]
|
475 |
+
|
476 |
+
if missing_props:
|
477 |
+
validation_results["errors"].append(
|
478 |
+
f"Field '{field_name}' missing properties: {missing_props}"
|
479 |
+
)
|
480 |
+
validation_results["valid"] = False
|
481 |
+
|
482 |
+
# Count categories and tiers
|
483 |
+
category = field_config.get("category", "unknown")
|
484 |
+
tier = field_config.get("tier", "unknown")
|
485 |
+
|
486 |
+
categories[category] = categories.get(category, 0) + 1
|
487 |
+
tiers[tier] = tiers.get(tier, 0) + 1
|
488 |
+
|
489 |
+
validation_results["category_distribution"] = categories
|
490 |
+
validation_results["tier_distribution"] = tiers
|
491 |
+
|
492 |
+
# Check scoring configuration
|
493 |
+
scoring_config = self.get_scoring_config()
|
494 |
+
if not scoring_config.get("tier_weights"):
|
495 |
+
validation_results["warnings"].append("Missing tier_weights in scoring_config")
|
496 |
+
|
497 |
+
if not scoring_config.get("category_weights"):
|
498 |
+
validation_results["warnings"].append("Missing category_weights in scoring_config")
|
499 |
+
|
500 |
+
except Exception as e:
|
501 |
+
validation_results["valid"] = False
|
502 |
+
validation_results["errors"].append(f"Registry validation error: {e}")
|
503 |
+
|
504 |
+
return validation_results
|
505 |
+
|
506 |
+
|
507 |
+
# =============================================================================
|
508 |
+
# GLOBAL INSTANCE AND CONVENIENCE FUNCTIONS
|
509 |
+
# =============================================================================
|
510 |
+
|
511 |
+
# Global registry manager instance (initialized on first import)
|
512 |
+
_registry_manager = None
|
513 |
+
|
514 |
+
def get_field_registry_manager() -> FieldRegistryManager:
|
515 |
+
"""Get the global field registry manager instance (singleton pattern)"""
|
516 |
+
global _registry_manager
|
517 |
+
if _registry_manager is None:
|
518 |
+
_registry_manager = FieldRegistryManager()
|
519 |
+
return _registry_manager
|
520 |
+
|
521 |
+
# Convenience functions for backward compatibility with existing code
|
522 |
+
|
523 |
+
def load_field_registry() -> Dict[str, Any]:
|
524 |
+
"""Load the complete field registry (convenience function)"""
|
525 |
+
manager = get_field_registry_manager()
|
526 |
+
return manager.registry
|
527 |
+
|
528 |
+
def generate_field_classification() -> Dict[str, Any]:
|
529 |
+
"""Generate FIELD_CLASSIFICATION from registry (convenience function)"""
|
530 |
+
manager = get_field_registry_manager()
|
531 |
+
return manager.generate_field_classification()
|
532 |
+
|
533 |
+
def generate_completeness_profiles() -> Dict[str, Any]:
|
534 |
+
"""Generate COMPLETENESS_PROFILES from registry (convenience function)"""
|
535 |
+
manager = get_field_registry_manager()
|
536 |
+
return manager.generate_completeness_profiles()
|
537 |
+
|
538 |
+
def generate_validation_messages() -> Dict[str, Any]:
|
539 |
+
"""Generate VALIDATION_MESSAGES from registry (convenience function)"""
|
540 |
+
manager = get_field_registry_manager()
|
541 |
+
return manager.generate_validation_messages()
|
542 |
+
|
543 |
+
def get_configurable_scoring_weights() -> Dict[str, Any]:
|
544 |
+
"""Get configurable scoring weights from registry"""
|
545 |
+
manager = get_field_registry_manager()
|
546 |
+
return manager.get_configurable_scoring_weights()
|
547 |
+
|
548 |
+
# For compatibility with old DynamicFieldDetector usage
|
549 |
+
class DynamicFieldDetector:
|
550 |
+
"""Compatibility wrapper for old DynamicFieldDetector usage"""
|
551 |
+
|
552 |
+
def __init__(self, registry_path: str):
|
553 |
+
"""Initialize with field registry manager"""
|
554 |
+
self.manager = FieldRegistryManager(registry_path)
|
555 |
+
self.registry = self.manager.registry
|
556 |
+
|
557 |
+
def detect_field_presence(self, aibom: dict, field_path: str) -> Tuple[bool, Any]:
|
558 |
+
"""Detect field presence using the manager"""
|
559 |
+
return self.manager.detect_field_presence(aibom, field_path)
|
560 |
+
|
561 |
+
def analyze_aibom_completeness(self, aibom: dict) -> Dict[str, Any]:
|
562 |
+
"""Analyze AIBOM completeness using the manager"""
|
563 |
+
return self.manager.analyze_aibom_completeness(aibom)
|
564 |
+
|
565 |
+
# Validation function for testing
|
566 |
+
def validate_registry_setup() -> bool:
|
567 |
+
"""Validate that the registry is properly set up and accessible"""
|
568 |
+
try:
|
569 |
+
manager = get_field_registry_manager()
|
570 |
+
validation_results = manager.validate_registry_integrity()
|
571 |
+
|
572 |
+
if validation_results["valid"]:
|
573 |
+
print(f"β
Registry validation successful")
|
574 |
+
print(f" Fields loaded: {validation_results['field_count']}")
|
575 |
+
print(f" Categories: {list(validation_results['category_distribution'].keys())}")
|
576 |
+
print(f" Tiers: {list(validation_results['tier_distribution'].keys())}")
|
577 |
+
return True
|
578 |
+
else:
|
579 |
+
print(f"β Registry validation failed")
|
580 |
+
for error in validation_results["errors"]:
|
581 |
+
print(f" Error: {error}")
|
582 |
+
return False
|
583 |
+
|
584 |
+
except Exception as e:
|
585 |
+
print(f"β Registry setup validation failed: {e}")
|
586 |
+
return False
|
587 |
+
|
588 |
+
def test_field_registry_manager():
|
589 |
+
"""
|
590 |
+
This function is temporary (or optional later on).
|
591 |
+
It serves the purpose of validating the field registry manager after refactoring
|
592 |
+
such as replacing old files or methods within for field detection and score calculations
|
593 |
+
and comes handy as a debugging tool.
|
594 |
+
"""
|
595 |
+
try:
|
596 |
+
print("π§ͺ Testing Consolidated Field Registry Manager...")
|
597 |
+
|
598 |
+
# Test manager initialization
|
599 |
+
manager = get_field_registry_manager()
|
600 |
+
print(f"β
Manager initialized with registry: {manager.registry_path}")
|
601 |
+
|
602 |
+
# Test configuration generation
|
603 |
+
field_classification = manager.generate_field_classification()
|
604 |
+
print(f"β
Generated FIELD_CLASSIFICATION with {len(field_classification)} fields")
|
605 |
+
|
606 |
+
completeness_profiles = manager.generate_completeness_profiles()
|
607 |
+
print(f"β
Generated COMPLETENESS_PROFILES with {len(completeness_profiles)} profiles")
|
608 |
+
|
609 |
+
validation_messages = manager.generate_validation_messages()
|
610 |
+
print(f"β
Generated VALIDATION_MESSAGES with {len(validation_messages)} messages")
|
611 |
+
|
612 |
+
scoring_weights = manager.get_configurable_scoring_weights()
|
613 |
+
print(f"β
Generated SCORING_WEIGHTS with {len(scoring_weights)} sections")
|
614 |
+
|
615 |
+
# Test field detection capabilities
|
616 |
+
test_fields = ['bomFormat', 'primaryPurpose', 'energyConsumption']
|
617 |
+
for field_name in test_fields:
|
618 |
+
field_info = manager.get_field_info(field_name)
|
619 |
+
if field_info:
|
620 |
+
jsonpath = field_info.get('jsonpath', 'N/A')
|
621 |
+
category = field_info.get('category', 'N/A')
|
622 |
+
tier = field_info.get('tier', 'N/A')
|
623 |
+
print(f"β
Field '{field_name}': {jsonpath} (category: {category}, tier: {tier})")
|
624 |
+
else:
|
625 |
+
print(f"β Field '{field_name}' not found in registry")
|
626 |
+
|
627 |
+
# Test registry validation
|
628 |
+
validation_results = manager.validate_registry_integrity()
|
629 |
+
if validation_results["valid"]:
|
630 |
+
print("β
Registry integrity validation passed")
|
631 |
+
else:
|
632 |
+
print("β οΈ Registry integrity validation issues found")
|
633 |
+
for error in validation_results["errors"]:
|
634 |
+
print(f" Error: {error}")
|
635 |
+
|
636 |
+
print("π Consolidated field registry manager test completed successfully!")
|
637 |
+
return True
|
638 |
+
|
639 |
+
except Exception as e:
|
640 |
+
print(f"β Field registry manager test failed: {e}")
|
641 |
+
import traceback
|
642 |
+
traceback.print_exc()
|
643 |
+
return False
|
644 |
+
|
645 |
+
if __name__ == "__main__":
|
646 |
+
# Test the consolidated manager when run directly
|
647 |
+
test_field_registry_manager()
|
648 |
+
|
src/aibom-generator/generator.py
CHANGED
@@ -1,13 +1,30 @@
|
|
1 |
import json
|
2 |
import uuid
|
3 |
import datetime
|
|
|
4 |
from typing import Dict, Optional, Any, List
|
5 |
|
6 |
-
|
7 |
from huggingface_hub import HfApi, ModelCard
|
|
|
8 |
from urllib.parse import urlparse
|
9 |
from .utils import calculate_completeness_score
|
10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
class AIBOMGenerator:
|
13 |
def __init__(
|
@@ -16,7 +33,7 @@ class AIBOMGenerator:
|
|
16 |
inference_model_url: Optional[str] = None,
|
17 |
use_inference: bool = True,
|
18 |
cache_dir: Optional[str] = None,
|
19 |
-
use_best_practices: bool = True, #
|
20 |
):
|
21 |
self.hf_api = HfApi(token=hf_token)
|
22 |
self.inference_model_url = inference_model_url
|
@@ -24,13 +41,48 @@ class AIBOMGenerator:
|
|
24 |
self.cache_dir = cache_dir
|
25 |
self.enhancement_report = None # Store enhancement report as instance variable
|
26 |
self.use_best_practices = use_best_practices # Store best practices flag
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
def generate_aibom(
|
29 |
self,
|
30 |
model_id: str,
|
31 |
output_file: Optional[str] = None,
|
32 |
include_inference: Optional[bool] = None,
|
33 |
-
use_best_practices: Optional[bool] = None, #
|
34 |
) -> Dict[str, Any]:
|
35 |
try:
|
36 |
model_id = self._normalise_model_id(model_id)
|
@@ -43,12 +95,59 @@ class AIBOMGenerator:
|
|
43 |
|
44 |
# Store original metadata before any AI enhancement
|
45 |
original_metadata = self._extract_structured_metadata(model_id, model_info, model_card)
|
|
|
|
|
|
|
|
|
46 |
|
47 |
# Create initial AIBOM with original metadata
|
48 |
original_aibom = self._create_aibom_structure(model_id, original_metadata)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
|
50 |
# Calculate initial score with industry-neutral approach if enabled
|
51 |
-
original_score = calculate_completeness_score(original_aibom, validate=True, use_best_practices=use_best_practices)
|
|
|
52 |
|
53 |
# Final metadata starts with original metadata
|
54 |
final_metadata = original_metadata.copy() if original_metadata else {}
|
@@ -74,12 +173,19 @@ class AIBOMGenerator:
|
|
74 |
except Exception as e:
|
75 |
print(f"Error during AI enhancement: {e}")
|
76 |
# Continue with original metadata if enhancement fails
|
77 |
-
|
|
|
78 |
# Create final AIBOM with potentially enhanced metadata
|
79 |
aibom = self._create_aibom_structure(model_id, final_metadata)
|
80 |
|
81 |
-
# Calculate final score with
|
82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
|
84 |
|
85 |
if output_file:
|
@@ -98,8 +204,8 @@ class AIBOMGenerator:
|
|
98 |
# Return only the AIBOM to maintain compatibility with existing code
|
99 |
return aibom
|
100 |
except Exception as e:
|
101 |
-
print(f"Error generating
|
102 |
-
# Return a minimal valid
|
103 |
return self._create_minimal_aibom(model_id)
|
104 |
|
105 |
def _create_minimal_aibom(self, model_id: str) -> Dict[str, Any]:
|
@@ -156,7 +262,7 @@ class AIBOMGenerator:
|
|
156 |
print(f"Error fetching model info for {model_id}: {e}")
|
157 |
return {}
|
158 |
|
159 |
-
|
160 |
@staticmethod
|
161 |
def _normalise_model_id(raw_id: str) -> str:
|
162 |
"""
|
@@ -171,7 +277,7 @@ class AIBOMGenerator:
|
|
171 |
return "/".join(parts[:2])
|
172 |
return path
|
173 |
return raw_id
|
174 |
-
|
175 |
|
176 |
def _fetch_model_card(self, model_id: str) -> Optional[ModelCard]:
|
177 |
try:
|
@@ -185,6 +291,12 @@ class AIBOMGenerator:
|
|
185 |
model_id: str,
|
186 |
metadata: Dict[str, Any],
|
187 |
) -> Dict[str, Any]:
|
|
|
|
|
|
|
|
|
|
|
|
|
188 |
# Extract owner and model name from model_id
|
189 |
parts = model_id.split("/")
|
190 |
group = parts[0] if len(parts) > 1 else ""
|
@@ -192,6 +304,9 @@ class AIBOMGenerator:
|
|
192 |
|
193 |
# Get version from metadata or use default
|
194 |
version = metadata.get("commit", "1.0")
|
|
|
|
|
|
|
195 |
|
196 |
aibom = {
|
197 |
"bomFormat": "CycloneDX",
|
@@ -206,7 +321,10 @@ class AIBOMGenerator:
|
|
206 |
"dependsOn": [f"pkg:huggingface/{model_id.replace('/', '/')}@{version}"]
|
207 |
}
|
208 |
]
|
209 |
-
}
|
|
|
|
|
|
|
210 |
|
211 |
# ALWAYS add root-level external references
|
212 |
aibom["externalReferences"] = [{
|
@@ -220,6 +338,7 @@ class AIBOMGenerator:
|
|
220 |
"url": metadata["commit_url"]
|
221 |
} )
|
222 |
|
|
|
223 |
return aibom
|
224 |
|
225 |
def _extract_structured_metadata(
|
@@ -228,6 +347,48 @@ class AIBOMGenerator:
|
|
228 |
model_info: Dict[str, Any],
|
229 |
model_card: Optional[ModelCard],
|
230 |
) -> Dict[str, Any]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
231 |
metadata = {}
|
232 |
|
233 |
if model_info:
|
@@ -248,7 +409,7 @@ class AIBOMGenerator:
|
|
248 |
"downloads": getattr(model_info, "downloads", 0),
|
249 |
"last_modified": getattr(model_info, "lastModified", None),
|
250 |
"commit": getattr(model_info, "sha", None)[:7] if getattr(model_info, "sha", None) else None,
|
251 |
-
"commit_url": f"https://huggingface.co/{model_id}/commit/{model_info.sha}" if getattr(model_info, "sha", None) else None,
|
252 |
})
|
253 |
except Exception as e:
|
254 |
print(f"Error extracting model info metadata: {e}")
|
@@ -290,6 +451,7 @@ class AIBOMGenerator:
|
|
290 |
print(f"DEBUG: Adding suppliedBy = {metadata.get('suppliedBy')}")
|
291 |
|
292 |
return {k: v for k, v in metadata.items() if v is not None}
|
|
|
293 |
|
294 |
|
295 |
def _extract_unstructured_metadata(self, model_card: Optional[ModelCard], model_id: str) -> Dict[str, Any]:
|
@@ -301,6 +463,9 @@ class AIBOMGenerator:
|
|
301 |
|
302 |
|
303 |
def _create_metadata_section(self, model_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
|
|
|
|
|
|
|
304 |
timestamp = datetime.datetime.utcnow().isoformat() + "Z"
|
305 |
|
306 |
# Get version from metadata or use default
|
@@ -358,24 +523,43 @@ class AIBOMGenerator:
|
|
358 |
|
359 |
# ALWAYS add critical fields for scoring
|
360 |
critical_fields = {
|
361 |
-
"primaryPurpose": metadata.get("primaryPurpose",
|
362 |
-
"suppliedBy": metadata.get("suppliedBy",
|
363 |
-
"typeOfModel": metadata.get("
|
364 |
}
|
365 |
-
|
366 |
-
# Add critical fields first
|
367 |
for key, value in critical_fields.items():
|
368 |
-
|
369 |
-
properties.append({"name": key, "value": str(value)})
|
370 |
|
371 |
-
# Add
|
372 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
373 |
for key, value in metadata.items():
|
374 |
-
|
|
|
|
|
375 |
if isinstance(value, (list, dict)):
|
376 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
377 |
value = json.dumps(value)
|
|
|
378 |
properties.append({"name": key, "value": str(value)})
|
|
|
379 |
|
380 |
# Assemble metadata section
|
381 |
metadata_section = {
|
@@ -388,6 +572,9 @@ class AIBOMGenerator:
|
|
388 |
return metadata_section
|
389 |
|
390 |
def _create_component_section(self, model_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
|
|
|
|
|
|
|
391 |
# Extract owner and model name from model_id
|
392 |
parts = model_id.split("/")
|
393 |
group = parts[0] if len(parts) > 1 else ""
|
@@ -412,7 +599,7 @@ class AIBOMGenerator:
|
|
412 |
"purl": purl
|
413 |
}
|
414 |
|
415 |
-
#
|
416 |
if metadata and "license" in metadata and metadata["license"]:
|
417 |
component["licenses"] = [{
|
418 |
"license": {
|
@@ -420,14 +607,48 @@ class AIBOMGenerator:
|
|
420 |
"url": self._get_license_url(metadata["license"])
|
421 |
}
|
422 |
}]
|
|
|
423 |
else:
|
424 |
-
# Add default license structure for consistency
|
425 |
component["licenses"] = [{
|
426 |
"license": {
|
427 |
-
"id": "
|
428 |
"url": "https://spdx.org/licenses/"
|
429 |
}
|
430 |
}]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
431 |
# Debug
|
432 |
print(f"DEBUG: License in metadata: {'license' in metadata}" )
|
433 |
if "license" in metadata:
|
@@ -435,6 +656,21 @@ class AIBOMGenerator:
|
|
435 |
|
436 |
# ALWAYS add description
|
437 |
component["description"] = metadata.get("description", f"AI model {model_id}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
438 |
|
439 |
# Add external references
|
440 |
external_refs = [{
|
@@ -470,26 +706,70 @@ class AIBOMGenerator:
|
|
470 |
|
471 |
return component
|
472 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
473 |
def _create_model_card_section(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
|
|
|
|
|
|
|
474 |
model_card_section = {}
|
475 |
|
476 |
# Add quantitative analysis section
|
477 |
if "eval_results" in metadata:
|
478 |
model_card_section["quantitativeAnalysis"] = {
|
479 |
-
"performanceMetrics": metadata["eval_results"],
|
480 |
"graphics": {} # Empty graphics object as in the example
|
481 |
}
|
482 |
else:
|
483 |
model_card_section["quantitativeAnalysis"] = {"graphics": {}}
|
484 |
|
485 |
-
# Add properties section
|
486 |
properties = []
|
487 |
-
for key, value in metadata.items():
|
488 |
-
if key in ["author", "library_name", "license", "downloads", "likes", "tags", "created_at", "last_modified"]:
|
489 |
-
properties.append({"name": key, "value": str(value)})
|
490 |
|
491 |
-
|
492 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
493 |
|
494 |
# Create model parameters section
|
495 |
model_parameters = {}
|
@@ -538,6 +818,25 @@ class AIBOMGenerator:
|
|
538 |
|
539 |
# Add model parameters to model card section
|
540 |
model_card_section["modelParameters"] = model_parameters
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
541 |
|
542 |
# Add considerations section
|
543 |
considerations = {}
|
@@ -578,4 +877,112 @@ class AIBOMGenerator:
|
|
578 |
logger.warning(f"Failed to fetch after {max_retries} attempts: {e}")
|
579 |
return None
|
580 |
time.sleep(1 * (attempt + 1)) # Exponential backoff
|
581 |
-
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import json
|
2 |
import uuid
|
3 |
import datetime
|
4 |
+
import json
|
5 |
from typing import Dict, Optional, Any, List
|
6 |
|
|
|
7 |
from huggingface_hub import HfApi, ModelCard
|
8 |
+
from huggingface_hub.repocard_data import EvalResult
|
9 |
from urllib.parse import urlparse
|
10 |
from .utils import calculate_completeness_score
|
11 |
|
12 |
+
# Import registry-aware enhanced extraction if available
|
13 |
+
try:
|
14 |
+
from .enhanced_extractor import EnhancedExtractor
|
15 |
+
from .field_registry_manager import get_field_registry_manager
|
16 |
+
ENHANCED_EXTRACTION_AVAILABLE = True
|
17 |
+
print("β
Registry-aware enhanced extraction module loaded successfully")
|
18 |
+
except ImportError:
|
19 |
+
try:
|
20 |
+
from enhanced_extractor import EnhancedExtractor
|
21 |
+
from field_registry_manager import get_field_registry_manager
|
22 |
+
ENHANCED_EXTRACTION_AVAILABLE = True
|
23 |
+
print("β
Registry-aware enhanced extraction module loaded successfully (direct import)")
|
24 |
+
except ImportError:
|
25 |
+
ENHANCED_EXTRACTION_AVAILABLE = False
|
26 |
+
print("β οΈ Registry-aware enhanced extraction not available, using basic extraction")
|
27 |
+
|
28 |
|
29 |
class AIBOMGenerator:
|
30 |
def __init__(
|
|
|
33 |
inference_model_url: Optional[str] = None,
|
34 |
use_inference: bool = True,
|
35 |
cache_dir: Optional[str] = None,
|
36 |
+
use_best_practices: bool = True, # parameter for industry-neutral scoring
|
37 |
):
|
38 |
self.hf_api = HfApi(token=hf_token)
|
39 |
self.inference_model_url = inference_model_url
|
|
|
41 |
self.cache_dir = cache_dir
|
42 |
self.enhancement_report = None # Store enhancement report as instance variable
|
43 |
self.use_best_practices = use_best_practices # Store best practices flag
|
44 |
+
self._setup_enhanced_logging()
|
45 |
+
|
46 |
+
self.extraction_results = {} # Store extraction results for scoring
|
47 |
+
|
48 |
+
# Initialize registry manager for enhanced extraction
|
49 |
+
self.registry_manager = None
|
50 |
+
if ENHANCED_EXTRACTION_AVAILABLE:
|
51 |
+
try:
|
52 |
+
self.registry_manager = get_field_registry_manager()
|
53 |
+
print("β
Registry manager initialized for generator")
|
54 |
+
except Exception as e:
|
55 |
+
print(f"β οΈ Could not initialize registry manager: {e}")
|
56 |
+
self.registry_manager = None
|
57 |
+
|
58 |
+
def get_extraction_results(self):
|
59 |
+
"""Return the enhanced extraction results from the last extraction"""
|
60 |
+
return getattr(self, 'extraction_results', {})
|
61 |
|
62 |
+
def _setup_enhanced_logging(self):
|
63 |
+
"""Setup enhanced logging for extraction tracking"""
|
64 |
+
import logging
|
65 |
+
|
66 |
+
# Configure logging to show in HF Spaces
|
67 |
+
logging.basicConfig(
|
68 |
+
level=logging.INFO,
|
69 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
70 |
+
force=True # Override any existing configuration
|
71 |
+
)
|
72 |
+
|
73 |
+
# Ensure logger shows up
|
74 |
+
logger = logging.getLogger('enhanced_extractor')
|
75 |
+
logger.setLevel(logging.INFO)
|
76 |
+
|
77 |
+
print("π§ Enhanced logging configured for AI SBOM generation")
|
78 |
+
|
79 |
+
|
80 |
def generate_aibom(
|
81 |
self,
|
82 |
model_id: str,
|
83 |
output_file: Optional[str] = None,
|
84 |
include_inference: Optional[bool] = None,
|
85 |
+
use_best_practices: Optional[bool] = None, # parameter for industry-neutral scoring
|
86 |
) -> Dict[str, Any]:
|
87 |
try:
|
88 |
model_id = self._normalise_model_id(model_id)
|
|
|
95 |
|
96 |
# Store original metadata before any AI enhancement
|
97 |
original_metadata = self._extract_structured_metadata(model_id, model_info, model_card)
|
98 |
+
print(f"π ENHANCED EXTRACTION DEBUG: Returned {len(original_metadata)} fields:")
|
99 |
+
for key, value in original_metadata.items():
|
100 |
+
print(f" {key}: {value}")
|
101 |
+
print(f"π EXTRACTION RESULTS: {len(self.extraction_results) if hasattr(self, 'extraction_results') and self.extraction_results else 0} extraction results available")
|
102 |
|
103 |
# Create initial AIBOM with original metadata
|
104 |
original_aibom = self._create_aibom_structure(model_id, original_metadata)
|
105 |
+
|
106 |
+
print(f"π AI SBOM CREATION DEBUG: Checking what made it into AIBOM:")
|
107 |
+
if 'components' in original_aibom and original_aibom['components']:
|
108 |
+
component = original_aibom['components'][0]
|
109 |
+
if 'properties' in component:
|
110 |
+
print(f" Found {len(component['properties'])} properties in AIBOM:")
|
111 |
+
for prop in component['properties']:
|
112 |
+
print(f" {prop.get('name')}: {prop.get('value')}")
|
113 |
+
else:
|
114 |
+
print(" No properties found in component")
|
115 |
+
else:
|
116 |
+
print(" No components found in AI SBOM")
|
117 |
+
print(f"π FIELD PRESERVATION VERIFICATION:")
|
118 |
+
print(f" Enhanced extraction returned: {len(original_metadata)} fields")
|
119 |
+
|
120 |
+
# Count fields in final AIBOM
|
121 |
+
aibom_field_count = 0
|
122 |
+
|
123 |
+
# Count component properties
|
124 |
+
if 'components' in original_aibom and original_aibom['components']:
|
125 |
+
component = original_aibom['components'][0]
|
126 |
+
if 'properties' in component:
|
127 |
+
aibom_field_count += len(component['properties'])
|
128 |
+
|
129 |
+
# Count model card properties
|
130 |
+
if 'modelCard' in component and 'properties' in component['modelCard']:
|
131 |
+
aibom_field_count += len(component['modelCard']['properties'])
|
132 |
+
|
133 |
+
# Count metadata properties
|
134 |
+
if 'metadata' in original_aibom and 'properties' in original_aibom['metadata']:
|
135 |
+
aibom_field_count += len(original_aibom['metadata']['properties'])
|
136 |
+
|
137 |
+
print(f" Final AIBOM contains: {aibom_field_count} fields")
|
138 |
+
print(f" Field preservation rate: {(aibom_field_count/len(original_metadata)*100):.1f}%")
|
139 |
+
|
140 |
+
if aibom_field_count >= len(original_metadata) * 0.9: # 90% or better
|
141 |
+
print("β
EXCELLENT: Field preservation successful!")
|
142 |
+
elif aibom_field_count >= len(original_metadata) * 0.7: # 70% or better
|
143 |
+
print("β οΈ GOOD: Most fields preserved, some optimization possible")
|
144 |
+
else:
|
145 |
+
print("β POOR: Significant field loss detected")
|
146 |
+
|
147 |
|
148 |
# Calculate initial score with industry-neutral approach if enabled
|
149 |
+
original_score = calculate_completeness_score(original_aibom, validate=True, use_best_practices=use_best_practices, extraction_results=self.extraction_results)
|
150 |
+
|
151 |
|
152 |
# Final metadata starts with original metadata
|
153 |
final_metadata = original_metadata.copy() if original_metadata else {}
|
|
|
173 |
except Exception as e:
|
174 |
print(f"Error during AI enhancement: {e}")
|
175 |
# Continue with original metadata if enhancement fails
|
176 |
+
print("π¨ FALLBACK: Using _create_minimal_aibom due to error!")
|
177 |
+
print(f"π¨ ERROR DETAILS: {str(e)}")
|
178 |
# Create final AIBOM with potentially enhanced metadata
|
179 |
aibom = self._create_aibom_structure(model_id, final_metadata)
|
180 |
|
181 |
+
# Calculate final score with enhanced extraction results
|
182 |
+
extraction_results = self.get_extraction_results()
|
183 |
+
final_score = calculate_completeness_score(
|
184 |
+
aibom,
|
185 |
+
validate=True,
|
186 |
+
use_best_practices=use_best_practices,
|
187 |
+
extraction_results=extraction_results # Pass enhanced results
|
188 |
+
)
|
189 |
|
190 |
|
191 |
if output_file:
|
|
|
204 |
# Return only the AIBOM to maintain compatibility with existing code
|
205 |
return aibom
|
206 |
except Exception as e:
|
207 |
+
print(f"Error generating AI SBOM: {e}")
|
208 |
+
# Return a minimal valid AI SBOM structure in case of error
|
209 |
return self._create_minimal_aibom(model_id)
|
210 |
|
211 |
def _create_minimal_aibom(self, model_id: str) -> Dict[str, Any]:
|
|
|
262 |
print(f"Error fetching model info for {model_id}: {e}")
|
263 |
return {}
|
264 |
|
265 |
+
|
266 |
@staticmethod
|
267 |
def _normalise_model_id(raw_id: str) -> str:
|
268 |
"""
|
|
|
277 |
return "/".join(parts[:2])
|
278 |
return path
|
279 |
return raw_id
|
280 |
+
|
281 |
|
282 |
def _fetch_model_card(self, model_id: str) -> Optional[ModelCard]:
|
283 |
try:
|
|
|
291 |
model_id: str,
|
292 |
metadata: Dict[str, Any],
|
293 |
) -> Dict[str, Any]:
|
294 |
+
# π CRASH DEBUG: troubleshoot where the process is crashing and falling back to minimal AIBOM
|
295 |
+
print(f"π CRASH_DEBUG: _create_aibom_structure called")
|
296 |
+
print(f"π CRASH_DEBUG: model_id = {model_id}")
|
297 |
+
print(f"π CRASH_DEBUG: metadata type = {type(metadata)}")
|
298 |
+
print(f"π CRASH_DEBUG: metadata keys = {list(metadata.keys()) if isinstance(metadata, dict) else 'NOT A DICT'}")
|
299 |
+
|
300 |
# Extract owner and model name from model_id
|
301 |
parts = model_id.split("/")
|
302 |
group = parts[0] if len(parts) > 1 else ""
|
|
|
304 |
|
305 |
# Get version from metadata or use default
|
306 |
version = metadata.get("commit", "1.0")
|
307 |
+
|
308 |
+
# π CRASH DEBUG: Check metadata before creating sections
|
309 |
+
print(f"π CRASH_DEBUG: About to create metadata section")
|
310 |
|
311 |
aibom = {
|
312 |
"bomFormat": "CycloneDX",
|
|
|
321 |
"dependsOn": [f"pkg:huggingface/{model_id.replace('/', '/')}@{version}"]
|
322 |
}
|
323 |
]
|
324 |
+
}
|
325 |
+
|
326 |
+
# π CRASH DEBUG: Check if we got this far
|
327 |
+
print(f"π CRASH_DEBUG: Successfully created basic AIBOM structure")
|
328 |
|
329 |
# ALWAYS add root-level external references
|
330 |
aibom["externalReferences"] = [{
|
|
|
338 |
"url": metadata["commit_url"]
|
339 |
} )
|
340 |
|
341 |
+
print(f"π CRASH_DEBUG: _create_aibom_structure completed successfully")
|
342 |
return aibom
|
343 |
|
344 |
def _extract_structured_metadata(
|
|
|
347 |
model_info: Dict[str, Any],
|
348 |
model_card: Optional[ModelCard],
|
349 |
) -> Dict[str, Any]:
|
350 |
+
|
351 |
+
# Use registry-aware enhanced extraction if available
|
352 |
+
if ENHANCED_EXTRACTION_AVAILABLE:
|
353 |
+
try:
|
354 |
+
print(f"π Using registry-aware enhanced extraction for: {model_id}")
|
355 |
+
|
356 |
+
# Create registry-aware enhanced extractor instance
|
357 |
+
extractor = EnhancedExtractor(self.hf_api, self.registry_manager)
|
358 |
+
|
359 |
+
# Get both metadata and extraction results
|
360 |
+
metadata = extractor.extract_metadata(model_id, model_info, model_card)
|
361 |
+
|
362 |
+
# Store extraction results for scoring
|
363 |
+
self.extraction_results = extractor.extraction_results
|
364 |
+
|
365 |
+
# Log extraction summary
|
366 |
+
if extractor.registry_fields:
|
367 |
+
registry_field_count = len(extractor.registry_fields)
|
368 |
+
extracted_count = len([k for k, v in metadata.items() if v is not None])
|
369 |
+
extraction_results_count = len(extractor.extraction_results)
|
370 |
+
|
371 |
+
print(f"β
Registry-driven extraction completed:")
|
372 |
+
print(f" π Registry fields available: {registry_field_count}")
|
373 |
+
print(f" π Fields attempted: {extraction_results_count}")
|
374 |
+
print(f" β
Fields extracted: {extracted_count}")
|
375 |
+
|
376 |
+
# Log field coverage
|
377 |
+
if registry_field_count > 0:
|
378 |
+
coverage = (extracted_count / registry_field_count) * 100
|
379 |
+
print(f" π Registry field coverage: {coverage:.1f}%")
|
380 |
+
else:
|
381 |
+
extracted_count = len([k for k, v in metadata.items() if v is not None])
|
382 |
+
print(f"β
Legacy extraction completed: {extracted_count} fields extracted")
|
383 |
+
|
384 |
+
return metadata
|
385 |
+
|
386 |
+
except Exception as e:
|
387 |
+
print(f"β Registry-aware enhanced extraction failed: {e}")
|
388 |
+
print("π Falling back to original extraction method")
|
389 |
+
# Fall back to original extraction code here
|
390 |
+
|
391 |
+
# ORIGINAL EXTRACTION METHOD (as fallback)
|
392 |
metadata = {}
|
393 |
|
394 |
if model_info:
|
|
|
409 |
"downloads": getattr(model_info, "downloads", 0),
|
410 |
"last_modified": getattr(model_info, "lastModified", None),
|
411 |
"commit": getattr(model_info, "sha", None)[:7] if getattr(model_info, "sha", None) else None,
|
412 |
+
"commit_url": f"https://huggingface.co/{model_id}/commit/{model_info.sha}" if getattr(model_info, "sha", None ) else None,
|
413 |
})
|
414 |
except Exception as e:
|
415 |
print(f"Error extracting model info metadata: {e}")
|
|
|
451 |
print(f"DEBUG: Adding suppliedBy = {metadata.get('suppliedBy')}")
|
452 |
|
453 |
return {k: v for k, v in metadata.items() if v is not None}
|
454 |
+
|
455 |
|
456 |
|
457 |
def _extract_unstructured_metadata(self, model_card: Optional[ModelCard], model_id: str) -> Dict[str, Any]:
|
|
|
463 |
|
464 |
|
465 |
def _create_metadata_section(self, model_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
|
466 |
+
print(f"π CRASH_DEBUG: _create_metadata_section called")
|
467 |
+
print(f"π CRASH_DEBUG: metadata type in metadata_section = {type(metadata)}")
|
468 |
+
|
469 |
timestamp = datetime.datetime.utcnow().isoformat() + "Z"
|
470 |
|
471 |
# Get version from metadata or use default
|
|
|
523 |
|
524 |
# ALWAYS add critical fields for scoring
|
525 |
critical_fields = {
|
526 |
+
"primaryPurpose": metadata.get("primaryPurpose", "text-generation"),
|
527 |
+
"suppliedBy": metadata.get("suppliedBy", "unknown"),
|
528 |
+
"typeOfModel": metadata.get("typeOfModel", "Transformer")
|
529 |
}
|
|
|
|
|
530 |
for key, value in critical_fields.items():
|
531 |
+
properties.append({"name": key, "value": str(value)})
|
|
|
532 |
|
533 |
+
# Add enhanced extraction fields to properties
|
534 |
+
# Organize fields by category for better AIBOM structure
|
535 |
+
component_fields = ["name", "author", "description", "commit"] # These go in component section
|
536 |
+
critical_fields = ["primaryPurpose", "suppliedBy", "typeOfModel"] # Always include these
|
537 |
+
|
538 |
+
# Add all other enhanced extraction fields (preserve everything!)
|
539 |
+
enhanced_fields = ["model_type", "tokenizer_class", "architectures", "library_name",
|
540 |
+
"pipeline_tag", "tags", "datasets", "base_model", "language",
|
541 |
+
"downloads", "last_modified", "commit_url", "ai:type", "ai:task",
|
542 |
+
"ai:framework", "eval_results"]
|
543 |
+
|
544 |
+
print(f"π CRASH_DEBUG: About to call .items() on metadata")
|
545 |
+
print(f"π CRASH_DEBUG: metadata type before .items() = {type(metadata)}")
|
546 |
+
|
547 |
for key, value in metadata.items():
|
548 |
+
# Skip component fields (handled elsewhere) but include everything else
|
549 |
+
if key not in component_fields and value is not None:
|
550 |
+
# Handle different data types properly
|
551 |
if isinstance(value, (list, dict)):
|
552 |
+
if isinstance(value, list) and len(value) > 0:
|
553 |
+
# Convert list to comma-separated string for better display
|
554 |
+
if all(isinstance(item, str) for item in value):
|
555 |
+
value = ", ".join(value)
|
556 |
+
else:
|
557 |
+
value = json.dumps(value)
|
558 |
+
elif isinstance(value, dict):
|
559 |
value = json.dumps(value)
|
560 |
+
|
561 |
properties.append({"name": key, "value": str(value)})
|
562 |
+
print(f"β
METADATA: Added {key} = {value} to properties")
|
563 |
|
564 |
# Assemble metadata section
|
565 |
metadata_section = {
|
|
|
572 |
return metadata_section
|
573 |
|
574 |
def _create_component_section(self, model_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
|
575 |
+
print(f"π CRASH_DEBUG: _create_component_section called")
|
576 |
+
print(f"π CRASH_DEBUG: metadata type in component_section = {type(metadata)}")
|
577 |
+
|
578 |
# Extract owner and model name from model_id
|
579 |
parts = model_id.split("/")
|
580 |
group = parts[0] if len(parts) > 1 else ""
|
|
|
599 |
"purl": purl
|
600 |
}
|
601 |
|
602 |
+
# Handle license
|
603 |
if metadata and "license" in metadata and metadata["license"]:
|
604 |
component["licenses"] = [{
|
605 |
"license": {
|
|
|
607 |
"url": self._get_license_url(metadata["license"])
|
608 |
}
|
609 |
}]
|
610 |
+
print(f"β
COMPONENT: Added license = {metadata['license']}")
|
611 |
else:
|
|
|
612 |
component["licenses"] = [{
|
613 |
"license": {
|
614 |
+
"id": "NOASSERTION",
|
615 |
"url": "https://spdx.org/licenses/"
|
616 |
}
|
617 |
}]
|
618 |
+
print(f"β οΈ COMPONENT: No license found, using NOASSERTION")
|
619 |
+
|
620 |
+
# ALWAYS add description
|
621 |
+
component["description"] = metadata.get("description", f"AI model {model_id}")
|
622 |
+
|
623 |
+
# Add enhanced technical properties to component
|
624 |
+
technical_properties = []
|
625 |
+
|
626 |
+
# Add model type information
|
627 |
+
if "model_type" in metadata:
|
628 |
+
technical_properties.append({"name": "model_type", "value": str(metadata["model_type"])})
|
629 |
+
print(f"β
COMPONENT: Added model_type = {metadata['model_type']}")
|
630 |
+
|
631 |
+
# Add tokenizer information
|
632 |
+
if "tokenizer_class" in metadata:
|
633 |
+
technical_properties.append({"name": "tokenizer_class", "value": str(metadata["tokenizer_class"])})
|
634 |
+
print(f"β
COMPONENT: Added tokenizer_class = {metadata['tokenizer_class']}")
|
635 |
+
|
636 |
+
# Add architecture information
|
637 |
+
if "architectures" in metadata:
|
638 |
+
arch_value = metadata["architectures"]
|
639 |
+
if isinstance(arch_value, list):
|
640 |
+
arch_value = ", ".join(arch_value)
|
641 |
+
technical_properties.append({"name": "architectures", "value": str(arch_value)})
|
642 |
+
print(f"β
COMPONENT: Added architectures = {arch_value}")
|
643 |
+
|
644 |
+
# Add library information
|
645 |
+
if "library_name" in metadata:
|
646 |
+
technical_properties.append({"name": "library_name", "value": str(metadata["library_name"])})
|
647 |
+
print(f"β
COMPONENT: Added library_name = {metadata['library_name']}")
|
648 |
+
|
649 |
+
# Add technical properties to component if any exist
|
650 |
+
if technical_properties:
|
651 |
+
component["properties"] = technical_properties
|
652 |
# Debug
|
653 |
print(f"DEBUG: License in metadata: {'license' in metadata}" )
|
654 |
if "license" in metadata:
|
|
|
656 |
|
657 |
# ALWAYS add description
|
658 |
component["description"] = metadata.get("description", f"AI model {model_id}")
|
659 |
+
if metadata.get("license"):
|
660 |
+
component["licenses"] = [{
|
661 |
+
"license": {
|
662 |
+
"id": metadata["license"],
|
663 |
+
"url": self._get_license_url(metadata["license"])
|
664 |
+
}
|
665 |
+
}]
|
666 |
+
else:
|
667 |
+
component["licenses"] = [{
|
668 |
+
"license": {
|
669 |
+
"id": "unknown",
|
670 |
+
"url": "https://spdx.org/licenses/"
|
671 |
+
}
|
672 |
+
}]
|
673 |
+
|
674 |
|
675 |
# Add external references
|
676 |
external_refs = [{
|
|
|
706 |
|
707 |
return component
|
708 |
|
709 |
+
def _eval_results_to_json(self, eval_results: List[EvalResult]) -> List[Dict[str, str]]:
|
710 |
+
res = []
|
711 |
+
for eval_result in eval_results:
|
712 |
+
if hasattr(eval_result, "metric_type") and hasattr(eval_result, "metric_value"):
|
713 |
+
res.append({"type": eval_result.metric_type, "value": str(eval_result.metric_value)})
|
714 |
+
return res
|
715 |
+
|
716 |
+
|
717 |
def _create_model_card_section(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
|
718 |
+
print(f"π CRASH_DEBUG: _create_model_card_section called")
|
719 |
+
print(f"π CRASH_DEBUG: metadata type in model_card_section = {type(metadata)}")
|
720 |
+
|
721 |
model_card_section = {}
|
722 |
|
723 |
# Add quantitative analysis section
|
724 |
if "eval_results" in metadata:
|
725 |
model_card_section["quantitativeAnalysis"] = {
|
726 |
+
"performanceMetrics": self._eval_results_to_json(metadata["eval_results"]),
|
727 |
"graphics": {} # Empty graphics object as in the example
|
728 |
}
|
729 |
else:
|
730 |
model_card_section["quantitativeAnalysis"] = {"graphics": {}}
|
731 |
|
732 |
+
# Add properties section with enhanced extraction fields
|
733 |
properties = []
|
|
|
|
|
|
|
734 |
|
735 |
+
# Component-level fields that shouldn't be duplicated in model card
|
736 |
+
component_level_fields = ["name", "author", "license", "description", "commit"]
|
737 |
+
|
738 |
+
# DEBUG: troubleshooting AIBOM generation
|
739 |
+
print(f"π DEBUG: About to iterate metadata.items()")
|
740 |
+
print(f"π DEBUG: metadata type = {type(metadata)}")
|
741 |
+
if isinstance(metadata, dict):
|
742 |
+
print(f"π DEBUG: metadata keys = {list(metadata.keys())}")
|
743 |
+
else:
|
744 |
+
print(f"π DEBUG: metadata value = {metadata}")
|
745 |
+
print(f"π DEBUG: This is the problem - metadata should be a dict!")
|
746 |
+
|
747 |
+
# Add all enhanced extraction fields to model card properties
|
748 |
+
try:
|
749 |
+
for key, value in metadata.items():
|
750 |
+
if key not in component_level_fields and value is not None:
|
751 |
+
# Handle different data types properly
|
752 |
+
if isinstance(value, (list, dict)):
|
753 |
+
if isinstance(value, list) and len(value) > 0:
|
754 |
+
# Convert list to readable format
|
755 |
+
if all(isinstance(item, str) for item in value):
|
756 |
+
value = ", ".join(value)
|
757 |
+
else:
|
758 |
+
value = json.dumps(value)
|
759 |
+
elif isinstance(value, dict):
|
760 |
+
value = json.dumps(value)
|
761 |
+
|
762 |
+
properties.append({"name": key, "value": str(value)})
|
763 |
+
print(f"β
MODEL_CARD: Added {key} = {value}")
|
764 |
+
except AttributeError as e:
|
765 |
+
print(f"β FOUND THE ERROR: {e}")
|
766 |
+
print(f"β metadata type: {type(metadata)}")
|
767 |
+
print(f"β metadata value: {metadata}")
|
768 |
+
raise e
|
769 |
+
|
770 |
+
# Always include properties section (even if empty for consistency)
|
771 |
+
model_card_section["properties"] = properties
|
772 |
+
print(f"β
MODEL_CARD: Added {len(properties)} properties to model card")
|
773 |
|
774 |
# Create model parameters section
|
775 |
model_parameters = {}
|
|
|
818 |
|
819 |
# Add model parameters to model card section
|
820 |
model_card_section["modelParameters"] = model_parameters
|
821 |
+
# Add enhanced technical parameters
|
822 |
+
if "model_type" in metadata or "tokenizer_class" in metadata or "architectures" in metadata:
|
823 |
+
technical_details = {}
|
824 |
+
|
825 |
+
if "model_type" in metadata:
|
826 |
+
technical_details["modelType"] = metadata["model_type"]
|
827 |
+
|
828 |
+
if "tokenizer_class" in metadata:
|
829 |
+
technical_details["tokenizerClass"] = metadata["tokenizer_class"]
|
830 |
+
|
831 |
+
if "architectures" in metadata:
|
832 |
+
technical_details["architectures"] = metadata["architectures"]
|
833 |
+
|
834 |
+
# Add to model parameters
|
835 |
+
model_parameters.update(technical_details)
|
836 |
+
print(f"β
MODEL_CARD: Added technical details: {list(technical_details.keys())}")
|
837 |
+
|
838 |
+
# Update model parameters with enhanced details
|
839 |
+
model_card_section["modelParameters"] = model_parameters
|
840 |
|
841 |
# Add considerations section
|
842 |
considerations = {}
|
|
|
877 |
logger.warning(f"Failed to fetch after {max_retries} attempts: {e}")
|
878 |
return None
|
879 |
time.sleep(1 * (attempt + 1)) # Exponential backoff
|
880 |
+
return None
|
881 |
+
|
882 |
+
def validate_registry_integration(self) -> Dict[str, Any]:
|
883 |
+
"""
|
884 |
+
Validate that the registry integration is working correctly.
|
885 |
+
This method helps debug registry-related issues.
|
886 |
+
"""
|
887 |
+
validation_results = {
|
888 |
+
'registry_manager_available': bool(self.registry_manager),
|
889 |
+
'enhanced_extraction_available': ENHANCED_EXTRACTION_AVAILABLE,
|
890 |
+
'registry_fields_count': 0,
|
891 |
+
'registry_fields_loaded': False,
|
892 |
+
'validation_status': 'unknown'
|
893 |
+
}
|
894 |
+
|
895 |
+
try:
|
896 |
+
if self.registry_manager:
|
897 |
+
registry = self.registry_manager.registry
|
898 |
+
registry_fields = registry.get('fields', {})
|
899 |
+
validation_results['registry_fields_count'] = len(registry_fields)
|
900 |
+
validation_results['registry_fields_loaded'] = len(registry_fields) > 0
|
901 |
+
|
902 |
+
if len(registry_fields) > 0:
|
903 |
+
validation_results['validation_status'] = 'success'
|
904 |
+
print(f"β
Registry validation successful: {len(registry_fields)} fields loaded")
|
905 |
+
|
906 |
+
# Log sample fields
|
907 |
+
sample_fields = list(registry_fields.keys())[:5]
|
908 |
+
print(f"π Sample registry fields: {', '.join(sample_fields)}")
|
909 |
+
else:
|
910 |
+
validation_results['validation_status'] = 'no_fields'
|
911 |
+
print("β οΈ Registry loaded but no fields found")
|
912 |
+
else:
|
913 |
+
validation_results['validation_status'] = 'no_registry_manager'
|
914 |
+
print("β Registry manager not available")
|
915 |
+
|
916 |
+
except Exception as e:
|
917 |
+
validation_results['validation_status'] = 'error'
|
918 |
+
validation_results['error'] = str(e)
|
919 |
+
print(f"β Registry validation failed: {e}")
|
920 |
+
|
921 |
+
return validation_results
|
922 |
+
|
923 |
+
def test_registry_integration():
|
924 |
+
"""
|
925 |
+
Test function to validate registry integration is working correctly.
|
926 |
+
This function can be called to debug registry-related issues.
|
927 |
+
"""
|
928 |
+
print("π§ͺ Testing Registry Integration...")
|
929 |
+
print("=" * 50)
|
930 |
+
|
931 |
+
try:
|
932 |
+
# Test generator initialization
|
933 |
+
generator = AIBOMGenerator()
|
934 |
+
|
935 |
+
# Validate registry integration
|
936 |
+
validation_results = generator.validate_registry_integration()
|
937 |
+
|
938 |
+
print("π Validation Results:")
|
939 |
+
for key, value in validation_results.items():
|
940 |
+
print(f" {key}: {value}")
|
941 |
+
|
942 |
+
# Test with a sample model
|
943 |
+
test_model = "deepseek-ai/DeepSeek-R1"
|
944 |
+
print(f"\nπ Testing extraction with model: {test_model}")
|
945 |
+
|
946 |
+
try:
|
947 |
+
# Test model info retrieval
|
948 |
+
model_info = generator.hf_api.model_info(test_model)
|
949 |
+
model_card = ModelCard.load(test_model)
|
950 |
+
|
951 |
+
# Test extraction
|
952 |
+
if ENHANCED_EXTRACTION_AVAILABLE and generator.registry_manager:
|
953 |
+
extractor = EnhancedExtractor(generator.hf_api, generator.registry_manager)
|
954 |
+
metadata = extractor.extract_metadata(test_model, model_info, model_card)
|
955 |
+
|
956 |
+
print(f"β
Test extraction successful: {len(metadata)} fields extracted")
|
957 |
+
|
958 |
+
# Show sample extracted fields
|
959 |
+
sample_fields = dict(list(metadata.items())[:5])
|
960 |
+
print("π Sample extracted fields:")
|
961 |
+
for key, value in sample_fields.items():
|
962 |
+
print(f" {key}: {value}")
|
963 |
+
|
964 |
+
# Show extraction results summary
|
965 |
+
extraction_results = extractor.get_extraction_results()
|
966 |
+
confidence_counts = {}
|
967 |
+
for result in extraction_results.values():
|
968 |
+
conf = result.confidence.value
|
969 |
+
confidence_counts[conf] = confidence_counts.get(conf, 0) + 1
|
970 |
+
|
971 |
+
print("π Extraction confidence distribution:")
|
972 |
+
for conf, count in confidence_counts.items():
|
973 |
+
print(f" {conf}: {count} fields")
|
974 |
+
|
975 |
+
else:
|
976 |
+
print("β οΈ Registry-aware extraction not available for testing")
|
977 |
+
|
978 |
+
except Exception as e:
|
979 |
+
print(f"β Test extraction failed: {e}")
|
980 |
+
|
981 |
+
except Exception as e:
|
982 |
+
print(f"β Registry integration test failed: {e}")
|
983 |
+
|
984 |
+
print("=" * 50)
|
985 |
+
print("π§ͺ Registry Integration Test Complete")
|
986 |
+
|
987 |
+
# Uncomment this line to run the test automatically when generator.py is imported
|
988 |
+
test_registry_integration()
|
src/aibom-generator/utils.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
"""
|
2 |
-
|
3 |
"""
|
4 |
|
5 |
import json
|
@@ -9,6 +9,14 @@ import re
|
|
9 |
import uuid
|
10 |
from typing import Dict, List, Optional, Any, Union, Tuple
|
11 |
from enum import Enum
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
logger = logging.getLogger(__name__)
|
14 |
|
@@ -18,98 +26,123 @@ class ValidationSeverity(Enum):
|
|
18 |
WARNING = "warning"
|
19 |
INFO = "info"
|
20 |
|
21 |
-
#
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
"
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
"
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
}
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
"
|
63 |
-
"
|
64 |
-
"
|
65 |
-
|
66 |
-
|
67 |
-
"
|
68 |
-
"
|
69 |
-
|
70 |
-
"
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
}
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
"downloadLocation": {
|
89 |
-
"missing": "Missing critical field: downloadLocation - needed for artifact retrieval",
|
90 |
-
"recommendation": "Add information about where the model can be downloaded"
|
91 |
-
},
|
92 |
-
"primaryPurpose": {
|
93 |
-
"missing": "Missing critical field: primaryPurpose - important for understanding model intent",
|
94 |
-
"recommendation": "Add information about the primary purpose of this model"
|
95 |
-
},
|
96 |
-
"suppliedBy": {
|
97 |
-
"missing": "Missing critical field: suppliedBy - needed for provenance tracking",
|
98 |
-
"recommendation": "Add information about who supplied this model"
|
99 |
-
},
|
100 |
-
"energyConsumption": {
|
101 |
-
"missing": "Missing important field: energyConsumption - helpful for environmental impact assessment",
|
102 |
-
"recommendation": "Consider documenting energy consumption metrics for better transparency"
|
103 |
-
},
|
104 |
-
"hyperparameter": {
|
105 |
-
"missing": "Missing important field: hyperparameter - valuable for reproducibility",
|
106 |
-
"recommendation": "Document key hyperparameters used in training"
|
107 |
-
},
|
108 |
-
"limitation": {
|
109 |
-
"missing": "Missing important field: limitation - important for responsible use",
|
110 |
-
"recommendation": "Document known limitations of the model to guide appropriate usage"
|
111 |
}
|
112 |
-
}
|
113 |
|
114 |
|
115 |
def setup_logging(level=logging.INFO):
|
@@ -207,77 +240,53 @@ def check_field_in_aibom(aibom: Dict[str, Any], field: str) -> bool:
|
|
207 |
Returns:
|
208 |
True if the field is present, False otherwise
|
209 |
"""
|
210 |
-
# Check in root level
|
211 |
if field in aibom:
|
212 |
return True
|
213 |
-
|
214 |
-
# Check in metadata
|
215 |
if "metadata" in aibom:
|
216 |
metadata = aibom["metadata"]
|
217 |
if field in metadata:
|
218 |
return True
|
219 |
-
|
220 |
-
# Check in metadata properties
|
221 |
if "properties" in metadata:
|
222 |
for prop in metadata["properties"]:
|
223 |
-
|
|
|
224 |
return True
|
225 |
-
|
226 |
-
# Check in components
|
227 |
if "components" in aibom and aibom["components"]:
|
228 |
-
component = aibom["components"][0]
|
229 |
-
|
230 |
if field in component:
|
231 |
return True
|
232 |
-
|
233 |
-
# Check in component properties
|
234 |
if "properties" in component:
|
235 |
for prop in component["properties"]:
|
236 |
-
|
|
|
237 |
return True
|
238 |
-
|
239 |
-
# Check in model card
|
240 |
if "modelCard" in component:
|
241 |
model_card = component["modelCard"]
|
242 |
-
|
243 |
if field in model_card:
|
244 |
return True
|
245 |
-
|
246 |
-
|
247 |
-
if "modelParameters" in model_card:
|
248 |
-
if field in model_card["modelParameters"]:
|
249 |
-
return True
|
250 |
-
|
251 |
-
# Check in model parameters properties
|
252 |
-
if "properties" in model_card["modelParameters"]:
|
253 |
-
for prop in model_card["modelParameters"]["properties"]:
|
254 |
-
if prop.get("name") == f"spdx:{field}" or prop.get("name") == field:
|
255 |
-
return True
|
256 |
-
|
257 |
-
# Check in considerations
|
258 |
if "considerations" in model_card:
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
if
|
268 |
-
return True
|
269 |
-
if field == "energyConsumption" and section == "environmentalConsiderations":
|
270 |
return True
|
271 |
-
|
272 |
-
|
273 |
if field == "downloadLocation" and "externalReferences" in aibom:
|
274 |
for ref in aibom["externalReferences"]:
|
275 |
-
if ref.get("type") == "distribution":
|
276 |
return True
|
277 |
-
|
278 |
return False
|
279 |
|
280 |
|
|
|
281 |
def determine_completeness_profile(aibom: Dict[str, Any], score: float) -> Dict[str, Any]:
|
282 |
"""
|
283 |
Determine which completeness profile the AIBOM satisfies.
|
@@ -835,8 +844,113 @@ def get_validation_summary(report: Dict[str, Any]) -> str:
|
|
835 |
|
836 |
return summary
|
837 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
838 |
|
839 |
-
|
|
|
840 |
"""
|
841 |
Calculate completeness score using industry best practices with proper normalization and penalties.
|
842 |
|
@@ -875,8 +989,8 @@ def calculate_industry_neutral_score(aibom: Dict[str, Any]) -> Dict[str, Any]:
|
|
875 |
# Count total fields in this category
|
876 |
fields_by_category[category]["total"] += 1
|
877 |
|
878 |
-
#
|
879 |
-
is_present =
|
880 |
|
881 |
if is_present:
|
882 |
fields_by_category[category]["present"] += 1
|
@@ -898,6 +1012,19 @@ def calculate_industry_neutral_score(aibom: Dict[str, Any]) -> Dict[str, Any]:
|
|
898 |
category_scores[category] = round(raw_score, 1)
|
899 |
else:
|
900 |
category_scores[category] = 0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
901 |
|
902 |
# Calculate subtotal (sum of rounded category scores)
|
903 |
subtotal_score = sum(category_scores.values())
|
@@ -1033,7 +1160,7 @@ def calculate_industry_neutral_score(aibom: Dict[str, Any]) -> Dict[str, Any]:
|
|
1033 |
return result
|
1034 |
|
1035 |
|
1036 |
-
def calculate_completeness_score(aibom: Dict[str, Any], validate: bool = True, use_best_practices: bool = True) -> Dict[str, Any]:
|
1037 |
"""
|
1038 |
Calculate completeness score for an AIBOM and optionally validate against AI requirements.
|
1039 |
Enhanced with industry best practices scoring.
|
@@ -1046,9 +1173,16 @@ def calculate_completeness_score(aibom: Dict[str, Any], validate: bool = True, u
|
|
1046 |
Returns:
|
1047 |
Dictionary containing score and validation results
|
1048 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1049 |
# If using best practices scoring, use the enhanced industry-neutral approach
|
1050 |
if use_best_practices:
|
1051 |
-
result = calculate_industry_neutral_score(aibom)
|
1052 |
|
1053 |
# Add validation if requested
|
1054 |
if validate:
|
@@ -1525,4 +1659,64 @@ def format_score_summary(score_result: Dict[str, Any]) -> str:
|
|
1525 |
summary += f"\nCompleteness Profile: {profile['name']}\n"
|
1526 |
summary += f"Description: {profile['description']}\n"
|
1527 |
|
1528 |
-
return summary
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
"""
|
2 |
+
Mostly score calculation functions for the AI SBOM Generator.
|
3 |
"""
|
4 |
|
5 |
import json
|
|
|
9 |
import uuid
|
10 |
from typing import Dict, List, Optional, Any, Union, Tuple
|
11 |
from enum import Enum
|
12 |
+
from .field_registry_manager import (
|
13 |
+
get_field_registry_manager,
|
14 |
+
generate_field_classification,
|
15 |
+
generate_completeness_profiles,
|
16 |
+
generate_validation_messages,
|
17 |
+
get_configurable_scoring_weights,
|
18 |
+
DynamicFieldDetector # Compatibility wrapper
|
19 |
+
)
|
20 |
|
21 |
logger = logging.getLogger(__name__)
|
22 |
|
|
|
26 |
WARNING = "warning"
|
27 |
INFO = "info"
|
28 |
|
29 |
+
# Registry-driven field definitions
|
30 |
+
try:
|
31 |
+
REGISTRY_MANAGER = get_field_registry_manager()
|
32 |
+
FIELD_CLASSIFICATION = generate_field_classification()
|
33 |
+
COMPLETENESS_PROFILES = generate_completeness_profiles()
|
34 |
+
VALIDATION_MESSAGES = generate_validation_messages()
|
35 |
+
SCORING_WEIGHTS = get_configurable_scoring_weights()
|
36 |
+
|
37 |
+
print(f"β
Registry-driven configuration loaded: {len(FIELD_CLASSIFICATION)} fields")
|
38 |
+
REGISTRY_AVAILABLE = True
|
39 |
+
|
40 |
+
except Exception as e:
|
41 |
+
print(f"β Failed to load registry configuration: {e}")
|
42 |
+
print("π Falling back to hardcoded definitions...")
|
43 |
+
REGISTRY_AVAILABLE = False
|
44 |
+
|
45 |
+
# Hardcoded definitions as fallback
|
46 |
+
FIELD_CLASSIFICATION = {
|
47 |
+
# Critical fields (silently aligned with SPDX mandatory fields)
|
48 |
+
"bomFormat": {"tier": "critical", "weight": 3, "category": "required_fields"},
|
49 |
+
"specVersion": {"tier": "critical", "weight": 3, "category": "required_fields"},
|
50 |
+
"serialNumber": {"tier": "critical", "weight": 3, "category": "required_fields"},
|
51 |
+
"version": {"tier": "critical", "weight": 3, "category": "required_fields"},
|
52 |
+
"name": {"tier": "critical", "weight": 4, "category": "component_basic"},
|
53 |
+
"downloadLocation": {"tier": "critical", "weight": 4, "category": "external_references"},
|
54 |
+
"primaryPurpose": {"tier": "critical", "weight": 3, "category": "metadata"},
|
55 |
+
"suppliedBy": {"tier": "critical", "weight": 4, "category": "metadata"},
|
56 |
+
|
57 |
+
# Important fields (aligned with key SPDX optional fields)
|
58 |
+
"type": {"tier": "important", "weight": 2, "category": "component_basic"},
|
59 |
+
"purl": {"tier": "important", "weight": 4, "category": "component_basic"},
|
60 |
+
"description": {"tier": "important", "weight": 4, "category": "component_basic"},
|
61 |
+
"licenses": {"tier": "important", "weight": 4, "category": "component_basic"},
|
62 |
+
"energyConsumption": {"tier": "important", "weight": 3, "category": "component_model_card"},
|
63 |
+
"hyperparameter": {"tier": "important", "weight": 3, "category": "component_model_card"},
|
64 |
+
"limitation": {"tier": "important", "weight": 3, "category": "component_model_card"},
|
65 |
+
"safetyRiskAssessment": {"tier": "important", "weight": 3, "category": "component_model_card"},
|
66 |
+
"typeOfModel": {"tier": "important", "weight": 3, "category": "component_model_card"},
|
67 |
+
|
68 |
+
# Supplementary fields (aligned with remaining SPDX optional fields)
|
69 |
+
"modelExplainability": {"tier": "supplementary", "weight": 2, "category": "component_model_card"},
|
70 |
+
"standardCompliance": {"tier": "supplementary", "weight": 2, "category": "metadata"},
|
71 |
+
"domain": {"tier": "supplementary", "weight": 2, "category": "metadata"},
|
72 |
+
"energyQuantity": {"tier": "supplementary", "weight": 2, "category": "component_model_card"},
|
73 |
+
"energyUnit": {"tier": "supplementary", "weight": 2, "category": "component_model_card"},
|
74 |
+
"informationAboutTraining": {"tier": "supplementary", "weight": 2, "category": "component_model_card"},
|
75 |
+
"informationAboutApplication": {"tier": "supplementary", "weight": 2, "category": "component_model_card"},
|
76 |
+
"metric": {"tier": "supplementary", "weight": 2, "category": "component_model_card"},
|
77 |
+
"metricDecisionThreshold": {"tier": "supplementary", "weight": 2, "category": "component_model_card"},
|
78 |
+
"modelDataPreprocessing": {"tier": "supplementary", "weight": 2, "category": "component_model_card"},
|
79 |
+
"autonomyType": {"tier": "supplementary", "weight": 1, "category": "metadata"},
|
80 |
+
"useSensitivePersonalInformation": {"tier": "supplementary", "weight": 2, "category": "component_model_card"}
|
81 |
+
}
|
82 |
+
|
83 |
+
# Completeness profiles (silently aligned with SPDX requirements)
|
84 |
+
COMPLETENESS_PROFILES = {
|
85 |
+
"basic": {
|
86 |
+
"description": "Minimal fields required for identification",
|
87 |
+
"required_fields": ["bomFormat", "specVersion", "serialNumber", "version", "name"],
|
88 |
+
"minimum_score": 40
|
89 |
+
},
|
90 |
+
"standard": {
|
91 |
+
"description": "Comprehensive fields for proper documentation",
|
92 |
+
"required_fields": ["bomFormat", "specVersion", "serialNumber", "version", "name",
|
93 |
+
"downloadLocation", "primaryPurpose", "suppliedBy"],
|
94 |
+
"minimum_score": 70
|
95 |
+
},
|
96 |
+
"advanced": {
|
97 |
+
"description": "Extensive documentation for maximum transparency",
|
98 |
+
"required_fields": ["bomFormat", "specVersion", "serialNumber", "version", "name",
|
99 |
+
"downloadLocation", "primaryPurpose", "suppliedBy",
|
100 |
+
"type", "purl", "description", "licenses", "hyperparameter", "limitation",
|
101 |
+
"energyConsumption", "safetyRiskAssessment", "typeOfModel"],
|
102 |
+
"minimum_score": 85
|
103 |
+
}
|
104 |
+
}
|
105 |
+
|
106 |
+
# Validation messages framed as best practices
|
107 |
+
VALIDATION_MESSAGES = {
|
108 |
+
"name": {
|
109 |
+
"missing": "Missing critical field: name - essential for model identification",
|
110 |
+
"recommendation": "Add a descriptive name for the model"
|
111 |
+
},
|
112 |
+
"downloadLocation": {
|
113 |
+
"missing": "Missing critical field: downloadLocation - needed for artifact retrieval",
|
114 |
+
"recommendation": "Add information about where the model can be downloaded"
|
115 |
+
},
|
116 |
+
"primaryPurpose": {
|
117 |
+
"missing": "Missing critical field: primaryPurpose - important for understanding model intent",
|
118 |
+
"recommendation": "Add information about the primary purpose of this model"
|
119 |
+
},
|
120 |
+
"suppliedBy": {
|
121 |
+
"missing": "Missing critical field: suppliedBy - needed for provenance tracking",
|
122 |
+
"recommendation": "Add information about who supplied this model"
|
123 |
+
},
|
124 |
+
"energyConsumption": {
|
125 |
+
"missing": "Missing important field: energyConsumption - helpful for environmental impact assessment",
|
126 |
+
"recommendation": "Consider documenting energy consumption metrics for better transparency"
|
127 |
+
},
|
128 |
+
"hyperparameter": {
|
129 |
+
"missing": "Missing important field: hyperparameter - valuable for reproducibility",
|
130 |
+
"recommendation": "Document key hyperparameters used in training"
|
131 |
+
},
|
132 |
+
"limitation": {
|
133 |
+
"missing": "Missing important field: limitation - important for responsible use",
|
134 |
+
"recommendation": "Document known limitations of the model to guide appropriate usage"
|
135 |
+
}
|
136 |
}
|
137 |
+
|
138 |
+
SCORING_WEIGHTS = {
|
139 |
+
"tier_weights": {"critical": 3, "important": 2, "supplementary": 1},
|
140 |
+
"category_weights": {
|
141 |
+
"required_fields": 20, "metadata": 20, "component_basic": 20,
|
142 |
+
"component_model_card": 30, "external_references": 10
|
143 |
+
},
|
144 |
+
"algorithm_config": {"type": "weighted_sum", "max_score": 100}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
}
|
|
|
146 |
|
147 |
|
148 |
def setup_logging(level=logging.INFO):
|
|
|
240 |
Returns:
|
241 |
True if the field is present, False otherwise
|
242 |
"""
|
|
|
243 |
if field in aibom:
|
244 |
return True
|
|
|
|
|
245 |
if "metadata" in aibom:
|
246 |
metadata = aibom["metadata"]
|
247 |
if field in metadata:
|
248 |
return True
|
|
|
|
|
249 |
if "properties" in metadata:
|
250 |
for prop in metadata["properties"]:
|
251 |
+
prop_name = prop.get("name", "")
|
252 |
+
if prop_name in {field, f"spdx:{field}"}:
|
253 |
return True
|
|
|
|
|
254 |
if "components" in aibom and aibom["components"]:
|
255 |
+
component = aibom["components"][0]
|
|
|
256 |
if field in component:
|
257 |
return True
|
|
|
|
|
258 |
if "properties" in component:
|
259 |
for prop in component["properties"]:
|
260 |
+
prop_name = prop.get("name", "")
|
261 |
+
if prop_name in {field, f"spdx:{field}"}:
|
262 |
return True
|
|
|
|
|
263 |
if "modelCard" in component:
|
264 |
model_card = component["modelCard"]
|
|
|
265 |
if field in model_card:
|
266 |
return True
|
267 |
+
if "modelParameters" in model_card and field in model_card["modelParameters"]:
|
268 |
+
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
269 |
if "considerations" in model_card:
|
270 |
+
considerations = model_card["considerations"]
|
271 |
+
field_mappings = {
|
272 |
+
"limitation": ["technicalLimitations", "limitations"],
|
273 |
+
"safetyRiskAssessment": ["ethicalConsiderations", "safetyRiskAssessment"],
|
274 |
+
"energyConsumption": ["environmentalConsiderations", "energyConsumption"]
|
275 |
+
}
|
276 |
+
if field in field_mappings:
|
277 |
+
for section in field_mappings[field]:
|
278 |
+
if section in considerations and considerations[section]:
|
|
|
|
|
279 |
return True
|
280 |
+
if field in considerations:
|
281 |
+
return True
|
282 |
if field == "downloadLocation" and "externalReferences" in aibom:
|
283 |
for ref in aibom["externalReferences"]:
|
284 |
+
if ref.get("type") == "distribution" and ref.get("url"):
|
285 |
return True
|
|
|
286 |
return False
|
287 |
|
288 |
|
289 |
+
|
290 |
def determine_completeness_profile(aibom: Dict[str, Any], score: float) -> Dict[str, Any]:
|
291 |
"""
|
292 |
Determine which completeness profile the AIBOM satisfies.
|
|
|
844 |
|
845 |
return summary
|
846 |
|
847 |
+
def check_field_with_enhanced_results(aibom: Dict[str, Any], field: str, extraction_results: Optional[Dict[str, Any]] = None) -> bool:
|
848 |
+
"""
|
849 |
+
Enhanced field detection using consolidated field registry manager.
|
850 |
+
|
851 |
+
Args:
|
852 |
+
aibom: The AIBOM to check
|
853 |
+
field: The field name to check (must match field registry)
|
854 |
+
extraction_results: Enhanced extraction results with confidence levels
|
855 |
+
|
856 |
+
Returns:
|
857 |
+
True if the field is present and should count toward score, False otherwise
|
858 |
+
"""
|
859 |
+
try:
|
860 |
+
# Initialize dynamic field detector (cached)
|
861 |
+
if not hasattr(check_field_with_enhanced_results, '_detector'):
|
862 |
+
try:
|
863 |
+
if REGISTRY_AVAILABLE:
|
864 |
+
# Use the consolidated registry manager
|
865 |
+
registry_manager = get_field_registry_manager()
|
866 |
+
check_field_with_enhanced_results._detector = DynamicFieldDetector(registry_manager)
|
867 |
+
print(f"β
Dynamic field detector initialized with registry manager")
|
868 |
+
else:
|
869 |
+
# Create registry manager from path
|
870 |
+
from field_registry_manager import FieldRegistryManager
|
871 |
+
registry_path = os.path.join(current_dir, "field_registry.json")
|
872 |
+
registry_manager = FieldRegistryManager(registry_path)
|
873 |
+
check_field_with_enhanced_results._detector = DynamicFieldDetector(registry_manager)
|
874 |
+
print(f"β
Dynamic field detector initialized with fallback registry manager")
|
875 |
+
|
876 |
+
except Exception as e:
|
877 |
+
print(f"β Failed to initialize dynamic field detector: {e}")
|
878 |
+
# Final fallback
|
879 |
+
import os
|
880 |
+
current_dir = os.path.dirname(os.path.abspath(__file__))
|
881 |
+
registry_path = os.path.join(current_dir, "field_registry.json")
|
882 |
+
try:
|
883 |
+
check_field_with_enhanced_results._detector = DynamicFieldDetector(registry_path)
|
884 |
+
print(f"π Dynamic field detector initialized with emergency fallback")
|
885 |
+
except Exception as final_error:
|
886 |
+
print(f"β Complete failure to initialize dynamic field detector: {final_error}")
|
887 |
+
check_field_with_enhanced_results._detector = None
|
888 |
+
|
889 |
+
detector = check_field_with_enhanced_results._detector
|
890 |
+
|
891 |
+
if detector is None:
|
892 |
+
print(f"β οΈ No detector available, using fallback for {field}")
|
893 |
+
return check_field_in_aibom(aibom, field)
|
894 |
+
|
895 |
+
# First, try dynamic detection from AIBOM structure using ENHANCED REGISTRY FORMAT
|
896 |
+
field_found_in_registry = False
|
897 |
+
|
898 |
+
# Use the enhanced registry structure (registry['fields'][field_name])
|
899 |
+
fields = detector.registry.get('fields', {})
|
900 |
+
if field in fields:
|
901 |
+
field_found_in_registry = True
|
902 |
+
field_config = fields[field]
|
903 |
+
field_path = field_config.get('jsonpath', '')
|
904 |
+
|
905 |
+
if field_path:
|
906 |
+
# Use dynamic detection
|
907 |
+
is_present, value = detector.detect_field_presence(aibom, field_path)
|
908 |
+
|
909 |
+
if is_present:
|
910 |
+
print(f"β
DYNAMIC: Found {field} = {value}")
|
911 |
+
return True
|
912 |
+
else:
|
913 |
+
print(f"β DYNAMIC: Missing {field} at {field_path}")
|
914 |
+
else:
|
915 |
+
print(f"β οΈ Field '{field}' has no jsonpath defined in registry")
|
916 |
+
|
917 |
+
# If field not in registry, log warning but continue
|
918 |
+
if not field_found_in_registry:
|
919 |
+
print(f"β οΈ WARNING: Field '{field}' not found in field registry")
|
920 |
+
|
921 |
+
# Second, check extraction results (existing logic)
|
922 |
+
if extraction_results and field in extraction_results:
|
923 |
+
extraction_result = extraction_results[field]
|
924 |
+
|
925 |
+
# Check if this field has actual extracted data (not just placeholder)
|
926 |
+
if hasattr(extraction_result, 'confidence'):
|
927 |
+
# Don't count fields with 'none' confidence (placeholders like NOASSERTION)
|
928 |
+
if extraction_result.confidence.value == 'none':
|
929 |
+
print(f"β EXTRACTION: {field} has 'none' confidence")
|
930 |
+
return False
|
931 |
+
# Count fields with medium or high confidence
|
932 |
+
is_confident = extraction_result.confidence.value in ['medium', 'high']
|
933 |
+
print(f"{'β
' if is_confident else 'β'} EXTRACTION: {field} confidence = {extraction_result.confidence.value}")
|
934 |
+
return is_confident
|
935 |
+
elif hasattr(extraction_result, 'value'):
|
936 |
+
# For simple extraction results, check if value is meaningful
|
937 |
+
value = extraction_result.value
|
938 |
+
if value in ['NOASSERTION', 'NOT_FOUND', None, '']:
|
939 |
+
print(f"β EXTRACTION: {field} has placeholder value: {value}")
|
940 |
+
return False
|
941 |
+
print(f"β
EXTRACTION: {field} = {value}")
|
942 |
+
return True
|
943 |
+
|
944 |
+
# Third, fallback to original AIBOM detection
|
945 |
+
print(f"π FALLBACK: Using original detection for {field}")
|
946 |
+
return check_field_in_aibom(aibom, field)
|
947 |
+
|
948 |
+
except Exception as e:
|
949 |
+
print(f"β Error in enhanced field detection for {field}: {e}")
|
950 |
+
return check_field_in_aibom(aibom, field)
|
951 |
|
952 |
+
|
953 |
+
def calculate_industry_neutral_score(aibom: Dict[str, Any], extraction_results: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
954 |
"""
|
955 |
Calculate completeness score using industry best practices with proper normalization and penalties.
|
956 |
|
|
|
989 |
# Count total fields in this category
|
990 |
fields_by_category[category]["total"] += 1
|
991 |
|
992 |
+
# Enhanced field detection using extraction results
|
993 |
+
is_present = check_field_with_enhanced_results(aibom, field, extraction_results)
|
994 |
|
995 |
if is_present:
|
996 |
fields_by_category[category]["present"] += 1
|
|
|
1012 |
category_scores[category] = round(raw_score, 1)
|
1013 |
else:
|
1014 |
category_scores[category] = 0.0
|
1015 |
+
|
1016 |
+
# Log field extraction summary
|
1017 |
+
total_fields = sum(counts["total"] for counts in fields_by_category.values())
|
1018 |
+
total_present = sum(counts["present"] for counts in fields_by_category.values())
|
1019 |
+
|
1020 |
+
print(f"π SCORING SUMMARY:")
|
1021 |
+
print(f" Total fields evaluated: {total_fields}")
|
1022 |
+
print(f" Fields successfully extracted: {total_present}")
|
1023 |
+
print(f" Extraction success rate: {round((total_present/total_fields)*100, 1)}%")
|
1024 |
+
print(f" Category breakdown:")
|
1025 |
+
for category, counts in fields_by_category.items():
|
1026 |
+
percentage = round((counts["present"]/counts["total"])*100, 1) if counts["total"] > 0 else 0
|
1027 |
+
print(f" {category}: {counts['present']}/{counts['total']} ({percentage}%)")
|
1028 |
|
1029 |
# Calculate subtotal (sum of rounded category scores)
|
1030 |
subtotal_score = sum(category_scores.values())
|
|
|
1160 |
return result
|
1161 |
|
1162 |
|
1163 |
+
def calculate_completeness_score(aibom: Dict[str, Any], validate: bool = True, use_best_practices: bool = True, extraction_results: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
1164 |
"""
|
1165 |
Calculate completeness score for an AIBOM and optionally validate against AI requirements.
|
1166 |
Enhanced with industry best practices scoring.
|
|
|
1173 |
Returns:
|
1174 |
Dictionary containing score and validation results
|
1175 |
"""
|
1176 |
+
print(f"π DEBUG: use_best_practices={use_best_practices}")
|
1177 |
+
print(f"π DEBUG: extraction_results is None: {extraction_results is None}")
|
1178 |
+
print(f"π DEBUG: extraction_results keys: {list(extraction_results.keys()) if extraction_results else 'None'}")
|
1179 |
+
|
1180 |
+
if use_best_practices:
|
1181 |
+
print("π DEBUG: Calling calculate_industry_neutral_score")
|
1182 |
+
result = calculate_industry_neutral_score(aibom, extraction_results)
|
1183 |
# If using best practices scoring, use the enhanced industry-neutral approach
|
1184 |
if use_best_practices:
|
1185 |
+
result = calculate_industry_neutral_score(aibom, extraction_results)
|
1186 |
|
1187 |
# Add validation if requested
|
1188 |
if validate:
|
|
|
1659 |
summary += f"\nCompleteness Profile: {profile['name']}\n"
|
1660 |
summary += f"Description: {profile['description']}\n"
|
1661 |
|
1662 |
+
return summary
|
1663 |
+
|
1664 |
+
def test_consolidated_integration():
|
1665 |
+
"""Test that consolidated field registry manager integration is working"""
|
1666 |
+
try:
|
1667 |
+
print("\nπ§ͺ Testing Consolidated Integration...")
|
1668 |
+
|
1669 |
+
# Test registry availability
|
1670 |
+
if REGISTRY_AVAILABLE:
|
1671 |
+
print("β
Consolidated registry manager available")
|
1672 |
+
|
1673 |
+
# Test registry manager
|
1674 |
+
manager = get_field_registry_manager()
|
1675 |
+
print(f"β
Registry manager initialized: {manager.registry_path}")
|
1676 |
+
|
1677 |
+
# Test field classification generation
|
1678 |
+
field_count = len(FIELD_CLASSIFICATION)
|
1679 |
+
print(f"β
FIELD_CLASSIFICATION loaded: {field_count} fields")
|
1680 |
+
|
1681 |
+
# Test completeness profiles
|
1682 |
+
profile_count = len(COMPLETENESS_PROFILES)
|
1683 |
+
print(f"β
COMPLETENESS_PROFILES loaded: {profile_count} profiles")
|
1684 |
+
|
1685 |
+
# Test validation messages
|
1686 |
+
message_count = len(VALIDATION_MESSAGES)
|
1687 |
+
print(f"β
VALIDATION_MESSAGES loaded: {message_count} messages")
|
1688 |
+
|
1689 |
+
# Test scoring weights
|
1690 |
+
tier_weights = SCORING_WEIGHTS.get("tier_weights", {})
|
1691 |
+
category_weights = SCORING_WEIGHTS.get("category_weights", {})
|
1692 |
+
print(f"β
SCORING_WEIGHTS loaded: {len(tier_weights)} tiers, {len(category_weights)} categories")
|
1693 |
+
|
1694 |
+
else:
|
1695 |
+
print("β οΈ Consolidated registry manager not available, using hardcoded definitions")
|
1696 |
+
|
1697 |
+
# Test dynamic field detector (DynamicFieldDetector)
|
1698 |
+
if hasattr(check_field_with_enhanced_results, '_detector') and check_field_with_enhanced_results._detector:
|
1699 |
+
print(f"β
Dynamic field detector ready")
|
1700 |
+
else:
|
1701 |
+
print(f"β οΈ Dynamic field detector not initialized")
|
1702 |
+
|
1703 |
+
# Test field lookup
|
1704 |
+
test_fields = ["bomFormat", "primaryPurpose", "energyConsumption"]
|
1705 |
+
for field in test_fields:
|
1706 |
+
if field in FIELD_CLASSIFICATION:
|
1707 |
+
field_info = FIELD_CLASSIFICATION[field]
|
1708 |
+
print(f"β
Field '{field}': tier={field_info['tier']}, category={field_info['category']}")
|
1709 |
+
else:
|
1710 |
+
print(f"β Field '{field}' not found in FIELD_CLASSIFICATION")
|
1711 |
+
|
1712 |
+
print("π Consolidated integration test completed!")
|
1713 |
+
return True
|
1714 |
+
|
1715 |
+
except Exception as e:
|
1716 |
+
print(f"β Consolidated integration test failed: {e}")
|
1717 |
+
import traceback
|
1718 |
+
traceback.print_exc()
|
1719 |
+
return False
|
1720 |
+
|
1721 |
+
# Uncomment this line to run the test automatically when utils.py is imported
|
1722 |
+
test_consolidated_integration()
|