a1c00l commited on
Commit
58ff627
Β·
verified Β·
1 Parent(s): c7cebcc

Upload 5 files

Browse files
src/aibom-generator/enhanced_extractor.py ADDED
@@ -0,0 +1,876 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Registry-Integrated (field_registry.json) Enhanced Multi-Layer Data Extraction for AI SBOM Generator
4
+
5
+ This module provides a fully configurable enhanced data extraction system that
6
+ automatically picks up new fields from the JSON registry (field_registry.json) without requiring code changes.
7
+ It includes comprehensive logging, fallback mechanisms, and confidence tracking.
8
+
9
+ Key Features:
10
+ - Automatically discovers all fields from the registry (field_registry.json)
11
+ - Attempts extraction for every registry field
12
+ - Provides detailed logging for each field attempt
13
+ - Graceful error handling for individual field failures
14
+ - Maintains backward compatibility with existing code
15
+
16
+ """
17
+
18
+ import json
19
+ import logging
20
+ import re
21
+ import requests
22
+ from typing import Dict, Any, Optional, List, Tuple
23
+ from enum import Enum
24
+ from dataclasses import dataclass, field
25
+ from datetime import datetime
26
+ from urllib.parse import urlparse, urljoin
27
+ import time
28
+
29
+ # Import existing dependencies
30
+ from huggingface_hub import HfApi, ModelCard, hf_hub_download
31
+ from huggingface_hub.utils import RepositoryNotFoundError, EntryNotFoundError
32
+
33
+ # Import field registry manager (field_registry_manager.py)
34
+ try:
35
+ from .field_registry_manager import get_field_registry_manager
36
+ REGISTRY_AVAILABLE = True
37
+ except ImportError:
38
+ try:
39
+ from field_registry_manager import get_field_registry_manager
40
+ REGISTRY_AVAILABLE = True
41
+ except ImportError:
42
+ REGISTRY_AVAILABLE = False
43
+ print("⚠️ Field registry manager not available, falling back to legacy extraction")
44
+
45
+ # Configure logging for this module
46
+ logger = logging.getLogger(__name__)
47
+
48
+ class DataSource(Enum):
49
+ """Enumeration of data sources for provenance tracking"""
50
+ HF_API = "huggingface_api"
51
+ MODEL_CARD = "model_card_yaml"
52
+ README_TEXT = "readme_text"
53
+ CONFIG_FILE = "config_file"
54
+ REPOSITORY_FILES = "repository_files"
55
+ EXTERNAL_REFERENCE = "external_reference"
56
+ INTELLIGENT_DEFAULT = "intelligent_default"
57
+ PLACEHOLDER = "placeholder"
58
+ REGISTRY_DRIVEN = "registry_driven"
59
+
60
+ class ConfidenceLevel(Enum):
61
+ """Confidence levels for extracted data"""
62
+ HIGH = "high" # Direct API data, official sources
63
+ MEDIUM = "medium" # Inferred from reliable patterns
64
+ LOW = "low" # Weak inference or pattern matching
65
+ NONE = "none" # Placeholder values
66
+
67
+ @dataclass
68
+ class ExtractionResult:
69
+ """Container for extraction results with full provenance"""
70
+ value: Any
71
+ source: DataSource
72
+ confidence: ConfidenceLevel
73
+ extraction_method: str
74
+ timestamp: str = field(default_factory=lambda: datetime.utcnow().isoformat())
75
+ fallback_chain: List[str] = field(default_factory=list)
76
+
77
+ def __str__(self):
78
+ return f"{self.value} (source: {self.source.value}, confidence: {self.confidence.value})"
79
+
80
+ class EnhancedExtractor:
81
+ """
82
+ Registry-integrated enhanced extractor that automatically picks up new fields
83
+ from the JSON registry (field_registry.json) without requiring code changes.
84
+ """
85
+
86
+ def __init__(self, hf_api: Optional[HfApi] = None, field_registry_manager=None):
87
+ """
88
+ Initialize the enhanced extractor with registry integration (field_registry.json and field_registry_manager.py).
89
+
90
+ Args:
91
+ hf_api: Optional HuggingFace API instance (will create if not provided)
92
+ field_registry_manager.py: Optional registry manager instance
93
+ """
94
+ self.hf_api = hf_api or HfApi()
95
+ self.extraction_results = {}
96
+
97
+ # Initialize registry manager (field_registry_manager.py)
98
+ self.registry_manager = field_registry_manager
99
+ if not self.registry_manager and REGISTRY_AVAILABLE:
100
+ try:
101
+ self.registry_manager = get_field_registry_manager()
102
+ logger.info("βœ… Registry manager initialized successfully")
103
+ except Exception as e:
104
+ logger.warning(f"⚠️ Could not initialize registry manager: {e}")
105
+ self.registry_manager = None
106
+
107
+ # Load registry fields
108
+ self.registry_fields = {}
109
+ if self.registry_manager:
110
+ try:
111
+ registry = self.registry_manager.registry
112
+ self.registry_fields = registry.get('fields', {})
113
+ logger.info(f"βœ… Loaded {len(self.registry_fields)} fields from registry")
114
+ except Exception as e:
115
+ logger.error(f"❌ Error loading registry fields: {e}")
116
+ self.registry_fields = {}
117
+
118
+ # Configure logging
119
+ self._setup_logging()
120
+
121
+ # Compile regex patterns for text extraction
122
+ self._compile_patterns()
123
+
124
+ logger.info(f"Enhanced extractor initialized (registry-driven: {bool(self.registry_fields)})")
125
+
126
+ def _setup_logging(self):
127
+ """Setup logging configuration for detailed extraction tracking"""
128
+ # Ensure a logger that will show in HF Spaces
129
+ if not logger.handlers:
130
+ handler = logging.StreamHandler()
131
+ formatter = logging.Formatter(
132
+ '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
133
+ )
134
+ handler.setFormatter(formatter)
135
+ logger.addHandler(handler)
136
+ logger.setLevel(logging.INFO)
137
+
138
+ def _compile_patterns(self):
139
+ """Compile regex patterns for text extraction"""
140
+ self.patterns = {
141
+ 'license': [
142
+ r'license[:\s]+([a-zA-Z0-9\-\.]+)',
143
+ r'licensed under[:\s]+([a-zA-Z0-9\-\.]+)',
144
+ r'released under[:\s]+([a-zA-Z0-9\-\.]+)',
145
+ ],
146
+ 'datasets': [
147
+ r'trained on[:\s]+([a-zA-Z0-9\-\_\/]+)',
148
+ r'dataset[:\s]+([a-zA-Z0-9\-\_\/]+)',
149
+ r'using[:\s]+([a-zA-Z0-9\-\_\/]+)\s+dataset',
150
+ ],
151
+ 'metrics': [
152
+ r'([a-zA-Z]+)[:\s]+([0-9\.]+)',
153
+ r'achieves[:\s]+([0-9\.]+)[:\s]+([a-zA-Z]+)',
154
+ ],
155
+ 'model_type': [
156
+ r'model type[:\s]+([a-zA-Z0-9\-]+)',
157
+ r'architecture[:\s]+([a-zA-Z0-9\-]+)',
158
+ ],
159
+ 'energy': [
160
+ r'energy[:\s]+([0-9\.]+)\s*([a-zA-Z]+)',
161
+ r'power[:\s]+([0-9\.]+)\s*([a-zA-Z]+)',
162
+ r'consumption[:\s]+([0-9\.]+)\s*([a-zA-Z]+)',
163
+ ],
164
+ 'limitations': [
165
+ r'limitation[s]?[:\s]+([^\.]+)',
166
+ r'known issue[s]?[:\s]+([^\.]+)',
167
+ r'constraint[s]?[:\s]+([^\.]+)',
168
+ ],
169
+ 'safety': [
170
+ r'safety[:\s]+([^\.]+)',
171
+ r'risk[s]?[:\s]+([^\.]+)',
172
+ r'bias[:\s]+([^\.]+)',
173
+ ]
174
+ }
175
+
176
+ # Compile all patterns
177
+ for category, pattern_list in self.patterns.items():
178
+ self.patterns[category] = [re.compile(pattern, re.IGNORECASE) for pattern in pattern_list]
179
+
180
+ def extract_metadata(self, model_id: str, model_info: Dict[str, Any], model_card: Optional[ModelCard]) -> Dict[str, Any]:
181
+ """
182
+ Main extraction method with full registry integration.
183
+
184
+ This method automatically discovers all fields from the registry and attempts
185
+ to extract them without requiring code changes when new fields are added.
186
+
187
+ Args:
188
+ model_id: Hugging Face model identifier
189
+ model_info: Model information from HF API
190
+ model_card: Model card object from HF
191
+
192
+ Returns:
193
+ Dictionary of extracted metadata
194
+ """
195
+ logger.info(f"πŸš€ Starting registry-driven extraction for model: {model_id}")
196
+
197
+ # Initialize extraction results tracking
198
+ self.extraction_results = {}
199
+ metadata = {}
200
+
201
+ if self.registry_fields:
202
+ # Registry-driven extraction
203
+ logger.info(f"πŸ“‹ Registry-driven mode: Attempting extraction for {len(self.registry_fields)} fields")
204
+ metadata = self._registry_driven_extraction(model_id, model_info, model_card)
205
+ else:
206
+ # Fallback to legacy extraction
207
+ logger.warning("⚠️ Registry not available, falling back to legacy extraction")
208
+ metadata = self._legacy_extraction(model_id, model_info, model_card)
209
+
210
+ # Log extraction summary
211
+ self._log_extraction_summary(model_id, metadata)
212
+
213
+ # Return metadata in the same format as original method
214
+ return {k: v for k, v in metadata.items() if v is not None}
215
+
216
+ def _registry_driven_extraction(self, model_id: str, model_info: Dict[str, Any], model_card: Optional[ModelCard]) -> Dict[str, Any]:
217
+ """
218
+ Registry-driven extraction that automatically processes all registry fields.
219
+ """
220
+ metadata = {}
221
+
222
+ # Prepare extraction context
223
+ extraction_context = {
224
+ 'model_id': model_id,
225
+ 'model_info': model_info,
226
+ 'model_card': model_card,
227
+ 'readme_content': self._get_readme_content(model_card, model_id),
228
+ 'config_data': self._download_and_parse_config(model_id, "config.json"),
229
+ 'tokenizer_config': self._download_and_parse_config(model_id, "tokenizer_config.json")
230
+ }
231
+
232
+ # Process each field from the registry
233
+ successful_extractions = 0
234
+ failed_extractions = 0
235
+
236
+ for field_name, field_config in self.registry_fields.items():
237
+ try:
238
+ logger.info(f"πŸ” Attempting extraction for field: {field_name}")
239
+
240
+ # Extract field using registry configuration
241
+ extracted_value = self._extract_registry_field(field_name, field_config, extraction_context)
242
+
243
+ if extracted_value is not None:
244
+ metadata[field_name] = extracted_value
245
+ successful_extractions += 1
246
+ logger.info(f"βœ… Successfully extracted {field_name}: {extracted_value}")
247
+ else:
248
+ failed_extractions += 1
249
+ logger.info(f"❌ Failed to extract {field_name}")
250
+
251
+ except Exception as e:
252
+ failed_extractions += 1
253
+ logger.error(f"❌ Error extracting {field_name}: {e}")
254
+ # Continue with other fields - individual failures don't stop the process
255
+ continue
256
+
257
+ logger.info(f"πŸ“Š Registry extraction complete: {successful_extractions} successful, {failed_extractions} failed")
258
+
259
+ # Add external references
260
+ metadata.update(self._generate_external_references(model_id, metadata))
261
+
262
+ return metadata
263
+
264
+ def _extract_registry_field(self, field_name: str, field_config: Dict[str, Any], context: Dict[str, Any]) -> Any:
265
+ """
266
+ Extract a single field based on its registry configuration.
267
+
268
+ This method uses multiple extraction strategies in order of preference:
269
+ 1. Direct API extraction
270
+ 2. Model card YAML extraction
271
+ 3. Text pattern matching
272
+ 4. Intelligent inference
273
+ 5. Fallback values
274
+ """
275
+ extraction_methods = []
276
+
277
+ # Strategy 1: Direct API extraction
278
+ api_value = self._try_api_extraction(field_name, context)
279
+ if api_value is not None:
280
+ self.extraction_results[field_name] = ExtractionResult(
281
+ value=api_value,
282
+ source=DataSource.HF_API,
283
+ confidence=ConfidenceLevel.HIGH,
284
+ extraction_method="api_direct"
285
+ )
286
+ extraction_methods.append("api_direct")
287
+ return api_value
288
+
289
+ # Strategy 2: Model card YAML extraction
290
+ yaml_value = self._try_model_card_extraction(field_name, context)
291
+ if yaml_value is not None:
292
+ self.extraction_results[field_name] = ExtractionResult(
293
+ value=yaml_value,
294
+ source=DataSource.MODEL_CARD,
295
+ confidence=ConfidenceLevel.HIGH,
296
+ extraction_method="model_card_yaml"
297
+ )
298
+ extraction_methods.append("model_card_yaml")
299
+ return yaml_value
300
+
301
+ # Strategy 3: Configuration file extraction
302
+ config_value = self._try_config_extraction(field_name, context)
303
+ if config_value is not None:
304
+ self.extraction_results[field_name] = ExtractionResult(
305
+ value=config_value,
306
+ source=DataSource.CONFIG_FILE,
307
+ confidence=ConfidenceLevel.HIGH,
308
+ extraction_method="config_file"
309
+ )
310
+ extraction_methods.append("config_file")
311
+ return config_value
312
+
313
+ # Strategy 4: Text pattern extraction
314
+ text_value = self._try_text_pattern_extraction(field_name, context)
315
+ if text_value is not None:
316
+ self.extraction_results[field_name] = ExtractionResult(
317
+ value=text_value,
318
+ source=DataSource.README_TEXT,
319
+ confidence=ConfidenceLevel.MEDIUM,
320
+ extraction_method="text_pattern"
321
+ )
322
+ extraction_methods.append("text_pattern")
323
+ return text_value
324
+
325
+ # Strategy 5: Intelligent inference
326
+ inferred_value = self._try_intelligent_inference(field_name, context)
327
+ if inferred_value is not None:
328
+ self.extraction_results[field_name] = ExtractionResult(
329
+ value=inferred_value,
330
+ source=DataSource.INTELLIGENT_DEFAULT,
331
+ confidence=ConfidenceLevel.MEDIUM,
332
+ extraction_method="intelligent_inference"
333
+ )
334
+ extraction_methods.append("intelligent_inference")
335
+ return inferred_value
336
+
337
+ # Strategy 6: Fallback value (if configured)
338
+ fallback_value = self._try_fallback_value(field_name, field_config)
339
+ if fallback_value is not None:
340
+ self.extraction_results[field_name] = ExtractionResult(
341
+ value=fallback_value,
342
+ source=DataSource.PLACEHOLDER,
343
+ confidence=ConfidenceLevel.NONE,
344
+ extraction_method="fallback_placeholder",
345
+ fallback_chain=extraction_methods
346
+ )
347
+ return fallback_value
348
+
349
+ # No extraction successful
350
+ self.extraction_results[field_name] = ExtractionResult(
351
+ value=None,
352
+ source=DataSource.PLACEHOLDER,
353
+ confidence=ConfidenceLevel.NONE,
354
+ extraction_method="extraction_failed",
355
+ fallback_chain=extraction_methods
356
+ )
357
+ return None
358
+
359
+ def _try_api_extraction(self, field_name: str, context: Dict[str, Any]) -> Any:
360
+ """Try to extract field from HuggingFace API data"""
361
+ model_info = context.get('model_info')
362
+ if not model_info:
363
+ return None
364
+
365
+ # Field mapping for API extraction
366
+ api_mappings = {
367
+ 'author': lambda info: getattr(info, 'author', None) or context['model_id'].split('/')[0],
368
+ 'name': lambda info: getattr(info, 'modelId', context['model_id']).split('/')[-1],
369
+ 'tags': lambda info: getattr(info, 'tags', []),
370
+ 'pipeline_tag': lambda info: getattr(info, 'pipeline_tag', None),
371
+ 'downloads': lambda info: getattr(info, 'downloads', 0),
372
+ 'commit': lambda info: getattr(info, 'sha', '')[:7] if getattr(info, 'sha', None) else None,
373
+ 'suppliedBy': lambda info: getattr(info, 'author', None) or context['model_id'].split('/')[0],
374
+ 'primaryPurpose': lambda info: getattr(info, 'pipeline_tag', 'text-generation'),
375
+ 'downloadLocation': lambda info: f"https://huggingface.co/{context['model_id']}/tree/main"
376
+ }
377
+
378
+ if field_name in api_mappings:
379
+ try:
380
+ return api_mappings[field_name](model_info)
381
+ except Exception as e:
382
+ logger.debug(f"API extraction failed for {field_name}: {e}")
383
+ return None
384
+
385
+ return None
386
+
387
+ def _try_model_card_extraction(self, field_name: str, context: Dict[str, Any]) -> Any:
388
+ """Try to extract field from model card YAML frontmatter"""
389
+ model_card = context.get('model_card')
390
+ if not model_card or not hasattr(model_card, 'data') or not model_card.data:
391
+ return None
392
+
393
+ try:
394
+ card_data = model_card.data.to_dict() if hasattr(model_card.data, 'to_dict') else {}
395
+
396
+ # Field mapping for model card extraction
397
+ card_mappings = {
398
+ 'license': 'license',
399
+ 'language': 'language',
400
+ 'library_name': 'library_name',
401
+ 'base_model': 'base_model',
402
+ 'datasets': 'datasets',
403
+ 'description': ['model_summary', 'description'],
404
+ 'typeOfModel': 'model_type',
405
+ 'licenses': 'license' # Alternative mapping
406
+ }
407
+
408
+ if field_name in card_mappings:
409
+ mapping = card_mappings[field_name]
410
+ if isinstance(mapping, list):
411
+ # Try multiple keys
412
+ for key in mapping:
413
+ value = card_data.get(key)
414
+ if value:
415
+ return value
416
+ else:
417
+ # Single key
418
+ return card_data.get(mapping)
419
+
420
+ # Direct field name lookup
421
+ return card_data.get(field_name)
422
+
423
+ except Exception as e:
424
+ logger.debug(f"Model card extraction failed for {field_name}: {e}")
425
+ return None
426
+
427
+ def _try_config_extraction(self, field_name: str, context: Dict[str, Any]) -> Any:
428
+ """Try to extract field from configuration files"""
429
+ config_data = context.get('config_data')
430
+ tokenizer_config = context.get('tokenizer_config')
431
+
432
+ # Config file mappings
433
+ config_mappings = {
434
+ 'model_type': ('config_data', 'model_type'),
435
+ 'architectures': ('config_data', 'architectures'),
436
+ 'vocab_size': ('config_data', 'vocab_size'),
437
+ 'tokenizer_class': ('tokenizer_config', 'tokenizer_class'),
438
+ 'typeOfModel': ('config_data', 'model_type')
439
+ }
440
+
441
+ if field_name in config_mappings:
442
+ config_type, config_key = config_mappings[field_name]
443
+ config_source = context.get(config_type)
444
+ if config_source:
445
+ return config_source.get(config_key)
446
+
447
+ return None
448
+
449
+ def _try_text_pattern_extraction(self, field_name: str, context: Dict[str, Any]) -> Any:
450
+ """Try to extract field using text pattern matching"""
451
+ readme_content = context.get('readme_content')
452
+ if not readme_content:
453
+ return None
454
+
455
+ # Pattern mappings for different fields
456
+ pattern_mappings = {
457
+ 'license': 'license',
458
+ 'datasets': 'datasets',
459
+ 'energyConsumption': 'energy',
460
+ 'limitation': 'limitations',
461
+ 'safetyRiskAssessment': 'safety',
462
+ 'model_type': 'model_type'
463
+ }
464
+
465
+ if field_name in pattern_mappings:
466
+ pattern_key = pattern_mappings[field_name]
467
+ if pattern_key in self.patterns:
468
+ matches = self._find_pattern_matches(readme_content, self.patterns[pattern_key])
469
+ if matches:
470
+ return matches[0] if len(matches) == 1 else matches
471
+
472
+ return None
473
+
474
+ def _try_intelligent_inference(self, field_name: str, context: Dict[str, Any]) -> Any:
475
+ """Try to infer field value from other available data"""
476
+ model_id = context['model_id']
477
+
478
+ # Intelligent inference rules
479
+ inference_rules = {
480
+ 'author': lambda: model_id.split('/')[0] if '/' in model_id else 'unknown',
481
+ 'suppliedBy': lambda: model_id.split('/')[0] if '/' in model_id else 'unknown',
482
+ 'name': lambda: model_id.split('/')[-1],
483
+ 'primaryPurpose': lambda: 'text-generation', # Default for most HF models
484
+ 'typeOfModel': lambda: 'transformer', # Default for most HF models
485
+ 'downloadLocation': lambda: f"https://huggingface.co/{model_id}/tree/main",
486
+ 'bomFormat': lambda: 'CycloneDX',
487
+ 'specVersion': lambda: '1.6',
488
+ 'serialNumber': lambda: f"urn:uuid:{model_id.replace('/', '-')}",
489
+ 'version': lambda: '1.0.0'
490
+ }
491
+
492
+ if field_name in inference_rules:
493
+ try:
494
+ return inference_rules[field_name]()
495
+ except Exception as e:
496
+ logger.debug(f"Intelligent inference failed for {field_name}: {e}")
497
+ return None
498
+
499
+ return None
500
+
501
+ def _try_fallback_value(self, field_name: str, field_config: Dict[str, Any]) -> Any:
502
+ """Try to get fallback value from field configuration"""
503
+ # Check if field config has fallback value
504
+ if isinstance(field_config, dict):
505
+ fallback = field_config.get('fallback_value')
506
+ if fallback:
507
+ return fallback
508
+
509
+ # Standard fallback values for common fields
510
+ standard_fallbacks = {
511
+ 'license': 'NOASSERTION',
512
+ 'description': 'No description available',
513
+ 'version': '1.0.0',
514
+ 'bomFormat': 'CycloneDX',
515
+ 'specVersion': '1.6'
516
+ }
517
+
518
+ return standard_fallbacks.get(field_name)
519
+
520
+ def _legacy_extraction(self, model_id: str, model_info: Dict[str, Any], model_card: Optional[ModelCard]) -> Dict[str, Any]:
521
+ """
522
+ Fallback to legacy extraction when registry is not available.
523
+ This maintains backward compatibility.
524
+ """
525
+ logger.info("πŸ”„ Executing legacy extraction mode")
526
+ metadata = {}
527
+
528
+ # Execute legacy extraction layers
529
+ metadata.update(self._layer1_structured_api(model_id, model_info, model_card))
530
+ metadata.update(self._layer2_repository_files(model_id))
531
+ metadata.update(self._layer3_stp_extraction(model_card, model_id))
532
+ metadata.update(self._layer4_external_references(model_id, metadata))
533
+ metadata.update(self._layer5_intelligent_defaults(model_id, metadata))
534
+
535
+ return metadata
536
+
537
+ def _generate_external_references(self, model_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
538
+ """Generate external references for the model"""
539
+ external_refs = []
540
+
541
+ # Model repository
542
+ repo_url = f"https://huggingface.co/{model_id}"
543
+ external_refs.append({
544
+ "type": "website",
545
+ "url": repo_url,
546
+ "comment": "Model repository"
547
+ })
548
+
549
+ # Model files
550
+ files_url = f"https://huggingface.co/{model_id}/tree/main"
551
+ external_refs.append({
552
+ "type": "distribution",
553
+ "url": files_url,
554
+ "comment": "Model files"
555
+ })
556
+
557
+ # Commit URL if available
558
+ if 'commit' in metadata:
559
+ commit_url = f"https://huggingface.co/{model_id}/commit/{metadata['commit']}"
560
+ external_refs.append({
561
+ "type": "vcs",
562
+ "url": commit_url,
563
+ "comment": "Specific commit"
564
+ })
565
+
566
+ # Dataset references
567
+ if 'datasets' in metadata:
568
+ datasets = metadata['datasets']
569
+ if isinstance(datasets, list):
570
+ for dataset in datasets:
571
+ if isinstance(dataset, str):
572
+ dataset_url = f"https://huggingface.co/datasets/{dataset}"
573
+ external_refs.append({
574
+ "type": "distribution",
575
+ "url": dataset_url,
576
+ "comment": f"Training dataset: {dataset}"
577
+ })
578
+
579
+ result = {'external_references': external_refs}
580
+
581
+ self.extraction_results['external_references'] = ExtractionResult(
582
+ value=external_refs,
583
+ source=DataSource.EXTERNAL_REFERENCE,
584
+ confidence=ConfidenceLevel.HIGH,
585
+ extraction_method="url_generation"
586
+ )
587
+
588
+ return result
589
+
590
+ # Legacy methods for backward compatibility
591
+ def _layer1_structured_api(self, model_id: str, model_info: Dict[str, Any], model_card: Optional[ModelCard]) -> Dict[str, Any]:
592
+ """Legacy Layer 1: Enhanced structured data extraction from HF API and model card."""
593
+ logger.info("πŸ“Š Executing Legacy Layer 1: Enhanced Structured API Extraction")
594
+ metadata = {}
595
+
596
+ # Enhanced model info extraction
597
+ if model_info:
598
+ try:
599
+ # Extract author with fallback logic
600
+ author = getattr(model_info, "author", None)
601
+ if not author or author.strip() == "":
602
+ parts = model_id.split("/")
603
+ author = parts[0] if len(parts) > 1 else "unknown"
604
+
605
+ metadata['author'] = author
606
+ metadata['name'] = getattr(model_info, "modelId", model_id).split("/")[-1]
607
+ metadata['tags'] = getattr(model_info, "tags", [])
608
+ metadata['pipeline_tag'] = getattr(model_info, "pipeline_tag", None)
609
+ metadata['downloads'] = getattr(model_info, "downloads", 0)
610
+
611
+ # Commit information
612
+ commit_sha = getattr(model_info, "sha", None)
613
+ if commit_sha:
614
+ metadata['commit'] = commit_sha[:7]
615
+
616
+ except Exception as e:
617
+ logger.error(f"❌ Legacy Layer 1: Error extracting from model_info: {e}")
618
+
619
+ # Enhanced model card extraction
620
+ if model_card and hasattr(model_card, "data") and model_card.data:
621
+ try:
622
+ card_data = model_card.data.to_dict() if hasattr(model_card.data, "to_dict") else {}
623
+
624
+ metadata['license'] = card_data.get("license")
625
+ metadata['language'] = card_data.get("language")
626
+ metadata['library_name'] = card_data.get("library_name")
627
+ metadata['base_model'] = card_data.get("base_model")
628
+ metadata['datasets'] = card_data.get("datasets")
629
+ metadata['description'] = card_data.get("model_summary") or card_data.get("description")
630
+
631
+ except Exception as e:
632
+ logger.error(f"❌ Legacy Layer 1: Error extracting from model card: {e}")
633
+
634
+ # Add standard AI metadata
635
+ metadata["primaryPurpose"] = metadata.get("pipeline_tag", "text-generation")
636
+ metadata["suppliedBy"] = metadata.get("author", "unknown")
637
+ metadata["typeOfModel"] = "transformer"
638
+
639
+ return metadata
640
+
641
+ def _layer2_repository_files(self, model_id: str) -> Dict[str, Any]:
642
+ """Legacy Layer 2: Repository file analysis"""
643
+ logger.info("πŸ”§ Executing Legacy Layer 2: Repository File Analysis")
644
+ metadata = {}
645
+
646
+ try:
647
+ config_data = self._download_and_parse_config(model_id, "config.json")
648
+ if config_data:
649
+ metadata['model_type'] = config_data.get("model_type")
650
+ metadata['architectures'] = config_data.get("architectures", [])
651
+ metadata['vocab_size'] = config_data.get("vocab_size")
652
+
653
+ tokenizer_config = self._download_and_parse_config(model_id, "tokenizer_config.json")
654
+ if tokenizer_config:
655
+ metadata['tokenizer_class'] = tokenizer_config.get("tokenizer_class")
656
+
657
+ except Exception as e:
658
+ logger.warning(f"⚠️ Legacy Layer 2: Could not analyze repository files: {e}")
659
+
660
+ return metadata
661
+
662
+ def _layer3_stp_extraction(self, model_card: Optional[ModelCard], model_id: str) -> Dict[str, Any]:
663
+ """Legacy Layer 3: Smart Text Parsing"""
664
+ logger.info("πŸ” Executing Legacy Layer 3: Smart Text Parsing")
665
+ metadata = {}
666
+
667
+ try:
668
+ readme_content = self._get_readme_content(model_card, model_id)
669
+ if readme_content:
670
+ extracted_info = self._extract_from_text(readme_content)
671
+ metadata.update(extracted_info)
672
+ except Exception as e:
673
+ logger.warning(f"⚠️ Legacy Layer 3: Error in Smart Text Parsing: {e}")
674
+
675
+ return metadata
676
+
677
+ def _layer4_external_references(self, model_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
678
+ """Legacy Layer 4: External reference generation"""
679
+ logger.info("πŸ”— Executing Legacy Layer 4: External Reference Generation")
680
+ return self._generate_external_references(model_id, metadata)
681
+
682
+ def _layer5_intelligent_defaults(self, model_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
683
+ """Legacy Layer 5: Intelligent default generation"""
684
+ logger.info("🧠 Executing Legacy Layer 5: Intelligent Default Generation")
685
+
686
+ if 'author' not in metadata or not metadata['author']:
687
+ parts = model_id.split("/")
688
+ metadata['author'] = parts[0] if len(parts) > 1 else "unknown"
689
+
690
+ if 'license' not in metadata or not metadata['license']:
691
+ metadata['license'] = "NOASSERTION"
692
+
693
+ return metadata
694
+
695
+ # Utility methods
696
+ def _download_and_parse_config(self, model_id: str, filename: str) -> Optional[Dict[str, Any]]:
697
+ """Download and parse a configuration file from the model repository"""
698
+ try:
699
+ file_path = hf_hub_download(repo_id=model_id, filename=filename)
700
+ with open(file_path, 'r') as f:
701
+ return json.load(f)
702
+ except (RepositoryNotFoundError, EntryNotFoundError, json.JSONDecodeError) as e:
703
+ logger.debug(f"Could not download/parse {filename}: {e}")
704
+ return None
705
+ except Exception as e:
706
+ logger.warning(f"Unexpected error downloading {filename}: {e}")
707
+ return None
708
+
709
+ def _get_readme_content(self, model_card: Optional[ModelCard], model_id: str) -> Optional[str]:
710
+ """Get README content from model card or by downloading"""
711
+ try:
712
+ if model_card and hasattr(model_card, 'content'):
713
+ return model_card.content
714
+
715
+ readme_path = hf_hub_download(repo_id=model_id, filename="README.md")
716
+ with open(readme_path, 'r', encoding='utf-8') as f:
717
+ return f.read()
718
+
719
+ except Exception as e:
720
+ logger.debug(f"Could not get README content: {e}")
721
+ return None
722
+
723
+ def _extract_from_text(self, text: str) -> Dict[str, Any]:
724
+ """Extract structured information from unstructured text"""
725
+ metadata = {}
726
+
727
+ # Extract license information
728
+ license_matches = self._find_pattern_matches(text, self.patterns['license'])
729
+ if license_matches:
730
+ metadata['license_from_text'] = license_matches[0]
731
+
732
+ # Extract dataset information
733
+ dataset_matches = self._find_pattern_matches(text, self.patterns['datasets'])
734
+ if dataset_matches:
735
+ metadata['datasets_from_text'] = dataset_matches
736
+
737
+ # Extract performance metrics
738
+ metric_matches = self._extract_metrics(text)
739
+ if metric_matches:
740
+ metadata['performance_metrics'] = metric_matches
741
+
742
+ return metadata
743
+
744
+ def _find_pattern_matches(self, text: str, patterns: List[re.Pattern]) -> List[str]:
745
+ """Find matches for a list of regex patterns in text"""
746
+ matches = []
747
+ for pattern in patterns:
748
+ found = pattern.findall(text)
749
+ matches.extend(found)
750
+ return list(set(matches)) # Remove duplicates
751
+
752
+ def _extract_metrics(self, text: str) -> Dict[str, float]:
753
+ """Extract performance metrics from text"""
754
+ metrics = {}
755
+
756
+ metric_patterns = [
757
+ r'accuracy[:\s]+([0-9\.]+)',
758
+ r'f1[:\s]+([0-9\.]+)',
759
+ r'bleu[:\s]+([0-9\.]+)',
760
+ r'rouge[:\s]+([0-9\.]+)',
761
+ ]
762
+
763
+ for pattern_str in metric_patterns:
764
+ pattern = re.compile(pattern_str, re.IGNORECASE)
765
+ matches = pattern.findall(text)
766
+ if matches:
767
+ metric_name = pattern_str.split('[')[0]
768
+ try:
769
+ metrics[metric_name] = float(matches[0])
770
+ except ValueError:
771
+ continue
772
+
773
+ return metrics
774
+
775
+ def _log_extraction_summary(self, model_id: str, metadata: Dict[str, Any]):
776
+ """Log comprehensive extraction summary"""
777
+ logger.info("=" * 60)
778
+ logger.info(f"πŸ“‹ REGISTRY-DRIVEN EXTRACTION SUMMARY FOR: {model_id}")
779
+ logger.info("=" * 60)
780
+
781
+ if self.registry_fields:
782
+ logger.info(f"πŸ“Š Registry fields available: {len(self.registry_fields)}")
783
+ logger.info(f"πŸ“Š Total fields extracted: {len(self.extraction_results)}")
784
+
785
+ # Count fields by confidence level
786
+ confidence_counts = {}
787
+ source_counts = {}
788
+
789
+ for field_name, result in self.extraction_results.items():
790
+ conf = result.confidence.value
791
+ source = result.source.value
792
+ confidence_counts[conf] = confidence_counts.get(conf, 0) + 1
793
+ source_counts[source] = source_counts.get(source, 0) + 1
794
+
795
+ logger.info("πŸ“ˆ Confidence distribution:")
796
+ for conf, count in confidence_counts.items():
797
+ logger.info(f" {conf}: {count} fields")
798
+
799
+ logger.info("πŸ” Source distribution:")
800
+ for source, count in source_counts.items():
801
+ logger.info(f" {source}: {count} fields")
802
+
803
+ # Log registry field coverage
804
+ extracted_fields = set(self.extraction_results.keys())
805
+ registry_field_names = set(self.registry_fields.keys())
806
+ coverage = len(extracted_fields & registry_field_names) / len(registry_field_names) * 100
807
+ logger.info(f"πŸ“Š Registry field coverage: {coverage:.1f}%")
808
+
809
+ # Log missing registry fields
810
+ missing_fields = registry_field_names - extracted_fields
811
+ if missing_fields:
812
+ logger.info(f"❌ Missing registry fields: {', '.join(sorted(missing_fields))}")
813
+ else:
814
+ logger.info(f"πŸ“Š Legacy extraction mode: {len(metadata)} fields extracted")
815
+
816
+ logger.info("=" * 60)
817
+
818
+ def get_extraction_results(self) -> Dict[str, ExtractionResult]:
819
+ """Get detailed extraction results with provenance"""
820
+ return self.extraction_results.copy()
821
+
822
+
823
+ # Convenience function for drop-in replacement
824
+ def extract_enhanced_metadata(model_id: str, model_info: Dict[str, Any], model_card: Optional[ModelCard], hf_api: Optional[HfApi] = None) -> Dict[str, Any]:
825
+ """
826
+ Drop-in replacement function for _extract_structured_metadata with registry integration.
827
+
828
+ This function automatically picks up new fields from the registry without code changes.
829
+
830
+ Args:
831
+ model_id: Hugging Face model identifier
832
+ model_info: Model information from HF API
833
+ model_card: Model card object from HF
834
+ hf_api: Optional HuggingFace API instance
835
+
836
+ Returns:
837
+ Dictionary of extracted metadata
838
+ """
839
+ extractor = EnhancedExtractor(hf_api)
840
+ return extractor.extract_metadata(model_id, model_info, model_card)
841
+
842
+
843
+ if __name__ == "__main__":
844
+ # Test the registry-integrated enhanced extractor
845
+ import sys
846
+
847
+ if len(sys.argv) > 1:
848
+ test_model_id = sys.argv[1]
849
+ else:
850
+ test_model_id = "deepseek-ai/DeepSeek-R1"
851
+
852
+ print(f"Testing registry-integrated enhanced extractor with model: {test_model_id}")
853
+
854
+ # Initialize HF API
855
+ hf_api = HfApi()
856
+
857
+ try:
858
+ # Fetch model info and card
859
+ model_info = hf_api.model_info(test_model_id)
860
+ model_card = ModelCard.load(test_model_id)
861
+
862
+ # Test extraction
863
+ extractor = EnhancedExtractor(hf_api)
864
+ metadata = extractor.extract_metadata(test_model_id, model_info, model_card)
865
+
866
+ print(f"\nExtracted {len(metadata)} metadata fields:")
867
+ for key, value in metadata.items():
868
+ print(f" {key}: {value}")
869
+
870
+ print(f"\nExtraction results with provenance:")
871
+ for field, result in extractor.get_extraction_results().items():
872
+ print(f" {field}: {result}")
873
+
874
+ except Exception as e:
875
+ print(f"Error testing extractor: {e}")
876
+
src/aibom-generator/field_registry.json ADDED
@@ -0,0 +1,737 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "registry_metadata": {
3
+ "description": "Field registry for configurable AI SBOM generation and scoring"
4
+ },
5
+ "scoring_config": {
6
+ "tier_weights": {
7
+ "critical": 3,
8
+ "important": 2,
9
+ "supplementary": 1
10
+ },
11
+ "category_weights": {
12
+ "required_fields": 20,
13
+ "metadata": 20,
14
+ "component_basic": 20,
15
+ "component_model_card": 30,
16
+ "external_references": 10
17
+ },
18
+ "scoring_profiles": {
19
+ "basic": {
20
+ "description": "Minimal fields required for identification",
21
+ "required_categories": ["required_fields", "component_basic"],
22
+ "required_fields": ["bomFormat", "specVersion", "serialNumber", "version", "name"],
23
+ "minimum_score": 40,
24
+ "weight_multiplier": 1.0
25
+ },
26
+ "standard": {
27
+ "description": "Comprehensive fields for proper documentation",
28
+ "required_categories": ["required_fields", "metadata", "component_basic"],
29
+ "required_fields": ["bomFormat", "specVersion", "serialNumber", "version", "name", "downloadLocation", "primaryPurpose", "suppliedBy"],
30
+ "minimum_score": 70,
31
+ "weight_multiplier": 1.0
32
+ },
33
+ "advanced": {
34
+ "description": "Extensive documentation for maximum transparency",
35
+ "required_categories": ["required_fields", "metadata", "component_basic", "component_model_card", "external_references"],
36
+ "required_fields": ["bomFormat", "specVersion", "serialNumber", "version", "name", "downloadLocation", "primaryPurpose", "suppliedBy", "type", "purl", "description", "licenses", "hyperparameter", "limitation", "energyConsumption", "safetyRiskAssessment", "typeOfModel"],
37
+ "minimum_score": 85,
38
+ "weight_multiplier": 1.0
39
+ }
40
+ },
41
+ "algorithm_config": {
42
+ "type": "weighted_sum",
43
+ "max_score": 100,
44
+ "normalization": "category_based",
45
+ "penalty_for_missing_critical": 0.5,
46
+ "bonus_for_complete_categories": 0.1
47
+ }
48
+ },
49
+ "aibom_config": {
50
+ "structure_template": "cyclonedx_1.6",
51
+ "generator_info": {
52
+ "name": "aetheris-aibom-generator",
53
+ "version": "1.0",
54
+ "manufacturer": "Aetheris AI"
55
+ },
56
+ "generation_rules": {
57
+ "include_metadata_properties": true,
58
+ "include_model_card": true,
59
+ "include_external_references": true,
60
+ "include_dependencies": true
61
+ },
62
+ "validation_rules": {
63
+ "require_critical_fields": true,
64
+ "validate_jsonpath_expressions": true,
65
+ "enforce_cyclonedx_schema": true
66
+ }
67
+ },
68
+ "fields": {
69
+ "bomFormat": {
70
+ "tier": "critical",
71
+ "weight": 4.0,
72
+ "category": "required_fields",
73
+ "description": "Format identifier for the SBOM",
74
+ "jsonpath": "$.bomFormat",
75
+ "aibom_generation": {
76
+ "location": "$.bomFormat",
77
+ "rule": "always_include",
78
+ "source_fields": ["bomFormat"],
79
+ "validation": "required",
80
+ "data_type": "string"
81
+ },
82
+ "scoring": {
83
+ "points": 4.0,
84
+ "required_for_profiles": ["basic", "standard", "advanced"],
85
+ "category_contribution": 0.2
86
+ },
87
+ "validation_message": {
88
+ "missing": "Missing critical field: bomFormat - essential for SBOM identification",
89
+ "recommendation": "Ensure bomFormat is set to 'CycloneDX'"
90
+ }
91
+ },
92
+ "specVersion": {
93
+ "tier": "critical",
94
+ "weight": 4.0,
95
+ "category": "required_fields",
96
+ "description": "CycloneDX specification version",
97
+ "jsonpath": "$.specVersion",
98
+ "aibom_generation": {
99
+ "location": "$.specVersion",
100
+ "rule": "always_include",
101
+ "source_fields": ["specVersion"],
102
+ "validation": "required",
103
+ "data_type": "string"
104
+ },
105
+ "scoring": {
106
+ "points": 4.0,
107
+ "required_for_profiles": ["basic", "standard", "advanced"],
108
+ "category_contribution": 0.2
109
+ },
110
+ "validation_message": {
111
+ "missing": "Missing critical field: specVersion - required for CycloneDX compliance",
112
+ "recommendation": "Set specVersion to '1.6' for CycloneDX 1.6 compliance"
113
+ }
114
+ },
115
+ "serialNumber": {
116
+ "tier": "critical",
117
+ "weight": 4.0,
118
+ "category": "required_fields",
119
+ "description": "Unique identifier for this SBOM instance",
120
+ "jsonpath": "$.serialNumber",
121
+ "aibom_generation": {
122
+ "location": "$.serialNumber",
123
+ "rule": "always_include",
124
+ "source_fields": ["serialNumber"],
125
+ "validation": "required",
126
+ "data_type": "string"
127
+ },
128
+ "scoring": {
129
+ "points": 4.0,
130
+ "required_for_profiles": ["basic", "standard", "advanced"],
131
+ "category_contribution": 0.2
132
+ },
133
+ "validation_message": {
134
+ "missing": "Missing critical field: serialNumber - unique identifier required",
135
+ "recommendation": "Generate a UUID for the SBOM instance"
136
+ }
137
+ },
138
+ "version": {
139
+ "tier": "critical",
140
+ "weight": 4.0,
141
+ "category": "required_fields",
142
+ "description": "Version of this SBOM document",
143
+ "jsonpath": "$.version",
144
+ "aibom_generation": {
145
+ "location": "$.version",
146
+ "rule": "always_include",
147
+ "source_fields": ["version"],
148
+ "validation": "required",
149
+ "data_type": "integer"
150
+ },
151
+ "scoring": {
152
+ "points": 4.0,
153
+ "required_for_profiles": ["basic", "standard", "advanced"],
154
+ "category_contribution": 0.2
155
+ },
156
+ "validation_message": {
157
+ "missing": "Missing critical field: version - document version required",
158
+ "recommendation": "Set version to 1 for initial SBOM generation"
159
+ }
160
+ },
161
+ "primaryPurpose": {
162
+ "tier": "critical",
163
+ "weight": 4.0,
164
+ "category": "metadata",
165
+ "description": "Primary purpose or task of the AI model",
166
+ "jsonpath": "$.metadata.properties[?(@.name=='primaryPurpose')].value",
167
+ "aibom_generation": {
168
+ "location": "$.metadata.properties",
169
+ "rule": "include_if_available",
170
+ "source_fields": ["primaryPurpose", "pipeline_tag", "ai:task"],
171
+ "validation": "recommended",
172
+ "data_type": "string"
173
+ },
174
+ "scoring": {
175
+ "points": 4.0,
176
+ "required_for_profiles": ["standard", "advanced"],
177
+ "category_contribution": 0.2
178
+ },
179
+ "validation_message": {
180
+ "missing": "Missing critical field: primaryPurpose - essential for understanding model intent",
181
+ "recommendation": "Add the primary task or purpose of the AI model"
182
+ }
183
+ },
184
+ "suppliedBy": {
185
+ "tier": "critical",
186
+ "weight": 4.0,
187
+ "category": "metadata",
188
+ "description": "Organization or individual that supplied the model",
189
+ "jsonpath": "$.metadata.properties[?(@.name=='suppliedBy')].value",
190
+ "aibom_generation": {
191
+ "location": "$.metadata.properties",
192
+ "rule": "include_if_available",
193
+ "source_fields": ["suppliedBy", "author", "publisher"],
194
+ "validation": "recommended",
195
+ "data_type": "string"
196
+ },
197
+ "scoring": {
198
+ "points": 4.0,
199
+ "required_for_profiles": ["standard", "advanced"],
200
+ "category_contribution": 0.2
201
+ },
202
+ "validation_message": {
203
+ "missing": "Missing critical field: suppliedBy - supplier identification required",
204
+ "recommendation": "Add the organization or individual who provided the model"
205
+ }
206
+ },
207
+ "standardCompliance": {
208
+ "tier": "supplementary",
209
+ "weight": 1.0,
210
+ "category": "metadata",
211
+ "description": "Standards or regulations the model complies with",
212
+ "jsonpath": "$.metadata.properties[?(@.name=='standardCompliance')].value",
213
+ "aibom_generation": {
214
+ "location": "$.metadata.properties",
215
+ "rule": "include_if_available",
216
+ "source_fields": ["standardCompliance", "compliance"],
217
+ "validation": "optional",
218
+ "data_type": "string"
219
+ },
220
+ "scoring": {
221
+ "points": 1.0,
222
+ "required_for_profiles": ["advanced"],
223
+ "category_contribution": 0.05
224
+ },
225
+ "validation_message": {
226
+ "missing": "Missing supplementary field: standardCompliance - compliance information helpful",
227
+ "recommendation": "Add any relevant standards or regulations the model complies with"
228
+ }
229
+ },
230
+ "domain": {
231
+ "tier": "supplementary",
232
+ "weight": 1.0,
233
+ "category": "metadata",
234
+ "description": "Domain or field of application",
235
+ "jsonpath": "$.metadata.properties[?(@.name=='domain')].value",
236
+ "aibom_generation": {
237
+ "location": "$.metadata.properties",
238
+ "rule": "include_if_available",
239
+ "source_fields": ["domain", "field", "application_area"],
240
+ "validation": "optional",
241
+ "data_type": "string"
242
+ },
243
+ "scoring": {
244
+ "points": 1.0,
245
+ "required_for_profiles": ["advanced"],
246
+ "category_contribution": 0.05
247
+ },
248
+ "validation_message": {
249
+ "missing": "Missing supplementary field: domain - application domain helpful for context",
250
+ "recommendation": "Add the domain or field where this model is typically applied"
251
+ }
252
+ },
253
+ "autonomyType": {
254
+ "tier": "supplementary",
255
+ "weight": 1.0,
256
+ "category": "metadata",
257
+ "description": "Level of autonomy or human involvement required",
258
+ "jsonpath": "$.metadata.properties[?(@.name=='autonomyType')].value",
259
+ "aibom_generation": {
260
+ "location": "$.metadata.properties",
261
+ "rule": "include_if_available",
262
+ "source_fields": ["autonomyType", "autonomy_level"],
263
+ "validation": "optional",
264
+ "data_type": "string"
265
+ },
266
+ "scoring": {
267
+ "points": 1.0,
268
+ "required_for_profiles": ["advanced"],
269
+ "category_contribution": 0.05
270
+ },
271
+ "validation_message": {
272
+ "missing": "Missing supplementary field: autonomyType - autonomy level information helpful",
273
+ "recommendation": "Add information about the level of human oversight required"
274
+ }
275
+ },
276
+ "name": {
277
+ "tier": "critical",
278
+ "weight": 4.0,
279
+ "category": "component_basic",
280
+ "description": "Name of the AI model component",
281
+ "jsonpath": "$.components[0].name",
282
+ "aibom_generation": {
283
+ "location": "$.components[0].name",
284
+ "rule": "always_include",
285
+ "source_fields": ["name", "model_name"],
286
+ "validation": "required",
287
+ "data_type": "string"
288
+ },
289
+ "scoring": {
290
+ "points": 4.0,
291
+ "required_for_profiles": ["basic", "standard", "advanced"],
292
+ "category_contribution": 0.2
293
+ },
294
+ "validation_message": {
295
+ "missing": "Missing critical field: name - essential for model identification",
296
+ "recommendation": "Add a descriptive name for the model"
297
+ }
298
+ },
299
+ "type": {
300
+ "tier": "important",
301
+ "weight": 3.0,
302
+ "category": "component_basic",
303
+ "description": "Type of component (machine-learning-model)",
304
+ "jsonpath": "$.components[0].type",
305
+ "aibom_generation": {
306
+ "location": "$.components[0].type",
307
+ "rule": "always_include",
308
+ "source_fields": ["type"],
309
+ "validation": "required",
310
+ "data_type": "string"
311
+ },
312
+ "scoring": {
313
+ "points": 3.0,
314
+ "required_for_profiles": ["standard", "advanced"],
315
+ "category_contribution": 0.15
316
+ },
317
+ "validation_message": {
318
+ "missing": "Missing important field: type - component type classification needed",
319
+ "recommendation": "Set type to 'machine-learning-model' for AI models"
320
+ }
321
+ },
322
+ "purl": {
323
+ "tier": "important",
324
+ "weight": 3.0,
325
+ "category": "component_basic",
326
+ "description": "Package URL identifier",
327
+ "jsonpath": "$.components[0].purl",
328
+ "aibom_generation": {
329
+ "location": "$.components[0].purl",
330
+ "rule": "include_if_available",
331
+ "source_fields": ["purl", "package_url"],
332
+ "validation": "recommended",
333
+ "data_type": "string"
334
+ },
335
+ "scoring": {
336
+ "points": 3.0,
337
+ "required_for_profiles": ["standard", "advanced"],
338
+ "category_contribution": 0.15
339
+ },
340
+ "validation_message": {
341
+ "missing": "Missing important field: purl - package URL for identification",
342
+ "recommendation": "Add a Package URL (PURL) for the model"
343
+ }
344
+ },
345
+ "description": {
346
+ "tier": "important",
347
+ "weight": 3.0,
348
+ "category": "component_basic",
349
+ "description": "Description of the AI model",
350
+ "jsonpath": "$.components[0].description",
351
+ "aibom_generation": {
352
+ "location": "$.components[0].description",
353
+ "rule": "include_if_available",
354
+ "source_fields": ["description", "summary"],
355
+ "validation": "recommended",
356
+ "data_type": "string"
357
+ },
358
+ "scoring": {
359
+ "points": 3.0,
360
+ "required_for_profiles": ["standard", "advanced"],
361
+ "category_contribution": 0.15
362
+ },
363
+ "validation_message": {
364
+ "missing": "Missing important field: description - model description helpful for understanding",
365
+ "recommendation": "Add a clear description of what the model does"
366
+ }
367
+ },
368
+ "licenses": {
369
+ "tier": "important",
370
+ "weight": 3.0,
371
+ "category": "component_basic",
372
+ "description": "License information for the model",
373
+ "jsonpath": "$.components[0].licenses",
374
+ "aibom_generation": {
375
+ "location": "$.components[0].licenses",
376
+ "rule": "include_if_available",
377
+ "source_fields": ["licenses", "license"],
378
+ "validation": "recommended",
379
+ "data_type": "array"
380
+ },
381
+ "scoring": {
382
+ "points": 3.0,
383
+ "required_for_profiles": ["standard", "advanced"],
384
+ "category_contribution": 0.15
385
+ },
386
+ "validation_message": {
387
+ "missing": "Missing important field: licenses - license information important for compliance",
388
+ "recommendation": "Add license information for the model"
389
+ }
390
+ },
391
+ "energyConsumption": {
392
+ "tier": "important",
393
+ "weight": 2.0,
394
+ "category": "component_model_card",
395
+ "description": "Energy consumption information",
396
+ "jsonpath": "$.metadata.properties[?(@.name=='energyConsumption')].value",
397
+ "aibom_generation": {
398
+ "location": "$.metadata.properties",
399
+ "rule": "include_if_available",
400
+ "source_fields": ["energyConsumption", "energy_usage"],
401
+ "validation": "optional",
402
+ "data_type": "string"
403
+ },
404
+ "scoring": {
405
+ "points": 2.0,
406
+ "required_for_profiles": ["advanced"],
407
+ "category_contribution": 0.067
408
+ },
409
+ "validation_message": {
410
+ "missing": "Missing important field: energyConsumption - energy usage information helpful for sustainability",
411
+ "recommendation": "Add information about the model's energy consumption"
412
+ }
413
+ },
414
+ "hyperparameter": {
415
+ "tier": "important",
416
+ "weight": 2.0,
417
+ "category": "component_model_card",
418
+ "description": "Key hyperparameters used in training",
419
+ "jsonpath": "$.metadata.properties[?(@.name=='hyperparameter')].value",
420
+ "aibom_generation": {
421
+ "location": "$.metadata.properties",
422
+ "rule": "include_if_available",
423
+ "source_fields": ["hyperparameter", "hyperparameters", "training_params"],
424
+ "validation": "optional",
425
+ "data_type": "string"
426
+ },
427
+ "scoring": {
428
+ "points": 2.0,
429
+ "required_for_profiles": ["advanced"],
430
+ "category_contribution": 0.067
431
+ },
432
+ "validation_message": {
433
+ "missing": "Missing important field: hyperparameter - training configuration helpful for reproducibility",
434
+ "recommendation": "Add key hyperparameters used during model training"
435
+ }
436
+ },
437
+ "limitation": {
438
+ "tier": "important",
439
+ "weight": 2.0,
440
+ "category": "component_model_card",
441
+ "description": "Known limitations of the model",
442
+ "jsonpath": "$.metadata.properties[?(@.name=='limitation')].value",
443
+ "aibom_generation": {
444
+ "location": "$.metadata.properties",
445
+ "rule": "include_if_available",
446
+ "source_fields": ["limitation", "limitations", "known_issues"],
447
+ "validation": "optional",
448
+ "data_type": "string"
449
+ },
450
+ "scoring": {
451
+ "points": 2.0,
452
+ "required_for_profiles": ["advanced"],
453
+ "category_contribution": 0.067
454
+ },
455
+ "validation_message": {
456
+ "missing": "Missing important field: limitation - known limitations important for responsible use",
457
+ "recommendation": "Add information about known limitations or constraints"
458
+ }
459
+ },
460
+ "safetyRiskAssessment": {
461
+ "tier": "important",
462
+ "weight": 2.0,
463
+ "category": "component_model_card",
464
+ "description": "Safety and risk assessment information",
465
+ "jsonpath": "$.metadata.properties[?(@.name=='safetyRiskAssessment')].value",
466
+ "aibom_generation": {
467
+ "location": "$.metadata.properties",
468
+ "rule": "include_if_available",
469
+ "source_fields": ["safetyRiskAssessment", "safety_assessment", "risk_analysis"],
470
+ "validation": "optional",
471
+ "data_type": "string"
472
+ },
473
+ "scoring": {
474
+ "points": 2.0,
475
+ "required_for_profiles": ["advanced"],
476
+ "category_contribution": 0.067
477
+ },
478
+ "validation_message": {
479
+ "missing": "Missing important field: safetyRiskAssessment - safety assessment important for responsible deployment",
480
+ "recommendation": "Add safety and risk assessment information"
481
+ }
482
+ },
483
+ "typeOfModel": {
484
+ "tier": "important",
485
+ "weight": 2.0,
486
+ "category": "component_model_card",
487
+ "description": "Type or architecture of the model",
488
+ "jsonpath": "$.metadata.properties[?(@.name=='typeOfModel')].value",
489
+ "aibom_generation": {
490
+ "location": "$.metadata.properties",
491
+ "rule": "include_if_available",
492
+ "source_fields": ["typeOfModel", "model_type", "architecture"],
493
+ "validation": "recommended",
494
+ "data_type": "string"
495
+ },
496
+ "scoring": {
497
+ "points": 2.0,
498
+ "required_for_profiles": ["advanced"],
499
+ "category_contribution": 0.067
500
+ },
501
+ "validation_message": {
502
+ "missing": "Missing important field: typeOfModel - model architecture information helpful",
503
+ "recommendation": "Add the type or architecture of the model (e.g., Transformer, CNN)"
504
+ }
505
+ },
506
+ "modelExplainability": {
507
+ "tier": "supplementary",
508
+ "weight": 1.0,
509
+ "category": "component_model_card",
510
+ "description": "Information about model explainability",
511
+ "jsonpath": "$.metadata.properties[?(@.name=='modelExplainability')].value",
512
+ "aibom_generation": {
513
+ "location": "$.metadata.properties",
514
+ "rule": "include_if_available",
515
+ "source_fields": ["modelExplainability", "explainability", "interpretability"],
516
+ "validation": "optional",
517
+ "data_type": "string"
518
+ },
519
+ "scoring": {
520
+ "points": 1.0,
521
+ "required_for_profiles": ["advanced"],
522
+ "category_contribution": 0.033
523
+ },
524
+ "validation_message": {
525
+ "missing": "Missing supplementary field: modelExplainability - explainability information helpful for transparency",
526
+ "recommendation": "Add information about model explainability or interpretability features"
527
+ }
528
+ },
529
+ "energyQuantity": {
530
+ "tier": "supplementary",
531
+ "weight": 1.0,
532
+ "category": "component_model_card",
533
+ "description": "Quantitative energy consumption data",
534
+ "jsonpath": "$.metadata.properties[?(@.name=='energyQuantity')].value",
535
+ "aibom_generation": {
536
+ "location": "$.metadata.properties",
537
+ "rule": "include_if_available",
538
+ "source_fields": ["energyQuantity", "energy_amount"],
539
+ "validation": "optional",
540
+ "data_type": "number"
541
+ },
542
+ "scoring": {
543
+ "points": 1.0,
544
+ "required_for_profiles": ["advanced"],
545
+ "category_contribution": 0.033
546
+ },
547
+ "validation_message": {
548
+ "missing": "Missing supplementary field: energyQuantity - quantitative energy data helpful for sustainability metrics",
549
+ "recommendation": "Add specific energy consumption quantities"
550
+ }
551
+ },
552
+ "energyUnit": {
553
+ "tier": "supplementary",
554
+ "weight": 1.0,
555
+ "category": "component_model_card",
556
+ "description": "Unit of measurement for energy consumption",
557
+ "jsonpath": "$.metadata.properties[?(@.name=='energyUnit')].value",
558
+ "aibom_generation": {
559
+ "location": "$.metadata.properties",
560
+ "rule": "include_if_available",
561
+ "source_fields": ["energyUnit", "energy_unit"],
562
+ "validation": "optional",
563
+ "data_type": "string"
564
+ },
565
+ "scoring": {
566
+ "points": 1.0,
567
+ "required_for_profiles": ["advanced"],
568
+ "category_contribution": 0.033
569
+ },
570
+ "validation_message": {
571
+ "missing": "Missing supplementary field: energyUnit - energy measurement unit helpful for standardization",
572
+ "recommendation": "Add the unit of measurement for energy consumption (e.g., kWh, Joules)"
573
+ }
574
+ },
575
+ "informationAboutTraining": {
576
+ "tier": "supplementary",
577
+ "weight": 1.0,
578
+ "category": "component_model_card",
579
+ "description": "Information about the training process",
580
+ "jsonpath": "$.metadata.properties[?(@.name=='informationAboutTraining')].value",
581
+ "aibom_generation": {
582
+ "location": "$.metadata.properties",
583
+ "rule": "include_if_available",
584
+ "source_fields": ["informationAboutTraining", "training_info", "training_details"],
585
+ "validation": "optional",
586
+ "data_type": "string"
587
+ },
588
+ "scoring": {
589
+ "points": 1.0,
590
+ "required_for_profiles": ["advanced"],
591
+ "category_contribution": 0.033
592
+ },
593
+ "validation_message": {
594
+ "missing": "Missing supplementary field: informationAboutTraining - training details helpful for understanding model development",
595
+ "recommendation": "Add information about the training process and methodology"
596
+ }
597
+ },
598
+ "informationAboutApplication": {
599
+ "tier": "supplementary",
600
+ "weight": 1.0,
601
+ "category": "component_model_card",
602
+ "description": "Information about intended applications",
603
+ "jsonpath": "$.metadata.properties[?(@.name=='informationAboutApplication')].value",
604
+ "aibom_generation": {
605
+ "location": "$.metadata.properties",
606
+ "rule": "include_if_available",
607
+ "source_fields": ["informationAboutApplication", "application_info", "intended_use"],
608
+ "validation": "optional",
609
+ "data_type": "string"
610
+ },
611
+ "scoring": {
612
+ "points": 1.0,
613
+ "required_for_profiles": ["advanced"],
614
+ "category_contribution": 0.033
615
+ },
616
+ "validation_message": {
617
+ "missing": "Missing supplementary field: informationAboutApplication - application guidance helpful for proper usage",
618
+ "recommendation": "Add information about intended applications and use cases"
619
+ }
620
+ },
621
+ "metric": {
622
+ "tier": "supplementary",
623
+ "weight": 1.0,
624
+ "category": "component_model_card",
625
+ "description": "Performance metrics and evaluation results",
626
+ "jsonpath": "$.metadata.properties[?(@.name=='metric')].value",
627
+ "aibom_generation": {
628
+ "location": "$.metadata.properties",
629
+ "rule": "include_if_available",
630
+ "source_fields": ["metric", "metrics", "performance"],
631
+ "validation": "optional",
632
+ "data_type": "string"
633
+ },
634
+ "scoring": {
635
+ "points": 1.0,
636
+ "required_for_profiles": ["advanced"],
637
+ "category_contribution": 0.033
638
+ },
639
+ "validation_message": {
640
+ "missing": "Missing supplementary field: metric - performance metrics helpful for evaluation",
641
+ "recommendation": "Add performance metrics and evaluation results"
642
+ }
643
+ },
644
+ "metricDecisionThreshold": {
645
+ "tier": "supplementary",
646
+ "weight": 1.0,
647
+ "category": "component_model_card",
648
+ "description": "Decision thresholds for metrics",
649
+ "jsonpath": "$.metadata.properties[?(@.name=='metricDecisionThreshold')].value",
650
+ "aibom_generation": {
651
+ "location": "$.metadata.properties",
652
+ "rule": "include_if_available",
653
+ "source_fields": ["metricDecisionThreshold", "decision_threshold", "threshold"],
654
+ "validation": "optional",
655
+ "data_type": "number"
656
+ },
657
+ "scoring": {
658
+ "points": 1.0,
659
+ "required_for_profiles": ["advanced"],
660
+ "category_contribution": 0.033
661
+ },
662
+ "validation_message": {
663
+ "missing": "Missing supplementary field: metricDecisionThreshold - decision thresholds helpful for operational guidance",
664
+ "recommendation": "Add decision thresholds for performance metrics"
665
+ }
666
+ },
667
+ "modelDataPreprocessing": {
668
+ "tier": "supplementary",
669
+ "weight": 1.0,
670
+ "category": "component_model_card",
671
+ "description": "Data preprocessing information",
672
+ "jsonpath": "$.metadata.properties[?(@.name=='modelDataPreprocessing')].value",
673
+ "aibom_generation": {
674
+ "location": "$.metadata.properties",
675
+ "rule": "include_if_available",
676
+ "source_fields": ["modelDataPreprocessing", "data_preprocessing", "preprocessing"],
677
+ "validation": "optional",
678
+ "data_type": "string"
679
+ },
680
+ "scoring": {
681
+ "points": 1.0,
682
+ "required_for_profiles": ["advanced"],
683
+ "category_contribution": 0.033
684
+ },
685
+ "validation_message": {
686
+ "missing": "Missing supplementary field: modelDataPreprocessing - preprocessing details helpful for reproducibility",
687
+ "recommendation": "Add information about data preprocessing steps"
688
+ }
689
+ },
690
+ "useSensitivePersonalInformation": {
691
+ "tier": "supplementary",
692
+ "weight": 1.0,
693
+ "category": "component_model_card",
694
+ "description": "Information about use of sensitive personal data",
695
+ "jsonpath": "$.metadata.properties[?(@.name=='useSensitivePersonalInformation')].value",
696
+ "aibom_generation": {
697
+ "location": "$.metadata.properties",
698
+ "rule": "include_if_available",
699
+ "source_fields": ["useSensitivePersonalInformation", "sensitive_data", "personal_data"],
700
+ "validation": "optional",
701
+ "data_type": "boolean"
702
+ },
703
+ "scoring": {
704
+ "points": 1.0,
705
+ "required_for_profiles": ["advanced"],
706
+ "category_contribution": 0.033
707
+ },
708
+ "validation_message": {
709
+ "missing": "Missing supplementary field: useSensitivePersonalInformation - privacy information important for compliance",
710
+ "recommendation": "Add information about use of sensitive or personal data"
711
+ }
712
+ },
713
+ "downloadLocation": {
714
+ "tier": "critical",
715
+ "weight": 4.0,
716
+ "category": "external_references",
717
+ "description": "Location where the model can be downloaded",
718
+ "jsonpath": "$.externalReferences[0].url",
719
+ "aibom_generation": {
720
+ "location": "$.externalReferences",
721
+ "rule": "include_if_available",
722
+ "source_fields": ["downloadLocation", "download_url", "repository_url"],
723
+ "validation": "recommended",
724
+ "data_type": "string"
725
+ },
726
+ "scoring": {
727
+ "points": 4.0,
728
+ "required_for_profiles": ["standard", "advanced"],
729
+ "category_contribution": 1.0
730
+ },
731
+ "validation_message": {
732
+ "missing": "Missing critical field: downloadLocation - download location essential for model access",
733
+ "recommendation": "Add the URL where the model can be downloaded or accessed"
734
+ }
735
+ }
736
+ }
737
+ }
src/aibom-generator/field_registry_manager.py ADDED
@@ -0,0 +1,648 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Field Registry Manager for AI SBOM Generator
3
+ Combines registry loading, configuration generation, and field detection functionality
4
+ """
5
+
6
+ import json
7
+ import os
8
+ import re
9
+ from typing import Dict, Any, Optional, List, Tuple
10
+ from functools import lru_cache
11
+
12
+ class FieldRegistryManager:
13
+ """
14
+ Field registry manager that handles:
15
+ 1. Registry loading and validation
16
+ 2. Configuration generation for utils.py compatibility
17
+ 3. Field detection and JSONPath parsing
18
+ 4. AIBOM completeness analysis
19
+ 5. Scoring calculations
20
+ """
21
+
22
+ def __init__(self, registry_path: Optional[str] = None):
23
+ """
24
+ Initialize the field registry manager
25
+
26
+ Args:
27
+ registry_path: Path to the field registry JSON file. If None, auto-detects.
28
+ """
29
+ if registry_path is None:
30
+ # Auto-detect registry path relative to this file
31
+ current_dir = os.path.dirname(os.path.abspath(__file__))
32
+ registry_path = os.path.join(current_dir, "field_registry.json")
33
+
34
+ self.registry_path = registry_path
35
+ self.registry = self._load_registry()
36
+
37
+ # Cache for performance
38
+ self._field_classification = None
39
+ self._completeness_profiles = None
40
+ self._validation_messages = None
41
+ self._scoring_weights = None
42
+
43
+ def _load_registry(self) -> Dict[str, Any]:
44
+ """Load the complete field registry from JSON file"""
45
+ try:
46
+ with open(self.registry_path, 'r', encoding='utf-8') as f:
47
+ registry = json.load(f)
48
+
49
+ # Validate basic structure
50
+ required_sections = ["fields"]
51
+ missing_sections = [section for section in required_sections if section not in registry]
52
+
53
+ if missing_sections:
54
+ raise ValueError(f"Registry missing required sections: {missing_sections}")
55
+
56
+ # Validate fields structure
57
+ fields = registry.get('fields', {})
58
+ if not fields:
59
+ raise ValueError("Registry 'fields' section is empty")
60
+
61
+ print(f"βœ… Field registry loaded: {len(fields)} fields from {self.registry_path}")
62
+ return registry
63
+
64
+ except FileNotFoundError:
65
+ raise FileNotFoundError(f"Field registry not found at: {self.registry_path}")
66
+ except json.JSONDecodeError as e:
67
+ raise ValueError(f"Invalid JSON in field registry: {e}")
68
+ except Exception as e:
69
+ raise Exception(f"Failed to load field registry: {e}")
70
+
71
+ # =============================================================================
72
+ # CONFIGURATION GENERATION
73
+ # =============================================================================
74
+
75
+ @lru_cache(maxsize=1)
76
+ def get_scoring_config(self) -> Dict[str, Any]:
77
+ """Get scoring configuration from registry"""
78
+ return self.registry.get('scoring_config', {})
79
+
80
+ @lru_cache(maxsize=1)
81
+ def get_aibom_config(self) -> Dict[str, Any]:
82
+ """Get AIBOM generation configuration from registry"""
83
+ return self.registry.get('aibom_config', {})
84
+
85
+ @lru_cache(maxsize=1)
86
+ def get_field_definitions(self) -> Dict[str, Any]:
87
+ """Get all field definitions from registry"""
88
+ return self.registry.get('fields', {})
89
+
90
+ def generate_field_classification(self) -> Dict[str, Any]:
91
+ """
92
+ Generate FIELD_CLASSIFICATION dictionary from registry
93
+ """
94
+ if self._field_classification is not None:
95
+ return self._field_classification
96
+
97
+ fields = self.get_field_definitions()
98
+ classification = {}
99
+
100
+ for field_name, field_config in fields.items():
101
+ classification[field_name] = {
102
+ "tier": field_config.get("tier", "supplementary"),
103
+ "weight": field_config.get("weight", 1),
104
+ "category": field_config.get("category", "unknown")
105
+ }
106
+
107
+ self._field_classification = classification
108
+ return classification
109
+
110
+ def generate_completeness_profiles(self) -> Dict[str, Any]:
111
+ """
112
+ Generate COMPLETENESS_PROFILES dictionary from registry
113
+ """
114
+ if self._completeness_profiles is not None:
115
+ return self._completeness_profiles
116
+
117
+ scoring_config = self.get_scoring_config()
118
+ profiles = scoring_config.get("scoring_profiles", {})
119
+
120
+ # Convert to utils.py format
121
+ completeness_profiles = {}
122
+ for profile_name, profile_config in profiles.items():
123
+ completeness_profiles[profile_name] = {
124
+ "description": profile_config.get("description", f"{profile_name.title()} completeness profile"),
125
+ "required_fields": profile_config.get("required_fields", []),
126
+ "minimum_score": profile_config.get("minimum_score", 50)
127
+ }
128
+
129
+ # Fallback profiles if none defined in registry
130
+ if not completeness_profiles:
131
+ completeness_profiles = {
132
+ "basic": {
133
+ "description": "Minimal fields required for identification",
134
+ "required_fields": ["bomFormat", "specVersion", "serialNumber", "version", "name"],
135
+ "minimum_score": 40
136
+ },
137
+ "standard": {
138
+ "description": "Comprehensive fields for proper documentation",
139
+ "required_fields": ["bomFormat", "specVersion", "serialNumber", "version", "name",
140
+ "downloadLocation", "primaryPurpose", "suppliedBy"],
141
+ "minimum_score": 70
142
+ },
143
+ "advanced": {
144
+ "description": "Extensive documentation for maximum transparency",
145
+ "required_fields": ["bomFormat", "specVersion", "serialNumber", "version", "name",
146
+ "downloadLocation", "primaryPurpose", "suppliedBy",
147
+ "type", "purl", "description", "licenses", "hyperparameter", "limitation",
148
+ "energyConsumption", "safetyRiskAssessment", "typeOfModel"],
149
+ "minimum_score": 85
150
+ }
151
+ }
152
+
153
+ self._completeness_profiles = completeness_profiles
154
+ return completeness_profiles
155
+
156
+ def generate_validation_messages(self) -> Dict[str, Any]:
157
+ """
158
+ Generate VALIDATION_MESSAGES dictionary from registry
159
+ """
160
+ if self._validation_messages is not None:
161
+ return self._validation_messages
162
+
163
+ fields = self.get_field_definitions()
164
+ validation_messages = {}
165
+
166
+ for field_name, field_config in fields.items():
167
+ validation_msg = field_config.get("validation_message", {})
168
+ if validation_msg:
169
+ validation_messages[field_name] = {
170
+ "missing": validation_msg.get("missing", f"Missing field: {field_name}"),
171
+ "recommendation": validation_msg.get("recommendation", f"Consider adding {field_name} field")
172
+ }
173
+
174
+ self._validation_messages = validation_messages
175
+ return validation_messages
176
+
177
+ def get_configurable_scoring_weights(self) -> Dict[str, Any]:
178
+ """Get configurable scoring weights from registry"""
179
+ if self._scoring_weights is not None:
180
+ return self._scoring_weights
181
+
182
+ scoring_config = self.get_scoring_config()
183
+
184
+ weights = {
185
+ "tier_weights": scoring_config.get("tier_weights", {
186
+ "critical": 3,
187
+ "important": 2,
188
+ "supplementary": 1
189
+ }),
190
+ "category_weights": scoring_config.get("category_weights", {
191
+ "required_fields": 20,
192
+ "metadata": 20,
193
+ "component_basic": 20,
194
+ "component_model_card": 30,
195
+ "external_references": 10
196
+ }),
197
+ "algorithm_config": scoring_config.get("algorithm_config", {
198
+ "type": "weighted_sum",
199
+ "max_score": 100,
200
+ "normalization": "category_based"
201
+ })
202
+ }
203
+
204
+ self._scoring_weights = weights
205
+ return weights
206
+
207
+ # =============================================================================
208
+ # FIELD DETECTION
209
+ # =============================================================================
210
+
211
+ def _get_nested_value(self, data: dict, path: str) -> Tuple[bool, Any]:
212
+ """
213
+ Get value from nested dictionary using dot notation and array filters
214
+ Supports paths like: $.components[0].name, $.metadata.properties[?(@.name=='primaryPurpose')].value
215
+ """
216
+ try:
217
+ # Remove leading $. if present
218
+ if path.startswith('$.'):
219
+ path = path[2:]
220
+
221
+ # Handle special JSONPath-like syntax for property arrays
222
+ if '[?(@.name==' in path:
223
+ return self._handle_property_array_path(data, path)
224
+
225
+ # Split path and traverse
226
+ parts = self._split_path(path)
227
+ current = data
228
+
229
+ for part in parts:
230
+ if '[' in part and ']' in part:
231
+ # Handle array access like components[0]
232
+ key, index_str = part.split('[')
233
+ index = int(index_str.rstrip(']'))
234
+
235
+ if key and key in current:
236
+ current = current[key]
237
+
238
+ if isinstance(current, list) and 0 <= index < len(current):
239
+ current = current[index]
240
+ else:
241
+ return False, None
242
+ else:
243
+ # Regular key access
244
+ if isinstance(current, dict) and part in current:
245
+ current = current[part]
246
+ else:
247
+ return False, None
248
+
249
+ # Check if value is meaningful
250
+ if current is not None and current != "" and current != []:
251
+ return True, current
252
+
253
+ return False, None
254
+
255
+ except Exception as e:
256
+ print(f"Error getting value at path {path}: {e}")
257
+ return False, None
258
+
259
+ def _handle_property_array_path(self, data: dict, path: str) -> Tuple[bool, Any]:
260
+ """
261
+ Handle JSONPath-like syntax for property arrays
262
+ Example: metadata.properties[?(@.name=='primaryPurpose')].value
263
+ """
264
+ try:
265
+ # Extract the base path, property name, and final key
266
+ match = re.match(r'(.+)\.properties\[\?\(@\.name==\'(.+)\'\)\]\.(.+)', path)
267
+ if not match:
268
+ return False, None
269
+
270
+ base_path, prop_name, final_key = match.groups()
271
+
272
+ # Get to the properties array
273
+ base_found, base_value = self._get_nested_value(data, base_path + '.properties')
274
+ if not base_found or not isinstance(base_value, list):
275
+ return False, None
276
+
277
+ # Find the property with matching name
278
+ for prop in base_value:
279
+ if isinstance(prop, dict) and prop.get('name') == prop_name:
280
+ if final_key in prop:
281
+ value = prop[final_key]
282
+ if value is not None and value != "" and value != []:
283
+ return True, value
284
+
285
+ return False, None
286
+
287
+ except Exception as e:
288
+ print(f"Error handling property array path {path}: {e}")
289
+ return False, None
290
+
291
+ def _split_path(self, path: str) -> List[str]:
292
+ """Split path into parts, handling array notation"""
293
+ parts = []
294
+ current_part = ""
295
+ in_brackets = False
296
+
297
+ for char in path:
298
+ if char == '[':
299
+ in_brackets = True
300
+ current_part += char
301
+ elif char == ']':
302
+ in_brackets = False
303
+ current_part += char
304
+ elif char == '.' and not in_brackets:
305
+ if current_part:
306
+ parts.append(current_part)
307
+ current_part = ""
308
+ else:
309
+ current_part += char
310
+
311
+ if current_part:
312
+ parts.append(current_part)
313
+
314
+ return parts
315
+
316
+ def detect_field_presence(self, aibom: dict, field_path: str) -> Tuple[bool, Any]:
317
+ """
318
+ Detect if a field exists at the given path in the AIBOM
319
+ Returns: (field_exists, field_value)
320
+ """
321
+ return self._get_nested_value(aibom, field_path)
322
+
323
+ def analyze_aibom_completeness(self, aibom: dict) -> Dict[str, Any]:
324
+ """
325
+ Analyze AIBOM completeness against the enhanced field registry
326
+ Compatible with enhanced registry structure: registry['fields'][field_name]
327
+ """
328
+ results = {
329
+ 'category_scores': {},
330
+ 'total_score': 0,
331
+ 'field_details': {},
332
+ 'summary': {}
333
+ }
334
+
335
+ # Get fields from enhanced registry structure
336
+ fields = self.get_field_definitions()
337
+ if not fields:
338
+ print("❌ No fields found in registry")
339
+ return results
340
+
341
+ # Get scoring configuration
342
+ scoring_weights = self.get_configurable_scoring_weights()
343
+ category_weights = scoring_weights.get('category_weights', {})
344
+
345
+ # Group fields by category
346
+ categories = {}
347
+ for field_name, field_config in fields.items():
348
+ category = field_config.get('category', 'unknown')
349
+ if category not in categories:
350
+ categories[category] = []
351
+ categories[category].append((field_name, field_config))
352
+
353
+ print(f"πŸ” Analyzing {len(fields)} fields across {len(categories)} categories")
354
+
355
+ total_weighted_score = 0
356
+
357
+ for category_name, category_fields in categories.items():
358
+ category_weight = category_weights.get(category_name, 20)
359
+
360
+ present_fields = 0
361
+ total_fields = len(category_fields)
362
+ field_details = {}
363
+
364
+ print(f"\nπŸ“‚ Category: {category_name} (weight: {category_weight})")
365
+
366
+ for field_name, field_config in category_fields:
367
+ field_path = field_config.get('jsonpath', '')
368
+ tier = field_config.get('tier', 'supplementary')
369
+ weight = field_config.get('weight', 1)
370
+
371
+ if not field_path:
372
+ print(f"⚠️ Field {field_name} has no jsonpath defined")
373
+ field_details[field_name] = {
374
+ 'present': False,
375
+ 'value': None,
376
+ 'path': field_path,
377
+ 'tier': tier,
378
+ 'weight': weight,
379
+ 'error': 'No jsonpath defined'
380
+ }
381
+ continue
382
+
383
+ is_present, value = self.detect_field_presence(aibom, field_path)
384
+
385
+ field_details[field_name] = {
386
+ 'present': is_present,
387
+ 'value': value,
388
+ 'path': field_path,
389
+ 'tier': tier,
390
+ 'weight': weight
391
+ }
392
+
393
+ if is_present:
394
+ present_fields += 1
395
+ print(f"βœ… FOUND: {field_name} = {value} (tier: {tier}, weight: {weight})")
396
+ else:
397
+ print(f"❌ MISSING: {field_name} at {field_path} (tier: {tier})")
398
+
399
+ # Calculate category score
400
+ category_percentage = (present_fields / total_fields) * 100 if total_fields > 0 else 0
401
+ category_score = (category_percentage / 100) * category_weight
402
+
403
+ results['category_scores'][category_name] = category_score
404
+ results['field_details'][category_name] = field_details
405
+ results['summary'][category_name] = {
406
+ 'present': present_fields,
407
+ 'total': total_fields,
408
+ 'percentage': category_percentage,
409
+ 'weight': category_weight
410
+ }
411
+
412
+ total_weighted_score += category_score
413
+
414
+ print(f"πŸ“Š {category_name}: {present_fields}/{total_fields} ({category_percentage:.1f}%) Γ— {category_weight} = {category_score:.1f} pts")
415
+
416
+ results['total_score'] = total_weighted_score
417
+
418
+ print(f"\n🎯 TOTAL SCORE: {total_weighted_score:.1f}")
419
+
420
+ return results
421
+
422
+ # =============================================================================
423
+ # UTILITY METHODS
424
+ # =============================================================================
425
+
426
+ def get_field_info(self, field_name: str) -> Optional[Dict[str, Any]]:
427
+ """Get complete information for a specific field"""
428
+ fields = self.get_field_definitions()
429
+ return fields.get(field_name)
430
+
431
+ def get_field_jsonpath(self, field_name: str) -> Optional[str]:
432
+ """Get JSONPath expression for a specific field"""
433
+ field_info = self.get_field_info(field_name)
434
+ return field_info.get("jsonpath") if field_info else None
435
+
436
+ def get_fields_by_category(self, category: str) -> List[str]:
437
+ """Get all field names in a specific category"""
438
+ fields = self.get_field_definitions()
439
+ return [
440
+ field_name for field_name, field_config in fields.items()
441
+ if field_config.get("category") == category
442
+ ]
443
+
444
+ def get_fields_by_tier(self, tier: str) -> List[str]:
445
+ """Get all field names in a specific tier"""
446
+ fields = self.get_field_definitions()
447
+ return [
448
+ field_name for field_name, field_config in fields.items()
449
+ if field_config.get("tier") == tier
450
+ ]
451
+
452
+ def validate_registry_integrity(self) -> Dict[str, Any]:
453
+ """Validate the integrity of the loaded registry"""
454
+ validation_results = {
455
+ "valid": True,
456
+ "errors": [],
457
+ "warnings": [],
458
+ "field_count": 0,
459
+ "category_distribution": {},
460
+ "tier_distribution": {}
461
+ }
462
+
463
+ try:
464
+ fields = self.get_field_definitions()
465
+ validation_results["field_count"] = len(fields)
466
+
467
+ # Check category and tier distribution
468
+ categories = {}
469
+ tiers = {}
470
+
471
+ for field_name, field_config in fields.items():
472
+ # Check required field properties
473
+ required_props = ["tier", "weight", "category", "jsonpath"]
474
+ missing_props = [prop for prop in required_props if prop not in field_config]
475
+
476
+ if missing_props:
477
+ validation_results["errors"].append(
478
+ f"Field '{field_name}' missing properties: {missing_props}"
479
+ )
480
+ validation_results["valid"] = False
481
+
482
+ # Count categories and tiers
483
+ category = field_config.get("category", "unknown")
484
+ tier = field_config.get("tier", "unknown")
485
+
486
+ categories[category] = categories.get(category, 0) + 1
487
+ tiers[tier] = tiers.get(tier, 0) + 1
488
+
489
+ validation_results["category_distribution"] = categories
490
+ validation_results["tier_distribution"] = tiers
491
+
492
+ # Check scoring configuration
493
+ scoring_config = self.get_scoring_config()
494
+ if not scoring_config.get("tier_weights"):
495
+ validation_results["warnings"].append("Missing tier_weights in scoring_config")
496
+
497
+ if not scoring_config.get("category_weights"):
498
+ validation_results["warnings"].append("Missing category_weights in scoring_config")
499
+
500
+ except Exception as e:
501
+ validation_results["valid"] = False
502
+ validation_results["errors"].append(f"Registry validation error: {e}")
503
+
504
+ return validation_results
505
+
506
+
507
+ # =============================================================================
508
+ # GLOBAL INSTANCE AND CONVENIENCE FUNCTIONS
509
+ # =============================================================================
510
+
511
+ # Global registry manager instance (initialized on first import)
512
+ _registry_manager = None
513
+
514
+ def get_field_registry_manager() -> FieldRegistryManager:
515
+ """Get the global field registry manager instance (singleton pattern)"""
516
+ global _registry_manager
517
+ if _registry_manager is None:
518
+ _registry_manager = FieldRegistryManager()
519
+ return _registry_manager
520
+
521
+ # Convenience functions for backward compatibility with existing code
522
+
523
+ def load_field_registry() -> Dict[str, Any]:
524
+ """Load the complete field registry (convenience function)"""
525
+ manager = get_field_registry_manager()
526
+ return manager.registry
527
+
528
+ def generate_field_classification() -> Dict[str, Any]:
529
+ """Generate FIELD_CLASSIFICATION from registry (convenience function)"""
530
+ manager = get_field_registry_manager()
531
+ return manager.generate_field_classification()
532
+
533
+ def generate_completeness_profiles() -> Dict[str, Any]:
534
+ """Generate COMPLETENESS_PROFILES from registry (convenience function)"""
535
+ manager = get_field_registry_manager()
536
+ return manager.generate_completeness_profiles()
537
+
538
+ def generate_validation_messages() -> Dict[str, Any]:
539
+ """Generate VALIDATION_MESSAGES from registry (convenience function)"""
540
+ manager = get_field_registry_manager()
541
+ return manager.generate_validation_messages()
542
+
543
+ def get_configurable_scoring_weights() -> Dict[str, Any]:
544
+ """Get configurable scoring weights from registry"""
545
+ manager = get_field_registry_manager()
546
+ return manager.get_configurable_scoring_weights()
547
+
548
+ # For compatibility with old DynamicFieldDetector usage
549
+ class DynamicFieldDetector:
550
+ """Compatibility wrapper for old DynamicFieldDetector usage"""
551
+
552
+ def __init__(self, registry_path: str):
553
+ """Initialize with field registry manager"""
554
+ self.manager = FieldRegistryManager(registry_path)
555
+ self.registry = self.manager.registry
556
+
557
+ def detect_field_presence(self, aibom: dict, field_path: str) -> Tuple[bool, Any]:
558
+ """Detect field presence using the manager"""
559
+ return self.manager.detect_field_presence(aibom, field_path)
560
+
561
+ def analyze_aibom_completeness(self, aibom: dict) -> Dict[str, Any]:
562
+ """Analyze AIBOM completeness using the manager"""
563
+ return self.manager.analyze_aibom_completeness(aibom)
564
+
565
+ # Validation function for testing
566
+ def validate_registry_setup() -> bool:
567
+ """Validate that the registry is properly set up and accessible"""
568
+ try:
569
+ manager = get_field_registry_manager()
570
+ validation_results = manager.validate_registry_integrity()
571
+
572
+ if validation_results["valid"]:
573
+ print(f"βœ… Registry validation successful")
574
+ print(f" Fields loaded: {validation_results['field_count']}")
575
+ print(f" Categories: {list(validation_results['category_distribution'].keys())}")
576
+ print(f" Tiers: {list(validation_results['tier_distribution'].keys())}")
577
+ return True
578
+ else:
579
+ print(f"❌ Registry validation failed")
580
+ for error in validation_results["errors"]:
581
+ print(f" Error: {error}")
582
+ return False
583
+
584
+ except Exception as e:
585
+ print(f"❌ Registry setup validation failed: {e}")
586
+ return False
587
+
588
+ def test_field_registry_manager():
589
+ """
590
+ This function is temporary (or optional later on).
591
+ It serves the purpose of validating the field registry manager after refactoring
592
+ such as replacing old files or methods within for field detection and score calculations
593
+ and comes handy as a debugging tool.
594
+ """
595
+ try:
596
+ print("πŸ§ͺ Testing Consolidated Field Registry Manager...")
597
+
598
+ # Test manager initialization
599
+ manager = get_field_registry_manager()
600
+ print(f"βœ… Manager initialized with registry: {manager.registry_path}")
601
+
602
+ # Test configuration generation
603
+ field_classification = manager.generate_field_classification()
604
+ print(f"βœ… Generated FIELD_CLASSIFICATION with {len(field_classification)} fields")
605
+
606
+ completeness_profiles = manager.generate_completeness_profiles()
607
+ print(f"βœ… Generated COMPLETENESS_PROFILES with {len(completeness_profiles)} profiles")
608
+
609
+ validation_messages = manager.generate_validation_messages()
610
+ print(f"βœ… Generated VALIDATION_MESSAGES with {len(validation_messages)} messages")
611
+
612
+ scoring_weights = manager.get_configurable_scoring_weights()
613
+ print(f"βœ… Generated SCORING_WEIGHTS with {len(scoring_weights)} sections")
614
+
615
+ # Test field detection capabilities
616
+ test_fields = ['bomFormat', 'primaryPurpose', 'energyConsumption']
617
+ for field_name in test_fields:
618
+ field_info = manager.get_field_info(field_name)
619
+ if field_info:
620
+ jsonpath = field_info.get('jsonpath', 'N/A')
621
+ category = field_info.get('category', 'N/A')
622
+ tier = field_info.get('tier', 'N/A')
623
+ print(f"βœ… Field '{field_name}': {jsonpath} (category: {category}, tier: {tier})")
624
+ else:
625
+ print(f"❌ Field '{field_name}' not found in registry")
626
+
627
+ # Test registry validation
628
+ validation_results = manager.validate_registry_integrity()
629
+ if validation_results["valid"]:
630
+ print("βœ… Registry integrity validation passed")
631
+ else:
632
+ print("⚠️ Registry integrity validation issues found")
633
+ for error in validation_results["errors"]:
634
+ print(f" Error: {error}")
635
+
636
+ print("πŸŽ‰ Consolidated field registry manager test completed successfully!")
637
+ return True
638
+
639
+ except Exception as e:
640
+ print(f"❌ Field registry manager test failed: {e}")
641
+ import traceback
642
+ traceback.print_exc()
643
+ return False
644
+
645
+ if __name__ == "__main__":
646
+ # Test the consolidated manager when run directly
647
+ test_field_registry_manager()
648
+
src/aibom-generator/generator.py CHANGED
@@ -1,13 +1,30 @@
1
  import json
2
  import uuid
3
  import datetime
 
4
  from typing import Dict, Optional, Any, List
5
 
6
-
7
  from huggingface_hub import HfApi, ModelCard
 
8
  from urllib.parse import urlparse
9
  from .utils import calculate_completeness_score
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  class AIBOMGenerator:
13
  def __init__(
@@ -16,7 +33,7 @@ class AIBOMGenerator:
16
  inference_model_url: Optional[str] = None,
17
  use_inference: bool = True,
18
  cache_dir: Optional[str] = None,
19
- use_best_practices: bool = True, # Added parameter for industry-neutral scoring
20
  ):
21
  self.hf_api = HfApi(token=hf_token)
22
  self.inference_model_url = inference_model_url
@@ -24,13 +41,48 @@ class AIBOMGenerator:
24
  self.cache_dir = cache_dir
25
  self.enhancement_report = None # Store enhancement report as instance variable
26
  self.use_best_practices = use_best_practices # Store best practices flag
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  def generate_aibom(
29
  self,
30
  model_id: str,
31
  output_file: Optional[str] = None,
32
  include_inference: Optional[bool] = None,
33
- use_best_practices: Optional[bool] = None, # Added parameter for industry-neutral scoring
34
  ) -> Dict[str, Any]:
35
  try:
36
  model_id = self._normalise_model_id(model_id)
@@ -43,12 +95,59 @@ class AIBOMGenerator:
43
 
44
  # Store original metadata before any AI enhancement
45
  original_metadata = self._extract_structured_metadata(model_id, model_info, model_card)
 
 
 
 
46
 
47
  # Create initial AIBOM with original metadata
48
  original_aibom = self._create_aibom_structure(model_id, original_metadata)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
  # Calculate initial score with industry-neutral approach if enabled
51
- original_score = calculate_completeness_score(original_aibom, validate=True, use_best_practices=use_best_practices)
 
52
 
53
  # Final metadata starts with original metadata
54
  final_metadata = original_metadata.copy() if original_metadata else {}
@@ -74,12 +173,19 @@ class AIBOMGenerator:
74
  except Exception as e:
75
  print(f"Error during AI enhancement: {e}")
76
  # Continue with original metadata if enhancement fails
77
-
 
78
  # Create final AIBOM with potentially enhanced metadata
79
  aibom = self._create_aibom_structure(model_id, final_metadata)
80
 
81
- # Calculate final score with industry-neutral approach if enabled
82
- final_score = calculate_completeness_score(aibom, validate=True, use_best_practices=use_best_practices)
 
 
 
 
 
 
83
 
84
 
85
  if output_file:
@@ -98,8 +204,8 @@ class AIBOMGenerator:
98
  # Return only the AIBOM to maintain compatibility with existing code
99
  return aibom
100
  except Exception as e:
101
- print(f"Error generating AIBOM: {e}")
102
- # Return a minimal valid AIBOM structure in case of error
103
  return self._create_minimal_aibom(model_id)
104
 
105
  def _create_minimal_aibom(self, model_id: str) -> Dict[str, Any]:
@@ -156,7 +262,7 @@ class AIBOMGenerator:
156
  print(f"Error fetching model info for {model_id}: {e}")
157
  return {}
158
 
159
- # ---- new helper ---------------------------------------------------------
160
  @staticmethod
161
  def _normalise_model_id(raw_id: str) -> str:
162
  """
@@ -171,7 +277,7 @@ class AIBOMGenerator:
171
  return "/".join(parts[:2])
172
  return path
173
  return raw_id
174
- # -------------------------------------------------------------------------
175
 
176
  def _fetch_model_card(self, model_id: str) -> Optional[ModelCard]:
177
  try:
@@ -185,6 +291,12 @@ class AIBOMGenerator:
185
  model_id: str,
186
  metadata: Dict[str, Any],
187
  ) -> Dict[str, Any]:
 
 
 
 
 
 
188
  # Extract owner and model name from model_id
189
  parts = model_id.split("/")
190
  group = parts[0] if len(parts) > 1 else ""
@@ -192,6 +304,9 @@ class AIBOMGenerator:
192
 
193
  # Get version from metadata or use default
194
  version = metadata.get("commit", "1.0")
 
 
 
195
 
196
  aibom = {
197
  "bomFormat": "CycloneDX",
@@ -206,7 +321,10 @@ class AIBOMGenerator:
206
  "dependsOn": [f"pkg:huggingface/{model_id.replace('/', '/')}@{version}"]
207
  }
208
  ]
209
- }
 
 
 
210
 
211
  # ALWAYS add root-level external references
212
  aibom["externalReferences"] = [{
@@ -220,6 +338,7 @@ class AIBOMGenerator:
220
  "url": metadata["commit_url"]
221
  } )
222
 
 
223
  return aibom
224
 
225
  def _extract_structured_metadata(
@@ -228,6 +347,48 @@ class AIBOMGenerator:
228
  model_info: Dict[str, Any],
229
  model_card: Optional[ModelCard],
230
  ) -> Dict[str, Any]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
  metadata = {}
232
 
233
  if model_info:
@@ -248,7 +409,7 @@ class AIBOMGenerator:
248
  "downloads": getattr(model_info, "downloads", 0),
249
  "last_modified": getattr(model_info, "lastModified", None),
250
  "commit": getattr(model_info, "sha", None)[:7] if getattr(model_info, "sha", None) else None,
251
- "commit_url": f"https://huggingface.co/{model_id}/commit/{model_info.sha}" if getattr(model_info, "sha", None) else None,
252
  })
253
  except Exception as e:
254
  print(f"Error extracting model info metadata: {e}")
@@ -290,6 +451,7 @@ class AIBOMGenerator:
290
  print(f"DEBUG: Adding suppliedBy = {metadata.get('suppliedBy')}")
291
 
292
  return {k: v for k, v in metadata.items() if v is not None}
 
293
 
294
 
295
  def _extract_unstructured_metadata(self, model_card: Optional[ModelCard], model_id: str) -> Dict[str, Any]:
@@ -301,6 +463,9 @@ class AIBOMGenerator:
301
 
302
 
303
  def _create_metadata_section(self, model_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
 
 
 
304
  timestamp = datetime.datetime.utcnow().isoformat() + "Z"
305
 
306
  # Get version from metadata or use default
@@ -358,24 +523,43 @@ class AIBOMGenerator:
358
 
359
  # ALWAYS add critical fields for scoring
360
  critical_fields = {
361
- "primaryPurpose": metadata.get("primaryPurpose", metadata.get("ai:task", "text-generation")),
362
- "suppliedBy": metadata.get("suppliedBy", metadata.get("author", "unknown")),
363
- "typeOfModel": metadata.get("ai:type", "transformer")
364
  }
365
-
366
- # Add critical fields first
367
  for key, value in critical_fields.items():
368
- if value and value != "unknown":
369
- properties.append({"name": key, "value": str(value)})
370
 
371
- # Add other metadata fields (excluding basic component fields)
372
- excluded_fields = ["name", "author", "license", "description", "commit", "primaryPurpose", "suppliedBy", "typeOfModel"]
 
 
 
 
 
 
 
 
 
 
 
 
373
  for key, value in metadata.items():
374
- if key not in excluded_fields and value is not None:
 
 
375
  if isinstance(value, (list, dict)):
376
- if not isinstance(value, str):
 
 
 
 
 
 
377
  value = json.dumps(value)
 
378
  properties.append({"name": key, "value": str(value)})
 
379
 
380
  # Assemble metadata section
381
  metadata_section = {
@@ -388,6 +572,9 @@ class AIBOMGenerator:
388
  return metadata_section
389
 
390
  def _create_component_section(self, model_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
 
 
 
391
  # Extract owner and model name from model_id
392
  parts = model_id.split("/")
393
  group = parts[0] if len(parts) > 1 else ""
@@ -412,7 +599,7 @@ class AIBOMGenerator:
412
  "purl": purl
413
  }
414
 
415
- # ALWAYS add licenses (use default if not available)
416
  if metadata and "license" in metadata and metadata["license"]:
417
  component["licenses"] = [{
418
  "license": {
@@ -420,14 +607,48 @@ class AIBOMGenerator:
420
  "url": self._get_license_url(metadata["license"])
421
  }
422
  }]
 
423
  else:
424
- # Add default license structure for consistency
425
  component["licenses"] = [{
426
  "license": {
427
- "id": "unknown",
428
  "url": "https://spdx.org/licenses/"
429
  }
430
  }]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
431
  # Debug
432
  print(f"DEBUG: License in metadata: {'license' in metadata}" )
433
  if "license" in metadata:
@@ -435,6 +656,21 @@ class AIBOMGenerator:
435
 
436
  # ALWAYS add description
437
  component["description"] = metadata.get("description", f"AI model {model_id}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
438
 
439
  # Add external references
440
  external_refs = [{
@@ -470,26 +706,70 @@ class AIBOMGenerator:
470
 
471
  return component
472
 
 
 
 
 
 
 
 
 
473
  def _create_model_card_section(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
 
 
 
474
  model_card_section = {}
475
 
476
  # Add quantitative analysis section
477
  if "eval_results" in metadata:
478
  model_card_section["quantitativeAnalysis"] = {
479
- "performanceMetrics": metadata["eval_results"],
480
  "graphics": {} # Empty graphics object as in the example
481
  }
482
  else:
483
  model_card_section["quantitativeAnalysis"] = {"graphics": {}}
484
 
485
- # Add properties section
486
  properties = []
487
- for key, value in metadata.items():
488
- if key in ["author", "library_name", "license", "downloads", "likes", "tags", "created_at", "last_modified"]:
489
- properties.append({"name": key, "value": str(value)})
490
 
491
- if properties:
492
- model_card_section["properties"] = properties
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
493
 
494
  # Create model parameters section
495
  model_parameters = {}
@@ -538,6 +818,25 @@ class AIBOMGenerator:
538
 
539
  # Add model parameters to model card section
540
  model_card_section["modelParameters"] = model_parameters
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
541
 
542
  # Add considerations section
543
  considerations = {}
@@ -578,4 +877,112 @@ class AIBOMGenerator:
578
  logger.warning(f"Failed to fetch after {max_retries} attempts: {e}")
579
  return None
580
  time.sleep(1 * (attempt + 1)) # Exponential backoff
581
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import json
2
  import uuid
3
  import datetime
4
+ import json
5
  from typing import Dict, Optional, Any, List
6
 
 
7
  from huggingface_hub import HfApi, ModelCard
8
+ from huggingface_hub.repocard_data import EvalResult
9
  from urllib.parse import urlparse
10
  from .utils import calculate_completeness_score
11
 
12
+ # Import registry-aware enhanced extraction if available
13
+ try:
14
+ from .enhanced_extractor import EnhancedExtractor
15
+ from .field_registry_manager import get_field_registry_manager
16
+ ENHANCED_EXTRACTION_AVAILABLE = True
17
+ print("βœ… Registry-aware enhanced extraction module loaded successfully")
18
+ except ImportError:
19
+ try:
20
+ from enhanced_extractor import EnhancedExtractor
21
+ from field_registry_manager import get_field_registry_manager
22
+ ENHANCED_EXTRACTION_AVAILABLE = True
23
+ print("βœ… Registry-aware enhanced extraction module loaded successfully (direct import)")
24
+ except ImportError:
25
+ ENHANCED_EXTRACTION_AVAILABLE = False
26
+ print("⚠️ Registry-aware enhanced extraction not available, using basic extraction")
27
+
28
 
29
  class AIBOMGenerator:
30
  def __init__(
 
33
  inference_model_url: Optional[str] = None,
34
  use_inference: bool = True,
35
  cache_dir: Optional[str] = None,
36
+ use_best_practices: bool = True, # parameter for industry-neutral scoring
37
  ):
38
  self.hf_api = HfApi(token=hf_token)
39
  self.inference_model_url = inference_model_url
 
41
  self.cache_dir = cache_dir
42
  self.enhancement_report = None # Store enhancement report as instance variable
43
  self.use_best_practices = use_best_practices # Store best practices flag
44
+ self._setup_enhanced_logging()
45
+
46
+ self.extraction_results = {} # Store extraction results for scoring
47
+
48
+ # Initialize registry manager for enhanced extraction
49
+ self.registry_manager = None
50
+ if ENHANCED_EXTRACTION_AVAILABLE:
51
+ try:
52
+ self.registry_manager = get_field_registry_manager()
53
+ print("βœ… Registry manager initialized for generator")
54
+ except Exception as e:
55
+ print(f"⚠️ Could not initialize registry manager: {e}")
56
+ self.registry_manager = None
57
+
58
+ def get_extraction_results(self):
59
+ """Return the enhanced extraction results from the last extraction"""
60
+ return getattr(self, 'extraction_results', {})
61
 
62
+ def _setup_enhanced_logging(self):
63
+ """Setup enhanced logging for extraction tracking"""
64
+ import logging
65
+
66
+ # Configure logging to show in HF Spaces
67
+ logging.basicConfig(
68
+ level=logging.INFO,
69
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
70
+ force=True # Override any existing configuration
71
+ )
72
+
73
+ # Ensure logger shows up
74
+ logger = logging.getLogger('enhanced_extractor')
75
+ logger.setLevel(logging.INFO)
76
+
77
+ print("πŸ”§ Enhanced logging configured for AI SBOM generation")
78
+
79
+
80
  def generate_aibom(
81
  self,
82
  model_id: str,
83
  output_file: Optional[str] = None,
84
  include_inference: Optional[bool] = None,
85
+ use_best_practices: Optional[bool] = None, # parameter for industry-neutral scoring
86
  ) -> Dict[str, Any]:
87
  try:
88
  model_id = self._normalise_model_id(model_id)
 
95
 
96
  # Store original metadata before any AI enhancement
97
  original_metadata = self._extract_structured_metadata(model_id, model_info, model_card)
98
+ print(f"πŸ” ENHANCED EXTRACTION DEBUG: Returned {len(original_metadata)} fields:")
99
+ for key, value in original_metadata.items():
100
+ print(f" {key}: {value}")
101
+ print(f"πŸ” EXTRACTION RESULTS: {len(self.extraction_results) if hasattr(self, 'extraction_results') and self.extraction_results else 0} extraction results available")
102
 
103
  # Create initial AIBOM with original metadata
104
  original_aibom = self._create_aibom_structure(model_id, original_metadata)
105
+
106
+ print(f"πŸ” AI SBOM CREATION DEBUG: Checking what made it into AIBOM:")
107
+ if 'components' in original_aibom and original_aibom['components']:
108
+ component = original_aibom['components'][0]
109
+ if 'properties' in component:
110
+ print(f" Found {len(component['properties'])} properties in AIBOM:")
111
+ for prop in component['properties']:
112
+ print(f" {prop.get('name')}: {prop.get('value')}")
113
+ else:
114
+ print(" No properties found in component")
115
+ else:
116
+ print(" No components found in AI SBOM")
117
+ print(f"πŸ” FIELD PRESERVATION VERIFICATION:")
118
+ print(f" Enhanced extraction returned: {len(original_metadata)} fields")
119
+
120
+ # Count fields in final AIBOM
121
+ aibom_field_count = 0
122
+
123
+ # Count component properties
124
+ if 'components' in original_aibom and original_aibom['components']:
125
+ component = original_aibom['components'][0]
126
+ if 'properties' in component:
127
+ aibom_field_count += len(component['properties'])
128
+
129
+ # Count model card properties
130
+ if 'modelCard' in component and 'properties' in component['modelCard']:
131
+ aibom_field_count += len(component['modelCard']['properties'])
132
+
133
+ # Count metadata properties
134
+ if 'metadata' in original_aibom and 'properties' in original_aibom['metadata']:
135
+ aibom_field_count += len(original_aibom['metadata']['properties'])
136
+
137
+ print(f" Final AIBOM contains: {aibom_field_count} fields")
138
+ print(f" Field preservation rate: {(aibom_field_count/len(original_metadata)*100):.1f}%")
139
+
140
+ if aibom_field_count >= len(original_metadata) * 0.9: # 90% or better
141
+ print("βœ… EXCELLENT: Field preservation successful!")
142
+ elif aibom_field_count >= len(original_metadata) * 0.7: # 70% or better
143
+ print("⚠️ GOOD: Most fields preserved, some optimization possible")
144
+ else:
145
+ print("❌ POOR: Significant field loss detected")
146
+
147
 
148
  # Calculate initial score with industry-neutral approach if enabled
149
+ original_score = calculate_completeness_score(original_aibom, validate=True, use_best_practices=use_best_practices, extraction_results=self.extraction_results)
150
+
151
 
152
  # Final metadata starts with original metadata
153
  final_metadata = original_metadata.copy() if original_metadata else {}
 
173
  except Exception as e:
174
  print(f"Error during AI enhancement: {e}")
175
  # Continue with original metadata if enhancement fails
176
+ print("🚨 FALLBACK: Using _create_minimal_aibom due to error!")
177
+ print(f"🚨 ERROR DETAILS: {str(e)}")
178
  # Create final AIBOM with potentially enhanced metadata
179
  aibom = self._create_aibom_structure(model_id, final_metadata)
180
 
181
+ # Calculate final score with enhanced extraction results
182
+ extraction_results = self.get_extraction_results()
183
+ final_score = calculate_completeness_score(
184
+ aibom,
185
+ validate=True,
186
+ use_best_practices=use_best_practices,
187
+ extraction_results=extraction_results # Pass enhanced results
188
+ )
189
 
190
 
191
  if output_file:
 
204
  # Return only the AIBOM to maintain compatibility with existing code
205
  return aibom
206
  except Exception as e:
207
+ print(f"Error generating AI SBOM: {e}")
208
+ # Return a minimal valid AI SBOM structure in case of error
209
  return self._create_minimal_aibom(model_id)
210
 
211
  def _create_minimal_aibom(self, model_id: str) -> Dict[str, Any]:
 
262
  print(f"Error fetching model info for {model_id}: {e}")
263
  return {}
264
 
265
+
266
  @staticmethod
267
  def _normalise_model_id(raw_id: str) -> str:
268
  """
 
277
  return "/".join(parts[:2])
278
  return path
279
  return raw_id
280
+
281
 
282
  def _fetch_model_card(self, model_id: str) -> Optional[ModelCard]:
283
  try:
 
291
  model_id: str,
292
  metadata: Dict[str, Any],
293
  ) -> Dict[str, Any]:
294
+ # πŸ” CRASH DEBUG: troubleshoot where the process is crashing and falling back to minimal AIBOM
295
+ print(f"πŸ” CRASH_DEBUG: _create_aibom_structure called")
296
+ print(f"πŸ” CRASH_DEBUG: model_id = {model_id}")
297
+ print(f"πŸ” CRASH_DEBUG: metadata type = {type(metadata)}")
298
+ print(f"πŸ” CRASH_DEBUG: metadata keys = {list(metadata.keys()) if isinstance(metadata, dict) else 'NOT A DICT'}")
299
+
300
  # Extract owner and model name from model_id
301
  parts = model_id.split("/")
302
  group = parts[0] if len(parts) > 1 else ""
 
304
 
305
  # Get version from metadata or use default
306
  version = metadata.get("commit", "1.0")
307
+
308
+ # πŸ” CRASH DEBUG: Check metadata before creating sections
309
+ print(f"πŸ” CRASH_DEBUG: About to create metadata section")
310
 
311
  aibom = {
312
  "bomFormat": "CycloneDX",
 
321
  "dependsOn": [f"pkg:huggingface/{model_id.replace('/', '/')}@{version}"]
322
  }
323
  ]
324
+ }
325
+
326
+ # πŸ” CRASH DEBUG: Check if we got this far
327
+ print(f"πŸ” CRASH_DEBUG: Successfully created basic AIBOM structure")
328
 
329
  # ALWAYS add root-level external references
330
  aibom["externalReferences"] = [{
 
338
  "url": metadata["commit_url"]
339
  } )
340
 
341
+ print(f"πŸ” CRASH_DEBUG: _create_aibom_structure completed successfully")
342
  return aibom
343
 
344
  def _extract_structured_metadata(
 
347
  model_info: Dict[str, Any],
348
  model_card: Optional[ModelCard],
349
  ) -> Dict[str, Any]:
350
+
351
+ # Use registry-aware enhanced extraction if available
352
+ if ENHANCED_EXTRACTION_AVAILABLE:
353
+ try:
354
+ print(f"πŸš€ Using registry-aware enhanced extraction for: {model_id}")
355
+
356
+ # Create registry-aware enhanced extractor instance
357
+ extractor = EnhancedExtractor(self.hf_api, self.registry_manager)
358
+
359
+ # Get both metadata and extraction results
360
+ metadata = extractor.extract_metadata(model_id, model_info, model_card)
361
+
362
+ # Store extraction results for scoring
363
+ self.extraction_results = extractor.extraction_results
364
+
365
+ # Log extraction summary
366
+ if extractor.registry_fields:
367
+ registry_field_count = len(extractor.registry_fields)
368
+ extracted_count = len([k for k, v in metadata.items() if v is not None])
369
+ extraction_results_count = len(extractor.extraction_results)
370
+
371
+ print(f"βœ… Registry-driven extraction completed:")
372
+ print(f" πŸ“‹ Registry fields available: {registry_field_count}")
373
+ print(f" πŸ“Š Fields attempted: {extraction_results_count}")
374
+ print(f" βœ… Fields extracted: {extracted_count}")
375
+
376
+ # Log field coverage
377
+ if registry_field_count > 0:
378
+ coverage = (extracted_count / registry_field_count) * 100
379
+ print(f" πŸ“ˆ Registry field coverage: {coverage:.1f}%")
380
+ else:
381
+ extracted_count = len([k for k, v in metadata.items() if v is not None])
382
+ print(f"βœ… Legacy extraction completed: {extracted_count} fields extracted")
383
+
384
+ return metadata
385
+
386
+ except Exception as e:
387
+ print(f"❌ Registry-aware enhanced extraction failed: {e}")
388
+ print("πŸ”„ Falling back to original extraction method")
389
+ # Fall back to original extraction code here
390
+
391
+ # ORIGINAL EXTRACTION METHOD (as fallback)
392
  metadata = {}
393
 
394
  if model_info:
 
409
  "downloads": getattr(model_info, "downloads", 0),
410
  "last_modified": getattr(model_info, "lastModified", None),
411
  "commit": getattr(model_info, "sha", None)[:7] if getattr(model_info, "sha", None) else None,
412
+ "commit_url": f"https://huggingface.co/{model_id}/commit/{model_info.sha}" if getattr(model_info, "sha", None ) else None,
413
  })
414
  except Exception as e:
415
  print(f"Error extracting model info metadata: {e}")
 
451
  print(f"DEBUG: Adding suppliedBy = {metadata.get('suppliedBy')}")
452
 
453
  return {k: v for k, v in metadata.items() if v is not None}
454
+
455
 
456
 
457
  def _extract_unstructured_metadata(self, model_card: Optional[ModelCard], model_id: str) -> Dict[str, Any]:
 
463
 
464
 
465
  def _create_metadata_section(self, model_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
466
+ print(f"πŸ” CRASH_DEBUG: _create_metadata_section called")
467
+ print(f"πŸ” CRASH_DEBUG: metadata type in metadata_section = {type(metadata)}")
468
+
469
  timestamp = datetime.datetime.utcnow().isoformat() + "Z"
470
 
471
  # Get version from metadata or use default
 
523
 
524
  # ALWAYS add critical fields for scoring
525
  critical_fields = {
526
+ "primaryPurpose": metadata.get("primaryPurpose", "text-generation"),
527
+ "suppliedBy": metadata.get("suppliedBy", "unknown"),
528
+ "typeOfModel": metadata.get("typeOfModel", "Transformer")
529
  }
 
 
530
  for key, value in critical_fields.items():
531
+ properties.append({"name": key, "value": str(value)})
 
532
 
533
+ # Add enhanced extraction fields to properties
534
+ # Organize fields by category for better AIBOM structure
535
+ component_fields = ["name", "author", "description", "commit"] # These go in component section
536
+ critical_fields = ["primaryPurpose", "suppliedBy", "typeOfModel"] # Always include these
537
+
538
+ # Add all other enhanced extraction fields (preserve everything!)
539
+ enhanced_fields = ["model_type", "tokenizer_class", "architectures", "library_name",
540
+ "pipeline_tag", "tags", "datasets", "base_model", "language",
541
+ "downloads", "last_modified", "commit_url", "ai:type", "ai:task",
542
+ "ai:framework", "eval_results"]
543
+
544
+ print(f"πŸ” CRASH_DEBUG: About to call .items() on metadata")
545
+ print(f"πŸ” CRASH_DEBUG: metadata type before .items() = {type(metadata)}")
546
+
547
  for key, value in metadata.items():
548
+ # Skip component fields (handled elsewhere) but include everything else
549
+ if key not in component_fields and value is not None:
550
+ # Handle different data types properly
551
  if isinstance(value, (list, dict)):
552
+ if isinstance(value, list) and len(value) > 0:
553
+ # Convert list to comma-separated string for better display
554
+ if all(isinstance(item, str) for item in value):
555
+ value = ", ".join(value)
556
+ else:
557
+ value = json.dumps(value)
558
+ elif isinstance(value, dict):
559
  value = json.dumps(value)
560
+
561
  properties.append({"name": key, "value": str(value)})
562
+ print(f"βœ… METADATA: Added {key} = {value} to properties")
563
 
564
  # Assemble metadata section
565
  metadata_section = {
 
572
  return metadata_section
573
 
574
  def _create_component_section(self, model_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
575
+ print(f"πŸ” CRASH_DEBUG: _create_component_section called")
576
+ print(f"πŸ” CRASH_DEBUG: metadata type in component_section = {type(metadata)}")
577
+
578
  # Extract owner and model name from model_id
579
  parts = model_id.split("/")
580
  group = parts[0] if len(parts) > 1 else ""
 
599
  "purl": purl
600
  }
601
 
602
+ # Handle license
603
  if metadata and "license" in metadata and metadata["license"]:
604
  component["licenses"] = [{
605
  "license": {
 
607
  "url": self._get_license_url(metadata["license"])
608
  }
609
  }]
610
+ print(f"βœ… COMPONENT: Added license = {metadata['license']}")
611
  else:
 
612
  component["licenses"] = [{
613
  "license": {
614
+ "id": "NOASSERTION",
615
  "url": "https://spdx.org/licenses/"
616
  }
617
  }]
618
+ print(f"⚠️ COMPONENT: No license found, using NOASSERTION")
619
+
620
+ # ALWAYS add description
621
+ component["description"] = metadata.get("description", f"AI model {model_id}")
622
+
623
+ # Add enhanced technical properties to component
624
+ technical_properties = []
625
+
626
+ # Add model type information
627
+ if "model_type" in metadata:
628
+ technical_properties.append({"name": "model_type", "value": str(metadata["model_type"])})
629
+ print(f"βœ… COMPONENT: Added model_type = {metadata['model_type']}")
630
+
631
+ # Add tokenizer information
632
+ if "tokenizer_class" in metadata:
633
+ technical_properties.append({"name": "tokenizer_class", "value": str(metadata["tokenizer_class"])})
634
+ print(f"βœ… COMPONENT: Added tokenizer_class = {metadata['tokenizer_class']}")
635
+
636
+ # Add architecture information
637
+ if "architectures" in metadata:
638
+ arch_value = metadata["architectures"]
639
+ if isinstance(arch_value, list):
640
+ arch_value = ", ".join(arch_value)
641
+ technical_properties.append({"name": "architectures", "value": str(arch_value)})
642
+ print(f"βœ… COMPONENT: Added architectures = {arch_value}")
643
+
644
+ # Add library information
645
+ if "library_name" in metadata:
646
+ technical_properties.append({"name": "library_name", "value": str(metadata["library_name"])})
647
+ print(f"βœ… COMPONENT: Added library_name = {metadata['library_name']}")
648
+
649
+ # Add technical properties to component if any exist
650
+ if technical_properties:
651
+ component["properties"] = technical_properties
652
  # Debug
653
  print(f"DEBUG: License in metadata: {'license' in metadata}" )
654
  if "license" in metadata:
 
656
 
657
  # ALWAYS add description
658
  component["description"] = metadata.get("description", f"AI model {model_id}")
659
+ if metadata.get("license"):
660
+ component["licenses"] = [{
661
+ "license": {
662
+ "id": metadata["license"],
663
+ "url": self._get_license_url(metadata["license"])
664
+ }
665
+ }]
666
+ else:
667
+ component["licenses"] = [{
668
+ "license": {
669
+ "id": "unknown",
670
+ "url": "https://spdx.org/licenses/"
671
+ }
672
+ }]
673
+
674
 
675
  # Add external references
676
  external_refs = [{
 
706
 
707
  return component
708
 
709
+ def _eval_results_to_json(self, eval_results: List[EvalResult]) -> List[Dict[str, str]]:
710
+ res = []
711
+ for eval_result in eval_results:
712
+ if hasattr(eval_result, "metric_type") and hasattr(eval_result, "metric_value"):
713
+ res.append({"type": eval_result.metric_type, "value": str(eval_result.metric_value)})
714
+ return res
715
+
716
+
717
  def _create_model_card_section(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
718
+ print(f"πŸ” CRASH_DEBUG: _create_model_card_section called")
719
+ print(f"πŸ” CRASH_DEBUG: metadata type in model_card_section = {type(metadata)}")
720
+
721
  model_card_section = {}
722
 
723
  # Add quantitative analysis section
724
  if "eval_results" in metadata:
725
  model_card_section["quantitativeAnalysis"] = {
726
+ "performanceMetrics": self._eval_results_to_json(metadata["eval_results"]),
727
  "graphics": {} # Empty graphics object as in the example
728
  }
729
  else:
730
  model_card_section["quantitativeAnalysis"] = {"graphics": {}}
731
 
732
+ # Add properties section with enhanced extraction fields
733
  properties = []
 
 
 
734
 
735
+ # Component-level fields that shouldn't be duplicated in model card
736
+ component_level_fields = ["name", "author", "license", "description", "commit"]
737
+
738
+ # DEBUG: troubleshooting AIBOM generation
739
+ print(f"πŸ” DEBUG: About to iterate metadata.items()")
740
+ print(f"πŸ” DEBUG: metadata type = {type(metadata)}")
741
+ if isinstance(metadata, dict):
742
+ print(f"πŸ” DEBUG: metadata keys = {list(metadata.keys())}")
743
+ else:
744
+ print(f"πŸ” DEBUG: metadata value = {metadata}")
745
+ print(f"πŸ” DEBUG: This is the problem - metadata should be a dict!")
746
+
747
+ # Add all enhanced extraction fields to model card properties
748
+ try:
749
+ for key, value in metadata.items():
750
+ if key not in component_level_fields and value is not None:
751
+ # Handle different data types properly
752
+ if isinstance(value, (list, dict)):
753
+ if isinstance(value, list) and len(value) > 0:
754
+ # Convert list to readable format
755
+ if all(isinstance(item, str) for item in value):
756
+ value = ", ".join(value)
757
+ else:
758
+ value = json.dumps(value)
759
+ elif isinstance(value, dict):
760
+ value = json.dumps(value)
761
+
762
+ properties.append({"name": key, "value": str(value)})
763
+ print(f"βœ… MODEL_CARD: Added {key} = {value}")
764
+ except AttributeError as e:
765
+ print(f"❌ FOUND THE ERROR: {e}")
766
+ print(f"❌ metadata type: {type(metadata)}")
767
+ print(f"❌ metadata value: {metadata}")
768
+ raise e
769
+
770
+ # Always include properties section (even if empty for consistency)
771
+ model_card_section["properties"] = properties
772
+ print(f"βœ… MODEL_CARD: Added {len(properties)} properties to model card")
773
 
774
  # Create model parameters section
775
  model_parameters = {}
 
818
 
819
  # Add model parameters to model card section
820
  model_card_section["modelParameters"] = model_parameters
821
+ # Add enhanced technical parameters
822
+ if "model_type" in metadata or "tokenizer_class" in metadata or "architectures" in metadata:
823
+ technical_details = {}
824
+
825
+ if "model_type" in metadata:
826
+ technical_details["modelType"] = metadata["model_type"]
827
+
828
+ if "tokenizer_class" in metadata:
829
+ technical_details["tokenizerClass"] = metadata["tokenizer_class"]
830
+
831
+ if "architectures" in metadata:
832
+ technical_details["architectures"] = metadata["architectures"]
833
+
834
+ # Add to model parameters
835
+ model_parameters.update(technical_details)
836
+ print(f"βœ… MODEL_CARD: Added technical details: {list(technical_details.keys())}")
837
+
838
+ # Update model parameters with enhanced details
839
+ model_card_section["modelParameters"] = model_parameters
840
 
841
  # Add considerations section
842
  considerations = {}
 
877
  logger.warning(f"Failed to fetch after {max_retries} attempts: {e}")
878
  return None
879
  time.sleep(1 * (attempt + 1)) # Exponential backoff
880
+ return None
881
+
882
+ def validate_registry_integration(self) -> Dict[str, Any]:
883
+ """
884
+ Validate that the registry integration is working correctly.
885
+ This method helps debug registry-related issues.
886
+ """
887
+ validation_results = {
888
+ 'registry_manager_available': bool(self.registry_manager),
889
+ 'enhanced_extraction_available': ENHANCED_EXTRACTION_AVAILABLE,
890
+ 'registry_fields_count': 0,
891
+ 'registry_fields_loaded': False,
892
+ 'validation_status': 'unknown'
893
+ }
894
+
895
+ try:
896
+ if self.registry_manager:
897
+ registry = self.registry_manager.registry
898
+ registry_fields = registry.get('fields', {})
899
+ validation_results['registry_fields_count'] = len(registry_fields)
900
+ validation_results['registry_fields_loaded'] = len(registry_fields) > 0
901
+
902
+ if len(registry_fields) > 0:
903
+ validation_results['validation_status'] = 'success'
904
+ print(f"βœ… Registry validation successful: {len(registry_fields)} fields loaded")
905
+
906
+ # Log sample fields
907
+ sample_fields = list(registry_fields.keys())[:5]
908
+ print(f"πŸ“‹ Sample registry fields: {', '.join(sample_fields)}")
909
+ else:
910
+ validation_results['validation_status'] = 'no_fields'
911
+ print("⚠️ Registry loaded but no fields found")
912
+ else:
913
+ validation_results['validation_status'] = 'no_registry_manager'
914
+ print("❌ Registry manager not available")
915
+
916
+ except Exception as e:
917
+ validation_results['validation_status'] = 'error'
918
+ validation_results['error'] = str(e)
919
+ print(f"❌ Registry validation failed: {e}")
920
+
921
+ return validation_results
922
+
923
+ def test_registry_integration():
924
+ """
925
+ Test function to validate registry integration is working correctly.
926
+ This function can be called to debug registry-related issues.
927
+ """
928
+ print("πŸ§ͺ Testing Registry Integration...")
929
+ print("=" * 50)
930
+
931
+ try:
932
+ # Test generator initialization
933
+ generator = AIBOMGenerator()
934
+
935
+ # Validate registry integration
936
+ validation_results = generator.validate_registry_integration()
937
+
938
+ print("πŸ“Š Validation Results:")
939
+ for key, value in validation_results.items():
940
+ print(f" {key}: {value}")
941
+
942
+ # Test with a sample model
943
+ test_model = "deepseek-ai/DeepSeek-R1"
944
+ print(f"\nπŸ” Testing extraction with model: {test_model}")
945
+
946
+ try:
947
+ # Test model info retrieval
948
+ model_info = generator.hf_api.model_info(test_model)
949
+ model_card = ModelCard.load(test_model)
950
+
951
+ # Test extraction
952
+ if ENHANCED_EXTRACTION_AVAILABLE and generator.registry_manager:
953
+ extractor = EnhancedExtractor(generator.hf_api, generator.registry_manager)
954
+ metadata = extractor.extract_metadata(test_model, model_info, model_card)
955
+
956
+ print(f"βœ… Test extraction successful: {len(metadata)} fields extracted")
957
+
958
+ # Show sample extracted fields
959
+ sample_fields = dict(list(metadata.items())[:5])
960
+ print("πŸ“‹ Sample extracted fields:")
961
+ for key, value in sample_fields.items():
962
+ print(f" {key}: {value}")
963
+
964
+ # Show extraction results summary
965
+ extraction_results = extractor.get_extraction_results()
966
+ confidence_counts = {}
967
+ for result in extraction_results.values():
968
+ conf = result.confidence.value
969
+ confidence_counts[conf] = confidence_counts.get(conf, 0) + 1
970
+
971
+ print("πŸ“ˆ Extraction confidence distribution:")
972
+ for conf, count in confidence_counts.items():
973
+ print(f" {conf}: {count} fields")
974
+
975
+ else:
976
+ print("⚠️ Registry-aware extraction not available for testing")
977
+
978
+ except Exception as e:
979
+ print(f"❌ Test extraction failed: {e}")
980
+
981
+ except Exception as e:
982
+ print(f"❌ Registry integration test failed: {e}")
983
+
984
+ print("=" * 50)
985
+ print("πŸ§ͺ Registry Integration Test Complete")
986
+
987
+ # Uncomment this line to run the test automatically when generator.py is imported
988
+ test_registry_integration()
src/aibom-generator/utils.py CHANGED
@@ -1,5 +1,5 @@
1
  """
2
- Utility functions for the AI SBOM Generator.
3
  """
4
 
5
  import json
@@ -9,6 +9,14 @@ import re
9
  import uuid
10
  from typing import Dict, List, Optional, Any, Union, Tuple
11
  from enum import Enum
 
 
 
 
 
 
 
 
12
 
13
  logger = logging.getLogger(__name__)
14
 
@@ -18,98 +26,123 @@ class ValidationSeverity(Enum):
18
  WARNING = "warning"
19
  INFO = "info"
20
 
21
- # Field classification based on documentation value (silently aligned with SPDX)
22
- FIELD_CLASSIFICATION = {
23
- # Critical fields (silently aligned with SPDX mandatory fields)
24
- "bomFormat": {"tier": "critical", "weight": 3, "category": "required_fields"},
25
- "specVersion": {"tier": "critical", "weight": 3, "category": "required_fields"},
26
- "serialNumber": {"tier": "critical", "weight": 3, "category": "required_fields"},
27
- "version": {"tier": "critical", "weight": 3, "category": "required_fields"},
28
- "name": {"tier": "critical", "weight": 4, "category": "component_basic"},
29
- "downloadLocation": {"tier": "critical", "weight": 4, "category": "external_references"},
30
- "primaryPurpose": {"tier": "critical", "weight": 3, "category": "metadata"},
31
- "suppliedBy": {"tier": "critical", "weight": 4, "category": "metadata"},
32
-
33
- # Important fields (aligned with key SPDX optional fields)
34
- "type": {"tier": "important", "weight": 2, "category": "component_basic"},
35
- "purl": {"tier": "important", "weight": 4, "category": "component_basic"},
36
- "description": {"tier": "important", "weight": 4, "category": "component_basic"},
37
- "licenses": {"tier": "important", "weight": 4, "category": "component_basic"},
38
- "energyConsumption": {"tier": "important", "weight": 3, "category": "component_model_card"},
39
- "hyperparameter": {"tier": "important", "weight": 3, "category": "component_model_card"},
40
- "limitation": {"tier": "important", "weight": 3, "category": "component_model_card"},
41
- "safetyRiskAssessment": {"tier": "important", "weight": 3, "category": "component_model_card"},
42
- "typeOfModel": {"tier": "important", "weight": 3, "category": "component_model_card"},
43
-
44
- # Supplementary fields (aligned with remaining SPDX optional fields)
45
- "modelExplainability": {"tier": "supplementary", "weight": 2, "category": "component_model_card"},
46
- "standardCompliance": {"tier": "supplementary", "weight": 2, "category": "metadata"},
47
- "domain": {"tier": "supplementary", "weight": 2, "category": "metadata"},
48
- "energyQuantity": {"tier": "supplementary", "weight": 2, "category": "component_model_card"},
49
- "energyUnit": {"tier": "supplementary", "weight": 2, "category": "component_model_card"},
50
- "informationAboutTraining": {"tier": "supplementary", "weight": 2, "category": "component_model_card"},
51
- "informationAboutApplication": {"tier": "supplementary", "weight": 2, "category": "component_model_card"},
52
- "metric": {"tier": "supplementary", "weight": 2, "category": "component_model_card"},
53
- "metricDecisionThreshold": {"tier": "supplementary", "weight": 2, "category": "component_model_card"},
54
- "modelDataPreprocessing": {"tier": "supplementary", "weight": 2, "category": "component_model_card"},
55
- "autonomyType": {"tier": "supplementary", "weight": 1, "category": "metadata"},
56
- "useSensitivePersonalInformation": {"tier": "supplementary", "weight": 2, "category": "component_model_card"}
57
- }
58
-
59
- # Completeness profiles (silently aligned with SPDX requirements)
60
- COMPLETENESS_PROFILES = {
61
- "basic": {
62
- "description": "Minimal fields required for identification",
63
- "required_fields": ["bomFormat", "specVersion", "serialNumber", "version", "name"],
64
- "minimum_score": 40
65
- },
66
- "standard": {
67
- "description": "Comprehensive fields for proper documentation",
68
- "required_fields": ["bomFormat", "specVersion", "serialNumber", "version", "name",
69
- "downloadLocation", "primaryPurpose", "suppliedBy"],
70
- "minimum_score": 70
71
- },
72
- "advanced": {
73
- "description": "Extensive documentation for maximum transparency",
74
- "required_fields": ["bomFormat", "specVersion", "serialNumber", "version", "name",
75
- "downloadLocation", "primaryPurpose", "suppliedBy",
76
- "type", "purl", "description", "licenses", "hyperparameter", "limitation",
77
- "energyConsumption", "safetyRiskAssessment", "typeOfModel"],
78
- "minimum_score": 85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  }
80
- }
81
-
82
- # Validation messages framed as best practices
83
- VALIDATION_MESSAGES = {
84
- "name": {
85
- "missing": "Missing critical field: name - essential for model identification",
86
- "recommendation": "Add a descriptive name for the model"
87
- },
88
- "downloadLocation": {
89
- "missing": "Missing critical field: downloadLocation - needed for artifact retrieval",
90
- "recommendation": "Add information about where the model can be downloaded"
91
- },
92
- "primaryPurpose": {
93
- "missing": "Missing critical field: primaryPurpose - important for understanding model intent",
94
- "recommendation": "Add information about the primary purpose of this model"
95
- },
96
- "suppliedBy": {
97
- "missing": "Missing critical field: suppliedBy - needed for provenance tracking",
98
- "recommendation": "Add information about who supplied this model"
99
- },
100
- "energyConsumption": {
101
- "missing": "Missing important field: energyConsumption - helpful for environmental impact assessment",
102
- "recommendation": "Consider documenting energy consumption metrics for better transparency"
103
- },
104
- "hyperparameter": {
105
- "missing": "Missing important field: hyperparameter - valuable for reproducibility",
106
- "recommendation": "Document key hyperparameters used in training"
107
- },
108
- "limitation": {
109
- "missing": "Missing important field: limitation - important for responsible use",
110
- "recommendation": "Document known limitations of the model to guide appropriate usage"
111
  }
112
- }
113
 
114
 
115
  def setup_logging(level=logging.INFO):
@@ -207,77 +240,53 @@ def check_field_in_aibom(aibom: Dict[str, Any], field: str) -> bool:
207
  Returns:
208
  True if the field is present, False otherwise
209
  """
210
- # Check in root level
211
  if field in aibom:
212
  return True
213
-
214
- # Check in metadata
215
  if "metadata" in aibom:
216
  metadata = aibom["metadata"]
217
  if field in metadata:
218
  return True
219
-
220
- # Check in metadata properties
221
  if "properties" in metadata:
222
  for prop in metadata["properties"]:
223
- if prop.get("name") == f"spdx:{field}" or prop.get("name") == field:
 
224
  return True
225
-
226
- # Check in components
227
  if "components" in aibom and aibom["components"]:
228
- component = aibom["components"][0] # Use first component
229
-
230
  if field in component:
231
  return True
232
-
233
- # Check in component properties
234
  if "properties" in component:
235
  for prop in component["properties"]:
236
- if prop.get("name") == f"spdx:{field}" or prop.get("name") == field:
 
237
  return True
238
-
239
- # Check in model card
240
  if "modelCard" in component:
241
  model_card = component["modelCard"]
242
-
243
  if field in model_card:
244
  return True
245
-
246
- # Check in model parameters
247
- if "modelParameters" in model_card:
248
- if field in model_card["modelParameters"]:
249
- return True
250
-
251
- # Check in model parameters properties
252
- if "properties" in model_card["modelParameters"]:
253
- for prop in model_card["modelParameters"]["properties"]:
254
- if prop.get("name") == f"spdx:{field}" or prop.get("name") == field:
255
- return True
256
-
257
- # Check in considerations
258
  if "considerations" in model_card:
259
- if field in model_card["considerations"]:
260
- return True
261
-
262
- # Check in specific consideration sections
263
- for section in ["technicalLimitations", "ethicalConsiderations", "environmentalConsiderations"]:
264
- if section in model_card["considerations"]:
265
- if field == "limitation" and section == "technicalLimitations":
266
- return True
267
- if field == "safetyRiskAssessment" and section == "ethicalConsiderations":
268
- return True
269
- if field == "energyConsumption" and section == "environmentalConsiderations":
270
  return True
271
-
272
- # Check in external references
273
  if field == "downloadLocation" and "externalReferences" in aibom:
274
  for ref in aibom["externalReferences"]:
275
- if ref.get("type") == "distribution":
276
  return True
277
-
278
  return False
279
 
280
 
 
281
  def determine_completeness_profile(aibom: Dict[str, Any], score: float) -> Dict[str, Any]:
282
  """
283
  Determine which completeness profile the AIBOM satisfies.
@@ -835,8 +844,113 @@ def get_validation_summary(report: Dict[str, Any]) -> str:
835
 
836
  return summary
837
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
838
 
839
- def calculate_industry_neutral_score(aibom: Dict[str, Any]) -> Dict[str, Any]:
 
840
  """
841
  Calculate completeness score using industry best practices with proper normalization and penalties.
842
 
@@ -875,8 +989,8 @@ def calculate_industry_neutral_score(aibom: Dict[str, Any]) -> Dict[str, Any]:
875
  # Count total fields in this category
876
  fields_by_category[category]["total"] += 1
877
 
878
- # Check if field is present (ensure boolean result)
879
- is_present = bool(check_field_in_aibom(aibom, field))
880
 
881
  if is_present:
882
  fields_by_category[category]["present"] += 1
@@ -898,6 +1012,19 @@ def calculate_industry_neutral_score(aibom: Dict[str, Any]) -> Dict[str, Any]:
898
  category_scores[category] = round(raw_score, 1)
899
  else:
900
  category_scores[category] = 0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
901
 
902
  # Calculate subtotal (sum of rounded category scores)
903
  subtotal_score = sum(category_scores.values())
@@ -1033,7 +1160,7 @@ def calculate_industry_neutral_score(aibom: Dict[str, Any]) -> Dict[str, Any]:
1033
  return result
1034
 
1035
 
1036
- def calculate_completeness_score(aibom: Dict[str, Any], validate: bool = True, use_best_practices: bool = True) -> Dict[str, Any]:
1037
  """
1038
  Calculate completeness score for an AIBOM and optionally validate against AI requirements.
1039
  Enhanced with industry best practices scoring.
@@ -1046,9 +1173,16 @@ def calculate_completeness_score(aibom: Dict[str, Any], validate: bool = True, u
1046
  Returns:
1047
  Dictionary containing score and validation results
1048
  """
 
 
 
 
 
 
 
1049
  # If using best practices scoring, use the enhanced industry-neutral approach
1050
  if use_best_practices:
1051
- result = calculate_industry_neutral_score(aibom)
1052
 
1053
  # Add validation if requested
1054
  if validate:
@@ -1525,4 +1659,64 @@ def format_score_summary(score_result: Dict[str, Any]) -> str:
1525
  summary += f"\nCompleteness Profile: {profile['name']}\n"
1526
  summary += f"Description: {profile['description']}\n"
1527
 
1528
- return summary
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
+ Mostly score calculation functions for the AI SBOM Generator.
3
  """
4
 
5
  import json
 
9
  import uuid
10
  from typing import Dict, List, Optional, Any, Union, Tuple
11
  from enum import Enum
12
+ from .field_registry_manager import (
13
+ get_field_registry_manager,
14
+ generate_field_classification,
15
+ generate_completeness_profiles,
16
+ generate_validation_messages,
17
+ get_configurable_scoring_weights,
18
+ DynamicFieldDetector # Compatibility wrapper
19
+ )
20
 
21
  logger = logging.getLogger(__name__)
22
 
 
26
  WARNING = "warning"
27
  INFO = "info"
28
 
29
+ # Registry-driven field definitions
30
+ try:
31
+ REGISTRY_MANAGER = get_field_registry_manager()
32
+ FIELD_CLASSIFICATION = generate_field_classification()
33
+ COMPLETENESS_PROFILES = generate_completeness_profiles()
34
+ VALIDATION_MESSAGES = generate_validation_messages()
35
+ SCORING_WEIGHTS = get_configurable_scoring_weights()
36
+
37
+ print(f"βœ… Registry-driven configuration loaded: {len(FIELD_CLASSIFICATION)} fields")
38
+ REGISTRY_AVAILABLE = True
39
+
40
+ except Exception as e:
41
+ print(f"❌ Failed to load registry configuration: {e}")
42
+ print("πŸ”„ Falling back to hardcoded definitions...")
43
+ REGISTRY_AVAILABLE = False
44
+
45
+ # Hardcoded definitions as fallback
46
+ FIELD_CLASSIFICATION = {
47
+ # Critical fields (silently aligned with SPDX mandatory fields)
48
+ "bomFormat": {"tier": "critical", "weight": 3, "category": "required_fields"},
49
+ "specVersion": {"tier": "critical", "weight": 3, "category": "required_fields"},
50
+ "serialNumber": {"tier": "critical", "weight": 3, "category": "required_fields"},
51
+ "version": {"tier": "critical", "weight": 3, "category": "required_fields"},
52
+ "name": {"tier": "critical", "weight": 4, "category": "component_basic"},
53
+ "downloadLocation": {"tier": "critical", "weight": 4, "category": "external_references"},
54
+ "primaryPurpose": {"tier": "critical", "weight": 3, "category": "metadata"},
55
+ "suppliedBy": {"tier": "critical", "weight": 4, "category": "metadata"},
56
+
57
+ # Important fields (aligned with key SPDX optional fields)
58
+ "type": {"tier": "important", "weight": 2, "category": "component_basic"},
59
+ "purl": {"tier": "important", "weight": 4, "category": "component_basic"},
60
+ "description": {"tier": "important", "weight": 4, "category": "component_basic"},
61
+ "licenses": {"tier": "important", "weight": 4, "category": "component_basic"},
62
+ "energyConsumption": {"tier": "important", "weight": 3, "category": "component_model_card"},
63
+ "hyperparameter": {"tier": "important", "weight": 3, "category": "component_model_card"},
64
+ "limitation": {"tier": "important", "weight": 3, "category": "component_model_card"},
65
+ "safetyRiskAssessment": {"tier": "important", "weight": 3, "category": "component_model_card"},
66
+ "typeOfModel": {"tier": "important", "weight": 3, "category": "component_model_card"},
67
+
68
+ # Supplementary fields (aligned with remaining SPDX optional fields)
69
+ "modelExplainability": {"tier": "supplementary", "weight": 2, "category": "component_model_card"},
70
+ "standardCompliance": {"tier": "supplementary", "weight": 2, "category": "metadata"},
71
+ "domain": {"tier": "supplementary", "weight": 2, "category": "metadata"},
72
+ "energyQuantity": {"tier": "supplementary", "weight": 2, "category": "component_model_card"},
73
+ "energyUnit": {"tier": "supplementary", "weight": 2, "category": "component_model_card"},
74
+ "informationAboutTraining": {"tier": "supplementary", "weight": 2, "category": "component_model_card"},
75
+ "informationAboutApplication": {"tier": "supplementary", "weight": 2, "category": "component_model_card"},
76
+ "metric": {"tier": "supplementary", "weight": 2, "category": "component_model_card"},
77
+ "metricDecisionThreshold": {"tier": "supplementary", "weight": 2, "category": "component_model_card"},
78
+ "modelDataPreprocessing": {"tier": "supplementary", "weight": 2, "category": "component_model_card"},
79
+ "autonomyType": {"tier": "supplementary", "weight": 1, "category": "metadata"},
80
+ "useSensitivePersonalInformation": {"tier": "supplementary", "weight": 2, "category": "component_model_card"}
81
+ }
82
+
83
+ # Completeness profiles (silently aligned with SPDX requirements)
84
+ COMPLETENESS_PROFILES = {
85
+ "basic": {
86
+ "description": "Minimal fields required for identification",
87
+ "required_fields": ["bomFormat", "specVersion", "serialNumber", "version", "name"],
88
+ "minimum_score": 40
89
+ },
90
+ "standard": {
91
+ "description": "Comprehensive fields for proper documentation",
92
+ "required_fields": ["bomFormat", "specVersion", "serialNumber", "version", "name",
93
+ "downloadLocation", "primaryPurpose", "suppliedBy"],
94
+ "minimum_score": 70
95
+ },
96
+ "advanced": {
97
+ "description": "Extensive documentation for maximum transparency",
98
+ "required_fields": ["bomFormat", "specVersion", "serialNumber", "version", "name",
99
+ "downloadLocation", "primaryPurpose", "suppliedBy",
100
+ "type", "purl", "description", "licenses", "hyperparameter", "limitation",
101
+ "energyConsumption", "safetyRiskAssessment", "typeOfModel"],
102
+ "minimum_score": 85
103
+ }
104
+ }
105
+
106
+ # Validation messages framed as best practices
107
+ VALIDATION_MESSAGES = {
108
+ "name": {
109
+ "missing": "Missing critical field: name - essential for model identification",
110
+ "recommendation": "Add a descriptive name for the model"
111
+ },
112
+ "downloadLocation": {
113
+ "missing": "Missing critical field: downloadLocation - needed for artifact retrieval",
114
+ "recommendation": "Add information about where the model can be downloaded"
115
+ },
116
+ "primaryPurpose": {
117
+ "missing": "Missing critical field: primaryPurpose - important for understanding model intent",
118
+ "recommendation": "Add information about the primary purpose of this model"
119
+ },
120
+ "suppliedBy": {
121
+ "missing": "Missing critical field: suppliedBy - needed for provenance tracking",
122
+ "recommendation": "Add information about who supplied this model"
123
+ },
124
+ "energyConsumption": {
125
+ "missing": "Missing important field: energyConsumption - helpful for environmental impact assessment",
126
+ "recommendation": "Consider documenting energy consumption metrics for better transparency"
127
+ },
128
+ "hyperparameter": {
129
+ "missing": "Missing important field: hyperparameter - valuable for reproducibility",
130
+ "recommendation": "Document key hyperparameters used in training"
131
+ },
132
+ "limitation": {
133
+ "missing": "Missing important field: limitation - important for responsible use",
134
+ "recommendation": "Document known limitations of the model to guide appropriate usage"
135
+ }
136
  }
137
+
138
+ SCORING_WEIGHTS = {
139
+ "tier_weights": {"critical": 3, "important": 2, "supplementary": 1},
140
+ "category_weights": {
141
+ "required_fields": 20, "metadata": 20, "component_basic": 20,
142
+ "component_model_card": 30, "external_references": 10
143
+ },
144
+ "algorithm_config": {"type": "weighted_sum", "max_score": 100}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  }
 
146
 
147
 
148
  def setup_logging(level=logging.INFO):
 
240
  Returns:
241
  True if the field is present, False otherwise
242
  """
 
243
  if field in aibom:
244
  return True
 
 
245
  if "metadata" in aibom:
246
  metadata = aibom["metadata"]
247
  if field in metadata:
248
  return True
 
 
249
  if "properties" in metadata:
250
  for prop in metadata["properties"]:
251
+ prop_name = prop.get("name", "")
252
+ if prop_name in {field, f"spdx:{field}"}:
253
  return True
 
 
254
  if "components" in aibom and aibom["components"]:
255
+ component = aibom["components"][0]
 
256
  if field in component:
257
  return True
 
 
258
  if "properties" in component:
259
  for prop in component["properties"]:
260
+ prop_name = prop.get("name", "")
261
+ if prop_name in {field, f"spdx:{field}"}:
262
  return True
 
 
263
  if "modelCard" in component:
264
  model_card = component["modelCard"]
 
265
  if field in model_card:
266
  return True
267
+ if "modelParameters" in model_card and field in model_card["modelParameters"]:
268
+ return True
 
 
 
 
 
 
 
 
 
 
 
269
  if "considerations" in model_card:
270
+ considerations = model_card["considerations"]
271
+ field_mappings = {
272
+ "limitation": ["technicalLimitations", "limitations"],
273
+ "safetyRiskAssessment": ["ethicalConsiderations", "safetyRiskAssessment"],
274
+ "energyConsumption": ["environmentalConsiderations", "energyConsumption"]
275
+ }
276
+ if field in field_mappings:
277
+ for section in field_mappings[field]:
278
+ if section in considerations and considerations[section]:
 
 
279
  return True
280
+ if field in considerations:
281
+ return True
282
  if field == "downloadLocation" and "externalReferences" in aibom:
283
  for ref in aibom["externalReferences"]:
284
+ if ref.get("type") == "distribution" and ref.get("url"):
285
  return True
 
286
  return False
287
 
288
 
289
+
290
  def determine_completeness_profile(aibom: Dict[str, Any], score: float) -> Dict[str, Any]:
291
  """
292
  Determine which completeness profile the AIBOM satisfies.
 
844
 
845
  return summary
846
 
847
+ def check_field_with_enhanced_results(aibom: Dict[str, Any], field: str, extraction_results: Optional[Dict[str, Any]] = None) -> bool:
848
+ """
849
+ Enhanced field detection using consolidated field registry manager.
850
+
851
+ Args:
852
+ aibom: The AIBOM to check
853
+ field: The field name to check (must match field registry)
854
+ extraction_results: Enhanced extraction results with confidence levels
855
+
856
+ Returns:
857
+ True if the field is present and should count toward score, False otherwise
858
+ """
859
+ try:
860
+ # Initialize dynamic field detector (cached)
861
+ if not hasattr(check_field_with_enhanced_results, '_detector'):
862
+ try:
863
+ if REGISTRY_AVAILABLE:
864
+ # Use the consolidated registry manager
865
+ registry_manager = get_field_registry_manager()
866
+ check_field_with_enhanced_results._detector = DynamicFieldDetector(registry_manager)
867
+ print(f"βœ… Dynamic field detector initialized with registry manager")
868
+ else:
869
+ # Create registry manager from path
870
+ from field_registry_manager import FieldRegistryManager
871
+ registry_path = os.path.join(current_dir, "field_registry.json")
872
+ registry_manager = FieldRegistryManager(registry_path)
873
+ check_field_with_enhanced_results._detector = DynamicFieldDetector(registry_manager)
874
+ print(f"βœ… Dynamic field detector initialized with fallback registry manager")
875
+
876
+ except Exception as e:
877
+ print(f"❌ Failed to initialize dynamic field detector: {e}")
878
+ # Final fallback
879
+ import os
880
+ current_dir = os.path.dirname(os.path.abspath(__file__))
881
+ registry_path = os.path.join(current_dir, "field_registry.json")
882
+ try:
883
+ check_field_with_enhanced_results._detector = DynamicFieldDetector(registry_path)
884
+ print(f"πŸ”„ Dynamic field detector initialized with emergency fallback")
885
+ except Exception as final_error:
886
+ print(f"❌ Complete failure to initialize dynamic field detector: {final_error}")
887
+ check_field_with_enhanced_results._detector = None
888
+
889
+ detector = check_field_with_enhanced_results._detector
890
+
891
+ if detector is None:
892
+ print(f"⚠️ No detector available, using fallback for {field}")
893
+ return check_field_in_aibom(aibom, field)
894
+
895
+ # First, try dynamic detection from AIBOM structure using ENHANCED REGISTRY FORMAT
896
+ field_found_in_registry = False
897
+
898
+ # Use the enhanced registry structure (registry['fields'][field_name])
899
+ fields = detector.registry.get('fields', {})
900
+ if field in fields:
901
+ field_found_in_registry = True
902
+ field_config = fields[field]
903
+ field_path = field_config.get('jsonpath', '')
904
+
905
+ if field_path:
906
+ # Use dynamic detection
907
+ is_present, value = detector.detect_field_presence(aibom, field_path)
908
+
909
+ if is_present:
910
+ print(f"βœ… DYNAMIC: Found {field} = {value}")
911
+ return True
912
+ else:
913
+ print(f"❌ DYNAMIC: Missing {field} at {field_path}")
914
+ else:
915
+ print(f"⚠️ Field '{field}' has no jsonpath defined in registry")
916
+
917
+ # If field not in registry, log warning but continue
918
+ if not field_found_in_registry:
919
+ print(f"⚠️ WARNING: Field '{field}' not found in field registry")
920
+
921
+ # Second, check extraction results (existing logic)
922
+ if extraction_results and field in extraction_results:
923
+ extraction_result = extraction_results[field]
924
+
925
+ # Check if this field has actual extracted data (not just placeholder)
926
+ if hasattr(extraction_result, 'confidence'):
927
+ # Don't count fields with 'none' confidence (placeholders like NOASSERTION)
928
+ if extraction_result.confidence.value == 'none':
929
+ print(f"❌ EXTRACTION: {field} has 'none' confidence")
930
+ return False
931
+ # Count fields with medium or high confidence
932
+ is_confident = extraction_result.confidence.value in ['medium', 'high']
933
+ print(f"{'βœ…' if is_confident else '❌'} EXTRACTION: {field} confidence = {extraction_result.confidence.value}")
934
+ return is_confident
935
+ elif hasattr(extraction_result, 'value'):
936
+ # For simple extraction results, check if value is meaningful
937
+ value = extraction_result.value
938
+ if value in ['NOASSERTION', 'NOT_FOUND', None, '']:
939
+ print(f"❌ EXTRACTION: {field} has placeholder value: {value}")
940
+ return False
941
+ print(f"βœ… EXTRACTION: {field} = {value}")
942
+ return True
943
+
944
+ # Third, fallback to original AIBOM detection
945
+ print(f"πŸ”„ FALLBACK: Using original detection for {field}")
946
+ return check_field_in_aibom(aibom, field)
947
+
948
+ except Exception as e:
949
+ print(f"❌ Error in enhanced field detection for {field}: {e}")
950
+ return check_field_in_aibom(aibom, field)
951
 
952
+
953
+ def calculate_industry_neutral_score(aibom: Dict[str, Any], extraction_results: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
954
  """
955
  Calculate completeness score using industry best practices with proper normalization and penalties.
956
 
 
989
  # Count total fields in this category
990
  fields_by_category[category]["total"] += 1
991
 
992
+ # Enhanced field detection using extraction results
993
+ is_present = check_field_with_enhanced_results(aibom, field, extraction_results)
994
 
995
  if is_present:
996
  fields_by_category[category]["present"] += 1
 
1012
  category_scores[category] = round(raw_score, 1)
1013
  else:
1014
  category_scores[category] = 0.0
1015
+
1016
+ # Log field extraction summary
1017
+ total_fields = sum(counts["total"] for counts in fields_by_category.values())
1018
+ total_present = sum(counts["present"] for counts in fields_by_category.values())
1019
+
1020
+ print(f"πŸ“Š SCORING SUMMARY:")
1021
+ print(f" Total fields evaluated: {total_fields}")
1022
+ print(f" Fields successfully extracted: {total_present}")
1023
+ print(f" Extraction success rate: {round((total_present/total_fields)*100, 1)}%")
1024
+ print(f" Category breakdown:")
1025
+ for category, counts in fields_by_category.items():
1026
+ percentage = round((counts["present"]/counts["total"])*100, 1) if counts["total"] > 0 else 0
1027
+ print(f" {category}: {counts['present']}/{counts['total']} ({percentage}%)")
1028
 
1029
  # Calculate subtotal (sum of rounded category scores)
1030
  subtotal_score = sum(category_scores.values())
 
1160
  return result
1161
 
1162
 
1163
+ def calculate_completeness_score(aibom: Dict[str, Any], validate: bool = True, use_best_practices: bool = True, extraction_results: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
1164
  """
1165
  Calculate completeness score for an AIBOM and optionally validate against AI requirements.
1166
  Enhanced with industry best practices scoring.
 
1173
  Returns:
1174
  Dictionary containing score and validation results
1175
  """
1176
+ print(f"πŸ” DEBUG: use_best_practices={use_best_practices}")
1177
+ print(f"πŸ” DEBUG: extraction_results is None: {extraction_results is None}")
1178
+ print(f"πŸ” DEBUG: extraction_results keys: {list(extraction_results.keys()) if extraction_results else 'None'}")
1179
+
1180
+ if use_best_practices:
1181
+ print("πŸ” DEBUG: Calling calculate_industry_neutral_score")
1182
+ result = calculate_industry_neutral_score(aibom, extraction_results)
1183
  # If using best practices scoring, use the enhanced industry-neutral approach
1184
  if use_best_practices:
1185
+ result = calculate_industry_neutral_score(aibom, extraction_results)
1186
 
1187
  # Add validation if requested
1188
  if validate:
 
1659
  summary += f"\nCompleteness Profile: {profile['name']}\n"
1660
  summary += f"Description: {profile['description']}\n"
1661
 
1662
+ return summary
1663
+
1664
+ def test_consolidated_integration():
1665
+ """Test that consolidated field registry manager integration is working"""
1666
+ try:
1667
+ print("\nπŸ§ͺ Testing Consolidated Integration...")
1668
+
1669
+ # Test registry availability
1670
+ if REGISTRY_AVAILABLE:
1671
+ print("βœ… Consolidated registry manager available")
1672
+
1673
+ # Test registry manager
1674
+ manager = get_field_registry_manager()
1675
+ print(f"βœ… Registry manager initialized: {manager.registry_path}")
1676
+
1677
+ # Test field classification generation
1678
+ field_count = len(FIELD_CLASSIFICATION)
1679
+ print(f"βœ… FIELD_CLASSIFICATION loaded: {field_count} fields")
1680
+
1681
+ # Test completeness profiles
1682
+ profile_count = len(COMPLETENESS_PROFILES)
1683
+ print(f"βœ… COMPLETENESS_PROFILES loaded: {profile_count} profiles")
1684
+
1685
+ # Test validation messages
1686
+ message_count = len(VALIDATION_MESSAGES)
1687
+ print(f"βœ… VALIDATION_MESSAGES loaded: {message_count} messages")
1688
+
1689
+ # Test scoring weights
1690
+ tier_weights = SCORING_WEIGHTS.get("tier_weights", {})
1691
+ category_weights = SCORING_WEIGHTS.get("category_weights", {})
1692
+ print(f"βœ… SCORING_WEIGHTS loaded: {len(tier_weights)} tiers, {len(category_weights)} categories")
1693
+
1694
+ else:
1695
+ print("⚠️ Consolidated registry manager not available, using hardcoded definitions")
1696
+
1697
+ # Test dynamic field detector (DynamicFieldDetector)
1698
+ if hasattr(check_field_with_enhanced_results, '_detector') and check_field_with_enhanced_results._detector:
1699
+ print(f"βœ… Dynamic field detector ready")
1700
+ else:
1701
+ print(f"⚠️ Dynamic field detector not initialized")
1702
+
1703
+ # Test field lookup
1704
+ test_fields = ["bomFormat", "primaryPurpose", "energyConsumption"]
1705
+ for field in test_fields:
1706
+ if field in FIELD_CLASSIFICATION:
1707
+ field_info = FIELD_CLASSIFICATION[field]
1708
+ print(f"βœ… Field '{field}': tier={field_info['tier']}, category={field_info['category']}")
1709
+ else:
1710
+ print(f"❌ Field '{field}' not found in FIELD_CLASSIFICATION")
1711
+
1712
+ print("πŸŽ‰ Consolidated integration test completed!")
1713
+ return True
1714
+
1715
+ except Exception as e:
1716
+ print(f"❌ Consolidated integration test failed: {e}")
1717
+ import traceback
1718
+ traceback.print_exc()
1719
+ return False
1720
+
1721
+ # Uncomment this line to run the test automatically when utils.py is imported
1722
+ test_consolidated_integration()