Spaces:

aetheris-ai
/

aibom-generator

Running

App Files Files

a1c00l commited on Apr 22

Commit

854b81e

verified ·

1 Parent(s): 25493c7

Update src/aibom_generator/generator.py

Browse files

Files changed (1) hide show

src/aibom_generator/generator.py +258 -198

src/aibom_generator/generator.py CHANGED Viewed

@@ -78,35 +78,12 @@ class AIBOMGenerator:
             # Calculate final score with industry-neutral approach if enabled
             final_score = calculate_completeness_score(aibom, validate=True, use_best_practices=use_best_practices)
-            # Add score and enhancement info to metadata properties
             if "metadata" in aibom and "properties" not in aibom["metadata"]:
                 aibom["metadata"]["properties"] = []
-            if "metadata" in aibom and "properties" in aibom["metadata"]:
-                # Add score information
-                aibom["metadata"]["properties"].append({"name": "aibom:quality-score", "value": str(final_score["total_score"])})
-                aibom["metadata"]["properties"].append({"name": "aibom:quality-breakdown", "value": json.dumps(final_score["section_scores"])})
-                aibom["metadata"]["properties"].append({"name": "aibom:max-scores", "value": json.dumps(final_score["max_scores"])})
-                # Add completeness profile information if available (from industry-neutral approach)
-                if use_best_practices and "completeness_profile" in final_score:
-                    aibom["metadata"]["properties"].append({
-                        "name": "aibom:completeness-profile",
-                        "value": final_score["completeness_profile"]["name"]
-                    })
-                    aibom["metadata"]["properties"].append({
-                        "name": "aibom:completeness-description",
-                        "value": final_score["completeness_profile"]["description"]
-                    })
-                # Add AI enhancement information
-                if ai_enhanced:
-                    aibom["metadata"]["properties"].append({"name": "aibom:ai-enhanced", "value": "true"})
-                    aibom["metadata"]["properties"].append({"name": "aibom:ai-model", "value": ai_model_name})
-                    aibom["metadata"]["properties"].append({"name": "aibom:original-score", "value": str(original_score["total_score"])})
-                    aibom["metadata"]["properties"].append({"name": "aibom:score-improvement",
-                                                          "value": str(round(final_score["total_score"] - original_score["total_score"], 2))})
             if output_file:
                 with open(output_file, 'w') as f:
@@ -137,27 +114,38 @@ class AIBOMGenerator:
             "version": 1,
             "metadata": {
                 "timestamp": datetime.datetime.utcnow().isoformat() + "Z",
-                "tools": [{
-                    "vendor": "Aetheris AI",
-                    "name": "aetheris-aibom-generator",
-                    "version": "0.1.0"
-                }],
                 "component": {
-                    "type": "machine-learning-model",
                     "name": model_id.split("/")[-1],
-                    "bom-ref": f"pkg:generic/{model_id.replace('/', '%2F')}"
-                },
-                "properties": [
-                    {"name": "aibom:error", "value": "Error generating complete AIBOM"}
-                ]
             },
             "components": [{
                 "type": "machine-learning-model",
-                "bom-ref": f"pkg:generic/{model_id.replace('/', '%2F')}",
                 "name": model_id.split("/")[-1],
-                "purl": f"pkg:huggingface/{model_id.replace('/', '/')}"
             }],
-            "dependencies": []
         }
     def get_enhancement_report(self):
@@ -183,6 +171,14 @@ class AIBOMGenerator:
         model_id: str,
         metadata: Dict[str, Any],
     ) -> Dict[str, Any]:
         aibom = {
             "bomFormat": "CycloneDX",
             "specVersion": "1.6",
@@ -192,8 +188,8 @@ class AIBOMGenerator:
             "components": [self._create_component_section(model_id, metadata)],
             "dependencies": [
                 {
-                    "ref": f"pkg:generic/{model_id.replace('/', '%2F')}",
-                    "dependsOn": ["pkg:pypi/torch@1.13.0"]
                 }
             ]
         }
@@ -325,8 +321,6 @@ class AIBOMGenerator:
                         limitations_section = card_text.split("## Limitations")[1].split("##")[0].strip()
                         if limitations_section:
                             enhanced_metadata["limitations"] = limitations_section
-                            # Map to industry-neutral field (silently aligned with SPDX)
-                            enhanced_metadata["limitation"] = limitations_section
                 # Extract ethical considerations if present
                 if "ethical_considerations" not in enhanced_metadata:
@@ -335,8 +329,6 @@ class AIBOMGenerator:
                             section = card_text.split(heading)[1].split("##")[0].strip()
                             if section:
                                 enhanced_metadata["ethical_considerations"] = section
-                                # Map to industry-neutral field (silently aligned with SPDX)
-                                enhanced_metadata["safetyRiskAssessment"] = section
                                 break
                 # Extract risks if present
@@ -346,23 +338,29 @@ class AIBOMGenerator:
                         if risks_section:
                             enhanced_metadata["risks"] = risks_section
-                # Extract energy consumption if present (for industry-neutral scoring)
-                if "energy" not in enhanced_metadata:
-                    for heading in ["## Energy", "## Energy Consumption", "## Environmental Impact"]:
-                        if heading in card_text:
-                            section = card_text.split(heading)[1].split("##")[0].strip()
-                            if section:
-                                enhanced_metadata["energyConsumption"] = section
-                                break
-                # Extract hyperparameters if present (for industry-neutral scoring)
-                if "hyperparameters" not in enhanced_metadata:
-                    for heading in ["## Hyperparameters", "## Training Hyperparameters", "## Model Hyperparameters"]:
-                        if heading in card_text:
-                            section = card_text.split(heading)[1].split("##")[0].strip()
-                            if section:
-                                enhanced_metadata["hyperparameter"] = section
-                                break
             except Exception as e:
                 print(f"Error extracting unstructured metadata: {e}")
@@ -370,181 +368,243 @@ class AIBOMGenerator:
     def _create_metadata_section(self, model_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
         timestamp = datetime.datetime.utcnow().isoformat() + "Z"
-        tools = [{
-            "vendor": "Aetheris AI",
-            "name": "aetheris-aibom-generator",
-            "version": "0.1.0"
-        }]
         authors = []
-        if metadata and "author" in metadata and metadata["author"]:
-            # Use email instead of url to comply with CycloneDX schema
             authors.append({
-                "name": metadata["author"],
-                "email": f"{metadata['author']}@huggingface.co"
             })
         component = {
-            "type": "machine-learning-model",
-            "name": metadata.get("name", model_id.split("/")[-1]) if metadata else model_id.split("/")[-1],
-            "bom-ref": f"pkg:generic/{model_id.replace('/', '%2F')}"
         }
         properties = []
-        if metadata:
-            for key, value in metadata.items():
-                if key not in ["name", "author", "license"] and value is not None:
-                    try:
-                        if isinstance(value, (list, dict)):
-                            value = json.dumps(value)
-                        elif isinstance(value, datetime.datetime):
-                            value = value.isoformat() + "Z"
-                        properties.append({"name": key, "value": str(value)})
-                    except Exception as e:
-                        print(f"Error processing metadata property {key}: {e}")
         metadata_section = {
             "timestamp": timestamp,
             "tools": tools,
             "component": component
         }
-        if authors:
-            metadata_section["authors"] = authors
         if properties:
             metadata_section["properties"] = properties
         return metadata_section
     def _create_component_section(self, model_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
         # Create PURL with version information if commit is available
         purl = f"pkg:huggingface/{model_id.replace('/', '/')}"
-        if metadata and "commit" in metadata:
             purl = f"{purl}@{metadata['commit']}"
         component = {
             "type": "machine-learning-model",
-            "name": metadata.get("name", model_id.split("/")[-1]) if metadata else model_id.split("/")[-1],
-            "bom-ref": f"pkg:generic/{model_id.replace('/', '%2F')}",
             "purl": purl
         }
-        # Add description if available
-        if metadata and "description" in metadata and metadata["description"]:
-            component["description"] = metadata["description"]
-        # Add license if available
-        if metadata and "license" in metadata and metadata["license"]:
-            license_id = metadata["license"]
             component["licenses"] = [{
                 "license": {
-                    "id": license_id
                 }
             }]
-        # Add model card if available
-        model_card = {}
-        # Add model parameters
         model_parameters = {}
-        if metadata:
-            for key in ["ai:type", "ai:task", "ai:framework", "base_model", "library_name"]:
-                if key in metadata and metadata[key]:
-                    if "properties" not in model_parameters:
-                        model_parameters["properties"] = []
-                    model_parameters["properties"].append({
-                        "name": key,
-                        "value": metadata[key]
-                    })
-            # Add datasets if available
-            if "datasets" in metadata and metadata["datasets"]:
-                model_parameters["datasets"] = []
-                try:
-                    if isinstance(metadata["datasets"], list):
-                        for dataset in metadata["datasets"]:
-                            model_parameters["datasets"].append({
-                                "name": dataset
-                            })
-                    elif isinstance(metadata["datasets"], str):
-                        model_parameters["datasets"].append({
-                            "name": metadata["datasets"]
-                        })
-                except Exception as e:
-                    print(f"Error processing datasets: {e}")
-        if model_parameters:
-            model_card["modelParameters"] = model_parameters
-        # Add quantitative analysis if available
-        if metadata and "eval_results" in metadata and metadata["eval_results"]:
-            try:
-                quantitative_analysis = {
-                    "performanceMetrics": []
-                }
-                eval_results = metadata["eval_results"]
-                if isinstance(eval_results, dict):
-                    for metric, value in eval_results.items():
-                        quantitative_analysis["performanceMetrics"].append({
-                            "type": metric,
-                            "value": str(value)
                         })
-                elif isinstance(eval_results, list):
-                    for result in eval_results:
-                        if isinstance(result, dict) and "metric" in result and "value" in result:
-                            quantitative_analysis["performanceMetrics"].append({
-                                "type": result["metric"],
-                                "value": str(result["value"])
-                            })
-                if quantitative_analysis["performanceMetrics"]:
-                    model_card["quantitativeAnalysis"] = quantitative_analysis
-            except Exception as e:
-                print(f"Error processing evaluation results: {e}")
-        # Add considerations if available
         considerations = {}
-        if metadata:
-            # Technical limitations
-            if "limitations" in metadata and metadata["limitations"]:
-                considerations["technicalLimitations"] = metadata["limitations"]
-            # Ethical considerations
-            if "ethical_considerations" in metadata and metadata["ethical_considerations"]:
-                considerations["ethicalConsiderations"] = metadata["ethical_considerations"]
-            # Risks
-            if "risks" in metadata and metadata["risks"]:
-                considerations["risks"] = metadata["risks"]
-            # Environmental considerations (for industry-neutral scoring)
-            if "energyConsumption" in metadata and metadata["energyConsumption"]:
-                considerations["environmentalConsiderations"] = metadata["energyConsumption"]
         if considerations:
-            model_card["considerations"] = considerations
-        if model_card:
-            component["modelCard"] = model_card
-        # Add external references if available
-        external_references = []
-        # Add model card URL
-        external_references.append({
-            "type": "documentation",
-            "url": f"https://huggingface.co/{model_id}"
-        })
-        # Add commit URL if available
-        if metadata and "commit_url" in metadata and metadata["commit_url"]:
-            external_references.append({
-                "type": "vcs",
-                "url": metadata["commit_url"]
-            })
-        if external_references:
-            component["externalReferences"] = external_references
-        return component

             # Calculate final score with industry-neutral approach if enabled
             final_score = calculate_completeness_score(aibom, validate=True, use_best_practices=use_best_practices)
+            # Ensure metadata.properties exists
             if "metadata" in aibom and "properties" not in aibom["metadata"]:
                 aibom["metadata"]["properties"] = []
+            # Note: Quality score information is no longer added to the AIBOM metadata
+            # This was removed as requested by the user
             if output_file:
                 with open(output_file, 'w') as f:
             "version": 1,
             "metadata": {
                 "timestamp": datetime.datetime.utcnow().isoformat() + "Z",
+                "tools": {
+                    "components": [{
+                        "bom-ref": "pkg:generic/@cybeats/aetheris-aibom-generator@0.1.0",
+                        "type": "application",
+                        "name": "aetheris-aibom-generator",
+                        "version": "0.1.0",
+                        "manufacturer": {
+                            "name": "Aetheris AI"
+                        }
+                    }]
+                },
                 "component": {
+                    "bom-ref": f"pkg:generic/{model_id.replace('/', '%2F')}@1.0",
+                    "type": "application",
                     "name": model_id.split("/")[-1],
+                    "description": f"AI model {model_id}",
+                    "version": "1.0",
+                    "purl": f"pkg:generic/{model_id.replace('/', '%2F')}@1.0",
+                    "copyright": "NOASSERTION"
+                }
             },
             "components": [{
+                "bom-ref": f"pkg:huggingface/{model_id.replace('/', '/')}@1.0",
                 "type": "machine-learning-model",
                 "name": model_id.split("/")[-1],
+                "version": "1.0",
+                "purl": f"pkg:huggingface/{model_id.replace('/', '/')}@1.0"
             }],
+            "dependencies": [{
+                "ref": f"pkg:generic/{model_id.replace('/', '%2F')}@1.0",
+                "dependsOn": [f"pkg:huggingface/{model_id.replace('/', '/')}@1.0"]
+            }]
         }
     def get_enhancement_report(self):
         model_id: str,
         metadata: Dict[str, Any],
     ) -> Dict[str, Any]:
+        # Extract owner and model name from model_id
+        parts = model_id.split("/")
+        group = parts[0] if len(parts) > 1 else ""
+        name = parts[1] if len(parts) > 1 else parts[0]
+        # Get version from metadata or use default
+        version = metadata.get("commit", "1.0")
         aibom = {
             "bomFormat": "CycloneDX",
             "specVersion": "1.6",
             "components": [self._create_component_section(model_id, metadata)],
             "dependencies": [
                 {
+                    "ref": f"pkg:generic/{model_id.replace('/', '%2F')}@{version}",
+                    "dependsOn": [f"pkg:huggingface/{model_id.replace('/', '/')}@{version}"]
                 }
             ]
         }
                         limitations_section = card_text.split("## Limitations")[1].split("##")[0].strip()
                         if limitations_section:
                             enhanced_metadata["limitations"] = limitations_section
                 # Extract ethical considerations if present
                 if "ethical_considerations" not in enhanced_metadata:
                             section = card_text.split(heading)[1].split("##")[0].strip()
                             if section:
                                 enhanced_metadata["ethical_considerations"] = section
                                 break
                 # Extract risks if present
                         if risks_section:
                             enhanced_metadata["risks"] = risks_section
+                # Extract datasets if present
+                if "datasets" not in enhanced_metadata:
+                    datasets = []
+                    if "## Dataset" in card_text or "## Datasets" in card_text:
+                        dataset_section = ""
+                        if "## Dataset" in card_text:
+                            dataset_section = card_text.split("## Dataset")[1].split("##")[0].strip()
+                        elif "## Datasets" in card_text:
+                            dataset_section = card_text.split("## Datasets")[1].split("##")[0].strip()
+                        if dataset_section:
+                            # Simple parsing to extract dataset names
+                            lines = dataset_section.split("\n")
+                            for line in lines:
+                                if line.strip() and not line.startswith("#"):
+                                    datasets.append({
+                                        "type": "dataset",
+                                        "name": line.strip().split()[0] if line.strip().split() else "Unknown",
+                                        "description": line.strip()
+                                    })
+                    if datasets:
+                        enhanced_metadata["datasets"] = datasets
             except Exception as e:
                 print(f"Error extracting unstructured metadata: {e}")
     def _create_metadata_section(self, model_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
         timestamp = datetime.datetime.utcnow().isoformat() + "Z"
+        # Get version from metadata or use default
+        version = metadata.get("commit", "1.0")
+        # Create tools section with components array
+        tools = {
+            "components": [{
+                "bom-ref": "pkg:generic/@cybeats/[email protected]",
+                "type": "application",
+                "name": "aetheris-aibom-generator",
+                "version": "0.1.0",
+                "manufacturer": {
+                    "name": "Aetheris AI"
+                }
+            }]
+        }
+        # Create authors array
         authors = []
+        if "author" in metadata and metadata["author"]:
             authors.append({
+                "name": metadata["author"]
             })
+        # Create component section for metadata
         component = {
+            "bom-ref": f"pkg:generic/{model_id.replace('/', '%2F')}@{version}",
+            "type": "application",
+            "name": metadata.get("name", model_id.split("/")[-1]),
+            "description": metadata.get("description", f"AI model {model_id}"),
+            "version": version,
+            "purl": f"pkg:generic/{model_id.replace('/', '%2F')}@{version}"
         }
+        # Add authors to component if available
+        if authors:
+            component["authors"] = authors
+        # Add publisher and supplier if author is available
+        if "author" in metadata and metadata["author"]:
+            component["publisher"] = metadata["author"]
+            component["supplier"] = {
+                "name": metadata["author"]
+            }
+            component["manufacturer"] = {
+                "name": metadata["author"]
+            }
+        # Add copyright
+        component["copyright"] = "NOASSERTION"
+        # Create properties array for additional metadata
         properties = []
+        for key, value in metadata.items():
+            if key not in ["name", "author", "license", "description", "commit"] and value is not None:
+                if isinstance(value, (list, dict)):
+                    if not isinstance(value, str):
+                        value = json.dumps(value)
+                properties.append({"name": key, "value": str(value)})
+        # Assemble metadata section
         metadata_section = {
             "timestamp": timestamp,
             "tools": tools,
             "component": component
         }
         if properties:
             metadata_section["properties"] = properties
         return metadata_section
     def _create_component_section(self, model_id: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
+        # Extract owner and model name from model_id
+        parts = model_id.split("/")
+        group = parts[0] if len(parts) > 1 else ""
+        name = parts[1] if len(parts) > 1 else parts[0]
+        # Get version from metadata or use default
+        version = metadata.get("commit", "1.0")
         # Create PURL with version information if commit is available
         purl = f"pkg:huggingface/{model_id.replace('/', '/')}"
+        if "commit" in metadata:
             purl = f"{purl}@{metadata['commit']}"
+        else:
+            purl = f"{purl}@{version}"
         component = {
+            "bom-ref": f"pkg:huggingface/{model_id.replace('/', '/')}@{version}",
             "type": "machine-learning-model",
+            "group": group,
+            "name": name,
+            "version": version,
             "purl": purl
         }
+        # Add licenses if available
+        if "license" in metadata:
             component["licenses"] = [{
                 "license": {
+                    "id": metadata["license"],
+                    "url": self._get_license_url(metadata["license"])
                 }
             }]
+        # Add description if available
+        if "description" in metadata:
+            component["description"] = metadata["description"]
+        # Add external references
+        external_refs = [{
+            "type": "website",
+            "url": f"https://huggingface.co/{model_id}"
+        }]
+        if "commit_url" in metadata:
+            external_refs.append({
+                "type": "vcs",
+                "url": metadata["commit_url"]
+            })
+        component["externalReferences"] = external_refs
+        # Add authors, publisher, supplier, manufacturer
+        if "author" in metadata and metadata["author"]:
+            component["authors"] = [{"name": metadata["author"]}]
+            component["publisher"] = metadata["author"]
+            component["supplier"] = {
+                "name": metadata["author"],
+                "url": [f"https://huggingface.co/{metadata['author']}"]
+            }
+            component["manufacturer"] = {
+                "name": metadata["author"],
+                "url": [f"https://huggingface.co/{metadata['author']}"]
+            }
+        # Add copyright
+        component["copyright"] = "NOASSERTION"
+        # Add model card section
+        component["modelCard"] = self._create_model_card_section(metadata)
+        return component
+    def _create_model_card_section(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
+        model_card_section = {}
+        # Add quantitative analysis section
+        if "eval_results" in metadata:
+            model_card_section["quantitativeAnalysis"] = {
+                "performanceMetrics": metadata["eval_results"],
+                "graphics": {}  # Empty graphics object as in the example
+            }
+        else:
+            model_card_section["quantitativeAnalysis"] = {"graphics": {}}
+        # Add properties section
+        properties = []
+        for key, value in metadata.items():
+            if key in ["author", "library_name", "license", "downloads", "likes", "tags", "created_at", "last_modified"]:
+                properties.append({"name": key, "value": str(value)})
+        if properties:
+            model_card_section["properties"] = properties
+        # Create model parameters section
         model_parameters = {}
+        # Add outputs array
+        model_parameters["outputs"] = [{"format": "generated-text"}]
+        # Add task
+        model_parameters["task"] = metadata.get("pipeline_tag", "text-generation")
+        # Add architecture information
+        model_parameters["architectureFamily"] = "llama" if "llama" in metadata.get("name", "").lower() else "transformer"
+        model_parameters["modelArchitecture"] = f"{metadata.get('name', 'Unknown')}ForCausalLM"
+        # Add datasets array with proper structure
+        if "datasets" in metadata:
+            datasets = []
+            if isinstance(metadata["datasets"], list):
+                for dataset in metadata["datasets"]:
+                    if isinstance(dataset, str):
+                        datasets.append({
+                            "type": "dataset",
+                            "name": dataset,
+                            "description": f"Dataset used for training {metadata.get('name', 'the model')}"
                         })
+                    elif isinstance(dataset, dict) and "name" in dataset:
+                        # Ensure dataset has the required structure
+                        dataset_entry = {
+                            "type": dataset.get("type", "dataset"),
+                            "name": dataset["name"],
+                            "description": dataset.get("description", f"Dataset: {dataset['name']}")
+                        }
+                        datasets.append(dataset_entry)
+            elif isinstance(metadata["datasets"], str):
+                datasets.append({
+                    "type": "dataset",
+                    "name": metadata["datasets"],
+                    "description": f"Dataset used for training {metadata.get('name', 'the model')}"
+                })
+            if datasets:
+                model_parameters["datasets"] = datasets
+        # Add inputs array
+        model_parameters["inputs"] = [{"format": "text"}]
+        # Add model parameters to model card section
+        model_card_section["modelParameters"] = model_parameters
+        # Add considerations section
         considerations = {}
+        for k in ["limitations", "ethical_considerations", "bias", "risks"]:
+            if k in metadata:
+                considerations[k] = metadata[k]
         if considerations:
+            model_card_section["considerations"] = considerations
+        return model_card_section
+    def _get_license_url(self, license_id: str) -> str:
+        """Get the URL for a license based on its SPDX ID."""
+        license_urls = {
+            "Apache-2.0": "https://www.apache.org/licenses/LICENSE-2.0",
+            "MIT": "https://opensource.org/licenses/MIT",
+            "BSD-3-Clause": "https://opensource.org/licenses/BSD-3-Clause",
+            "GPL-3.0": "https://www.gnu.org/licenses/gpl-3.0.en.html",
+            "CC-BY-4.0": "https://creativecommons.org/licenses/by/4.0/",
+            "CC-BY-SA-4.0": "https://creativecommons.org/licenses/by-sa/4.0/",
+            "CC-BY-NC-4.0": "https://creativecommons.org/licenses/by-nc/4.0/",
+            "CC-BY-ND-4.0": "https://creativecommons.org/licenses/by-nd/4.0/",
+            "CC-BY-NC-SA-4.0": "https://creativecommons.org/licenses/by-nc-sa/4.0/",
+            "CC-BY-NC-ND-4.0": "https://creativecommons.org/licenses/by-nc-nd/4.0/",
+            "LGPL-3.0": "https://www.gnu.org/licenses/lgpl-3.0.en.html",
+            "MPL-2.0": "https://www.mozilla.org/en-US/MPL/2.0/",
+        }
+        return license_urls.get(license_id, "https://spdx.org/licenses/")