Spaces:

danieldux
/

isco_hierarchical_accuracy

Sleeping

App Files Files Community

danieldux commited on Mar 12, 2024

Commit

087d986

1 Parent(s): 03c8589

Refactor ISCO_Hierarchical_Accuracy class to use weighted hierarchy dictionary

Browse files

Files changed (1) hide show

isco_hierarchical_accuracy.py +42 -25

isco_hierarchical_accuracy.py CHANGED Viewed

@@ -114,15 +114,14 @@ class ISCO_Hierarchical_Accuracy(evaluate.Metric):
     def create_hierarchy_dict(self, file: str) -> dict:
         """
-        Creates a dictionary where keys are nodes and values are sets of parent nodes representing the group level hierarchy of the ISCO-08 structure.
-        The function assumes that the input CSV file has a column named 'unit' with the 4-digit ISCO-08 codes.
-        A csv file with the ISCO-08 structure can be downloaded from the International Labour Organization (ILO) at [https://www.ilo.org/ilostat-files/ISCO/newdocs-08-2021/ISCO-08/ISCO-08 EN.csv](https://www.ilo.org/ilostat-files/ISCO/newdocs-08-2021/ISCO-08/ISCO-08%20EN.csv)
         Args:
         - file: A string representing the path to the CSV file containing the 4-digit ISCO-08 codes. It can be a local path or a web URL.
         Returns:
-        - A dictionary where keys are ISCO-08 unit codes and values are sets of their parent codes.
         """
         try:
@@ -146,7 +145,12 @@ class ISCO_Hierarchical_Accuracy(evaluate.Metric):
             minor_code = unit_code[0:3]
             sub_major_code = unit_code[0:2]
             major_code = unit_code[0]
-            isco_hierarchy[unit_code] = {minor_code, major_code, sub_major_code}
         return isco_hierarchy
@@ -192,40 +196,53 @@ class ISCO_Hierarchical_Accuracy(evaluate.Metric):
         self,
         reference_codes: List[str],
         predicted_codes: List[str],
-        hierarchy: Dict[str, Set[str]],
     ) -> Tuple[float, float]:
         """
         Calculates the hierarchical precision and recall given the reference codes, predicted codes, and hierarchy definition.
         Args:
-            real_codes (List[str]): The list of reference codes.
             predicted_codes (List[str]): The list of predicted codes.
             hierarchy (Dict[str, Set[str]]): The hierarchy definition where keys are nodes and values are sets of parent nodes.
         Returns:
             Tuple[float, float]: A tuple containing the hierarchical precision and recall floating point values.
         """
-        # Extend the sets of real and predicted codes with their ancestors
-        extended_real = set()
-        for code in reference_codes:
-            extended_real.add(code)
-            extended_real.update(hierarchy.get(code, set()))
-        extended_predicted = set()
-        for code in predicted_codes:
-            extended_predicted.add(code)
-            extended_predicted.update(hierarchy.get(code, set()))
-        # Calculate the intersection
-        correct_predictions = extended_real.intersection(extended_predicted)
-        # Calculate hierarchical precision and recall
-        hP = (
-            len(correct_predictions) / len(extended_predicted)
-            if extended_predicted
-            else 0
-        )
-        hR = len(correct_predictions) / len(extended_real) if extended_real else 0
         return hP, hR

     def create_hierarchy_dict(self, file: str) -> dict:
         """
+        Creates a dictionary where keys are nodes and values are dictionaries of their parent nodes with distance as weights,
+        representing the group level hierarchy of the ISCO-08 structure.
         Args:
         - file: A string representing the path to the CSV file containing the 4-digit ISCO-08 codes. It can be a local path or a web URL.
         Returns:
+        - A dictionary where keys are ISCO-08 unit codes and values are dictionaries of their parent codes with distances.
         """
         try:
             minor_code = unit_code[0:3]
             sub_major_code = unit_code[0:2]
             major_code = unit_code[0]
+            # Assign weights, higher for closer ancestors
+            weights = {minor_code: 0.75, sub_major_code: 0.5, major_code: 0.25}
+            # Store ancestors with their weights
+            isco_hierarchy[unit_code] = weights
         return isco_hierarchy
         self,
         reference_codes: List[str],
         predicted_codes: List[str],
+        hierarchy: Dict[str, Dict[str, float]],
     ) -> Tuple[float, float]:
         """
         Calculates the hierarchical precision and recall given the reference codes, predicted codes, and hierarchy definition.
         Args:
+            reference_codes (List[str]): The list of reference codes.
             predicted_codes (List[str]): The list of predicted codes.
             hierarchy (Dict[str, Set[str]]): The hierarchy definition where keys are nodes and values are sets of parent nodes.
         Returns:
             Tuple[float, float]: A tuple containing the hierarchical precision and recall floating point values.
         """
+        extended_real = {}
+        # Extend the sets of reference codes with their ancestors
+        for code in reference_codes:
+            weight = 1.0  # Full weight for exact match
+            extended_real[code] = weight
+            for ancestor, ancestor_weight in hierarchy.get(code, {}).items():
+                extended_real[ancestor] = max(
+                    extended_real.get(ancestor, 0), ancestor_weight
+                )
+        extended_predicted = {}
+        # Extend the sets of predicted codes with their ancestors
+        for code in predicted_codes:
+            weight = 1.0
+            extended_predicted[code] = weight
+            for ancestor, ancestor_weight in hierarchy.get(code, {}).items():
+                extended_predicted[ancestor] = max(
+                    extended_predicted.get(ancestor, 0), ancestor_weight
+                )
+        # Calculate weighted correct predictions
+        correct_weights = 0
+        for code, weight in extended_predicted.items():
+            if code in extended_real:
+                correct_weights += min(weight, extended_real[code])
+        total_predicted_weights = sum(extended_predicted.values())
+        total_real_weights = sum(extended_real.values())
+        # Calculate hierarchical precision and recall using weighted sums
+        hP = correct_weights / total_predicted_weights if total_predicted_weights else 0
+        hR = correct_weights / total_real_weights if total_real_weights else 0
         return hP, hR