Spaces:

unitxt
/

metric

Running

App Files Files Community

Elron commited on Sep 30, 2024

Commit

d389578

verified ·

1 Parent(s): 7cdc7d0

Upload folder using huggingface_hub

Browse files

Files changed (9) hide show

README.md +78 -2
artifact.py +7 -0
card.py +5 -3
collections.py +13 -2
metrics.py +55 -2
operators.py +1 -1
settings_utils.py +1 -1
templates.py +3 -3
version.py +1 -1

README.md CHANGED Viewed

@@ -57,10 +57,86 @@ Then launch the ui by running:
 unitxt-explore
 ```
 # 🦄 Contributors
 Please install Unitxt from source by:
-```
 git clone [email protected]:IBM/unitxt.git
 cd unitxt
 pip install -e ".[dev]"
@@ -71,7 +147,7 @@ pre-commit install
 If you use Unitxt in your research, please cite our paper:
-```
 @inproceedings{bandel-etal-2024-unitxt,
     title = "Unitxt: Flexible, Shareable and Reusable Data Preparation and Evaluation for Generative {AI}",
     author = "Bandel, Elron  and

 unitxt-explore
 ```
+# 🦄 Example
+This is a simple example of running end-to-end evaluation in self contained python code over user data.
+See more examples in examples subdirectory.
+```python
+from unitxt import get_logger
+from unitxt.api import evaluate, load_dataset
+from unitxt.blocks import Task, TaskCard
+from unitxt.inference import HFPipelineBasedInferenceEngine
+from unitxt.loaders import LoadFromDictionary
+from unitxt.templates import InputOutputTemplate, TemplatesDict
+from unitxt.text_utils import print_dict
+logger = get_logger()
+# Set up question answer pairs in a dictionary
+data = {
+    "test": [
+        {"question": "What is the capital of Texas?", "answer": "Austin"},
+        {"question": "What is the color of the sky?", "answer": "Blue"},
+    ]
+}
+card = TaskCard(
+    # Load the data from the dictionary.  Data can be  also loaded from HF, CSV files, COS and other sources using different loaders.
+    loader=LoadFromDictionary(data=data),
+    # Define the QA task input and output and metrics.
+    task=Task(
+        input_fields={"question": str},
+        reference_fields={"answer": str},
+        prediction_type=str,
+        metrics=["metrics.accuracy"],
+    ),
+)
+# Create a simple template that formats the input.
+# Add lowercase normalization as a post processor on the model prediction.
+template = InputOutputTemplate(
+    instruction="Answer the following question.",
+    input_format="{question}",
+    output_format="{answer}",
+    postprocessors=["processors.lower_case"],
+)
+# Verbalize the dataset using the template
+dataset = load_dataset(card=card, template=template)
+test_dataset = dataset["test"]
+# Infer using flan t5 base using HF API
+# can be replaced with any prediction code,
+# including the built in WMLInferenceEngine and OpenAiInferenceEngine.
+model_name = "google/flan-t5-base"
+inference_model = HFPipelineBasedInferenceEngine(
+    model_name=model_name, max_new_tokens=32
+)
+predictions = inference_model.infer(test_dataset)
+evaluated_dataset = evaluate(predictions=predictions, data=test_dataset)
+# Print results
+for instance in evaluated_dataset:
+    print_dict(
+        instance,
+        keys_to_print=[
+            "source", # input to the model
+            "prediction", # model prediction
+            "processed_prediction", # model prediction after post processing
+            "references", # reference answer
+            "score", # scores (per instance and global)
+        ],
+    )
+```
 # 🦄 Contributors
 Please install Unitxt from source by:
+```bash
 git clone [email protected]:IBM/unitxt.git
 cd unitxt
 pip install -e ".[dev]"
 If you use Unitxt in your research, please cite our paper:
+```bib
 @inproceedings{bandel-etal-2024-unitxt,
     title = "Unitxt: Flexible, Shareable and Reusable Data Preparation and Evaluation for Generative {AI}",
     author = "Bandel, Elron  and

artifact.py CHANGED Viewed

@@ -295,6 +295,13 @@ class Artifact(Dataclass):
             **self.process_data_before_dump(self._init_dict),
         }
     def process_data_before_dump(self, data):
         return data

             **self.process_data_before_dump(self._init_dict),
         }
+    def __deepcopy__(self, memo):
+        if id(self) in memo:
+            return memo[id(self)]
+        new_obj = Artifact.from_dict(self.to_dict())
+        memo[id(self)] = new_obj
+        return new_obj
     def process_data_before_dump(self, data):
         return data

card.py CHANGED Viewed

@@ -1,12 +1,12 @@
-from typing import List
 from .artifact import Artifact
-from .collections import Collection
 from .dataclass import OptionalField
 from .loaders import Loader
 from .operator import StreamingOperator
 from .splitters import RandomSampler, Sampler
 from .task import Task
 class TaskCard(Artifact):
@@ -25,5 +25,7 @@ class TaskCard(Artifact):
     loader: Loader
     preprocess_steps: List[StreamingOperator] = None
     task: Task
-    templates: Collection = None
     sampler: Sampler = OptionalField(default_factory=RandomSampler)

+from typing import Dict, List, Union
 from .artifact import Artifact
 from .dataclass import OptionalField
 from .loaders import Loader
 from .operator import StreamingOperator
 from .splitters import RandomSampler, Sampler
 from .task import Task
+from .templates import Template, TemplatesDict, TemplatesList
 class TaskCard(Artifact):
     loader: Loader
     preprocess_steps: List[StreamingOperator] = None
     task: Task
+    templates: Union[
+        TemplatesDict, TemplatesList, Dict[str, Template], List[Template]
+    ] = None
     sampler: Sampler = OptionalField(default_factory=RandomSampler)

collections.py CHANGED Viewed

@@ -1,7 +1,8 @@
 import random
 import typing
 from dataclasses import field
-from typing import Dict, List
 from .artifact import Artifact
 from .dataclass import AbstractField
@@ -11,12 +12,16 @@ from .random_utils import new_random_generator
 class Collection(Artifact):
     items: typing.Collection = AbstractField()
-    def __getitem__(self, key):
         try:
             return self.items[key]
         except LookupError as e:
             raise LookupError(f"Cannot find item {key!r} in {self!r}") from e
 class ListCollection(Collection):
     items: List[Artifact] = field(default_factory=list)
@@ -33,10 +38,16 @@ class ListCollection(Collection):
     def __add__(self, other):
         return ListCollection(self.items.__add__(other.items))
 class DictCollection(Collection):
     items: Dict[str, Artifact] = field(default_factory=dict)
 class ItemPicker(Artifact):
     item: object = None

 import random
 import typing
+from abc import abstractmethod
 from dataclasses import field
+from typing import Any, Dict, Hashable, List
 from .artifact import Artifact
 from .dataclass import AbstractField
 class Collection(Artifact):
     items: typing.Collection = AbstractField()
+    def __getitem__(self, key: Hashable) -> Any:
         try:
             return self.items[key]
         except LookupError as e:
             raise LookupError(f"Cannot find item {key!r} in {self!r}") from e
+    @abstractmethod
+    def keys(self) -> List[Hashable]:
+        pass
 class ListCollection(Collection):
     items: List[Artifact] = field(default_factory=list)
     def __add__(self, other):
         return ListCollection(self.items.__add__(other.items))
+    def keys(self) -> List[int]:
+        return list(range(len(self)))
 class DictCollection(Collection):
     items: Dict[str, Artifact] = field(default_factory=dict)
+    def keys(self) -> List[Hashable]:
+        return list(self.items.keys())
 class ItemPicker(Artifact):
     item: object = None

metrics.py CHANGED Viewed

@@ -1310,6 +1310,59 @@ class Accuracy(InstanceMetric):
         return result
 class JaccardIndex(InstanceMetric):
     reduction_map = {"mean": ["jaccard_index"]}
     main_score = "jaccard_index"
@@ -4741,7 +4794,7 @@ class F1Strings(InstanceMetric):
     main_score = "f1_strings"
     reduction_map = {"mean": ["f1_strings"]}
     prediction_type = str
-    single_reference_per_prediction = True
     _requirements_list = {
         "spacy": "Please pip install spacy",
     }
@@ -4764,7 +4817,7 @@ class F1Strings(InstanceMetric):
         prediction: str,
         task_data: List[Dict],
     ) -> dict:
-        doc_ref = self.nlp(references[0])
         set_ref = Counter([token.text.lower() for token in doc_ref])
         doc_pred = self.nlp(prediction)
         set_pred = Counter([token.text.lower() for token in doc_pred])

         return result
+class ANLS(InstanceMetric):
+    main_score = "anls"
+    reduction_map = {"mean": ["anls"]}
+    prediction_type = Any  # string representation is compared
+    def compute(
+        self,
+        references: List[Any],
+        prediction: Any,
+        task_data: List[Dict],
+        threshold=1.0,
+    ) -> dict:
+        """ANLS image-text accuracy metric."""
+        values = []
+        for answer in references:
+            # preprocess both the answers - gt and prediction
+            gt_answer = " ".join(answer.strip().lower().split())
+            det_answer = " ".join(prediction.strip().lower().split())
+            # dist = levenshtein_distance(answer.lower(), detObject['answer'].lower())
+            dist = self.levenshtein_distance(gt_answer, det_answer)
+            length = max(len(answer.upper()), len(prediction.upper()))
+            values.append(0.0 if length == 0 else float(dist) / float(length))
+        question_result = 1.0 - min(values)
+        if question_result < threshold:
+            question_result = 0.0
+        result = {}
+        result["score"] = question_result
+        result[self.main_score] = question_result
+        result["score_name"] = self.main_score
+        return result
+    @staticmethod
+    def levenshtein_distance(s1, s2):
+        if len(s1) > len(s2):
+            s1, s2 = s2, s1
+        distances = range(len(s1) + 1)
+        for i2, c2 in enumerate(s2):
+            distances_ = [i2 + 1]
+            for i1, c1 in enumerate(s1):
+                if c1 == c2:
+                    distances_.append(distances[i1])
+                else:
+                    distances_.append(
+                        1 + min((distances[i1], distances[i1 + 1], distances_[-1]))
+                    )
+            distances = distances_
+        return distances[-1]
 class JaccardIndex(InstanceMetric):
     reduction_map = {"mean": ["jaccard_index"]}
     main_score = "jaccard_index"
     main_score = "f1_strings"
     reduction_map = {"mean": ["f1_strings"]}
     prediction_type = str
+    single_reference_per_prediction = False
     _requirements_list = {
         "spacy": "Please pip install spacy",
     }
         prediction: str,
         task_data: List[Dict],
     ) -> dict:
+        doc_ref = self.nlp(" ".join(references))
         set_ref = Counter([token.text.lower() for token in doc_ref])
         doc_pred = self.nlp(prediction)
         set_pred = Counter([token.text.lower() for token in doc_pred])

operators.py CHANGED Viewed

@@ -1022,7 +1022,7 @@ class ArtifactFetcherMixin:
         if artifact_identifier not in cls.cache:
             artifact, artifactory = fetch_artifact(artifact_identifier)
             cls.cache[artifact_identifier] = artifact
-        return cls.cache[artifact_identifier]
 class ApplyOperatorsField(InstanceOperator):

         if artifact_identifier not in cls.cache:
             artifact, artifactory = fetch_artifact(artifact_identifier)
             cls.cache[artifact_identifier] = artifact
+        return copy.deepcopy(cls.cache[artifact_identifier])
 class ApplyOperatorsField(InstanceOperator):

settings_utils.py CHANGED Viewed

@@ -180,7 +180,7 @@ if Constants.is_uninitilized():
     constants.instance_stream = "__INSTANCE_STREAM__"
-def get_settings():
     return Settings()

     constants.instance_stream = "__INSTANCE_STREAM__"
+def get_settings() -> Settings:
     return Settings()

templates.py CHANGED Viewed

@@ -4,7 +4,7 @@ from random import random
 from typing import Any, Dict, List, Optional, Tuple, Union
 from .artifact import Artifact
-from .collections import ListCollection
 from .dataclass import NonPositionalField
 from .dict_utils import dict_set
 from .error_utils import Documentation, UnitxtError
@@ -866,7 +866,7 @@ class TemplatesList(ListCollection):
             assert isinstance(template, Template)
-class TemplatesDict(Dict):
     def verify(self):
-        for _key, template in self.items():
             assert isinstance(template, Template)

 from typing import Any, Dict, List, Optional, Tuple, Union
 from .artifact import Artifact
+from .collections import DictCollection, ListCollection
 from .dataclass import NonPositionalField
 from .dict_utils import dict_set
 from .error_utils import Documentation, UnitxtError
             assert isinstance(template, Template)
+class TemplatesDict(DictCollection):
     def verify(self):
+        for template in self.items.values():
             assert isinstance(template, Template)

version.py CHANGED Viewed

	@@ -1 +1 @@
1	- version = "1.13.0"


1	+ version = "1.13.1"