Upload folder using huggingface_hub
Browse files- README.md +78 -2
- artifact.py +7 -0
- card.py +5 -3
- collections.py +13 -2
- metrics.py +55 -2
- operators.py +1 -1
- settings_utils.py +1 -1
- templates.py +3 -3
- version.py +1 -1
README.md
CHANGED
|
@@ -57,10 +57,86 @@ Then launch the ui by running:
|
|
| 57 |
unitxt-explore
|
| 58 |
```
|
| 59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
# 🦄 Contributors
|
| 61 |
|
| 62 |
Please install Unitxt from source by:
|
| 63 |
-
```
|
| 64 |
git clone [email protected]:IBM/unitxt.git
|
| 65 |
cd unitxt
|
| 66 |
pip install -e ".[dev]"
|
|
@@ -71,7 +147,7 @@ pre-commit install
|
|
| 71 |
|
| 72 |
If you use Unitxt in your research, please cite our paper:
|
| 73 |
|
| 74 |
-
```
|
| 75 |
@inproceedings{bandel-etal-2024-unitxt,
|
| 76 |
title = "Unitxt: Flexible, Shareable and Reusable Data Preparation and Evaluation for Generative {AI}",
|
| 77 |
author = "Bandel, Elron and
|
|
|
|
| 57 |
unitxt-explore
|
| 58 |
```
|
| 59 |
|
| 60 |
+
# 🦄 Example
|
| 61 |
+
|
| 62 |
+
This is a simple example of running end-to-end evaluation in self contained python code over user data.
|
| 63 |
+
|
| 64 |
+
See more examples in examples subdirectory.
|
| 65 |
+
|
| 66 |
+
```python
|
| 67 |
+
from unitxt import get_logger
|
| 68 |
+
from unitxt.api import evaluate, load_dataset
|
| 69 |
+
from unitxt.blocks import Task, TaskCard
|
| 70 |
+
from unitxt.inference import HFPipelineBasedInferenceEngine
|
| 71 |
+
from unitxt.loaders import LoadFromDictionary
|
| 72 |
+
from unitxt.templates import InputOutputTemplate, TemplatesDict
|
| 73 |
+
from unitxt.text_utils import print_dict
|
| 74 |
+
|
| 75 |
+
logger = get_logger()
|
| 76 |
+
|
| 77 |
+
# Set up question answer pairs in a dictionary
|
| 78 |
+
data = {
|
| 79 |
+
"test": [
|
| 80 |
+
{"question": "What is the capital of Texas?", "answer": "Austin"},
|
| 81 |
+
{"question": "What is the color of the sky?", "answer": "Blue"},
|
| 82 |
+
]
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
card = TaskCard(
|
| 86 |
+
# Load the data from the dictionary. Data can be also loaded from HF, CSV files, COS and other sources using different loaders.
|
| 87 |
+
loader=LoadFromDictionary(data=data),
|
| 88 |
+
# Define the QA task input and output and metrics.
|
| 89 |
+
task=Task(
|
| 90 |
+
input_fields={"question": str},
|
| 91 |
+
reference_fields={"answer": str},
|
| 92 |
+
prediction_type=str,
|
| 93 |
+
metrics=["metrics.accuracy"],
|
| 94 |
+
),
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
# Create a simple template that formats the input.
|
| 98 |
+
# Add lowercase normalization as a post processor on the model prediction.
|
| 99 |
+
|
| 100 |
+
template = InputOutputTemplate(
|
| 101 |
+
instruction="Answer the following question.",
|
| 102 |
+
input_format="{question}",
|
| 103 |
+
output_format="{answer}",
|
| 104 |
+
postprocessors=["processors.lower_case"],
|
| 105 |
+
)
|
| 106 |
+
# Verbalize the dataset using the template
|
| 107 |
+
dataset = load_dataset(card=card, template=template)
|
| 108 |
+
test_dataset = dataset["test"]
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
# Infer using flan t5 base using HF API
|
| 112 |
+
# can be replaced with any prediction code,
|
| 113 |
+
# including the built in WMLInferenceEngine and OpenAiInferenceEngine.
|
| 114 |
+
model_name = "google/flan-t5-base"
|
| 115 |
+
inference_model = HFPipelineBasedInferenceEngine(
|
| 116 |
+
model_name=model_name, max_new_tokens=32
|
| 117 |
+
)
|
| 118 |
+
predictions = inference_model.infer(test_dataset)
|
| 119 |
+
evaluated_dataset = evaluate(predictions=predictions, data=test_dataset)
|
| 120 |
+
|
| 121 |
+
# Print results
|
| 122 |
+
for instance in evaluated_dataset:
|
| 123 |
+
print_dict(
|
| 124 |
+
instance,
|
| 125 |
+
keys_to_print=[
|
| 126 |
+
"source", # input to the model
|
| 127 |
+
"prediction", # model prediction
|
| 128 |
+
"processed_prediction", # model prediction after post processing
|
| 129 |
+
"references", # reference answer
|
| 130 |
+
"score", # scores (per instance and global)
|
| 131 |
+
],
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
```
|
| 135 |
+
|
| 136 |
# 🦄 Contributors
|
| 137 |
|
| 138 |
Please install Unitxt from source by:
|
| 139 |
+
```bash
|
| 140 |
git clone [email protected]:IBM/unitxt.git
|
| 141 |
cd unitxt
|
| 142 |
pip install -e ".[dev]"
|
|
|
|
| 147 |
|
| 148 |
If you use Unitxt in your research, please cite our paper:
|
| 149 |
|
| 150 |
+
```bib
|
| 151 |
@inproceedings{bandel-etal-2024-unitxt,
|
| 152 |
title = "Unitxt: Flexible, Shareable and Reusable Data Preparation and Evaluation for Generative {AI}",
|
| 153 |
author = "Bandel, Elron and
|
artifact.py
CHANGED
|
@@ -295,6 +295,13 @@ class Artifact(Dataclass):
|
|
| 295 |
**self.process_data_before_dump(self._init_dict),
|
| 296 |
}
|
| 297 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 298 |
def process_data_before_dump(self, data):
|
| 299 |
return data
|
| 300 |
|
|
|
|
| 295 |
**self.process_data_before_dump(self._init_dict),
|
| 296 |
}
|
| 297 |
|
| 298 |
+
def __deepcopy__(self, memo):
|
| 299 |
+
if id(self) in memo:
|
| 300 |
+
return memo[id(self)]
|
| 301 |
+
new_obj = Artifact.from_dict(self.to_dict())
|
| 302 |
+
memo[id(self)] = new_obj
|
| 303 |
+
return new_obj
|
| 304 |
+
|
| 305 |
def process_data_before_dump(self, data):
|
| 306 |
return data
|
| 307 |
|
card.py
CHANGED
|
@@ -1,12 +1,12 @@
|
|
| 1 |
-
from typing import List
|
| 2 |
|
| 3 |
from .artifact import Artifact
|
| 4 |
-
from .collections import Collection
|
| 5 |
from .dataclass import OptionalField
|
| 6 |
from .loaders import Loader
|
| 7 |
from .operator import StreamingOperator
|
| 8 |
from .splitters import RandomSampler, Sampler
|
| 9 |
from .task import Task
|
|
|
|
| 10 |
|
| 11 |
|
| 12 |
class TaskCard(Artifact):
|
|
@@ -25,5 +25,7 @@ class TaskCard(Artifact):
|
|
| 25 |
loader: Loader
|
| 26 |
preprocess_steps: List[StreamingOperator] = None
|
| 27 |
task: Task
|
| 28 |
-
templates:
|
|
|
|
|
|
|
| 29 |
sampler: Sampler = OptionalField(default_factory=RandomSampler)
|
|
|
|
| 1 |
+
from typing import Dict, List, Union
|
| 2 |
|
| 3 |
from .artifact import Artifact
|
|
|
|
| 4 |
from .dataclass import OptionalField
|
| 5 |
from .loaders import Loader
|
| 6 |
from .operator import StreamingOperator
|
| 7 |
from .splitters import RandomSampler, Sampler
|
| 8 |
from .task import Task
|
| 9 |
+
from .templates import Template, TemplatesDict, TemplatesList
|
| 10 |
|
| 11 |
|
| 12 |
class TaskCard(Artifact):
|
|
|
|
| 25 |
loader: Loader
|
| 26 |
preprocess_steps: List[StreamingOperator] = None
|
| 27 |
task: Task
|
| 28 |
+
templates: Union[
|
| 29 |
+
TemplatesDict, TemplatesList, Dict[str, Template], List[Template]
|
| 30 |
+
] = None
|
| 31 |
sampler: Sampler = OptionalField(default_factory=RandomSampler)
|
collections.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
| 1 |
import random
|
| 2 |
import typing
|
|
|
|
| 3 |
from dataclasses import field
|
| 4 |
-
from typing import Dict, List
|
| 5 |
|
| 6 |
from .artifact import Artifact
|
| 7 |
from .dataclass import AbstractField
|
|
@@ -11,12 +12,16 @@ from .random_utils import new_random_generator
|
|
| 11 |
class Collection(Artifact):
|
| 12 |
items: typing.Collection = AbstractField()
|
| 13 |
|
| 14 |
-
def __getitem__(self, key):
|
| 15 |
try:
|
| 16 |
return self.items[key]
|
| 17 |
except LookupError as e:
|
| 18 |
raise LookupError(f"Cannot find item {key!r} in {self!r}") from e
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
class ListCollection(Collection):
|
| 22 |
items: List[Artifact] = field(default_factory=list)
|
|
@@ -33,10 +38,16 @@ class ListCollection(Collection):
|
|
| 33 |
def __add__(self, other):
|
| 34 |
return ListCollection(self.items.__add__(other.items))
|
| 35 |
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
class DictCollection(Collection):
|
| 38 |
items: Dict[str, Artifact] = field(default_factory=dict)
|
| 39 |
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
class ItemPicker(Artifact):
|
| 42 |
item: object = None
|
|
|
|
| 1 |
import random
|
| 2 |
import typing
|
| 3 |
+
from abc import abstractmethod
|
| 4 |
from dataclasses import field
|
| 5 |
+
from typing import Any, Dict, Hashable, List
|
| 6 |
|
| 7 |
from .artifact import Artifact
|
| 8 |
from .dataclass import AbstractField
|
|
|
|
| 12 |
class Collection(Artifact):
|
| 13 |
items: typing.Collection = AbstractField()
|
| 14 |
|
| 15 |
+
def __getitem__(self, key: Hashable) -> Any:
|
| 16 |
try:
|
| 17 |
return self.items[key]
|
| 18 |
except LookupError as e:
|
| 19 |
raise LookupError(f"Cannot find item {key!r} in {self!r}") from e
|
| 20 |
|
| 21 |
+
@abstractmethod
|
| 22 |
+
def keys(self) -> List[Hashable]:
|
| 23 |
+
pass
|
| 24 |
+
|
| 25 |
|
| 26 |
class ListCollection(Collection):
|
| 27 |
items: List[Artifact] = field(default_factory=list)
|
|
|
|
| 38 |
def __add__(self, other):
|
| 39 |
return ListCollection(self.items.__add__(other.items))
|
| 40 |
|
| 41 |
+
def keys(self) -> List[int]:
|
| 42 |
+
return list(range(len(self)))
|
| 43 |
+
|
| 44 |
|
| 45 |
class DictCollection(Collection):
|
| 46 |
items: Dict[str, Artifact] = field(default_factory=dict)
|
| 47 |
|
| 48 |
+
def keys(self) -> List[Hashable]:
|
| 49 |
+
return list(self.items.keys())
|
| 50 |
+
|
| 51 |
|
| 52 |
class ItemPicker(Artifact):
|
| 53 |
item: object = None
|
metrics.py
CHANGED
|
@@ -1310,6 +1310,59 @@ class Accuracy(InstanceMetric):
|
|
| 1310 |
return result
|
| 1311 |
|
| 1312 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1313 |
class JaccardIndex(InstanceMetric):
|
| 1314 |
reduction_map = {"mean": ["jaccard_index"]}
|
| 1315 |
main_score = "jaccard_index"
|
|
@@ -4741,7 +4794,7 @@ class F1Strings(InstanceMetric):
|
|
| 4741 |
main_score = "f1_strings"
|
| 4742 |
reduction_map = {"mean": ["f1_strings"]}
|
| 4743 |
prediction_type = str
|
| 4744 |
-
single_reference_per_prediction =
|
| 4745 |
_requirements_list = {
|
| 4746 |
"spacy": "Please pip install spacy",
|
| 4747 |
}
|
|
@@ -4764,7 +4817,7 @@ class F1Strings(InstanceMetric):
|
|
| 4764 |
prediction: str,
|
| 4765 |
task_data: List[Dict],
|
| 4766 |
) -> dict:
|
| 4767 |
-
doc_ref = self.nlp(references
|
| 4768 |
set_ref = Counter([token.text.lower() for token in doc_ref])
|
| 4769 |
doc_pred = self.nlp(prediction)
|
| 4770 |
set_pred = Counter([token.text.lower() for token in doc_pred])
|
|
|
|
| 1310 |
return result
|
| 1311 |
|
| 1312 |
|
| 1313 |
+
class ANLS(InstanceMetric):
|
| 1314 |
+
main_score = "anls"
|
| 1315 |
+
reduction_map = {"mean": ["anls"]}
|
| 1316 |
+
prediction_type = Any # string representation is compared
|
| 1317 |
+
|
| 1318 |
+
def compute(
|
| 1319 |
+
self,
|
| 1320 |
+
references: List[Any],
|
| 1321 |
+
prediction: Any,
|
| 1322 |
+
task_data: List[Dict],
|
| 1323 |
+
threshold=1.0,
|
| 1324 |
+
) -> dict:
|
| 1325 |
+
"""ANLS image-text accuracy metric."""
|
| 1326 |
+
values = []
|
| 1327 |
+
for answer in references:
|
| 1328 |
+
# preprocess both the answers - gt and prediction
|
| 1329 |
+
gt_answer = " ".join(answer.strip().lower().split())
|
| 1330 |
+
det_answer = " ".join(prediction.strip().lower().split())
|
| 1331 |
+
|
| 1332 |
+
# dist = levenshtein_distance(answer.lower(), detObject['answer'].lower())
|
| 1333 |
+
dist = self.levenshtein_distance(gt_answer, det_answer)
|
| 1334 |
+
length = max(len(answer.upper()), len(prediction.upper()))
|
| 1335 |
+
values.append(0.0 if length == 0 else float(dist) / float(length))
|
| 1336 |
+
|
| 1337 |
+
question_result = 1.0 - min(values)
|
| 1338 |
+
|
| 1339 |
+
if question_result < threshold:
|
| 1340 |
+
question_result = 0.0
|
| 1341 |
+
result = {}
|
| 1342 |
+
result["score"] = question_result
|
| 1343 |
+
result[self.main_score] = question_result
|
| 1344 |
+
result["score_name"] = self.main_score
|
| 1345 |
+
return result
|
| 1346 |
+
|
| 1347 |
+
@staticmethod
|
| 1348 |
+
def levenshtein_distance(s1, s2):
|
| 1349 |
+
if len(s1) > len(s2):
|
| 1350 |
+
s1, s2 = s2, s1
|
| 1351 |
+
|
| 1352 |
+
distances = range(len(s1) + 1)
|
| 1353 |
+
for i2, c2 in enumerate(s2):
|
| 1354 |
+
distances_ = [i2 + 1]
|
| 1355 |
+
for i1, c1 in enumerate(s1):
|
| 1356 |
+
if c1 == c2:
|
| 1357 |
+
distances_.append(distances[i1])
|
| 1358 |
+
else:
|
| 1359 |
+
distances_.append(
|
| 1360 |
+
1 + min((distances[i1], distances[i1 + 1], distances_[-1]))
|
| 1361 |
+
)
|
| 1362 |
+
distances = distances_
|
| 1363 |
+
return distances[-1]
|
| 1364 |
+
|
| 1365 |
+
|
| 1366 |
class JaccardIndex(InstanceMetric):
|
| 1367 |
reduction_map = {"mean": ["jaccard_index"]}
|
| 1368 |
main_score = "jaccard_index"
|
|
|
|
| 4794 |
main_score = "f1_strings"
|
| 4795 |
reduction_map = {"mean": ["f1_strings"]}
|
| 4796 |
prediction_type = str
|
| 4797 |
+
single_reference_per_prediction = False
|
| 4798 |
_requirements_list = {
|
| 4799 |
"spacy": "Please pip install spacy",
|
| 4800 |
}
|
|
|
|
| 4817 |
prediction: str,
|
| 4818 |
task_data: List[Dict],
|
| 4819 |
) -> dict:
|
| 4820 |
+
doc_ref = self.nlp(" ".join(references))
|
| 4821 |
set_ref = Counter([token.text.lower() for token in doc_ref])
|
| 4822 |
doc_pred = self.nlp(prediction)
|
| 4823 |
set_pred = Counter([token.text.lower() for token in doc_pred])
|
operators.py
CHANGED
|
@@ -1022,7 +1022,7 @@ class ArtifactFetcherMixin:
|
|
| 1022 |
if artifact_identifier not in cls.cache:
|
| 1023 |
artifact, artifactory = fetch_artifact(artifact_identifier)
|
| 1024 |
cls.cache[artifact_identifier] = artifact
|
| 1025 |
-
return cls.cache[artifact_identifier]
|
| 1026 |
|
| 1027 |
|
| 1028 |
class ApplyOperatorsField(InstanceOperator):
|
|
|
|
| 1022 |
if artifact_identifier not in cls.cache:
|
| 1023 |
artifact, artifactory = fetch_artifact(artifact_identifier)
|
| 1024 |
cls.cache[artifact_identifier] = artifact
|
| 1025 |
+
return copy.deepcopy(cls.cache[artifact_identifier])
|
| 1026 |
|
| 1027 |
|
| 1028 |
class ApplyOperatorsField(InstanceOperator):
|
settings_utils.py
CHANGED
|
@@ -180,7 +180,7 @@ if Constants.is_uninitilized():
|
|
| 180 |
constants.instance_stream = "__INSTANCE_STREAM__"
|
| 181 |
|
| 182 |
|
| 183 |
-
def get_settings():
|
| 184 |
return Settings()
|
| 185 |
|
| 186 |
|
|
|
|
| 180 |
constants.instance_stream = "__INSTANCE_STREAM__"
|
| 181 |
|
| 182 |
|
| 183 |
+
def get_settings() -> Settings:
|
| 184 |
return Settings()
|
| 185 |
|
| 186 |
|
templates.py
CHANGED
|
@@ -4,7 +4,7 @@ from random import random
|
|
| 4 |
from typing import Any, Dict, List, Optional, Tuple, Union
|
| 5 |
|
| 6 |
from .artifact import Artifact
|
| 7 |
-
from .collections import ListCollection
|
| 8 |
from .dataclass import NonPositionalField
|
| 9 |
from .dict_utils import dict_set
|
| 10 |
from .error_utils import Documentation, UnitxtError
|
|
@@ -866,7 +866,7 @@ class TemplatesList(ListCollection):
|
|
| 866 |
assert isinstance(template, Template)
|
| 867 |
|
| 868 |
|
| 869 |
-
class TemplatesDict(
|
| 870 |
def verify(self):
|
| 871 |
-
for
|
| 872 |
assert isinstance(template, Template)
|
|
|
|
| 4 |
from typing import Any, Dict, List, Optional, Tuple, Union
|
| 5 |
|
| 6 |
from .artifact import Artifact
|
| 7 |
+
from .collections import DictCollection, ListCollection
|
| 8 |
from .dataclass import NonPositionalField
|
| 9 |
from .dict_utils import dict_set
|
| 10 |
from .error_utils import Documentation, UnitxtError
|
|
|
|
| 866 |
assert isinstance(template, Template)
|
| 867 |
|
| 868 |
|
| 869 |
+
class TemplatesDict(DictCollection):
|
| 870 |
def verify(self):
|
| 871 |
+
for template in self.items.values():
|
| 872 |
assert isinstance(template, Template)
|
version.py
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
version = "1.13.
|
|
|
|
| 1 |
+
version = "1.13.1"
|