File size: 12,145 Bytes
d08fbc6 ff375eb 59be457 4868000 f6ebc4f 9d5b4c0 d08fbc6 d443ad5 0a1b314 fe70438 d08fbc6 a350a45 f6ebc4f a350a45 f6ebc4f a350a45 f6ebc4f a350a45 f6ebc4f a350a45 4868000 d08fbc6 4868000 f6ebc4f fe70438 0a1b314 92d8b2d 59be457 f6ebc4f 59be457 f6ebc4f 59be457 058c80a f6ebc4f 058c80a 59be457 92d8b2d f6ebc4f 92d8b2d f6ebc4f d08fbc6 4868000 f6ebc4f a8f310f 058c80a 4868000 f6ebc4f 9d5b4c0 f6ebc4f 9d5b4c0 f6ebc4f a8f310f d08fbc6 f6ebc4f 9d5b4c0 f6ebc4f 9d5b4c0 f6ebc4f 9d5b4c0 59be457 f6ebc4f 59be457 9d5b4c0 59be457 f6ebc4f 59be457 f6ebc4f 59be457 9d5b4c0 59be457 9d5b4c0 59be457 f6ebc4f 59be457 a8f310f f6ebc4f a8f310f 058c80a f6ebc4f fe70438 ff375eb d443ad5 fe70438 d443ad5 ff375eb 59be457 f6ebc4f ff375eb d443ad5 59be457 d443ad5 59be457 058c80a 9d5b4c0 058c80a 9d5b4c0 058c80a f6ebc4f 058c80a f6ebc4f 058c80a f6ebc4f 058c80a f6ebc4f 058c80a f6ebc4f 058c80a f6ebc4f 058c80a a8f310f 058c80a f6ebc4f 0a1b314 8d5bd0c d08fbc6 f6ebc4f 4868000 0a1b314 d08fbc6 4868000 8d5bd0c d08fbc6 8d5bd0c f6ebc4f b462f85 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 |
import warnings
from functools import lru_cache
from typing import Any, Dict, List, Optional, Union
from .deprecation_utils import deprecation
from .error_utils import Documentation, UnitxtError, UnitxtWarning
from .logging_utils import get_logger
from .metrics import MetricsList
from .operator import InstanceOperator
from .operators import ArtifactFetcherMixin
from .settings_utils import get_constants
from .type_utils import (
Type,
get_args,
get_origin,
is_type_dict,
isoftype,
parse_type_dict,
parse_type_string,
to_type_dict,
to_type_string,
verify_required_schema,
)
constants = get_constants()
logger = get_logger()
@deprecation(
version="2.0.0",
msg="use python type instead of type strings (e.g Dict[str] instead of 'Dict[str]')",
)
def parse_string_types_instead_of_actual_objects(obj):
if isinstance(obj, dict):
return parse_type_dict(obj)
return parse_type_string(obj)
class Task(InstanceOperator, ArtifactFetcherMixin):
"""Task packs the different instance fields into dictionaries by their roles in the task.
Attributes:
input_fields (Union[Dict[str, str], List[str]]):
Dictionary with string names of instance input fields and types of respective values.
In case a list is passed, each type will be assumed to be Any.
reference_fields (Union[Dict[str, str], List[str]]):
Dictionary with string names of instance output fields and types of respective values.
In case a list is passed, each type will be assumed to be Any.
metrics (List[str]): List of names of metrics to be used in the task.
prediction_type (Optional[str]):
Need to be consistent with all used metrics. Defaults to None, which means that it will
be set to Any.
defaults (Optional[Dict[str, Any]]):
An optional dictionary with default values for chosen input/output keys. Needs to be
consistent with names and types provided in 'input_fields' and/or 'output_fields' arguments.
Will not overwrite values if already provided in a given instance.
The output instance contains three fields:
"input_fields" whose value is a sub-dictionary of the input instance, consisting of all the fields listed in Arg 'input_fields'.
"reference_fields" -- for the fields listed in Arg "reference_fields".
"metrics" -- to contain the value of Arg 'metrics'
"""
input_fields: Optional[Union[Dict[str, Type], Dict[str, str], List[str]]] = None
reference_fields: Optional[Union[Dict[str, Type], Dict[str, str], List[str]]] = None
inputs: Optional[Union[Dict[str, Type], Dict[str, str], List[str]]] = None
outputs: Optional[Union[Dict[str, Type], Dict[str, str], List[str]]] = None
metrics: List[str]
prediction_type: Optional[Union[Type, str]] = None
augmentable_inputs: List[str] = []
defaults: Optional[Dict[str, Any]] = None
def prepare(self):
super().prepare()
if self.input_fields is not None and self.inputs is not None:
raise UnitxtError(
"Conflicting attributes: 'input_fields' cannot be set simultaneously with 'inputs'. Use only 'input_fields'",
Documentation.ADDING_TASK,
)
if self.reference_fields is not None and self.outputs is not None:
raise UnitxtError(
"Conflicting attributes: 'reference_fields' cannot be set simultaneously with 'output'. Use only 'reference_fields'",
Documentation.ADDING_TASK,
)
self.input_fields = (
self.input_fields if self.input_fields is not None else self.inputs
)
self.reference_fields = (
self.reference_fields if self.reference_fields is not None else self.outputs
)
if isoftype(self.input_fields, Dict[str, str]):
self.input_fields = parse_string_types_instead_of_actual_objects(
self.input_fields
)
if isoftype(self.reference_fields, Dict[str, str]):
self.reference_fields = parse_string_types_instead_of_actual_objects(
self.reference_fields
)
if isinstance(self.prediction_type, str):
self.prediction_type = parse_string_types_instead_of_actual_objects(
self.prediction_type
)
def verify(self):
if hasattr(self, "inputs") and self.inputs is not None:
depr_message = (
"The 'inputs' field is deprecated. Please use 'input_fields' instead."
)
warnings.warn(depr_message, DeprecationWarning, stacklevel=2)
if hasattr(self, "outputs") and self.outputs is not None:
depr_message = "The 'outputs' field is deprecated. Please use 'reference_fields' instead."
warnings.warn(depr_message, DeprecationWarning, stacklevel=2)
if self.input_fields is None:
raise UnitxtError(
"Missing attribute in task: 'input_fields' not set.",
Documentation.ADDING_TASK,
)
if self.reference_fields is None:
raise UnitxtError(
"Missing attribute in task: 'reference_fields' not set.",
Documentation.ADDING_TASK,
)
for io_type in ["input_fields", "reference_fields"]:
data = (
self.input_fields
if io_type == "input_fields"
else self.reference_fields
)
if isinstance(data, list) or not is_type_dict(data):
UnitxtWarning(
f"'{io_type}' field of Task should be a dictionary of field names and their types. "
f"For example, {{'text': str, 'classes': List[str]}}. Instead only '{data}' was "
f"passed. All types will be assumed to be 'Any'. In future version of unitxt this "
f"will raise an exception.",
Documentation.ADDING_TASK,
)
data = {key: Any for key in data}
if io_type == "input_fields":
self.input_fields = data
else:
self.reference_fields = data
if not self.prediction_type:
UnitxtWarning(
"'prediction_type' was not set in Task. It is used to check the output of "
"template post processors is compatible with the expected input of the metrics. "
"Setting `prediction_type` to 'Any' (no checking is done). In future version "
"of unitxt this will raise an exception.",
Documentation.ADDING_TASK,
)
self.prediction_type = Any
self.check_metrics_type()
for augmentable_input in self.augmentable_inputs:
assert (
augmentable_input in self.input_fields
), f"augmentable_input {augmentable_input} is not part of {self.input_fields}"
self.verify_defaults()
@classmethod
def process_data_after_load(cls, data):
possible_dicts = ["inputs", "input_fields", "outputs", "reference_fields"]
for dict_name in possible_dicts:
if dict_name in data and isinstance(data[dict_name], dict):
data[dict_name] = parse_type_dict(data[dict_name])
if "prediction_type" in data:
data["prediction_type"] = parse_type_string(data["prediction_type"])
return data
def process_data_before_dump(self, data):
possible_dicts = ["inputs", "input_fields", "outputs", "reference_fields"]
for dict_name in possible_dicts:
if dict_name in data and isinstance(data[dict_name], dict):
if not isoftype(data[dict_name], Dict[str, str]):
data[dict_name] = to_type_dict(data[dict_name])
if "prediction_type" in data:
if not isinstance(data["prediction_type"], str):
data["prediction_type"] = to_type_string(data["prediction_type"])
return data
@classmethod
@lru_cache(maxsize=None)
def get_metrics_artifacts(cls, metric_id: str):
metric = cls.get_artifact(metric_id)
if isinstance(metric, MetricsList):
return metric.items
return [metric]
def check_metrics_type(self) -> None:
prediction_type = self.prediction_type
for metric_id in self.metrics:
metric_artifacts_list = Task.get_metrics_artifacts(metric_id)
for metric_artifact in metric_artifacts_list:
metric_prediction_type = metric_artifact.prediction_type
if (
prediction_type == metric_prediction_type
or prediction_type == Any
or metric_prediction_type == Any
or (
get_origin(metric_prediction_type) is Union
and prediction_type in get_args(metric_prediction_type)
)
):
continue
raise UnitxtError(
f"The task's prediction type ({prediction_type}) and '{metric_id}' "
f"metric's prediction type ({metric_prediction_type}) are different.",
Documentation.ADDING_TASK,
)
def verify_defaults(self):
if self.defaults:
if not isinstance(self.defaults, dict):
raise UnitxtError(
f"If specified, the 'defaults' must be a dictionary, "
f"however, '{self.defaults}' was provided instead, "
f"which is of type '{to_type_string(type(self.defaults))}'.",
Documentation.ADDING_TASK,
)
for default_name, default_value in self.defaults.items():
assert isinstance(default_name, str), (
f"If specified, all keys of the 'defaults' must be strings, "
f"however, the key '{default_name}' is of type '{to_type_string(type(default_name))}'."
)
val_type = self.input_fields.get(
default_name
) or self.reference_fields.get(default_name)
assert val_type, (
f"If specified, all keys of the 'defaults' must refer to a chosen "
f"key in either 'input_fields' or 'reference_fields'. However, the name '{default_name}' "
f"was provided which does not match any of the keys."
)
assert isoftype(default_value, val_type), (
f"The value of '{default_name}' from the 'defaults' must be of "
f"type '{to_type_string(val_type)}', however, it is of type '{to_type_string(type(default_value))}'."
)
def set_default_values(self, instance: Dict[str, Any]) -> Dict[str, Any]:
if self.defaults:
instance = {**self.defaults, **instance}
return instance
def process(
self, instance: Dict[str, Any], stream_name: Optional[str] = None
) -> Dict[str, Any]:
instance = self.set_default_values(instance)
verify_required_schema(self.input_fields, instance)
input_fields = {key: instance[key] for key in self.input_fields.keys()}
data_classification_policy = instance.get("data_classification_policy", [])
result = {
"input_fields": input_fields,
"metrics": self.metrics,
"data_classification_policy": data_classification_policy,
"media": instance.get("media", {}),
}
if stream_name == constants.inference_stream:
return result
verify_required_schema(self.reference_fields, instance)
result["reference_fields"] = {
key: instance[key] for key in self.reference_fields.keys()
}
return result
@deprecation(version="2.0.0", alternative=Task)
class FormTask(Task):
pass
|