Spaces:

unitxt
/

metric

Running

App Files Files Community

Elron commited on Mar 13, 2024

Commit

129744e

verified ·

1 Parent(s): 3c38cbc

Upload templates.py with huggingface_hub

Browse files

Files changed (1) hide show

templates.py +92 -75

templates.py CHANGED Viewed

@@ -9,6 +9,15 @@ from .random_utils import new_random_generator
 from .type_utils import isoftype
 class Template(StreamInstanceOperator):
     """The role of template is to take the fields of every instance and verbalize it.
@@ -26,8 +35,18 @@ class Template(StreamInstanceOperator):
     postprocessors: List[str] = NonPositionalField(
         default_factory=lambda: ["processors.to_string_stripped"]
     )
-    instruction: str = NonPositionalField(default_factory=lambda: "")
-    target_prefix: str = NonPositionalField(default_factory=lambda: "")
     def process(
         self, instance: Dict[str, Any], stream_name: Optional[str] = None
@@ -43,7 +62,12 @@ class Template(StreamInstanceOperator):
         inputs = instance.get("inputs")
         outputs = instance.get("outputs")
-        source, instruction = self.inputs_to_source(inputs)
         target, references = self.outputs_to_target_and_references(outputs)
         return {
@@ -52,13 +76,17 @@ class Template(StreamInstanceOperator):
             "target": target,
             "references": references,
             "instruction": instruction,
-            "target_prefix": self.target_prefix.format(**inputs),
         }
     @abstractmethod
     def inputs_to_source(self, inputs: Dict[str, object]) -> Tuple[str, str]:
         pass
     @abstractmethod
     def outputs_to_target_and_references(
         self, outputs: Dict[str, object]
@@ -68,6 +96,24 @@ class Template(StreamInstanceOperator):
     def get_postprocessors(self) -> List[str]:
         return self.postprocessors
 class InputOutputTemplate(Template):
     """Generate field 'source' from fields designated as input, and fields 'target' and 'references' from fields designated as output, of the processed instance.
@@ -78,30 +124,15 @@ class InputOutputTemplate(Template):
     input_format: str = None
     output_format: str = None
-    def process_template(self, template: str, data: Dict[str, object]) -> str:
-        data = {k: ", ".join(v) if isinstance(v, list) else v for k, v in data.items()}
-        return template.format(**data)
     def inputs_to_source(self, inputs: Dict[str, object]) -> Tuple[str, str]:
-        formatted = []
-        for formatting in [self.input_format, self.instruction]:
-            try:
-                formatted.append(self.process_template(formatting, inputs))
-            except KeyError as e:
-                raise KeyError(
-                    f"Available inputs are {list(inputs.keys())} but input format requires a different ones: '{formatting}'"
-                ) from e
-        return tuple(formatted)
     def outputs_to_target_and_references(self, outputs: Dict[str, object]) -> str:
-        try:
-            target = self.process_template(self.output_format, outputs)
-        except KeyError as e:
-            raise KeyError(
-                f"Available outputs are {outputs.keys()} but output format requires a different one: {self.output_format}"
-            ) from e
         references = [target]
         return target, references
@@ -110,19 +141,13 @@ class InputOutputReferenceTemplate(InputOutputTemplate):
     reference: str
     def outputs_to_target_and_references(self, outputs: Dict[str, object]) -> str:
-        output_fields = {}
-        for name, val in [
-            ("target", self.output_format),
-            ("reference", self.reference),
-        ]:
-            try:
-                result = self.process_template(val, outputs)
-                output_fields[name] = result
-            except KeyError as e:
-                raise KeyError(
-                    f"Available outputs are {outputs.keys()} but {name} requires a different one: {val}"
-                ) from e
-        return output_fields["target"], [output_fields["reference"]]
 class MultipleChoiceTemplate(Template):
@@ -135,7 +160,6 @@ class MultipleChoiceTemplate(Template):
     choices_seperator: str = ", "
     source_choice_format: str = "{choice_numeral}. {choice_text}"
     target_choice_format: str = "{choice_numeral}"
-    add_numerals_as_field: str = None
     enumerator: str = "capitals"
     def prepare(self):
@@ -170,7 +194,7 @@ class MultipleChoiceTemplate(Template):
                 "XX",
             ]
-    def get_choices(self, data: Dict[str, object], choice_format: str) -> str:
         choices = data[self.choices_field]
         enumrated_choices = []
         for i, choice in enumerate(choices):
@@ -182,22 +206,28 @@ class MultipleChoiceTemplate(Template):
             )
         return enumrated_choices
-    def inputs_to_source(self, inputs: Dict[str, object]) -> Tuple[str, str]:
-        choices = self.get_choices(inputs, self.source_choice_format)
-        inputs = {
-            "numerals": ",".join(self.get_choices(inputs, "{choice_numeral}")),
             **inputs,
             self.choices_field: self.choices_seperator.join(choices),
         }
-        formatted = []
-        for formatting in [self.input_format, self.instruction]:
-            try:
-                formatted.append(formatting.format(**inputs))
-            except KeyError as e:
-                raise KeyError(
-                    f"Available inputs are {inputs.keys()} but input format requires a different one: {formatting}"
-                ) from e
-        return tuple(formatted)
     def outputs_to_target_and_references(self, outputs: Dict[str, object]) -> str:
         target = outputs[self.target_field]
@@ -210,7 +240,7 @@ class MultipleChoiceTemplate(Template):
                     f"MultipleChoiceTemplate could not locate textual target '{target}' in choices list: {outputs[self.choices_field]}"
                 ) from e
-        choices = self.get_choices(outputs, self.target_choice_format)
         try:
             target = choices[target]
@@ -226,7 +256,7 @@ class MultipleChoiceTemplate(Template):
     ) -> Dict[str, Any]:
         result = super().process(instance, stream_name)
         if "options" not in result["outputs"]:
-            result["outputs"]["options"] = self.get_choices(
                 instance["outputs"], self.target_choice_format
             )
         return result
@@ -259,18 +289,9 @@ class YesNoTemplate(Template):
     no_answer: str = "No"
     def inputs_to_source(self, inputs: Dict[str, object]) -> Tuple[str, str]:
-        data = {
-            k: ", ".join(v) if isinstance(v, list) else v for k, v in inputs.items()
-        }
-        formatted = []
-        for formatting in [self.input_format, self.instruction]:
-            try:
-                formatted.append(formatting.format(**data))
-            except KeyError as e:
-                raise RuntimeError(
-                    f"Available inputs are {list(inputs.keys())} but input format requires a different one: {formatting}"
-                ) from e
-        return tuple(formatted)
     def outputs_to_target_and_references(self, outputs: Dict[str, object]) -> str:
         try:
@@ -316,26 +337,22 @@ class KeyValTemplate(Template):
     use_keys_for_outputs: bool = False
     def process_dict(
-        self, dic: Dict[str, object], key_val_sep, pairs_sep, use_keys
     ) -> str:
-        dic = {
-            k: ", ".join([str(vi) for vi in v]) if isinstance(v, list) else v
-            for k, v in dic.items()
-        }
         pairs = []
-        for key, val in dic.items():
             key_val = [key, str(val)] if use_keys else [str(val)]
             pairs.append(key_val_sep.join(key_val))
         return pairs_sep.join(pairs)
     def inputs_to_source(self, inputs: Dict[str, object]) -> Tuple[str, str]:
-        ret = self.process_dict(
             inputs,
             key_val_sep=self.key_val_seperator,
             pairs_sep=self.pairs_seperator,
             use_keys=self.use_keys_for_inputs,
         )
-        return (ret, ret)
     def outputs_to_target_and_references(self, outputs: Dict[str, object]) -> str:
         target = self.process_dict(

 from .type_utils import isoftype
+class TemplateFormatKeyError(KeyError):
+    def __init__(self, template, data, data_type, format_str, format_name):
+        keys = ", ".join(data.keys())
+        super().__init__(
+            f"Available {data_type}s are [{keys}] "
+            f"but {template.__class__.__name__}.{format_name} format requires a different ones: '{format_str}'"
+        )
 class Template(StreamInstanceOperator):
     """The role of template is to take the fields of every instance and verbalize it.
     postprocessors: List[str] = NonPositionalField(
         default_factory=lambda: ["processors.to_string_stripped"]
     )
+    instruction: str = NonPositionalField(default="")
+    target_prefix: str = NonPositionalField(default="")
+    title_fields: List[str] = NonPositionalField(default_factory=list)
+    def inputs_to_instruction_and_target_prefix(self, inputs):
+        instruction = self.apply_formatting(
+            inputs, "input", self.instruction, "instruction", serialize=True
+        )
+        target_prefix = self.apply_formatting(
+            inputs, "input", self.target_prefix, "target_prefix", serialize=True
+        )
+        return instruction, target_prefix
     def process(
         self, instance: Dict[str, Any], stream_name: Optional[str] = None
         inputs = instance.get("inputs")
         outputs = instance.get("outputs")
+        self.set_titles(inputs)
+        source = self.inputs_to_source(inputs)
+        instruction, target_prefix = self.inputs_to_instruction_and_target_prefix(
+            inputs
+        )
         target, references = self.outputs_to_target_and_references(outputs)
         return {
             "target": target,
             "references": references,
             "instruction": instruction,
+            "target_prefix": target_prefix,
         }
     @abstractmethod
     def inputs_to_source(self, inputs: Dict[str, object]) -> Tuple[str, str]:
         pass
+    def set_titles(self, data):
+        for field in self.title_fields:
+            data[field] = data[field].title()
     @abstractmethod
     def outputs_to_target_and_references(
         self, outputs: Dict[str, object]
     def get_postprocessors(self) -> List[str]:
         return self.postprocessors
+    def serialize_data(self, data):
+        return {
+            k: ", ".join(str(t) for t in v) if isinstance(v, list) else v
+            for k, v in data.items()
+        }
+    def apply_formatting(
+        self, data, data_type, format_str, format_name, serialize=False
+    ) -> str:
+        if serialize:
+            data = self.serialize_data(data)
+        try:
+            return format_str.format(**data)
+        except KeyError as e:
+            raise TemplateFormatKeyError(
+                self, data, data_type, format_str, format_name
+            ) from e
 class InputOutputTemplate(Template):
     """Generate field 'source' from fields designated as input, and fields 'target' and 'references' from fields designated as output, of the processed instance.
     input_format: str = None
     output_format: str = None
     def inputs_to_source(self, inputs: Dict[str, object]) -> Tuple[str, str]:
+        return self.apply_formatting(
+            inputs, "input", self.input_format, "input_format", serialize=True
+        )
     def outputs_to_target_and_references(self, outputs: Dict[str, object]) -> str:
+        target = self.apply_formatting(
+            outputs, "output", self.output_format, "output_format", serialize=True
+        )
         references = [target]
         return target, references
     reference: str
     def outputs_to_target_and_references(self, outputs: Dict[str, object]) -> str:
+        target = self.apply_formatting(
+            outputs, "output", self.output_format, "output_format", serialize=True
+        )
+        reference = self.apply_formatting(
+            outputs, "output", self.reference, "reference", serialize=True
+        )
+        return target, [reference]
 class MultipleChoiceTemplate(Template):
     choices_seperator: str = ", "
     source_choice_format: str = "{choice_numeral}. {choice_text}"
     target_choice_format: str = "{choice_numeral}"
     enumerator: str = "capitals"
     def prepare(self):
                 "XX",
             ]
+    def inputs_to_choices(self, data: Dict[str, object], choice_format: str) -> str:
         choices = data[self.choices_field]
         enumrated_choices = []
         for i, choice in enumerate(choices):
             )
         return enumrated_choices
+    def inputs_to_numerals(self, inputs: Dict[str, object]) -> Tuple[str, str]:
+        return self.inputs_to_choices(inputs, "{choice_numeral}")
+    def prepare_multiple_choice_inputs(
+        self, inputs: Dict[str, object]
+    ) -> Dict[str, object]:
+        choices = self.inputs_to_choices(inputs, self.source_choice_format)
+        return {
+            "numerals": self.inputs_to_numerals(inputs),
             **inputs,
             self.choices_field: self.choices_seperator.join(choices),
         }
+    def inputs_to_source(self, inputs: Dict[str, object]) -> Tuple[str, str]:
+        inputs = self.prepare_multiple_choice_inputs(inputs)
+        return self.apply_formatting(
+            inputs, "input", self.input_format, "input_format", serialize=True
+        )
+    def inputs_to_instruction_and_target_prefix(self, inputs):
+        inputs = self.prepare_multiple_choice_inputs(inputs)
+        return super().inputs_to_instruction_and_target_prefix(inputs)
     def outputs_to_target_and_references(self, outputs: Dict[str, object]) -> str:
         target = outputs[self.target_field]
                     f"MultipleChoiceTemplate could not locate textual target '{target}' in choices list: {outputs[self.choices_field]}"
                 ) from e
+        choices = self.inputs_to_choices(outputs, self.target_choice_format)
         try:
             target = choices[target]
     ) -> Dict[str, Any]:
         result = super().process(instance, stream_name)
         if "options" not in result["outputs"]:
+            result["outputs"]["options"] = self.inputs_to_choices(
                 instance["outputs"], self.target_choice_format
             )
         return result
     no_answer: str = "No"
     def inputs_to_source(self, inputs: Dict[str, object]) -> Tuple[str, str]:
+        return self.apply_formatting(
+            inputs, "input", self.input_format, "input_format", serialize=True
+        )
     def outputs_to_target_and_references(self, outputs: Dict[str, object]) -> str:
         try:
     use_keys_for_outputs: bool = False
     def process_dict(
+        self, data: Dict[str, object], key_val_sep, pairs_sep, use_keys
     ) -> str:
+        data = self.serialize_data(data)
         pairs = []
+        for key, val in data.items():
             key_val = [key, str(val)] if use_keys else [str(val)]
             pairs.append(key_val_sep.join(key_val))
         return pairs_sep.join(pairs)
     def inputs_to_source(self, inputs: Dict[str, object]) -> Tuple[str, str]:
+        return self.process_dict(
             inputs,
             key_val_sep=self.key_val_seperator,
             pairs_sep=self.pairs_seperator,
             use_keys=self.use_keys_for_inputs,
         )
     def outputs_to_target_and_references(self, outputs: Dict[str, object]) -> str:
         target = self.process_dict(