Upload formats.py with huggingface_hub
Browse files- formats.py +101 -72
    	
        formats.py
    CHANGED
    
    | @@ -1,82 +1,111 @@ | |
| 1 | 
            -
            from  | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 2 |  | 
|  | |
|  | |
| 3 |  | 
| 4 | 
            -
             | 
|  | |
| 5 | 
             
                pass
         | 
| 6 |  | 
| 7 |  | 
| 8 | 
            -
            class  | 
| 9 | 
            -
                 | 
| 10 | 
            -
             | 
| 11 | 
            -
             | 
| 12 | 
            -
             | 
| 13 | 
            -
                 | 
| 14 | 
            -
                 | 
| 15 | 
            -
                 | 
| 16 | 
            -
                 | 
| 17 | 
            -
             | 
| 18 | 
            -
                 | 
| 19 | 
            -
                 | 
| 20 | 
            -
                 | 
| 21 | 
            -
             | 
| 22 | 
            -
                 | 
| 23 | 
            -
             | 
| 24 | 
            -
             | 
| 25 | 
            -
                     | 
| 26 | 
            -
             | 
| 27 | 
            -
             | 
| 28 | 
            -
             | 
| 29 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 30 | 
             
                    )
         | 
| 31 | 
            -
             | 
| 32 | 
            -
             | 
| 33 | 
            -
             | 
| 34 | 
            -
                         | 
| 35 | 
            -
             | 
| 36 | 
            -
             | 
| 37 | 
            -
             | 
| 38 | 
            -
             | 
| 39 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 40 | 
             
                    )
         | 
| 41 |  | 
| 42 | 
            -
             | 
| 43 | 
            -
             | 
| 44 | 
            -
             | 
| 45 | 
            -
                     | 
| 46 | 
            -
             | 
| 47 | 
            -
                    instruction = ""
         | 
| 48 | 
             
                    if "instruction" in instance:
         | 
| 49 | 
            -
                         | 
|  | |
|  | |
|  | |
|  | |
| 50 | 
             
                        assert (
         | 
| 51 | 
            -
                             | 
| 52 | 
            -
                        ), f" | 
| 53 | 
            -
             | 
| 54 | 
            -
             | 
| 55 | 
            -
                         | 
| 56 | 
            -
             | 
| 57 | 
            -
                     | 
| 58 | 
            -
             | 
| 59 | 
            -
             | 
| 60 | 
            -
                         | 
| 61 | 
            -
             | 
| 62 | 
            -
             | 
| 63 | 
            -
             | 
| 64 | 
            -
             | 
| 65 | 
            -
                         | 
| 66 | 
            -
             | 
| 67 | 
            -
             | 
| 68 | 
            -
             | 
| 69 | 
            -
                            + self.demo_separator
         | 
| 70 | 
            -
                        )
         | 
| 71 | 
            -
             | 
| 72 | 
            -
                        if self.size_limiter is not None:
         | 
| 73 | 
            -
                            if not self.size_limiter.check(
         | 
| 74 | 
            -
                                source + demo_str + query_str + instance["target"]
         | 
| 75 | 
            -
                            ):
         | 
| 76 | 
            -
                                continue
         | 
| 77 | 
            -
             | 
| 78 | 
            -
                        source += demo_str
         | 
| 79 | 
            -
             | 
| 80 | 
            -
                    source += query_str
         | 
| 81 | 
            -
                    source += self.suffix
         | 
| 82 | 
            -
                    return source
         | 
|  | |
| 1 | 
            +
            from typing import (
         | 
| 2 | 
            +
                Any,
         | 
| 3 | 
            +
                Dict,
         | 
| 4 | 
            +
                List,
         | 
| 5 | 
            +
                Optional,
         | 
| 6 | 
            +
            )
         | 
| 7 |  | 
| 8 | 
            +
            from .operator import StreamInstanceOperator
         | 
| 9 | 
            +
            from .type_utils import isoftype
         | 
| 10 |  | 
| 11 | 
            +
             | 
| 12 | 
            +
            class Format(StreamInstanceOperator):
         | 
| 13 | 
             
                pass
         | 
| 14 |  | 
| 15 |  | 
| 16 | 
            +
            class SystemFormat(Format):
         | 
| 17 | 
            +
                r"""Generates the whole input to the model, from constant strings that are given as args, and from values found in specified fields of the instance.
         | 
| 18 | 
            +
             | 
| 19 | 
            +
                SystemFormat expects the input instance to contain:
         | 
| 20 | 
            +
                1. A field named "source" whose value is a string verbalizing the original values in the instance (as read
         | 
| 21 | 
            +
                from the source dataset), in the context of the underlying task.
         | 
| 22 | 
            +
                2. A field named "instruction" that contains a (non-None) string.
         | 
| 23 | 
            +
                3. A field named with the value in arg 'demos_field', containing a list of dicts, each dict with fields "source"
         | 
| 24 | 
            +
                and "target", representing a single demo.
         | 
| 25 | 
            +
             | 
| 26 | 
            +
                SystemFormat formats the above fields into a single string to be inputted to the model. This string overwrites
         | 
| 27 | 
            +
                field "source" of the instance. Formatting is driven by two args: 'demo_format' and 'model_input_format'.
         | 
| 28 | 
            +
                SystemFormat also pops field "instruction" and the field containing the demos out from the input instance.
         | 
| 29 | 
            +
             | 
| 30 | 
            +
                Args:
         | 
| 31 | 
            +
                    demos_field (str): the name of the field that contains the demos, being a list of dicts, each with "source" and "target" keys
         | 
| 32 | 
            +
                    demo_format (str): formatting string for a single demo, combining fields "source" and "target"
         | 
| 33 | 
            +
                    model_input_format (str) overall product format, combining instruction and source (as read from fields "instruction"
         | 
| 34 | 
            +
                    and "source" of the input instance), together with demos (as formatted into one string)
         | 
| 35 | 
            +
             | 
| 36 | 
            +
                Example:
         | 
| 37 | 
            +
                    when input instance:
         | 
| 38 | 
            +
                    {
         | 
| 39 | 
            +
                        "source": "1+1",
         | 
| 40 | 
            +
                        "target": "2",
         | 
| 41 | 
            +
                        "instruction": "Solve the math exercises.",
         | 
| 42 | 
            +
                        "demos": [{"source": "1+2", "target": "3"}, {"source": "4-2", "target": "2"}]
         | 
| 43 | 
            +
                    }
         | 
| 44 | 
            +
                    is process-ed by
         | 
| 45 | 
            +
                    system_format = SystemFormat(
         | 
| 46 | 
            +
                        demos_field="demos",
         | 
| 47 | 
            +
                        demo_format="Input: {source}\nOutput: {target}\n\n",
         | 
| 48 | 
            +
                        model_input_format="Instruction: {instruction}\n\n{demos}Input: {source}\nOutput: ",
         | 
| 49 | 
             
                    )
         | 
| 50 | 
            +
                    the resulting instance is:
         | 
| 51 | 
            +
                    {
         | 
| 52 | 
            +
                        "target": "2",
         | 
| 53 | 
            +
                        "source": "Instruction: Solve the math exercises.\n\nInput: 1+2\nOutput: 3\n\nInput: 4-2\nOutput: 2\n\nInput: 1+1\nOutput: ",
         | 
| 54 | 
            +
                    }
         | 
| 55 | 
            +
                """
         | 
| 56 | 
            +
             | 
| 57 | 
            +
                demos_field: str = "demos"
         | 
| 58 | 
            +
                demo_format: str = (
         | 
| 59 | 
            +
                    "{source}\n{target}\n\n"  #  example: "User: {source}\nAgent: {target}\n\n"
         | 
| 60 | 
            +
                )
         | 
| 61 | 
            +
                model_input_format: str = "{instruction}{demos}{source}\n"
         | 
| 62 | 
            +
             | 
| 63 | 
            +
                @staticmethod
         | 
| 64 | 
            +
                def _retrieve_field_and_assert_not_none(instance, field_name) -> str:
         | 
| 65 | 
            +
                    if field_name is not None and field_name in instance:
         | 
| 66 | 
            +
                        field_value = instance[field_name]
         | 
| 67 | 
            +
                        assert (
         | 
| 68 | 
            +
                            field_value is not None
         | 
| 69 | 
            +
                        ), f"Value in field '{field_name}' should not be none. Received instance: {instance}"
         | 
| 70 | 
            +
                        return field_value
         | 
| 71 | 
            +
                    return ""
         | 
| 72 | 
            +
             | 
| 73 | 
            +
                def process(
         | 
| 74 | 
            +
                    self, instance: Dict[str, Any], stream_name: Optional[str] = None
         | 
| 75 | 
            +
                ) -> Dict[str, Any]:
         | 
| 76 | 
            +
                    assert (
         | 
| 77 | 
            +
                        "source" in instance
         | 
| 78 | 
            +
                    ), f"field 'source' is expected to be in the input instance. Received instance: {instance}"
         | 
| 79 | 
            +
                    source = self._retrieve_field_and_assert_not_none(
         | 
| 80 | 
            +
                        instance=instance, field_name="source"
         | 
| 81 | 
             
                    )
         | 
| 82 |  | 
| 83 | 
            +
                    instruction = self._retrieve_field_and_assert_not_none(
         | 
| 84 | 
            +
                        instance=instance, field_name="instruction"
         | 
| 85 | 
            +
                    )
         | 
| 86 | 
            +
                    # pop "instruction" from instance
         | 
|  | |
|  | |
| 87 | 
             
                    if "instruction" in instance:
         | 
| 88 | 
            +
                        instance.pop("instruction")
         | 
| 89 | 
            +
             | 
| 90 | 
            +
                    demo_instances = []
         | 
| 91 | 
            +
                    if self.demos_field is not None and self.demos_field in instance:
         | 
| 92 | 
            +
                        demos = instance[self.demos_field]
         | 
| 93 | 
             
                        assert (
         | 
| 94 | 
            +
                            demos is not None and isoftype(demos, List[Dict[str, Any]])
         | 
| 95 | 
            +
                        ), f"A list of dict-s is expected in field '{self.demos_field}'. Received instance: {instance}"
         | 
| 96 | 
            +
                        demo_instances = demos
         | 
| 97 | 
            +
                        # pop demos from instance
         | 
| 98 | 
            +
                        instance.pop(self.demos_field)
         | 
| 99 | 
            +
             | 
| 100 | 
            +
                    demos_string = ""
         | 
| 101 | 
            +
                    for demo_instance in demo_instances:
         | 
| 102 | 
            +
                        demo_str = self.demo_format.format(**demo_instance)
         | 
| 103 | 
            +
                        demos_string += demo_str
         | 
| 104 | 
            +
             | 
| 105 | 
            +
                    output = self.model_input_format.format(
         | 
| 106 | 
            +
                        instruction=instruction,
         | 
| 107 | 
            +
                        demos=demos_string,
         | 
| 108 | 
            +
                        source=source,
         | 
| 109 | 
            +
                    )
         | 
| 110 | 
            +
                    instance["source"] = output
         | 
| 111 | 
            +
                    return instance
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 

