Spaces:

unitxt
/

metric

Running

File size: 11,443 Bytes

147cebb
719f1b0
b462f85
147cebb
49dbe26
 
 
 
147cebb
 
719f1b0
7e6fc99
 
 
 
 
 
 
147cebb
96fd200
147cebb
96fd200
147cebb
96fd200
 
147cebb
719f1b0
147cebb
 
 
719f1b0
7e6fc99
 
 
 
 
 
 
147cebb
 
 
 
719f1b0
7e6fc99
 
 
 
 
 
 
147cebb
 
 
 
719f1b0
7e6fc99
 
 
 
 
 
 
147cebb
96fd200
147cebb
 
96fd200
147cebb
719f1b0
147cebb
 
058c80a
49dbe26
7e6fc99
 
49dbe26
7e6fc99
 
 
058c80a
7e6fc99
49dbe26
147cebb
 
49dbe26
147cebb
058c80a
 
 
 
 
 
 
 
147cebb
49dbe26
 
147cebb
 
0a1b314
 
 
147cebb
 
49dbe26
147cebb
 
49dbe26
147cebb
49dbe26
147cebb
49dbe26
147cebb
49dbe26
 
 
fe70438
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d443ad5
 
fe70438
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
058c80a
 
 
 
49dbe26
058c80a
147cebb
 
fe70438
 
 
 
 
 
 
147cebb
719f1b0
7e6fc99
 
 
 
 
 
 
147cebb
 
 
 
 
 
96fd200
47ba072
 
 
 
 
b462f85
 
100c2eb
 
b462f85
 
 
 
 
 
 
 
100c2eb
 
 
 
b462f85
058c80a
100c2eb
b462f85
 
 
 
 
 
 
 
 
 
 
100c2eb
 
 
058c80a
100c2eb
058c80a
 
 
100c2eb
 
 
 
058c80a
 
 
 
100c2eb
058c80a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100c2eb
 
058c80a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b462f85

import re
import shutil
from typing import List, Tuple

from .logging_utils import get_logger

logger = get_logger()


def split_words(s):
    """Splits a string into words based on PascalCase, camelCase, snake_case, kebab-case, and numbers attached to strings.

    Args:
        s (str): The string to be split.

    Returns:
        list: The list of words obtained after splitting the string.
    """
    # Split PascalCase or camelCase
    s = re.sub(r"([A-Z][a-z]+)", r" \1", re.sub(r"([A-Z]+)", r" \1", s)).strip()
    # Split snake_case or kebab-case
    s = re.sub(r"[_-]", " ", s)
    # Split numbers attached to strings
    s = re.sub(r"([a-zA-Z])(\d)", r"\1 \2", s)
    s = re.sub(r"(\d)([a-zA-Z])", r"\1 \2", s)
    # Split the string into words based on spaces
    return s.split()


def is_camel_case(s):
    """Checks if a string is in camelCase.

    Args:
        s (str): The string to be checked.

    Returns:
        bool: True if the string is in camelCase, False otherwise.
    """
    return re.match(r"^[A-Z]+([a-z0-9]*[A-Z]*[a-z0-9]*)*$", s) is not None


def is_snake_case(s):
    """Checks if a string is in snake_case.

    Args:
        s (str): The string to be checked.

    Returns:
        bool: True if the string is in snake_case, False otherwise.
    """
    return re.match(r"^[a-z0-9]+(_[a-z0-9]+)*$", s) is not None


def camel_to_snake_case(s):
    """Converts a string from camelCase to snake_case.

    Args:
        s (str): The string to be converted.

    Returns:
        str: The string converted to snake_case.
    """
    # Add an underscore before every uppercase letter that is followed by a lowercase letter or digit and not preceded by an underscore, a hyphen or an uppercase letter
    s = re.sub(r"(?<=[^A-Z_-])([A-Z])", r"_\1", s)

    # Ensure there's an underscore before any uppercase letter that's followed by a lowercase letter or digit and comes after a sequence of uppercase letters
    s = re.sub(r"([A-Z]+)([A-Z][a-z0-9])", r"\1_\2", s)

    return s.lower()


def construct_dict_str(d, indent=0, indent_delta=4, max_chars=None, keys=None):
    """Constructs a formatted string of a dictionary.

    Args:
        d (dict): The dictionary to be formatted.
        indent (int, optional): The current level of indentation. Defaults to 0.
        indent_delta (int, optional): The amount of spaces to add for each level of indentation. Defaults to 4.
        max_chars (int, optional): The maximum number of characters for each line. Defaults to terminal width - 10.
        keys (List[Str], optional): the list of fields to print
    """
    max_chars = max_chars or shutil.get_terminal_size()[0] - 10
    indent_str = " " * indent
    indent_delta_str = " " * indent_delta
    res = ""

    if keys is None:
        keys = d.keys()
    for key in keys:
        if key not in d.keys():
            raise ValueError(
                f"Dictionary does not contain field {key} specified in 'keys' argument. The available keys are {d.keys()}"
            )
        value = d[key]
        if isinstance(value, dict):
            res += f"{indent_str}{key}:\n"
            res += construct_dict_str(value, indent + indent_delta, max_chars=max_chars)
        else:
            str_value = str(value)
            str_value = re.sub(r"\w+=None, ", "", str_value)
            str_value = re.sub(r"\w+={}, ", "", str_value)
            str_value = re.sub(r"\w+=\[\], ", "", str_value)
            line_width = max_chars - indent
            lines = str_value.split("\n")
            res += f"{indent_str}{key} ({type(value).__name__}):\n"
            for line in lines:
                if len(line) + len(indent_str) + indent_delta > line_width:
                    res += f"{indent_str}{indent_delta_str}{line[:line_width]}\n"
                    for i in range(line_width, len(line), line_width):
                        res += f"{indent_str}{indent_delta_str}{line[i:i+line_width]}\n"
                else:
                    res += f"{indent_str}{indent_delta_str}{line}\n"
                key = ""  # Empty the key for lines after the first one
    return res


def construct_dict_as_yaml_lines(d, indent_delta=2) -> List[str]:
    """Constructs the lines of a dictionary formatted as yaml.

    Args:
        d: The element to be formatted.
        indent_delta (int, optional): The amount of spaces to add for each level of indentation. Defaults to 2.
    """

    def is_simple(val) -> bool:
        # if can show in same line as dictionary's key
        return not isinstance(val, (dict, list)) or (len(val) == 0)

    indent_delta_str = " " * indent_delta
    ticked_indent_delta_str = indent_delta_str[:-2] + "- "
    assert (
        indent_delta >= 2
    ), f"Needs at least 2 position indentations, for the case of list elements, that are to be preceded each by ' -'. Got indent_delta={indent_delta}."
    res = []  # conputed hereunder as a list of lines, that are indented only at the end

    if isinstance(d, dict):
        if len(d) == 0:
            return ["{}"]
        for key, val in d.items():
            printable_key = f'"{key}"' if (" " in key) or (key == "") else key
            res.append(printable_key + ": ")
            yaml_for_val = construct_dict_as_yaml_lines(val, indent_delta=indent_delta)
            assert len(yaml_for_val) > 0
            if is_simple(val):
                assert len(yaml_for_val) == 1
                res[-1] += yaml_for_val[0]
            else:
                for line in yaml_for_val:
                    res.append(indent_delta_str + line)
        return res

    if isinstance(d, list):
        if len(d) == 0:
            return ["[]"]
        for val in d:
            yaml_for_val = construct_dict_as_yaml_lines(val, indent_delta=indent_delta)
            assert len(yaml_for_val) > 0
            res.append(ticked_indent_delta_str + yaml_for_val[0])
            for line in yaml_for_val[1:]:
                res.append(indent_delta_str + line)
        return res

    # d1 = re.sub(r"(\n+)", r'"\1"', str(d))
    d1 = str(d).replace("\n", "\\n").replace('"', '\\"')
    if "\\n" in d1:
        d1 = f'"{d1}"'
    return [d1]


def print_dict(
    d, indent=0, indent_delta=4, max_chars=None, keys_to_print=None, log_level="info"
):
    dict_str = construct_dict_str(d, indent, indent_delta, max_chars, keys_to_print)
    dict_str = "\n" + dict_str
    getattr(logger, log_level)(dict_str)


def print_dict_as_yaml(d: dict, indent_delta=2) -> str:
    yaml_lines = construct_dict_as_yaml_lines(d)
    # yaml_lines = [re.sub(r"(\n+)", r'"\1"', line) for line in yaml_lines]
    # yaml_lines = [line.replace("\n", "\\n") for line in yaml_lines]
    return "\n".join(yaml_lines)


def nested_tuple_to_string(nested_tuple: tuple) -> str:
    """Converts a nested tuple to a string, with elements separated by underscores.

    Args:
        nested_tuple (tuple): The nested tuple to be converted.

    Returns:
        str: The string representation of the nested tuple.
    """
    result = []
    for item in nested_tuple:
        if isinstance(item, tuple):
            result.append(nested_tuple_to_string(item))
        else:
            result.append(str(item))
    return "_".join(result)


def is_made_of_sub_strings(string, sub_strings):
    pattern = "^(" + "|".join(map(re.escape, sub_strings)) + ")+$"
    return bool(re.match(pattern, string))


# Giveמ all the lines of a card preparer file, e.g. all the lines of prepare/cards/cohere_for_ai.py,
# and an object name, e.g. TaskCard(,
# return the ordinal number of the line that starts that object, in our example: the
# line number of the following line (notice that the line where TaskCard is imported
# is not supposed to return):
#         card = TaskCard(
# and the line number of the line that ends the object, in our case the line that include
# the matching close:
#         )
# This util depends on ruff to ensure this setting of the card file: that a close of one
# tag and the open of the next tag, do not sit in same line, when both tags being
# major level within TaskCard.
# It also prepares for the case that  __description__ tag does not contain balanced
# parentheses, since it is often cut in the middle, (with  "... see more at")
# flake8: noqa: B007
# flake8: noqa: C901
def lines_defining_obj_in_card(
    all_lines: List[str], obj_name: str, start_search_at_line: int = 0
) -> Tuple[int, int]:
    for starting_line in range(start_search_at_line, len(all_lines)):
        line = all_lines[starting_line]
        if obj_name in line:
            break
    if obj_name not in line:
        # obj_name found no where in the input lines
        return (-1, -1)
    num_of_opens = 0
    num_of_closes = 0
    ending_line = starting_line - 1
    while ending_line < len(all_lines):
        ending_line += 1

        if "__description__" in all_lines[ending_line]:
            # can not trust parentheses inside description, because this is mainly truncated
            # free text.
            # We do trust the indentation enforced by ruff, and the way we build __description__:
            # a line consisting of only __description__=(
            # followed by one or more lines of text, can not trust opens and closes
            # in them, followed by a line consisting of only:  ),
            # where the ) is indented with the beginning of __description__
            # We also prepare for the case that, when not entered by us, __description__=
            # is not followed by a ( and the whole description does not end with a single ) in its line.
            # We build on ruff making the line following the description start with same indentation
            # or 4 less (i.e., the following line is the closing of the card).
            tag_indentation = all_lines[ending_line].index("__description__")
            starts_with_parent = "__description__=(" in all_lines[ending_line]
            if starts_with_parent:
                last_line_to_start_with = (" " * tag_indentation) + r"\)"
            else:
                # actually, the line that follows the description
                last_line_to_start_with1 = (" " * tag_indentation) + "[^ ]"
                last_line_to_start_with2 = (" " * (tag_indentation - 4)) + "[^ ]"
                last_line_to_start_with = (
                    "("
                    + last_line_to_start_with1
                    + "|"
                    + last_line_to_start_with2
                    + ")"
                )
            ending_line += 1
            while not re.search("^" + last_line_to_start_with, all_lines[ending_line]):
                ending_line += 1
            if "__description__" in obj_name:
                return (
                    starting_line,
                    ending_line if starts_with_parent else ending_line - 1,
                )

            if starts_with_parent:
                ending_line += 1

            # we conrinue in card, having passed the description, ending line points
            # to the line that follows description

        num_of_opens += len(re.findall(r"[({[]", all_lines[ending_line]))
        num_of_closes += len(re.findall(r"[)}\]]", all_lines[ending_line]))
        if num_of_closes == num_of_opens:
            break

    if num_of_closes != num_of_opens:
        raise ValueError(
            "input lines were exhausted before the matching close is found"
        )

    return (starting_line, ending_line)