File size: 15,997 Bytes
147cebb
719f1b0
b462f85
147cebb
88c61d3
 
49dbe26
 
 
 
147cebb
 
719f1b0
7e6fc99
 
 
 
 
 
 
147cebb
96fd200
147cebb
96fd200
147cebb
96fd200
 
147cebb
719f1b0
147cebb
 
 
719f1b0
7e6fc99
 
 
 
 
 
 
147cebb
 
 
 
719f1b0
7e6fc99
 
 
 
 
 
 
147cebb
 
 
 
719f1b0
7e6fc99
 
 
 
 
 
 
147cebb
96fd200
147cebb
 
96fd200
147cebb
719f1b0
147cebb
 
88c61d3
 
 
 
 
 
 
 
 
 
7e6fc99
 
88c61d3
7e6fc99
88c61d3
 
 
 
 
7e6fc99
49dbe26
147cebb
49dbe26
147cebb
88c61d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
058c80a
88c61d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147cebb
88c61d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49dbe26
 
 
fe70438
 
 
 
 
 
 
 
 
 
 
 
35fffae
fe70438
 
 
 
 
d443ad5
 
fe70438
 
99f75f9
fe70438
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ed33057
fe70438
 
 
99f75f9
35fffae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99f75f9
35fffae
 
 
 
 
 
 
 
99f75f9
35fffae
 
99f75f9
 
35fffae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99f75f9
35fffae
 
 
fe70438
058c80a
 
 
88c61d3
49dbe26
058c80a
147cebb
 
fe70438
35fffae
fe70438
 
 
 
99f75f9
35fffae
 
99f75f9
35fffae
fe70438
99f75f9
147cebb
719f1b0
7e6fc99
 
 
 
 
 
 
147cebb
 
 
 
 
 
96fd200
47ba072
 
 
 
 
b462f85
 
100c2eb
 
b462f85
 
 
 
 
 
 
 
100c2eb
 
 
 
b462f85
058c80a
100c2eb
b462f85
 
 
 
 
 
 
 
 
 
 
100c2eb
 
 
058c80a
100c2eb
058c80a
 
 
100c2eb
 
 
 
058c80a
 
 
 
100c2eb
058c80a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100c2eb
 
058c80a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b462f85
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
import re
import shutil
from typing import List, Tuple

import pandas as pd

from .logging_utils import get_logger

logger = get_logger()


def split_words(s):
    """Splits a string into words based on PascalCase, camelCase, snake_case, kebab-case, and numbers attached to strings.

    Args:
        s (str): The string to be split.

    Returns:
        list: The list of words obtained after splitting the string.
    """
    # Split PascalCase or camelCase
    s = re.sub(r"([A-Z][a-z]+)", r" \1", re.sub(r"([A-Z]+)", r" \1", s)).strip()
    # Split snake_case or kebab-case
    s = re.sub(r"[_-]", " ", s)
    # Split numbers attached to strings
    s = re.sub(r"([a-zA-Z])(\d)", r"\1 \2", s)
    s = re.sub(r"(\d)([a-zA-Z])", r"\1 \2", s)
    # Split the string into words based on spaces
    return s.split()


def is_camel_case(s):
    """Checks if a string is in camelCase.

    Args:
        s (str): The string to be checked.

    Returns:
        bool: True if the string is in camelCase, False otherwise.
    """
    return re.match(r"^[A-Z]+([a-z0-9]*[A-Z]*[a-z0-9]*)*$", s) is not None


def is_snake_case(s):
    """Checks if a string is in snake_case.

    Args:
        s (str): The string to be checked.

    Returns:
        bool: True if the string is in snake_case, False otherwise.
    """
    return re.match(r"^[a-z0-9]+(_[a-z0-9]+)*$", s) is not None


def camel_to_snake_case(s):
    """Converts a string from camelCase to snake_case.

    Args:
        s (str): The string to be converted.

    Returns:
        str: The string converted to snake_case.
    """
    # Add an underscore before every uppercase letter that is followed by a lowercase letter or digit and not preceded by an underscore, a hyphen or an uppercase letter
    s = re.sub(r"(?<=[^A-Z_-])([A-Z])", r"_\1", s)

    # Ensure there's an underscore before any uppercase letter that's followed by a lowercase letter or digit and comes after a sequence of uppercase letters
    s = re.sub(r"([A-Z]+)([A-Z][a-z0-9])", r"\1_\2", s)

    return s.lower()


def to_pretty_string(
    value,
    indent=0,
    indent_delta=4,
    max_chars=None,
    keys=None,
    item_label=None,
    float_format=None,
):
    """Constructs a formatted string representation of various data structures (dicts, lists, tuples, and DataFrames).

    Args:
        value: The Python data structure to be formatted.
        indent (int, optional): The current level of indentation. Defaults to 0.
        indent_delta (int, optional): Amount of spaces to add per indentation level. Defaults to 4.
        max_chars (int, optional): Max characters per line before wrapping. Defaults to terminal width - 10.
        keys (List[str], optional): For dicts, optionally specify keys and order.
        item_label (str, optional): Internal parameter for labeling items.
        float_format (str, optional): Format string for float values (e.g., ".2f"). Defaults to None.
    """
    max_chars = max_chars or shutil.get_terminal_size()[0] - 10
    indent_str = " " * indent
    res = ""

    if isinstance(value, dict):
        keys_to_print = keys if keys is not None else list(value.keys())

        for k in keys_to_print:
            if k not in value:
                raise ValueError(
                    f"Dictionary does not contain field '{k}' specified in 'keys' argument. "
                    f"The available keys are {list(value.keys())}"
                )

        for k in keys_to_print:
            v = value[k]
            item_header = f"{k} ({type(v).__name__})"
            res += f"{indent_str}{item_header}:\n"
            res += to_pretty_string(
                v,
                indent=indent + indent_delta,
                indent_delta=indent_delta,
                max_chars=max_chars,
                float_format=float_format,
            )

    elif isinstance(value, (list, tuple)):
        for i, v in enumerate(value):
            label = f"[{i}]" if isinstance(value, list) else f"({i})"
            item_header = f"{label} ({type(v).__name__})"
            res += f"{indent_str}{item_header}:\n"
            res += to_pretty_string(
                v,
                indent=indent + indent_delta,
                indent_delta=indent_delta,
                max_chars=max_chars,
                float_format=float_format,
            )

    elif isinstance(value, pd.DataFrame):
        line_width = max_chars - indent
        options = [
            "display.max_rows",
            None,
            "display.max_columns",
            None,
            "display.max_colwidth",
            None,
            "display.width",
            line_width,
            # 'display.colheader_justify', 'left'
        ]
        if float_format is not None:
            options.extend(
                ["display.float_format", ("{:," + float_format + "}").format]
            )
        with pd.option_context(*options):
            df_str = repr(value)

        lines = df_str.split("\n")
        for line in lines:
            if len(line) + len(indent_str) > line_width:
                start = 0
                while start < len(line):
                    wrap_chunk = line[start : start + line_width].rstrip()
                    res += f"{indent_str}{wrap_chunk}\n"
                    start += line_width
            else:
                res += f"{indent_str}{line.rstrip()}\n"

    else:
        # Handle scalar values, including floats
        if isinstance(value, float) and float_format:
            formatted_value = f"{value:{float_format}}"
        else:
            formatted_value = str(value)

        # Wrap lines according to max_chars
        line_width = max_chars - indent
        lines = formatted_value.split("\n")
        for line in lines:
            if len(line) + len(indent_str) > line_width:
                start = 0
                while start < len(line):
                    wrap_chunk = line[start : start + line_width].rstrip()
                    res += f"{indent_str}{wrap_chunk}\n"
                    start += line_width
            else:
                res += f"{indent_str}{line.rstrip()}\n"

    return res


def construct_dict_as_yaml_lines(d, indent_delta=2) -> List[str]:
    """Constructs the lines of a dictionary formatted as yaml.

    Args:
        d: The element to be formatted.
        indent_delta (int, optional): The amount of spaces to add for each level of indentation. Defaults to 2.
    """
    indent_delta_str = " " * indent_delta
    ticked_indent_delta_str = indent_delta_str[:-2] + "- "
    assert (
        indent_delta >= 2
    ), f"Needs at least 2 position indentations, for the case of list elements, that are to be preceded each by ' -'. Got indent_delta={indent_delta}."
    res = []  # computed hereunder as a list of lines, that are indented only at the end

    if isinstance(d, dict):
        if len(d) == 0:
            return ["{}"]
        for key, val in d.items():
            printable_key = f'"{key}"' if (" " in key) or (key == "") else key
            res.append(printable_key + ": ")
            yaml_for_val = construct_dict_as_yaml_lines(val, indent_delta=indent_delta)
            assert len(yaml_for_val) > 0
            if len(yaml_for_val) == 1:
                res[-1] += yaml_for_val[0]
            else:
                for line in yaml_for_val:
                    res.append(indent_delta_str + line)
        return res

    if isinstance(d, list):
        if len(d) == 0:
            return ["[]"]
        for val in d:
            yaml_for_val = construct_dict_as_yaml_lines(val, indent_delta=indent_delta)
            assert len(yaml_for_val) > 0
            res.append(ticked_indent_delta_str + yaml_for_val[0])
            for line in yaml_for_val[1:]:
                res.append(indent_delta_str + line)
        return res

    # d1 = re.sub(r"(\n+)", r'"\1"', str(d))
    d1 = str(d).replace("\n", "\\n").replace('"', '\\"')
    if "\\n" in d1 or d1 == "":
        d1 = f'"{d1}"'
    return [d1]


def construct_dict_as_python_lines(d, indent_delta=4) -> List[str]:
    """Constructs the lines of a dictionary formatted as a piece of python code.

    Args:
        d: The element to be formatted.
        indent_delta (int, optional): The amount of spaces to add for each level of indentation. Defaults to 2.
    """
    indent_delta_str = " " * indent_delta
    res = []  # computed hereunder as a list of lines, that are indented only at the end

    if isinstance(d, dict):
        istype = False
        if len(d) == 0:
            return ["{}"]
        if "__type__" in d:
            istype = True
            res = ["__type__" + d["__type__"] + "("]
            if len(d) == 1:
                res[0] += ")"
                return res
        else:
            res = ["{"]
        for key, val in d.items():
            if key == "__type__":
                continue
            printable_key = f'"{key}"' if not istype else key
            res.append(printable_key + ("=" if istype else ": "))
            py_for_val = construct_dict_as_python_lines(val, indent_delta=indent_delta)
            assert len(py_for_val) > 0
            if len(py_for_val) == 1:
                res[-1] += py_for_val[0] + ","
            else:
                res[-1] += py_for_val[0]
                if py_for_val[0].startswith("{") or py_for_val[0].startswith("["):
                    for line in py_for_val[1:-1]:
                        res.append(indent_delta_str + line)
                else:
                    # val is type, its inner lines are already indented
                    res.extend(py_for_val[1:-1])
                res.append(py_for_val[-1] + ",")
        res.append(")" if istype else "}")
        if istype:
            for i in range(1, len(res) - 1):
                res[i] = indent_delta_str + res[i]
        return res

    if isinstance(d, list):
        if len(d) == 0:
            return ["[]"]
        res = ["["]
        for val in d:
            py_for_val = construct_dict_as_python_lines(val, indent_delta=indent_delta)
            assert len(py_for_val) > 0
            for line in py_for_val[:-1]:
                res.append(line)
            res.append(py_for_val[-1] + ",")
        res.append("]")
        return res

    # d1 = re.sub(r"(\n+)", r'"\1"', str(d))
    if isinstance(d, str):
        return [f'"{d}"']
    if d is None or isinstance(d, (int, float, bool)):
        return [f"{d}"]
    raise RuntimeError(f"unrecognized value to print as python: {d}")


def print_dict(
    d, indent=0, indent_delta=4, max_chars=None, keys_to_print=None, log_level="info"
):
    dict_str = to_pretty_string(d, indent, indent_delta, max_chars, keys_to_print)
    dict_str = "\n" + dict_str
    getattr(logger, log_level)(dict_str)


def print_dict_as_yaml(d: dict, indent_delta=2) -> str:
    yaml_lines = construct_dict_as_yaml_lines(d, indent_delta=indent_delta)
    # yaml_lines = [re.sub(r"(\n+)", r'"\1"', line) for line in yaml_lines]
    # yaml_lines = [line.replace("\n", "\\n") for line in yaml_lines]
    return "\n".join(yaml_lines)


def print_dict_as_python(d: dict, indent_delta=4) -> str:
    py_lines = construct_dict_as_python_lines(d, indent_delta=indent_delta)
    assert len(py_lines) > 0
    return "\n".join(py_lines)


def nested_tuple_to_string(nested_tuple: tuple) -> str:
    """Converts a nested tuple to a string, with elements separated by underscores.

    Args:
        nested_tuple (tuple): The nested tuple to be converted.

    Returns:
        str: The string representation of the nested tuple.
    """
    result = []
    for item in nested_tuple:
        if isinstance(item, tuple):
            result.append(nested_tuple_to_string(item))
        else:
            result.append(str(item))
    return "_".join(result)


def is_made_of_sub_strings(string, sub_strings):
    pattern = "^(" + "|".join(map(re.escape, sub_strings)) + ")+$"
    return bool(re.match(pattern, string))


# Giveמ all the lines of a card preparer file, e.g. all the lines of prepare/cards/cohere_for_ai.py,
# and an object name, e.g. TaskCard(,
# return the ordinal number of the line that starts that object, in our example: the
# line number of the following line (notice that the line where TaskCard is imported
# is not supposed to return):
#         card = TaskCard(
# and the line number of the line that ends the object, in our case the line that include
# the matching close:
#         )
# This util depends on ruff to ensure this setting of the card file: that a close of one
# tag and the open of the next tag, do not sit in same line, when both tags being
# major level within TaskCard.
# It also prepares for the case that  __description__ tag does not contain balanced
# parentheses, since it is often cut in the middle, (with  "... see more at")
# flake8: noqa: B007
# flake8: noqa: C901
def lines_defining_obj_in_card(
    all_lines: List[str], obj_name: str, start_search_at_line: int = 0
) -> Tuple[int, int]:
    for starting_line in range(start_search_at_line, len(all_lines)):
        line = all_lines[starting_line]
        if obj_name in line:
            break
    if obj_name not in line:
        # obj_name found no where in the input lines
        return (-1, -1)
    num_of_opens = 0
    num_of_closes = 0
    ending_line = starting_line - 1
    while ending_line < len(all_lines):
        ending_line += 1

        if "__description__" in all_lines[ending_line]:
            # can not trust parentheses inside description, because this is mainly truncated
            # free text.
            # We do trust the indentation enforced by ruff, and the way we build __description__:
            # a line consisting of only __description__=(
            # followed by one or more lines of text, can not trust opens and closes
            # in them, followed by a line consisting of only:  ),
            # where the ) is indented with the beginning of __description__
            # We also prepare for the case that, when not entered by us, __description__=
            # is not followed by a ( and the whole description does not end with a single ) in its line.
            # We build on ruff making the line following the description start with same indentation
            # or 4 less (i.e., the following line is the closing of the card).
            tag_indentation = all_lines[ending_line].index("__description__")
            starts_with_parent = "__description__=(" in all_lines[ending_line]
            if starts_with_parent:
                last_line_to_start_with = (" " * tag_indentation) + r"\)"
            else:
                # actually, the line that follows the description
                last_line_to_start_with1 = (" " * tag_indentation) + "[^ ]"
                last_line_to_start_with2 = (" " * (tag_indentation - 4)) + "[^ ]"
                last_line_to_start_with = (
                    "("
                    + last_line_to_start_with1
                    + "|"
                    + last_line_to_start_with2
                    + ")"
                )
            ending_line += 1
            while not re.search("^" + last_line_to_start_with, all_lines[ending_line]):
                ending_line += 1
            if "__description__" in obj_name:
                return (
                    starting_line,
                    ending_line if starts_with_parent else ending_line - 1,
                )

            if starts_with_parent:
                ending_line += 1

            # we conrinue in card, having passed the description, ending line points
            # to the line that follows description

        num_of_opens += len(re.findall(r"[({[]", all_lines[ending_line]))
        num_of_closes += len(re.findall(r"[)}\]]", all_lines[ending_line]))
        if num_of_closes == num_of_opens:
            break

    if num_of_closes != num_of_opens:
        raise ValueError(
            "input lines were exhausted before the matching close is found"
        )

    return (starting_line, ending_line)