File size: 15,997 Bytes
147cebb 719f1b0 b462f85 147cebb 88c61d3 49dbe26 147cebb 719f1b0 7e6fc99 147cebb 96fd200 147cebb 96fd200 147cebb 96fd200 147cebb 719f1b0 147cebb 719f1b0 7e6fc99 147cebb 719f1b0 7e6fc99 147cebb 719f1b0 7e6fc99 147cebb 96fd200 147cebb 96fd200 147cebb 719f1b0 147cebb 88c61d3 7e6fc99 88c61d3 7e6fc99 88c61d3 7e6fc99 49dbe26 147cebb 49dbe26 147cebb 88c61d3 058c80a 88c61d3 147cebb 88c61d3 49dbe26 fe70438 35fffae fe70438 d443ad5 fe70438 99f75f9 fe70438 ed33057 fe70438 99f75f9 35fffae 99f75f9 35fffae 99f75f9 35fffae 99f75f9 35fffae 99f75f9 35fffae fe70438 058c80a 88c61d3 49dbe26 058c80a 147cebb fe70438 35fffae fe70438 99f75f9 35fffae 99f75f9 35fffae fe70438 99f75f9 147cebb 719f1b0 7e6fc99 147cebb 96fd200 47ba072 b462f85 100c2eb b462f85 100c2eb b462f85 058c80a 100c2eb b462f85 100c2eb 058c80a 100c2eb 058c80a 100c2eb 058c80a 100c2eb 058c80a 100c2eb 058c80a b462f85 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 |
import re
import shutil
from typing import List, Tuple
import pandas as pd
from .logging_utils import get_logger
logger = get_logger()
def split_words(s):
"""Splits a string into words based on PascalCase, camelCase, snake_case, kebab-case, and numbers attached to strings.
Args:
s (str): The string to be split.
Returns:
list: The list of words obtained after splitting the string.
"""
# Split PascalCase or camelCase
s = re.sub(r"([A-Z][a-z]+)", r" \1", re.sub(r"([A-Z]+)", r" \1", s)).strip()
# Split snake_case or kebab-case
s = re.sub(r"[_-]", " ", s)
# Split numbers attached to strings
s = re.sub(r"([a-zA-Z])(\d)", r"\1 \2", s)
s = re.sub(r"(\d)([a-zA-Z])", r"\1 \2", s)
# Split the string into words based on spaces
return s.split()
def is_camel_case(s):
"""Checks if a string is in camelCase.
Args:
s (str): The string to be checked.
Returns:
bool: True if the string is in camelCase, False otherwise.
"""
return re.match(r"^[A-Z]+([a-z0-9]*[A-Z]*[a-z0-9]*)*$", s) is not None
def is_snake_case(s):
"""Checks if a string is in snake_case.
Args:
s (str): The string to be checked.
Returns:
bool: True if the string is in snake_case, False otherwise.
"""
return re.match(r"^[a-z0-9]+(_[a-z0-9]+)*$", s) is not None
def camel_to_snake_case(s):
"""Converts a string from camelCase to snake_case.
Args:
s (str): The string to be converted.
Returns:
str: The string converted to snake_case.
"""
# Add an underscore before every uppercase letter that is followed by a lowercase letter or digit and not preceded by an underscore, a hyphen or an uppercase letter
s = re.sub(r"(?<=[^A-Z_-])([A-Z])", r"_\1", s)
# Ensure there's an underscore before any uppercase letter that's followed by a lowercase letter or digit and comes after a sequence of uppercase letters
s = re.sub(r"([A-Z]+)([A-Z][a-z0-9])", r"\1_\2", s)
return s.lower()
def to_pretty_string(
value,
indent=0,
indent_delta=4,
max_chars=None,
keys=None,
item_label=None,
float_format=None,
):
"""Constructs a formatted string representation of various data structures (dicts, lists, tuples, and DataFrames).
Args:
value: The Python data structure to be formatted.
indent (int, optional): The current level of indentation. Defaults to 0.
indent_delta (int, optional): Amount of spaces to add per indentation level. Defaults to 4.
max_chars (int, optional): Max characters per line before wrapping. Defaults to terminal width - 10.
keys (List[str], optional): For dicts, optionally specify keys and order.
item_label (str, optional): Internal parameter for labeling items.
float_format (str, optional): Format string for float values (e.g., ".2f"). Defaults to None.
"""
max_chars = max_chars or shutil.get_terminal_size()[0] - 10
indent_str = " " * indent
res = ""
if isinstance(value, dict):
keys_to_print = keys if keys is not None else list(value.keys())
for k in keys_to_print:
if k not in value:
raise ValueError(
f"Dictionary does not contain field '{k}' specified in 'keys' argument. "
f"The available keys are {list(value.keys())}"
)
for k in keys_to_print:
v = value[k]
item_header = f"{k} ({type(v).__name__})"
res += f"{indent_str}{item_header}:\n"
res += to_pretty_string(
v,
indent=indent + indent_delta,
indent_delta=indent_delta,
max_chars=max_chars,
float_format=float_format,
)
elif isinstance(value, (list, tuple)):
for i, v in enumerate(value):
label = f"[{i}]" if isinstance(value, list) else f"({i})"
item_header = f"{label} ({type(v).__name__})"
res += f"{indent_str}{item_header}:\n"
res += to_pretty_string(
v,
indent=indent + indent_delta,
indent_delta=indent_delta,
max_chars=max_chars,
float_format=float_format,
)
elif isinstance(value, pd.DataFrame):
line_width = max_chars - indent
options = [
"display.max_rows",
None,
"display.max_columns",
None,
"display.max_colwidth",
None,
"display.width",
line_width,
# 'display.colheader_justify', 'left'
]
if float_format is not None:
options.extend(
["display.float_format", ("{:," + float_format + "}").format]
)
with pd.option_context(*options):
df_str = repr(value)
lines = df_str.split("\n")
for line in lines:
if len(line) + len(indent_str) > line_width:
start = 0
while start < len(line):
wrap_chunk = line[start : start + line_width].rstrip()
res += f"{indent_str}{wrap_chunk}\n"
start += line_width
else:
res += f"{indent_str}{line.rstrip()}\n"
else:
# Handle scalar values, including floats
if isinstance(value, float) and float_format:
formatted_value = f"{value:{float_format}}"
else:
formatted_value = str(value)
# Wrap lines according to max_chars
line_width = max_chars - indent
lines = formatted_value.split("\n")
for line in lines:
if len(line) + len(indent_str) > line_width:
start = 0
while start < len(line):
wrap_chunk = line[start : start + line_width].rstrip()
res += f"{indent_str}{wrap_chunk}\n"
start += line_width
else:
res += f"{indent_str}{line.rstrip()}\n"
return res
def construct_dict_as_yaml_lines(d, indent_delta=2) -> List[str]:
"""Constructs the lines of a dictionary formatted as yaml.
Args:
d: The element to be formatted.
indent_delta (int, optional): The amount of spaces to add for each level of indentation. Defaults to 2.
"""
indent_delta_str = " " * indent_delta
ticked_indent_delta_str = indent_delta_str[:-2] + "- "
assert (
indent_delta >= 2
), f"Needs at least 2 position indentations, for the case of list elements, that are to be preceded each by ' -'. Got indent_delta={indent_delta}."
res = [] # computed hereunder as a list of lines, that are indented only at the end
if isinstance(d, dict):
if len(d) == 0:
return ["{}"]
for key, val in d.items():
printable_key = f'"{key}"' if (" " in key) or (key == "") else key
res.append(printable_key + ": ")
yaml_for_val = construct_dict_as_yaml_lines(val, indent_delta=indent_delta)
assert len(yaml_for_val) > 0
if len(yaml_for_val) == 1:
res[-1] += yaml_for_val[0]
else:
for line in yaml_for_val:
res.append(indent_delta_str + line)
return res
if isinstance(d, list):
if len(d) == 0:
return ["[]"]
for val in d:
yaml_for_val = construct_dict_as_yaml_lines(val, indent_delta=indent_delta)
assert len(yaml_for_val) > 0
res.append(ticked_indent_delta_str + yaml_for_val[0])
for line in yaml_for_val[1:]:
res.append(indent_delta_str + line)
return res
# d1 = re.sub(r"(\n+)", r'"\1"', str(d))
d1 = str(d).replace("\n", "\\n").replace('"', '\\"')
if "\\n" in d1 or d1 == "":
d1 = f'"{d1}"'
return [d1]
def construct_dict_as_python_lines(d, indent_delta=4) -> List[str]:
"""Constructs the lines of a dictionary formatted as a piece of python code.
Args:
d: The element to be formatted.
indent_delta (int, optional): The amount of spaces to add for each level of indentation. Defaults to 2.
"""
indent_delta_str = " " * indent_delta
res = [] # computed hereunder as a list of lines, that are indented only at the end
if isinstance(d, dict):
istype = False
if len(d) == 0:
return ["{}"]
if "__type__" in d:
istype = True
res = ["__type__" + d["__type__"] + "("]
if len(d) == 1:
res[0] += ")"
return res
else:
res = ["{"]
for key, val in d.items():
if key == "__type__":
continue
printable_key = f'"{key}"' if not istype else key
res.append(printable_key + ("=" if istype else ": "))
py_for_val = construct_dict_as_python_lines(val, indent_delta=indent_delta)
assert len(py_for_val) > 0
if len(py_for_val) == 1:
res[-1] += py_for_val[0] + ","
else:
res[-1] += py_for_val[0]
if py_for_val[0].startswith("{") or py_for_val[0].startswith("["):
for line in py_for_val[1:-1]:
res.append(indent_delta_str + line)
else:
# val is type, its inner lines are already indented
res.extend(py_for_val[1:-1])
res.append(py_for_val[-1] + ",")
res.append(")" if istype else "}")
if istype:
for i in range(1, len(res) - 1):
res[i] = indent_delta_str + res[i]
return res
if isinstance(d, list):
if len(d) == 0:
return ["[]"]
res = ["["]
for val in d:
py_for_val = construct_dict_as_python_lines(val, indent_delta=indent_delta)
assert len(py_for_val) > 0
for line in py_for_val[:-1]:
res.append(line)
res.append(py_for_val[-1] + ",")
res.append("]")
return res
# d1 = re.sub(r"(\n+)", r'"\1"', str(d))
if isinstance(d, str):
return [f'"{d}"']
if d is None or isinstance(d, (int, float, bool)):
return [f"{d}"]
raise RuntimeError(f"unrecognized value to print as python: {d}")
def print_dict(
d, indent=0, indent_delta=4, max_chars=None, keys_to_print=None, log_level="info"
):
dict_str = to_pretty_string(d, indent, indent_delta, max_chars, keys_to_print)
dict_str = "\n" + dict_str
getattr(logger, log_level)(dict_str)
def print_dict_as_yaml(d: dict, indent_delta=2) -> str:
yaml_lines = construct_dict_as_yaml_lines(d, indent_delta=indent_delta)
# yaml_lines = [re.sub(r"(\n+)", r'"\1"', line) for line in yaml_lines]
# yaml_lines = [line.replace("\n", "\\n") for line in yaml_lines]
return "\n".join(yaml_lines)
def print_dict_as_python(d: dict, indent_delta=4) -> str:
py_lines = construct_dict_as_python_lines(d, indent_delta=indent_delta)
assert len(py_lines) > 0
return "\n".join(py_lines)
def nested_tuple_to_string(nested_tuple: tuple) -> str:
"""Converts a nested tuple to a string, with elements separated by underscores.
Args:
nested_tuple (tuple): The nested tuple to be converted.
Returns:
str: The string representation of the nested tuple.
"""
result = []
for item in nested_tuple:
if isinstance(item, tuple):
result.append(nested_tuple_to_string(item))
else:
result.append(str(item))
return "_".join(result)
def is_made_of_sub_strings(string, sub_strings):
pattern = "^(" + "|".join(map(re.escape, sub_strings)) + ")+$"
return bool(re.match(pattern, string))
# Giveמ all the lines of a card preparer file, e.g. all the lines of prepare/cards/cohere_for_ai.py,
# and an object name, e.g. TaskCard(,
# return the ordinal number of the line that starts that object, in our example: the
# line number of the following line (notice that the line where TaskCard is imported
# is not supposed to return):
# card = TaskCard(
# and the line number of the line that ends the object, in our case the line that include
# the matching close:
# )
# This util depends on ruff to ensure this setting of the card file: that a close of one
# tag and the open of the next tag, do not sit in same line, when both tags being
# major level within TaskCard.
# It also prepares for the case that __description__ tag does not contain balanced
# parentheses, since it is often cut in the middle, (with "... see more at")
# flake8: noqa: B007
# flake8: noqa: C901
def lines_defining_obj_in_card(
all_lines: List[str], obj_name: str, start_search_at_line: int = 0
) -> Tuple[int, int]:
for starting_line in range(start_search_at_line, len(all_lines)):
line = all_lines[starting_line]
if obj_name in line:
break
if obj_name not in line:
# obj_name found no where in the input lines
return (-1, -1)
num_of_opens = 0
num_of_closes = 0
ending_line = starting_line - 1
while ending_line < len(all_lines):
ending_line += 1
if "__description__" in all_lines[ending_line]:
# can not trust parentheses inside description, because this is mainly truncated
# free text.
# We do trust the indentation enforced by ruff, and the way we build __description__:
# a line consisting of only __description__=(
# followed by one or more lines of text, can not trust opens and closes
# in them, followed by a line consisting of only: ),
# where the ) is indented with the beginning of __description__
# We also prepare for the case that, when not entered by us, __description__=
# is not followed by a ( and the whole description does not end with a single ) in its line.
# We build on ruff making the line following the description start with same indentation
# or 4 less (i.e., the following line is the closing of the card).
tag_indentation = all_lines[ending_line].index("__description__")
starts_with_parent = "__description__=(" in all_lines[ending_line]
if starts_with_parent:
last_line_to_start_with = (" " * tag_indentation) + r"\)"
else:
# actually, the line that follows the description
last_line_to_start_with1 = (" " * tag_indentation) + "[^ ]"
last_line_to_start_with2 = (" " * (tag_indentation - 4)) + "[^ ]"
last_line_to_start_with = (
"("
+ last_line_to_start_with1
+ "|"
+ last_line_to_start_with2
+ ")"
)
ending_line += 1
while not re.search("^" + last_line_to_start_with, all_lines[ending_line]):
ending_line += 1
if "__description__" in obj_name:
return (
starting_line,
ending_line if starts_with_parent else ending_line - 1,
)
if starts_with_parent:
ending_line += 1
# we conrinue in card, having passed the description, ending line points
# to the line that follows description
num_of_opens += len(re.findall(r"[({[]", all_lines[ending_line]))
num_of_closes += len(re.findall(r"[)}\]]", all_lines[ending_line]))
if num_of_closes == num_of_opens:
break
if num_of_closes != num_of_opens:
raise ValueError(
"input lines were exhausted before the matching close is found"
)
return (starting_line, ending_line)
|