| import os | |
| import re | |
| import tiktoken | |
| def singleton(cls, *args, **kw): | |
| instances = {} | |
| def _singleton(): | |
| key = str(cls) + str(os.getpid()) | |
| if key not in instances: | |
| instances[key] = cls(*args, **kw) | |
| return instances[key] | |
| return _singleton | |
| def rmSpace(txt): | |
| txt = re.sub(r"([^a-z0-9.,]) +([^ ])", r"\1\2", txt, flags=re.IGNORECASE) | |
| return re.sub(r"([^ ]) +([^a-z0-9.,])", r"\1\2", txt, flags=re.IGNORECASE) | |
| def findMaxDt(fnm): | |
| m = "1970-01-01 00:00:00" | |
| try: | |
| with open(fnm, "r") as f: | |
| while True: | |
| l = f.readline() | |
| if not l: | |
| break | |
| l = l.strip("\n") | |
| if l == 'nan': | |
| continue | |
| if l > m: | |
| m = l | |
| except Exception as e: | |
| pass | |
| return m | |
| def findMaxTm(fnm): | |
| m = 0 | |
| try: | |
| with open(fnm, "r") as f: | |
| while True: | |
| l = f.readline() | |
| if not l: | |
| break | |
| l = l.strip("\n") | |
| if l == 'nan': | |
| continue | |
| if int(l) > m: | |
| m = int(l) | |
| except Exception as e: | |
| pass | |
| return m | |
| encoder = tiktoken.encoding_for_model("gpt-3.5-turbo") | |
| def num_tokens_from_string(string: str) -> int: | |
| """Returns the number of tokens in a text string.""" | |
| num_tokens = len(encoder.encode(string)) | |
| return num_tokens | |
| def truncate(string: str, max_len: int) -> int: | |
| """Returns truncated text if the length of text exceed max_len.""" | |
| return encoder.decode(encoder.encode(string)[:max_len]) | |