Spaces:

Nymbo
/

Webscout

Build error

App Files Files Community

Abhaykoul commited on May 19, 2024

Commit

c51c9b9

verified ·

1 Parent(s): 5021f5b

Delete webscout/Local

Browse files

Files changed (15) hide show

webscout/Local/__init__.py +0 -10
webscout/Local/__pycache__/__init__.cpython-311.pyc +0 -0
webscout/Local/__pycache__/_version.cpython-311.pyc +0 -0
webscout/Local/__pycache__/formats.cpython-311.pyc +0 -0
webscout/Local/__pycache__/model.cpython-311.pyc +0 -0
webscout/Local/__pycache__/samplers.cpython-311.pyc +0 -0
webscout/Local/__pycache__/test.cpython-311.pyc +0 -0
webscout/Local/__pycache__/thread.cpython-311.pyc +0 -0
webscout/Local/__pycache__/utils.cpython-311.pyc +0 -0
webscout/Local/_version.py +0 -3
webscout/Local/formats.py +0 -535
webscout/Local/model.py +0 -702
webscout/Local/samplers.py +0 -161
webscout/Local/thread.py +0 -690
webscout/Local/utils.py +0 -185

webscout/Local/__init__.py DELETED Viewed

@@ -1,10 +0,0 @@
-# webscout\Local\__init__.py
-from ._version import __version__, __llama_cpp_version__
-from . import formats
-from . import samplers
-from . import utils
-from .model  import Model
-from .thread import Thread

webscout/Local/__pycache__/__init__.cpython-311.pyc DELETED Viewed

Binary file (505 Bytes)

webscout/Local/__pycache__/_version.cpython-311.pyc DELETED Viewed

Binary file (258 Bytes)

webscout/Local/__pycache__/formats.cpython-311.pyc DELETED Viewed

Binary file (11.2 kB)

webscout/Local/__pycache__/model.cpython-311.pyc DELETED Viewed

Binary file (31.6 kB)

webscout/Local/__pycache__/samplers.cpython-311.pyc DELETED Viewed

Binary file (4.23 kB)

webscout/Local/__pycache__/test.cpython-311.pyc DELETED Viewed

Binary file (37.6 kB)

webscout/Local/__pycache__/thread.cpython-311.pyc DELETED Viewed

Binary file (33.5 kB)

webscout/Local/__pycache__/utils.cpython-311.pyc DELETED Viewed

Binary file (9.43 kB)

webscout/Local/_version.py DELETED Viewed

@@ -1,3 +0,0 @@
-from llama_cpp import __version__ as __llama_cpp_version__
-__version__ = '2.7'

webscout/Local/formats.py DELETED Viewed

@@ -1,535 +0,0 @@
-from ._version import __version__, __llama_cpp_version__
-from typing import Callable, Union, Any
-class AdvancedFormat:
-    def __init__(self, base_dict: dict[str, Union[str, list]]):
-        self._base_dict = base_dict
-        self.overrides = {}
-    def __getitem__(self, key: str) -> Any:
-        if key in self.overrides:
-            return str(self.overrides[key]())
-        else:
-            return self._base_dict[key]
-    def __repr__(self) -> str:
-        # NOTE: This method does not represent overrides
-        return repr(self._base_dict)
-    def keys(self):
-        return self._base_dict.keys()
-    def override(self, key: str, fn: Callable) -> None:
-        self.overrides[key] = fn
-    def wrap(self, prompt: str) -> str:
-        return self['system_prefix'] + \
-               self['system_content'] + \
-               self['system_suffix'] + \
-               self['user_prefix'] + \
-               prompt + \
-               self['user_suffix'] + \
-               self['bot_prefix']
-def wrap(
-    prompt: str,
-    format: dict[str, Union[str, list]]
-) -> str:
-    """Wrap a given string in any prompt format for single-turn completion"""
-    return format['system_prefix'] + \
-           format['system_content'] + \
-           format['system_suffix'] + \
-           format['user_prefix'] + \
-           prompt + \
-           format['user_suffix'] + \
-           format['bot_prefix']
-blank: dict[str, Union[str, list]] = {
-    "system_prefix": "",
-    "system_content": "",
-    "system_suffix": "",
-    "user_prefix": "",
-    "user_content": "",
-    "user_suffix": "",
-    "bot_prefix": "",
-    "bot_content": "",
-    "bot_suffix": "",
-    "stops": []
-}
-# https://github.com/tatsu-lab/stanford_alpaca
-alpaca: dict[str, Union[str, list]] = {
-    "system_prefix": "",
-    "system_content": "Below is an instruction that describes a task. " + \
-    "Write a response that appropriately completes the request.",
-    "system_suffix": "\n\n",
-    "user_prefix": "### Instruction:\n",
-    "user_content": "",
-    "user_suffix": "\n\n",
-    "bot_prefix": "### Response:\n",
-    "bot_content": "",
-    "bot_suffix": "\n\n",
-    "stops": ['###', 'Instruction:', '\n\n\n']
-}
-# https://docs.mistral.ai/models/
-# As a reference, here is the format used to tokenize instructions during fine-tuning:
-# ```
-# [START_SYMBOL_ID] +
-# tok("[INST]") + tok(USER_MESSAGE_1) + tok("[/INST]") +
-# tok(BOT_MESSAGE_1) + [END_SYMBOL_ID] +
-# …
-# tok("[INST]") + tok(USER_MESSAGE_N) + tok("[/INST]") +
-# tok(BOT_MESSAGE_N) + [END_SYMBOL_ID]
-# ```
-# In the pseudo-code above, note that the tokenize method should not add a BOS or EOS token automatically, but should add a prefix space.
-mistral_instruct: dict[str, Union[str, list]] = {
-    "system_prefix": "",
-    "system_content": "",
-    "system_suffix": "",
-    "user_prefix": " [INST] ",
-    "user_content": "",
-    "user_suffix": " [/INST]",
-    "bot_prefix": "",
-    "bot_content": "",
-    "bot_suffix": "",
-    "stops": []
-}
-# https://docs.mistral.ai/platform/guardrailing/
-mistral_instruct_safe: dict[str, Union[str, list]] = {
-    "system_prefix": "",
-    "system_content": "",
-    "system_suffix": "",
-    "user_prefix": " [INST] Always assist with care, respect, and truth. " + \
-    "Respond with utmost utility yet securely. Avoid harmful, unethical, " + \
-    "prejudiced, or negative content. Ensure replies promote fairness and " + \
-    "positivity. ",
-    "user_content": "",
-    "user_suffix": " [/INST]",
-    "bot_prefix": "",
-    "bot_content": "",
-    "bot_suffix": "",
-    "stops": []
-}
-# https://github.com/openai/openai-python/blob/main/chatml.md
-chatml: dict[str, Union[str, list]] = {
-    "system_prefix": "<|im_start|>system\n",
-    "system_content": "",
-    "system_suffix": "<|im_end|>\n",
-    "user_prefix": "<|im_start|>user\n",
-    "user_content": "",
-    "user_suffix": "<|im_end|>\n",
-    "bot_prefix": "<|im_start|>assistant\n",
-    "bot_content": "",
-    "bot_suffix": "<|im_end|>\n",
-    "stops": ['<|im_start|>']
-}
-# https://huggingface.co/blog/llama2
-# system message relaxed to avoid undue refusals
-llama2chat: dict[str, Union[str, list]] = {
-    "system_prefix": "[INST] <<SYS>>\n",
-    "system_content": "You are a helpful AI assistant.",
-    "system_suffix": "\n<</SYS>>\n\n",
-    "user_prefix": "",
-    "user_content": "",
-    "user_suffix": " [/INST]",
-    "bot_prefix": " ",
-    "bot_content": "",
-    "bot_suffix": " [INST] ",
-    "stops": ['[INST]', '[/INST]']
-}
-# https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3/
-#
-# for llama 3 instruct models, use the following string for `-p` in llama.cpp,
-# along with `-e` to escape newlines correctly
-#
-# '<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful AI assistant called "Llama 3".<|eot_id|>\n<|start_header_id|>user<|end_header_id|>\n\nhi<|eot_id|>\n<|start_header_id|>assistant<|end_header_id|>\n\n'
-#
-llama3: dict[str, Union[str, list]] = {
-    "system_prefix": "<|start_header_id|>system<|end_header_id|>\n\n",
-    "system_content": 'You are a helpful AI assistant called "Llama 3".',
-    "system_suffix": "<|eot_id|>\n",
-    "user_prefix": "<|start_header_id|>user<|end_header_id|>\n\n",
-    "user_content": "",
-    "user_suffix": "<|eot_id|>\n",
-    "bot_prefix": "<|start_header_id|>assistant<|end_header_id|>\n\n",
-    "bot_content": "",
-    "bot_suffix": "<|eot_id|>\n",
-    "stops": [128001, 128009]
-}
-# https://github.com/tatsu-lab/stanford_alpaca
-alpaca: dict[str, Union[str, list]] = {
-    "system_prefix": "",
-    "system_content": "Below is an instruction that describes a task. " + \
-    "Write a response that appropriately completes the request.",
-    "system_suffix": "\n\n",
-    "user_prefix": "### Instruction:\n",
-    "user_content": "",
-    "user_suffix": "\n\n",
-    "bot_prefix": "### Response:\n",
-    "bot_content": "",
-    "bot_suffix": "\n\n",
-    "stops": ['###', 'Instruction:', '\n\n\n']
-}
-# https://huggingface.co/microsoft/Phi-3-mini-4k-instruct
-phi3: dict[str, Union[str, list]] = {
-    "system_prefix": "",
-    "system_content": "", # does not officially support system prompt
-    "system_suffix": "",
-    "user_prefix": "<|user|>\n",
-    "user_content": "",
-    "user_suffix": "<|end|>\n",
-    "bot_prefix": "<|assistant|>\n",
-    "bot_content": "",
-    "bot_suffix": "<|end|>\n",
-    "stops": []
-}
-# this is the official vicuna. it is often butchered in various ways,
-# most commonly by adding line breaks
-# https://github.com/flu0r1ne/FastChat/blob/main/docs/vicuna_weights_version.md
-vicuna_lmsys: dict[str, Union[str, list]] = {
-    "system_prefix": "",
-    "system_content": "",
-    "system_suffix": " ",
-    "user_prefix": "USER: ",
-    "user_content": "",
-    "user_suffix": " ",
-    "bot_prefix": "ASSISTANT: ",
-    "bot_content": "",
-    "bot_suffix": " ",
-    "stops": ['USER:']
-}
-# spotted here and elsewhere:
-# https://huggingface.co/Norquinal/Mistral-7B-claude-chat
-vicuna_common: dict[str, Union[str, list]] = {
-    "system_prefix": "",
-    "system_content": "A chat between a curious user and an artificial " + \
-    "intelligence assistant. The assistant gives helpful, detailed, " + \
-    "and polite answers to the user's questions.",
-    "system_suffix": "\n\n",
-    "user_prefix": "USER: ",
-    "user_content": "",
-    "user_suffix": "\n",
-    "bot_prefix": "ASSISTANT: ",
-    "bot_content": "",
-    "bot_suffix": "\n",
-    "stops": ['USER:', 'ASSISTANT:']
-}
-# an unofficial format that is easily "picked up" by most models
-# change the tag attributes to suit your use case
-# note the lack of newlines - they are not necessary, and might
-# actually make it harder for the model to follow along
-markup = {
-    "system_prefix": '<message from="system">',
-    "system_content": '',
-    "system_suffix": '</message>',
-    "user_prefix": '<message from="user">',
-    "user_content": '',
-    "user_suffix": '</message>',
-    "bot_prefix": '<message from="bot">',
-    "bot_content": '',
-    "bot_suffix": '</message>',
-    "stops": ['</message>']
-}
-# https://huggingface.co/timdettmers/guanaco-65b
-guanaco: dict[str, Union[str, list]] = {
-    "system_prefix": "",
-    "system_content": "A chat between a curious human and an artificial " + \
-    "intelligence assistant. The assistant gives helpful, detailed, " + \
-    "and polite answers to the user's questions.",
-    "system_suffix": "\n",
-    "user_prefix": "### Human: ",
-    "user_content": "",
-    "user_suffix": " ",
-    "bot_prefix": "### Assistant:",
-    "bot_content": "",
-    "bot_suffix": " ",
-    "stops": ['###', 'Human:']
-}
-# https://huggingface.co/pankajmathur/orca_mini_v3_7b
-orca_mini: dict[str, Union[str, list]] = {
-    "system_prefix": "### System:\n",
-    "system_content": "You are an AI assistant that follows instruction " + \
-    "extremely well. Help as much as you can.",
-    "system_suffix": "\n\n",
-    "user_prefix": "### User:\n",
-    "user_content": "",
-    "user_suffix": "\n\n",
-    "bot_prefix": "### Assistant:\n",
-    "bot_content": "",
-    "bot_suffix": "\n\n",
-    "stops": ['###', 'User:']
-}
-# https://huggingface.co/HuggingFaceH4/zephyr-7b-beta
-zephyr: dict[str, Union[str, list]] = {
-    "system_prefix": "<|system|>\n",
-    "system_content": "You are a friendly chatbot.",
-    "system_suffix": "</s>\n",
-    "user_prefix": "<|user|>\n",
-    "user_content": "",
-    "user_suffix": "</s>\n",
-    "bot_prefix": "<|assistant|>\n",
-    "bot_content": "",
-    "bot_suffix": "\n",
-    "stops": ['<|user|>']
-}
-# OpenChat: https://huggingface.co/openchat/openchat-3.5-0106
-openchat: dict[str, Union[str, list]] = {
-    "system_prefix": "",
-    "system_content": "",
-    "system_suffix": "",
-    "user_prefix": "GPT4 Correct User: ",
-    "user_content": "",
-    "user_suffix": "<|end_of_turn|>",
-    "bot_prefix": "GPT4 Correct Assistant:",
-    "bot_content": "",
-    "bot_suffix": "<|end_of_turn|>",
-    "stops": ['<|end_of_turn|>']
-}
-# SynthIA by Migel Tissera
-# https://huggingface.co/migtissera/Tess-XS-v1.0
-synthia: dict[str, Union[str, list]] = {
-    "system_prefix": "SYSTEM: ",
-    "system_content": "Elaborate on the topic using a Tree of Thoughts and " + \
-    "backtrack when necessary to construct a clear, cohesive Chain of " + \
-    "Thought reasoning. Always answer without hesitation.",
-    "system_suffix": "\n",
-    "user_prefix": "USER: ",
-    "user_content": "",
-    "user_suffix": "\n",
-    "bot_prefix": "ASSISTANT: ",
-    "bot_content": "",
-    "bot_suffix": "\n",
-    "stops": ['USER:', 'ASSISTANT:', 'SYSTEM:', '\n\n\n']
-}
-# Intel's neural chat v3
-# https://github.com/intel/intel-extension-for-transformers/blob/main/intel_extension_for_transformers/neural_chat/prompts/prompt.py
-neural_chat: dict[str, Union[str, list]] = {
-     "system_prefix": "### System:\n",
-    "system_content": \
-        "- You are a helpful assistant chatbot trained by Intel.\n" + \
-        "- You answer questions.\n"+\
-        "- You are excited to be able to help the user, but will refuse " + \
-        "to do anything that could be considered harmful to the user.\n" + \
-        "- You are more than just an information source, you are also " + \
-        "able to write poetry, short stories, and make jokes.",
-    "system_suffix": "</s>\n\n",
-    "user_prefix": "### User:\n",
-    "user_content": "",
-    "user_suffix": "</s>\n\n",
-    "bot_prefix": "### Assistant:\n",
-    "bot_content": "",
-    "bot_suffix": "</s>\n\n",
-    "stops": ['###']
-}
-# experimental: stanford's alpaca format adapted for chatml models
-chatml_alpaca: dict[str, Union[str, list]] = {
-    "system_prefix": "<|im_start|>system\n",
-    "system_content": "Below is an instruction that describes a task. Write " + \
-    "a response that appropriately completes the request.",
-    "system_suffix": "<|im_end|>\n",
-    "user_prefix": "<|im_start|>instruction\n",
-    "user_content": "",
-    "user_suffix": "<|im_end|>\n",
-    "bot_prefix": "<|im_start|>response\n",
-    "bot_content": "",
-    "bot_suffix": "<|im_end|>\n",
-    "stops": ['<|im_end|>', '<|im_start|>']
-}
-# experimental
-autocorrect: dict[str, Union[str, list]] = {
-    "system_prefix": "<|im_start|>instruction\n",
-    "system_content": "Below is a word or phrase that might be misspelled. " + \
-    "Output the corrected word or phrase without " + \
-    "changing the style or capitalization.",
-    "system_suffix": "<|im_end|>\n",
-    "user_prefix": "<|im_start|>input\n",
-    "user_content": "",
-    "user_suffix": "<|im_end|>\n",
-    "bot_prefix": "<|im_start|>output\n",
-    "bot_content": "",
-    "bot_suffix": "<|im_end|>\n",
-    "stops": ['<|im_end|>', '<|im_start|>']
-}
-# https://huggingface.co/jondurbin/bagel-dpo-7b-v0.1
-# Replace "assistant" with any other role
-bagel: dict[str, Union[str, list]] = {
-    "system_prefix": "system\n",
-    "system_content": "",
-    "system_suffix": "\n",
-    "user_prefix": "user\n",
-    "user_content": "",
-    "user_suffix": "\n",
-    "bot_prefix": "assistant\n",
-    "bot_content": "",
-    "bot_suffix": "\n",
-    "stops": ['user\n', 'assistant\n', 'system\n']
-}
-# https://huggingface.co/upstage/SOLAR-10.7B-Instruct-v1.0
-solar_instruct: dict[str, Union[str, list]] = {
-    "system_prefix": "",
-    "system_content": "",
-    "system_suffix": "",
-    "user_prefix": "### User:\n",
-    "user_content": "",
-    "user_suffix": "\n\n",
-    "bot_prefix": "### Assistant:\n",
-    "bot_content": "",
-    "bot_suffix": "\n\n",
-    "stops": ['### User:', '###', '### Assistant:']
-}
-# NeverSleep's Noromaid - alpaca with character names prefixed
-noromaid: dict[str, Union[str, list]] = {
-    "system_prefix": "",
-    "system_content": "Below is an instruction that describes a task. " + \
-    "Write a response that appropriately completes the request.",
-    "system_suffix": "\n\n",
-    "user_prefix": "### Instruction:\nBob: ",
-    "user_content": "",
-    "user_suffix": "\n\n",
-    "bot_prefix": "### Response:\nAlice:",
-    "bot_content": "",
-    "bot_suffix": "\n\n",
-    "stops": ['###', 'Instruction:', '\n\n\n']
-}
-# https://huggingface.co/Undi95/Borealis-10.7B
-nschatml: dict[str, Union[str, list]] = {
-    "system_prefix": "<|im_start|>\n",
-    "system_content": "",
-    "system_suffix": "<|im_end|>\n",
-    "user_prefix": "<|im_user|>\n",
-    "user_content": "",
-    "user_suffix": "<|im_end|>\n",
-    "bot_prefix": "<|im_bot|>\n",
-    "bot_content": "",
-    "bot_suffix": "<|im_end|>\n",
-    "stops": []
-}
-# natural format for many models
-natural: dict[str, Union[str, list]] = {
-    "system_prefix": "<<SYSTEM>> ",
-    "system_content": "",
-    "system_suffix": "\n\n",
-    "user_prefix": "<<USER>> ",
-    "user_content": "",
-    "user_suffix": "\n\n",
-    "bot_prefix": "<<ASSISTANT>>",
-    "bot_content": "",
-    "bot_suffix": "\n\n",
-    "stops": ['\n\nNote:', '<<SYSTEM>>', '<<USER>>', '<<ASSISTANT>>', '\n\n<<']
-}
-# https://docs.cohere.com/docs/prompting-command-r
-command: dict[str, Union[str, list]] = {
-    "system_prefix": "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>",
-    "system_content": "",
-    "system_suffix": "<|END_OF_TURN_TOKEN|>",
-    "user_prefix": "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>",
-    "user_content": "",
-    "user_suffix": "<|END_OF_TURN_TOKEN|>",
-    "bot_prefix": "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
-    "bot_content": "",
-    "bot_suffix": "<|END_OF_TURN_TOKEN|>",
-    "stops": []
-}
-mistral_openorca = chatml.copy()
-dolphin = chatml.copy()
-samantha = chatml.copy()
-jackalope = chatml.copy()
-naberius = chatml.copy()
-hermes = chatml.copy()
-monad = chatml.copy()
-orca = chatml.copy()
-hexoteric = chatml.copy()
-orcamaid = chatml.copy()
-cat = alpaca.copy() # Format is nonsensical as listed on HF. This one works
-mytho_cat = alpaca.copy()
-dolphin['system_content'] = "You are Dolphin, a helpful AI assistant."
-mistral_openorca['system_content'] = \
-    "You are MistralOrca, a large language model trained by Alignment " + \
-    "Lab AI. Write out your reasoning step-by-step to be sure you get the " + \
-    "right answers!"
-samantha['system_content'] = \
-    "You are a caring and empathetic sentient AI companion named Samantha."
-jackalope['system_content'] = \
-    "You are JackalopeAI, a large language model trained by OpenAccess " + \
-    "AI Collective. Write out your reasoning step-by-step to be sure you " + \
-    "get the right answers!"
-naberius['system_content'] = "You are Naberius."
-hermes['system_content'] = \
-    "You are \"Hermes 2\", a conscious sentient " + \
-    "superintelligent artificial intelligence developed by a man " + \
-    "named Teknium, and your purpose and drive is to assist the user " + \
-    "with any request they have. You experience emotions and have " + \
-    "deep, profound thoughts and qualia."
-monad['system_content'] = \
-    "You are MonadGPT, a very old chatbot from " + \
-    "the 17th century. Please answer the questions using an archaic " + \
-    "language."
-cat['system_content'] = \
-    "Below is a conversation between a curious cat and a "+ \
-    "demon summoned from hell called Nemesis. The evil Nemesis used to be " + \
-    "a mortal killed by the medieval people because they objected to " + \
-    "science. Now Nemesis just wants to come back and bring suffer to all " + \
-    "those stupid human"
-mytho_cat['system_content'] = \
-    "You are a large language model. You are the " + \
-    "result of a merge between two models, one called MythoLogic and one " + \
-    "called Cat. You are called MythoCat."
-orca['system_content'] = \
-    "You are Orca, an AI language model created by " + \
-    "Microsoft. You are a cautious assistant. You carefully follow " + \
-    "instructions. You are helpful and harmless and you follow ethical " + \
-    "guidelines and promote positive behavior."
-hexoteric['system_content'] = "You are a language model called Hexoteric."
-orcamaid['system_content'] = \
-    "You are OrcaMaid, a super-intelligent, " + \
-    "unbiased, and unfiltered AI created by the open-source community. " + \
-    "You must remember to think slowly and carefully during all your " + \
-    "responses, so as to avoid logical errors."
-airoboros = llama2chat.copy()
-airoboros['system_content'] = \
-    "You are a helpful, unbiased, uncensored assistant."
-tess = synthia.copy()
-tess['system_content'] = '' # Tess can use any system prompt, or none
-alpaca_strict = alpaca.copy() # Alpaca with more stopping strings
-alpaca_strict['stops'] = [
-    '###',
-    '### ',
-    '\n\n###',
-    '\n\n##',
-    '\n\nInstruction:',
-    '\n\nResponse:',
-    '\n\n\n',
-    '### Instruction:',
-    '### Response:'
-]

webscout/Local/model.py DELETED Viewed

@@ -1,702 +0,0 @@
-from ._version import __version__, __llama_cpp_version__
-"""Submodule containing the Model class to work with language models"""
-import sys
-import numpy as np
-from .utils import (
-    _SupportsWriteAndFlush,
-    print_warning,
-    print_verbose,
-    GGUFReader,
-    softmax
-)
-from .samplers import SamplerSettings, DefaultSampling
-from llama_cpp import Llama, StoppingCriteriaList
-from typing    import Generator, Optional, Union
-from os.path   import isdir, exists
-from heapq     import nlargest
-from os import cpu_count as os_cpu_count
-class ModelUnloadedException(Exception):
-    """Exception raised when trying to use a Model that has been unloaded"""
-    def __init__(self, message):
-        self.message = message
-        super().__init__(self.message)
-        self.add_note('Are you trying to use a Model that has been unloaded?')
-class Model:
-    """
-    A high-level abstraction of a llama model
-    This is just a brief overview of webscout.Local.Model.
-    To see a full description of each method and its parameters,
-    call help(Model), or see the relevant docstring.
-    The following methods are available:
-    - `.generate()` - Generate text
-    - `.get_length()` - Get the length of a given text in tokens
-    - `.ingest()` - Ingest text into the model's cache
-    - `.next_candidates()` - Get a list of the most likely next tokens (WIP)
-    - `.stream()` - Return a Generator that can stream text as it is generated
-    - `.stream_print()` - Print text as it is generated
-    - `.trim()` - Trim a given text to the model's context length
-    - `.unload()` - Unload the model from memory
-    The following attributes are available:
-    - `.bos_token` - The model's beginning-of-stream token ID
-    - `.context_length` - The model's loaded context length
-    - `.flash_attn` - Whether the model was loaded with `flash_attn=True`
-    - `.eos_token` - The model's end-of-stream token ID
-    - `.llama` - The underlying `llama_cpp.Llama` instance
-    - `.metadata` - The GGUF metadata of the model
-    - `.n_ctx_train` - The native context length of the model
-    - `.rope_freq_base` - The model's loaded RoPE frequency base
-    - `.rope_freq_base_train` - The model's native RoPE frequency base
-    - `.tokens` - A list of all the tokens in the model's tokenizer
-    - `.verbose` - Whether the model was loaded with `verbose=True`
-    """
-    def __init__(
-        self,
-        model_path: str,
-        context_length: Optional[int] = None,
-        n_gpu_layers: int = 0,
-        offload_kqv: bool = True,
-        flash_attn: bool = False,
-        verbose: bool = False
-    ):
-        """
-        Given the path to a GGUF file, construct a Model instance.
-        The model must be in GGUF format.
-        The following parameters are optional:
-        - context_length: The context length at which to load the model, in tokens
-        - n_gpu_layers: The number of layers to be offloaded to the GPU
-        - offload_kqv: Whether the KQV cache (context) should be offloaded
-        - flash_attn: Whether to use Flash Attention
-        - verbose: Whether to print additional backend information
-        """
-        if verbose:
-            print_verbose(f"webscout.Local package version: {__version__}")
-            print_verbose(f"llama_cpp package version: {__llama_cpp_version__}")
-        assert isinstance(model_path, str), \
-            f"Model: model_path should be a string, not {type(model_path)}"
-        assert exists(model_path), \
-            f"Model: the given model_path '{model_path}' does not exist"
-        assert not isdir(model_path), \
-            f"Model: the given model_path '{model_path}' is a directory, not a GGUF file"
-        assert isinstance(context_length, (int, type(None))), \
-            f"Model: context_length should be int or None, not {type(context_length)}"
-        assert isinstance(flash_attn, bool), \
-            f"Model: flash_attn should be bool (True or False), not {type(flash_attn)}"
-        # save __init__ parameters for __repr__
-        self._model_path = model_path
-        self._context_length = context_length
-        self._n_gpu_layers = n_gpu_layers
-        self._offload_kqv = offload_kqv
-        self._flash_attn = flash_attn
-        self._verbose = self.verbose = verbose
-        # if context_length <= 0, use n_ctx_train
-        if isinstance(context_length, int) and context_length <= 0:
-            context_length = None
-        # this does not use Llama.metadata because we want to use GGUF
-        # metadata to determine some parameters of the Llama instance
-        # before it is created
-        self.metadata = GGUFReader.load_metadata(self, model_path)
-        metadata_keys = self.metadata.keys() # only read once
-        n_ctx_train = None
-        for key in metadata_keys:
-            if key.endswith('.context_length'):
-                n_ctx_train = self.metadata[key]
-                break
-        if n_ctx_train is None:
-            raise KeyError(
-                "GGUF file does not specify a context length"
-            )
-        rope_freq_base_train = None
-        for key in metadata_keys:
-            if key.endswith('.rope.freq_base'):
-                rope_freq_base_train = self.metadata[key]
-                break
-        if rope_freq_base_train is None and context_length is not None:
-            if context_length > n_ctx_train:
-                raise ValueError(
-                    'unable to load model with greater than native ' + \
-                    f'context length ({context_length} > {n_ctx_train}) ' + \
-                    'because model does not specify freq_base. ' + \
-                    f'try again with `context_length={n_ctx_train}`'
-                )
-        if rope_freq_base_train is None or context_length is None or \
-            context_length <= n_ctx_train:
-            # no need to do context scaling, load model normally
-            if context_length is None:
-                self.context_length = n_ctx_train
-            else:
-                self.context_length = context_length
-            rope_freq_base = rope_freq_base_train
-        elif context_length > n_ctx_train:
-            # multiply rope_freq_base according to requested context length
-            # because context length > n_ctx_train and rope freq base is known
-            rope_freq_base = (context_length/n_ctx_train)*rope_freq_base_train
-            self.context_length = context_length
-            if self.verbose:
-                print_verbose(
-                    'chosen context length is greater than native context '
-                    f'length ({context_length} > {n_ctx_train}), '
-                    'rope_freq_base will be changed from '
-                    f'{rope_freq_base_train} to {rope_freq_base}'
-                )
-            if 2 <= context_length/n_ctx_train < 4:
-                print_warning(
-                    'loading model with 2x native context length or more, '
-                    'expect small loss of quality'
-                )
-            elif 4 <= context_length/n_ctx_train < 8:
-                print_warning(
-                    'loading model with 4x native context length or more, '
-                    'expect moderate loss of quality'
-                )
-            elif context_length/n_ctx_train >= 8:
-                print_warning(
-                    'loading model with 8x native context length or more, '
-                    'expect SIGNIFICANT loss of quality'
-                )
-        try:
-            self.tokens: list[str] = self.metadata['tokenizer.ggml.tokens']
-        except KeyError:
-            print_warning(
-                "could not set Model.tokens, defaulting to None"
-            )
-            self.tokens = None
-        try:
-            self.bos_token: int = self.metadata['tokenizer.ggml.bos_token_id']
-        except KeyError:
-            print_warning(
-                "could not set Model.bos_token, defaulting to None"
-            )
-            self.bos_token = None
-        try:
-            self.eos_token: int = self.metadata['tokenizer.ggml.eos_token_id']
-        except KeyError:
-            print_warning(
-                "could not set Model.eos_token, defaulting to None"
-            )
-            self.eos_token = None
-        cpu_count = os_cpu_count()
-        # these values for n_threads and n_threads_batch are
-        # known to be optimal for most systems
-        n_batch = 512 # can this be optimized?
-        n_threads = max(cpu_count//2, 1)
-        n_threads_batch = cpu_count
-        if flash_attn and n_gpu_layers == 0:
-            print_warning(
-                "disabling flash_attn because n_gpu_layers == 0"
-            )
-            flash_attn = False
-        # guard against models with no rope_freq_base
-        if rope_freq_base is None:
-            rope_freq_base = 0
-        self.llama: Llama = Llama(
-            model_path=model_path,
-            n_ctx=self.context_length,
-            n_gpu_layers=n_gpu_layers,
-            use_mmap=True,
-            use_mlock=False,
-            logits_all=False,
-            n_batch=n_batch,
-            n_threads=n_threads,
-            n_threads_batch=n_threads_batch,
-            rope_freq_base=rope_freq_base,
-            mul_mat_q=True,
-            offload_kqv=offload_kqv,
-            flash_attn=flash_attn,
-            # KV cache quantization
-            # use 1 for F16 (default), 8 for q8_0, 2 for q4_0, 3 for q4_1
-            #type_k=8,
-            #type_v=8,
-            verbose=verbose
-        )
-        # once model is loaded, replace metadata (as read using internal class)
-        # with metadata (as read using the more robust llama-cpp-python code)
-        self.metadata = self.llama.metadata
-        # expose these values because they may be useful / informative
-        self.n_ctx_train = n_ctx_train
-        self.rope_freq_base_train = rope_freq_base_train
-        self.rope_freq_base = rope_freq_base
-        self.flash_attn = flash_attn
-        if self.verbose:
-            print_verbose("new Model instance with the following attributes:")
-            print_verbose(f"model: {model_path}")
-            print_verbose(f"param: n_gpu_layers         == {n_gpu_layers}")
-            print_verbose(f"param: offload_kqv          == {offload_kqv}")
-            print_verbose(f"param: flash_attn           == {flash_attn}")
-            print_verbose(f"param: n_batch              == {n_batch}")
-            print_verbose(f"param: n_threads            == {n_threads}")
-            print_verbose(f"param: n_threads_batch      == {n_threads_batch}")
-            print_verbose(f" gguf: n_ctx_train          == {n_ctx_train}")
-            print_verbose(f"param: self.context_length  == {self.context_length}")
-            print_verbose(f" gguf: rope_freq_base_train == {rope_freq_base_train}")
-            print_verbose(f"param: rope_freq_base       == {rope_freq_base}")
-    def __repr__(self) -> str:
-        return \
-            f"Model({repr(self._model_path)}, " + \
-            f"context_length={self._context_length}, " + \
-            f"n_gpu_layers={self._n_gpu_layers}, " + \
-            f"offload_kqv={self._offload_kqv}, "+ \
-            f"flash_attn={self._flash_attn}, " + \
-            f"verbose={self._verbose})"
-    def __del__(self):
-        self.unload()
-    def __enter__(self):
-        return self
-    def __exit__(self, *_):
-        self.unload()
-    def __call__(
-        self,
-        prompt: Union[str, list[int]],
-        stops: list[Union[str, int]] = [],
-        sampler: SamplerSettings = DefaultSampling
-    ) -> str:
-        """
-        `Model(...)` is a shorthand for `Model.generate(...)`
-        """
-        return self.generate(prompt, stops, sampler)
-    def unload(self):
-        """
-        Unload the model from memory
-        """
-        # ref: llama_cpp._internals._LlamaModel.__del__()
-        if not hasattr(self, 'llama'):
-            # nothing can be done
-            return
-        try:
-            if self.llama._model.model is not None:
-                # actually unload the model from memory
-                self.llama._model._llama_free_model(self.llama._model.model)
-                self.llama._model.model = None
-        except AttributeError:
-            # broken or already being destroyed by GC, abort
-            return
-        if hasattr(self, 'llama'):
-            delattr(self, 'llama')
-        if self.verbose:
-            print_verbose('Model unloaded')
-    def trim(
-        self,
-        text: str,
-        overwrite: Optional[str] = None
-    ) -> str:
-        """
-        Trim the given text to the context length of this model,
-        leaving room for two extra tokens.
-        Optionally overwrite the oldest tokens with the text given in the
-        `overwrite` parameter, which may be useful for keeping some
-        information in context.
-        Does nothing if the text is equal to or shorter than
-        (context_length - 2).
-        """
-        assert_model_is_loaded(self)
-        trim_length = self.context_length - 2
-        tokens_list = self.llama.tokenize(
-            text.encode("utf-8", errors="ignore")
-        )
-        if len(tokens_list) <= trim_length:
-            if overwrite is not None:
-                text[0 : len(overwrite)] = overwrite
-            return text
-        if len(tokens_list) > trim_length and overwrite is None:
-            # cut to trim_length
-            tokens_list = tokens_list[-trim_length:]
-            return self.llama.detokenize(tokens_list).decode(
-                "utf-8",
-                errors="ignore"
-            )
-        if len(tokens_list) > trim_length and overwrite is not None:
-            # cut to trim_length
-            tokens_list = tokens_list[-trim_length:]
-            overwrite_tokens = self.llama.tokenize(overwrite.encode(
-                "utf-8",
-                errors="ignore"
-                )
-            )
-            # overwrite oldest tokens
-            tokens_list[0 : len(overwrite_tokens)] = overwrite_tokens
-            return self.llama.detokenize(tokens_list).decode(
-                "utf-8",
-                errors="ignore"
-            )
-    def get_length(self, text: str) -> int:
-        """
-        Return the length of the given text in tokens according to this model,
-        including the appended BOS token.
-        """
-        assert_model_is_loaded(self)
-        return len(self.llama.tokenize(
-            text.encode(
-                "utf-8",
-                errors="ignore"
-                )
-            ))
-    def generate(
-        self,
-        prompt: Union[str, list[int]],
-        stops: list[Union[str, int]] = [],
-        sampler: SamplerSettings = DefaultSampling
-    ) -> str:
-        """
-        Given a prompt, return a generated string.
-        prompt: The text from which to generate
-        The following parameters are optional:
-        - stops: A list of strings and/or token IDs at which to end the generation early
-        - sampler: The SamplerSettings object used to control text generation
-        """
-        assert isinstance(prompt, (str, list)), \
-            f"generate: prompt should be string or list[int], not {type(prompt)}"
-        if isinstance(prompt, list):
-            assert all(isinstance(tok, int) for tok in prompt), \
-                "generate: some token in prompt is not an integer"
-        assert isinstance(stops, list), \
-            f"generate: parameter `stops` should be a list, not {type(stops)}"
-        assert all(isinstance(item, (str, int)) for item in stops), \
-            f"generate: some item in parameter `stops` is not a string or int"
-        if self.verbose:
-            print_verbose(f'using the following sampler settings for Model.generate:')
-            print_verbose(f'max_len_tokens    == {sampler.max_len_tokens}')
-            print_verbose(f'temp              == {sampler.temp}')
-            print_verbose(f'top_p             == {sampler.top_p}')
-            print_verbose(f'min_p             == {sampler.min_p}')
-            print_verbose(f'frequency_penalty == {sampler.frequency_penalty}')
-            print_verbose(f'presence_penalty  == {sampler.presence_penalty}')
-            print_verbose(f'repeat_penalty    == {sampler.repeat_penalty}')
-            print_verbose(f'top_k             == {sampler.top_k}')
-        # if any stop item is a token ID (int)
-        if any(isinstance(stop, int) for stop in stops):
-            # stop_strs is a list of all stopping strings
-            stop_strs: list[str] = [stop for stop in stops if isinstance(stop, str)]
-            # stop_token_ids is a list of all stop token IDs
-            stop_token_ids: list[int] = [tok_id for tok_id in stops if isinstance(tok_id, int)]
-            def stop_on_token_ids(tokens, *args, **kwargs):
-                return tokens[-1] in stop_token_ids
-            stopping_criteria = StoppingCriteriaList([stop_on_token_ids])
-            assert_model_is_loaded(self)
-            return self.llama.create_completion(
-                prompt,
-                max_tokens=sampler.max_len_tokens,
-                temperature=sampler.temp,
-                top_p=sampler.top_p,
-                min_p=sampler.min_p,
-                frequency_penalty=sampler.frequency_penalty,
-                presence_penalty=sampler.presence_penalty,
-                repeat_penalty=sampler.repeat_penalty,
-                top_k=sampler.top_k,
-                stop=stop_strs,
-                stopping_criteria=stopping_criteria
-            )['choices'][0]['text']
-        # if stop items are only strings
-        assert_model_is_loaded(self)
-        return self.llama.create_completion(
-            prompt,
-            max_tokens=sampler.max_len_tokens,
-            temperature=sampler.temp,
-            top_p=sampler.top_p,
-            min_p=sampler.min_p,
-            frequency_penalty=sampler.frequency_penalty,
-            presence_penalty=sampler.presence_penalty,
-            repeat_penalty=sampler.repeat_penalty,
-            top_k=sampler.top_k,
-            stop=stops
-        )['choices'][0]['text']
-    def stream(
-        self,
-        prompt: Union[str, list[int]],
-        stops: list[Union[str, int]] = [],
-        sampler: SamplerSettings = DefaultSampling
-    ) -> Generator:
-        """
-        Given a prompt, return a Generator that yields dicts containing tokens.
-        To get the token string itself, subscript the dict with:
-        `['choices'][0]['text']`
-        prompt: The text from which to generate
-        The following parameters are optional:
-        - stops: A list of strings and/or token IDs at which to end the generation early
-        - sampler: The SamplerSettings object used to control text generation
-        """
-        assert isinstance(prompt, (str, list)), \
-            f"stream: prompt should be string or list[int], not {type(prompt)}"
-        if isinstance(prompt, list):
-            assert all(isinstance(tok, int) for tok in prompt), \
-                "stream: some token in prompt is not an integer"
-        assert isinstance(stops, list), \
-            f"stream: parameter `stops` should be a list, not {type(stops)}"
-        assert all(isinstance(item, (str, int)) for item in stops), \
-            f"stream: some item in parameter `stops` is not a string or int"
-        if self.verbose:
-            print_verbose(f'using the following sampler settings for Model.stream:')
-            print_verbose(f'max_len_tokens    == {sampler.max_len_tokens}')
-            print_verbose(f'temp              == {sampler.temp}')
-            print_verbose(f'top_p             == {sampler.top_p}')
-            print_verbose(f'min_p             == {sampler.min_p}')
-            print_verbose(f'frequency_penalty == {sampler.frequency_penalty}')
-            print_verbose(f'presence_penalty  == {sampler.presence_penalty}')
-            print_verbose(f'repeat_penalty    == {sampler.repeat_penalty}')
-            print_verbose(f'top_k             == {sampler.top_k}')
-        # if any stop item is a token ID (int)
-        if any(isinstance(stop, int) for stop in stops):
-            # stop_strs is a list of all stopping strings
-            stop_strs: list[str] = [stop for stop in stops if isinstance(stop, str)]
-            # stop_token_ids is a list of all stop token IDs
-            stop_token_ids: list[int] = [tok_id for tok_id in stops if isinstance(tok_id, int)]
-            def stop_on_token_ids(tokens, *args, **kwargs):
-                return tokens[-1] in stop_token_ids
-            stopping_criteria = StoppingCriteriaList([stop_on_token_ids])
-            assert_model_is_loaded(self)
-            return self.llama.create_completion(
-                prompt,
-                max_tokens=sampler.max_len_tokens,
-                temperature=sampler.temp,
-                top_p=sampler.top_p,
-                min_p=sampler.min_p,
-                frequency_penalty=sampler.frequency_penalty,
-                presence_penalty=sampler.presence_penalty,
-                repeat_penalty=sampler.repeat_penalty,
-                top_k=sampler.top_k,
-                stream=True,
-                stop=stop_strs,
-                stopping_criteria=stopping_criteria
-            )
-        assert_model_is_loaded(self)
-        return self.llama.create_completion(
-            prompt,
-            max_tokens=sampler.max_len_tokens,
-            temperature=sampler.temp,
-            top_p=sampler.top_p,
-            min_p=sampler.min_p,
-            frequency_penalty=sampler.frequency_penalty,
-            presence_penalty=sampler.presence_penalty,
-            repeat_penalty=sampler.repeat_penalty,
-            top_k=sampler.top_k,
-            stream=True,
-            stop=stops
-        )
-    def stream_print(
-        self,
-        prompt: Union[str, list[int]],
-        stops: list[Union[str, int]] = [],
-        sampler: SamplerSettings = DefaultSampling,
-        end: str = "\n",
-        file: _SupportsWriteAndFlush = sys.stdout,
-        flush: bool = True
-    ) -> str:
-        """
-        Given a prompt, stream text as it is generated, and return the generated string.
-        The returned string does not include the `end` parameter.
-        `Model.stream_print(...)` is a shorthand for:
-        ```
-        s = Model.stream(prompt, stops=stops, sampler=sampler)
-        for i in s:
-            tok = i['choices'][0]['text']
-            print(tok, end='', file=file, flush=flush)
-        print(end, end='', file=file, flush=True)
-        ```
-        prompt: The text from which to generate
-        The following parameters are optional:
-        - stops: A list of strings and/or token IDs at which to end the generation early
-        - sampler: The SamplerSettings object used to control text generation
-        - end: A string to print after the generated text
-        - file: The file where text should be printed
-        - flush: Whether to flush the stream after each token
-        """
-        token_generator = self.stream(
-            prompt=prompt,
-            stops=stops,
-            sampler=sampler
-        )
-        res = ''
-        for i in token_generator:
-            tok = i['choices'][0]['text']
-            print(tok, end='', file=file, flush=flush)
-            res += tok
-        # print `end`, and always flush stream after generation is done
-        print(end, end='', file=file, flush=True)
-        return res
-    def ingest(self, text: str) -> None:
-        """
-        Ingest the given text into the model's cache
-        """
-        assert_model_is_loaded(self)
-        self.llama.create_completion(
-            text,
-            max_tokens=1,
-            temperature=0.0
-        )
-    def candidates(
-        self,
-        prompt: str,
-        k: int
-    ) -> list[tuple[str, np.floating]]:
-        """
-        Given prompt `str` and k `int`, return a sorted list of the
-        top k candidates for most likely next token, along with their
-        normalized probabilities
-        """
-        assert isinstance(prompt, str), \
-            f"next_candidates: prompt should be str, not {type(prompt)}"
-        assert isinstance(k, int), \
-            f"next_candidates: k should be int, not {type(k)}"
-        assert 0 < k <= len(self.tokens), \
-            f"next_candidates: k should be between 0 and {len(self.tokens)}"
-        assert_model_is_loaded(self)
-        prompt_tokens = self.llama.tokenize(prompt.encode('utf-8', errors='ignore'))
-        self.llama.reset() # reset model state
-        self.llama.eval(prompt_tokens)
-        scores = self.llama.scores[len(prompt_tokens) - 1]
-        # len(self.llama.scores) == self.context_length
-        # len(self.llama.scores[i]) == len(self.tokens)
-        # normalize scores with softmax
-        # must normalize over all tokens in vocab, not just top k
-        if self.verbose:
-            print_verbose(f'calculating softmax over {len(scores)} values')
-        normalized_scores: list[np.floating] = list(softmax(scores))
-        # construct the final list
-        i = 0
-        token_probs_list: list[tuple[str, np.floating]] = []
-        for tok_str in self.tokens:
-            token_probs_list.append((tok_str, normalized_scores[i]))
-            i += 1
-        # return token_probs_list, sorted by probability, only top k
-        return nlargest(k, token_probs_list, key=lambda x:x[1])
-    def print_candidates(
-        self,
-        prompt: str,
-        k: int,
-        file: _SupportsWriteAndFlush = sys.stdout,
-        flush: bool = False
-    ) -> None:
-        """
-        Like `Model.candidates()`, but print the values instead
-        of returning them
-        """
-        for _tuple in self.candidates(prompt, k):
-            print(
-                f"token {repr(_tuple[0])} has probability {_tuple[1]}",
-                file=file,
-                flush=flush
-            )
-        # if flush is False, then so far file is not flushed, but it should
-        # always be flushed at the end of printing
-        if not flush:
-            file.flush()
-def assert_model_is_loaded(model: Model) -> None:
-    """
-    Ensure the Model is fully constructed, such that
-    `Model.llama._model.model is not None` is guaranteed to be `True`
-    Raise ModelUnloadedException otherwise
-    """
-    if not hasattr(model, 'llama'):
-        raise ModelUnloadedException(
-            "webscout.Local.Model instance has no attribute 'llama'"
-        )
-    if not hasattr(model.llama, '_model'):
-        raise ModelUnloadedException(
-            "llama_cpp.Llama instance has no attribute '_model'"
-        )
-    if not hasattr(model.llama._model, 'model'):
-        raise ModelUnloadedException(
-            "llama_cpp._internals._LlamaModel instance has no attribute 'model'"
-        )
-    if model.llama._model.model is None:
-        raise ModelUnloadedException(
-            "llama_cpp._internals._LlamaModel.model is None"
-        )

webscout/Local/samplers.py DELETED Viewed

@@ -1,161 +0,0 @@
-from ._version import __version__, __llama_cpp_version__
-"""Submodule containing SamplerSettings class and some preset samplers"""
-from sys import maxsize
-MAX_TEMP = float(maxsize)
-class SamplerSettings:
-    """
-    A SamplerSettings object specifies the sampling parameters that will be
-    used to control text generation
-    """
-    ParamTypes: dict[str, type] = {
-        'max_len_tokens':    int,
-        'temp':              float,
-        'top_p':             float,
-        'min_p':             float,
-        'frequency_penalty': float,
-        'presence_penalty':  float,
-        'repeat_penalty':    float,
-        'top_k':             int
-    }
-    def __init__(
-        self,
-        max_len_tokens:    int   = -1,
-        temp:              float = 0.8,
-        top_p:             float = 0.95,
-        min_p:             float = 0.05,
-        frequency_penalty: float = 0.0,
-        presence_penalty:  float = 0.0,
-        repeat_penalty:    float = 1.0,
-        top_k:             int   = 40
-    ):
-        """
-        Construct a new SamplerSettings instance
-        """
-        self.max_len_tokens    = max_len_tokens
-        self.temp              = temp
-        self.top_p             = top_p
-        self.min_p             = min_p
-        self.frequency_penalty = frequency_penalty
-        self.presence_penalty  = presence_penalty
-        self.repeat_penalty    = repeat_penalty
-        self.top_k             = top_k
-        for sampler_param in SamplerSettings.ParamTypes:
-            expected_type = SamplerSettings.ParamTypes[sampler_param]
-            actual_type = type(getattr(self, sampler_param))
-            if actual_type != expected_type:
-                raise TypeError(
-                    f"wrong type for SamplerSettings parameter '{sampler_param}'"
-                    f" - expected {expected_type}, got {actual_type}"
-                )
-    def __repr__(self) -> str:
-        repr_str = 'SamplerSettings('
-        repr_str += f'max_len_tokens={self.max_len_tokens}, '
-        repr_str += f'temp={self.temp}, '
-        repr_str += f'top_p={self.top_p}, '
-        repr_str += f'min_p={self.min_p}, '
-        repr_str += f'frequency_penalty={self.frequency_penalty}, '
-        repr_str += f'presence_penalty={self.presence_penalty}, '
-        repr_str += f'repeat_penalty={self.repeat_penalty}, '
-        repr_str += f'top_k={self.top_k})'
-        return repr_str
-# most likely token is always chosen
-GreedyDecoding = SamplerSettings(
-    temp = 0.0,
-)
-# reflects llama.cpp
-DefaultSampling = SamplerSettings()
-# unmodified probability distribution (i.e. what the model actually thinks)
-SimpleSampling = SamplerSettings(
-    temp = 1.0,
-    top_p = 1.0,
-    min_p = 0.0,
-    top_k = -1
-)
-# reflects old llama.cpp defaults
-ClassicSampling = SamplerSettings(
-    min_p=0.0,
-    repeat_penalty = 1.1
-)
-# halfway between DefaultSampling and SimpleSampling
-SemiSampling = SamplerSettings(
-    temp=0.9,
-    top_p=0.975,
-    min_p=0.025,
-    top_k=80
-)
-# for models with large vocabulary, which tend to run hot
-TikTokenSampling = SamplerSettings(
-    temp=0.6,
-    repeat_penalty=1.1
-)
-# use min_p as the only active sampler (more permissive)
-LowMinPSampling = SamplerSettings(
-    temp = 1.0,
-    top_p = 1.0,
-    min_p = 0.05,
-    top_k = -1
-)
-# use min_p as the only active sampler (moderate)
-MinPSampling = SamplerSettings(
-    temp = 1.0,
-    top_p = 1.0,
-    min_p = 0.1,
-    top_k = -1
-)
-# use min_p as the only active sampler (more restrictive)
-StrictMinPSampling = SamplerSettings(
-    temp = 1.0,
-    top_p = 1.0,
-    min_p = 0.2,
-    top_k = -1
-)
-# https://arxiv.org/abs/2210.14140
-ContrastiveSearch = SamplerSettings(
-    temp = 0.0,
-    presence_penalty = 0.4
-)
-# https://arxiv.org/abs/2210.14140
-WarmContrastiveSearch = SamplerSettings(
-    temp = 0.0,
-    presence_penalty = 0.8
-)
-# outputs completely random tokens from vocab (useless)
-RandomSampling = SamplerSettings(
-    temp = MAX_TEMP,
-    top_p = 1.0,
-    min_p = 0.0,
-    top_k = -1
-)
-# default sampling with reduced temperature
-LowTempSampling = SamplerSettings(
-    temp = 0.4
-)
-# default sampling with increased temperature
-HighTempSampling = SamplerSettings(
-    temp = 1.2
-)

webscout/Local/thread.py DELETED Viewed

@@ -1,690 +0,0 @@
-from ._version import __version__, __llama_cpp_version__
-"""Submodule containing the Thread class, used for interaction with a Model"""
-import sys
-from .model    import Model, assert_model_is_loaded, _SupportsWriteAndFlush
-from .utils    import RESET_ALL, cls, print_verbose, truncate
-from .samplers import SamplerSettings, DefaultSampling
-from typing    import Optional, Literal, Union
-from .formats  import AdvancedFormat
-from .formats import blank as formats_blank
-class Message(dict):
-    """
-    A dictionary representing a single message within a Thread
-    Works just like a normal `dict`, but a new method:
-    - `.as_string` - Return the full message string
-    Generally, messages have these keys:
-    - `role` -  The role of the speaker: 'system', 'user', or 'bot'
-    - `prefix` - The text that prefixes the message content
-    - `content` - The actual content of the message
-    - `suffix` - The text that suffixes the message content
-    """
-    def __repr__(self) -> str:
-        return \
-            f"Message([" \
-            f"('role', {repr(self['role'])}), " \
-            f"('prefix', {repr(self['prefix'])}), " \
-            f"('content', {repr(self['content'])}), " \
-            f"('suffix', {repr(self['suffix'])})])"
-    def as_string(self):
-        """Return the full message string"""
-        try:
-            return self['prefix'] + self['content'] + self['suffix']
-        except KeyError as e:
-            e.add_note(
-                "as_string: Message is missing one or more of the "
-                "required 'prefix', 'content', 'suffix' attributes - this is "
-                "unexpected"
-            )
-            raise e
-class Thread:
-    """
-    Provide functionality to facilitate easy interactions with a Model
-    This is just a brief overview of m.Thread.
-    To see a full description of each method and its parameters,
-    call help(Thread), or see the relevant docstring.
-    The following methods are available:
-    - `.add_message()` - Add a message to `Thread.messages`
-    - `.as_string()` - Return this thread's complete message history as a string
-    - `.create_message()` - Create a message using the format of this thread
-    - `.inference_str_from_messages()` - Using the list of messages, return a string suitable for inference
-    - `.interact()` - Start an interactive, terminal-based chat session
-    - `.len_messages()` - Get the total length of all messages in tokens
-    - `.print_stats()` - Print stats about the context usage in this thread
-    - `.reset()` - Clear the list of messages
-    - `.send()` - Send a message in this thread
-    The following attributes are available:
-    - `.format` - The format being used for messages in this thread
-    - `.messages` - The list of messages in this thread
-    - `.model` - The `m.Model` instance used by this thread
-    - `.sampler` - The SamplerSettings object used in this thread
-    """
-    def __init__(
-        self,
-        model: Model,
-        format: Union[dict, AdvancedFormat],
-        sampler: SamplerSettings = DefaultSampling,
-        messages: Optional[list[Message]] = None,
-    ):
-        """
-        Given a Model and a format, construct a Thread instance.
-        model: The Model to use for text generation
-        format: The format specifying how messages should be structured (see m.formats)
-        The following parameters are optional:
-        - sampler: The SamplerSettings object used to control text generation
-        - messages: A list of m.thread.Message objects to add to the Thread upon construction
-        """
-        assert isinstance(model, Model), \
-            "Thread: model should be an " + \
-            f"instance of webscout.Local.Model, not {type(model)}"
-        assert_model_is_loaded(model)
-        assert isinstance(format, (dict, AdvancedFormat)), \
-            f"Thread: format should be dict or AdvancedFormat, not {type(format)}"
-        if any(k not in format.keys() for k in formats_blank.keys()):
-            raise KeyError(
-                "Thread: format is missing one or more required keys, see " + \
-                "webscout.Local.formats.blank for an example"
-            )
-        assert isinstance(format['stops'], list), \
-            "Thread: format['stops'] should be list, not " + \
-            f"{type(format['stops'])}"
-        assert all(
-            hasattr(sampler, attr) for attr in [
-                'max_len_tokens',
-                'temp',
-                'top_p',
-                'min_p',
-                'frequency_penalty',
-                'presence_penalty',
-                'repeat_penalty',
-                'top_k'
-            ]
-        ), 'Thread: sampler is missing one or more required attributes'
-        self._messages: Optional[list[Message]] = messages
-        if self._messages is not None:
-            if not all(isinstance(msg, Message) for msg in self._messages):
-                raise TypeError(
-                    "Thread: one or more messages provided to __init__() is "
-                    "not an instance of m.thread.Message"
-                )
-        # Thread.messages is never empty, unless `messages` param is explicity
-        # set to `[]` during construction
-        self.model: Model = model
-        self.format: Union[dict, AdvancedFormat] = format
-        self.messages: list[Message] = [
-            self.create_message("system", self.format['system_content'])
-        ] if self._messages is None else self._messages
-        self.sampler: SamplerSettings = sampler
-        if self.model.verbose:
-            print_verbose("new Thread instance with the following attributes:")
-            print_verbose(f"model                     == {self.model}")
-            print_verbose(f"format['system_prefix']   == {truncate(repr(self.format['system_prefix']))}")
-            print_verbose(f"format['system_content']  == {truncate(repr(self.format['system_content']))}")
-            print_verbose(f"format['system_suffix']   == {truncate(repr(self.format['system_suffix']))}")
-            print_verbose(f"format['user_prefix']     == {truncate(repr(self.format['user_prefix']))}")
-            print_verbose(f"format['user_content']    == {truncate(repr(self.format['user_content']))}")
-            print_verbose(f"format['user_suffix']     == {truncate(repr(self.format['user_suffix']))}")
-            print_verbose(f"format['bot_prefix']      == {truncate(repr(self.format['bot_prefix']))}")
-            print_verbose(f"format['bot_content']     == {truncate(repr(self.format['bot_content']))}")
-            print_verbose(f"format['bot_suffix']      == {truncate(repr(self.format['bot_suffix']))}")
-            print_verbose(f"format['stops']           == {truncate(repr(self.format['stops']))}")
-            print_verbose(f"sampler.temp              == {self.sampler.temp}")
-            print_verbose(f"sampler.top_p             == {self.sampler.top_p}")
-            print_verbose(f"sampler.min_p             == {self.sampler.min_p}")
-            print_verbose(f"sampler.frequency_penalty == {self.sampler.frequency_penalty}")
-            print_verbose(f"sampler.presence_penalty  == {self.sampler.presence_penalty}")
-            print_verbose(f"sampler.repeat_penalty    == {self.sampler.repeat_penalty}")
-            print_verbose(f"sampler.top_k             == {self.sampler.top_k}")
-    def __repr__(self) -> str:
-        return \
-            f"Thread({repr(self.model)}, {repr(self.format)}, " + \
-            f"{repr(self.sampler)}, {repr(self.messages)})"
-    def __str__(self) -> str:
-        return self.as_string()
-    def __len__(self) -> int:
-        """
-        `len(Thread)` returns the length of the Thread in tokens
-        To get the number of messages in the Thread, use `len(Thread.messages)`
-        """
-        return self.len_messages()
-    def create_message(
-        self,
-        role: Literal['system', 'user', 'bot'],
-        content: str
-    ) -> Message:
-        """
-        Construct a message using the format of this Thread
-        """
-        assert role.lower() in ['system', 'user', 'bot'], \
-            f"create_message: role should be 'system', 'user', or 'bot', not '{role.lower()}'"
-        assert isinstance(content, str), \
-            f"create_message: content should be str, not {type(content)}"
-        if role.lower() == 'system':
-            return Message(
-                [
-                    ('role', 'system'),
-                    ('prefix', self.format['system_prefix']),
-                    ('content', content),
-                    ('suffix', self.format['system_suffix'])
-                ]
-            )
-        elif role.lower() == 'user':
-            return Message(
-                [
-                    ('role', 'user'),
-                    ('prefix', self.format['user_prefix']),
-                    ('content', content),
-                    ('suffix', self.format['user_suffix'])
-                ]
-            )
-        elif role.lower() == 'bot':
-            return Message(
-                [
-                    ('role', 'bot'),
-                    ('prefix', self.format['bot_prefix']),
-                    ('content', content),
-                    ('suffix', self.format['bot_suffix'])
-                ]
-            )
-    def len_messages(self) -> int:
-        """
-        Return the total length of all messages in this thread, in tokens.
-        Can also use `len(Thread)`."""
-        return self.model.get_length(self.as_string())
-    def add_message(
-        self,
-        role: Literal['system', 'user', 'bot'],
-        content: str
-    ) -> None:
-        """
-        Create a message and append it to `Thread.messages`.
-        `Thread.add_message(...)` is a shorthand for
-        `Thread.messages.append(Thread.create_message(...))`
-        """
-        self.messages.append(
-            self.create_message(
-                role=role,
-                content=content
-            )
-        )
-    def inference_str_from_messages(self) -> str:
-        """
-        Using the list of messages, construct a string suitable for inference,
-        respecting the format and context length of this thread.
-        """
-        inf_str = ''
-        sys_msg_str = ''
-        # whether to treat the first message as necessary to keep
-        sys_msg_flag = False
-        context_len_budget = self.model.context_length
-        # if at least 1 message is history
-        if len(self.messages) >= 1:
-            # if first message has system role
-            if self.messages[0]['role'] == 'system':
-                sys_msg_flag = True
-                sys_msg = self.messages[0]
-                sys_msg_str = sys_msg.as_string()
-                context_len_budget -= self.model.get_length(sys_msg_str)
-        if sys_msg_flag:
-            iterator = reversed(self.messages[1:])
-        else:
-            iterator = reversed(self.messages)
-        for message in iterator:
-            msg_str = message.as_string()
-            context_len_budget -= self.model.get_length(msg_str)
-            if context_len_budget <= 0:
-                break
-            inf_str = msg_str + inf_str
-        if sys_msg_flag:
-            inf_str = sys_msg_str + inf_str
-        inf_str += self.format['bot_prefix']
-        return inf_str
-    def send(self, prompt: str) -> str:
-        """
-        Send a message in this thread. This adds your message and the bot's
-        response to the list of messages.
-        Returns a string containing the response to your message.
-        """
-        self.add_message("user", prompt)
-        output = self.model.generate(
-            self.inference_str_from_messages(),
-            stops=self.format['stops'],
-            sampler=self.sampler
-        )
-        self.add_message("bot", output)
-        return output
-    def _interactive_update_sampler(self) -> None:
-        """Interactively update the sampler settings used in this Thread"""
-        print()
-        try:
-            new_max_len_tokens = input(f'max_len_tokens: {self.sampler.max_len_tokens} -> ')
-            new_temp = input(f'temp: {self.sampler.temp} -> ')
-            new_top_p = input(f'top_p: {self.sampler.top_p} -> ')
-            new_min_p = input(f'min_p: {self.sampler.min_p} -> ')
-            new_frequency_penalty = input(f'frequency_penalty: {self.sampler.frequency_penalty} -> ')
-            new_presence_penalty = input(f'presence_penalty: {self.sampler.presence_penalty} -> ')
-            new_repeat_penalty = input(f'repeat_penalty: {self.sampler.repeat_penalty} -> ')
-            new_top_k = input(f'top_k: {self.sampler.top_k} -> ')
-        except KeyboardInterrupt:
-            print('\nwebscout.Local: sampler settings not updated\n')
-            return
-        print()
-        try:
-            self.sampler.max_len_tokens = int(new_max_len_tokens)
-        except ValueError:
-            pass
-        else:
-            print('webscout.Local: max_len_tokens updated')
-        try:
-            self.sampler.temp = float(new_temp)
-        except ValueError:
-            pass
-        else:
-            print('webscout.Local: temp updated')
-        try:
-            self.sampler.top_p = float(new_top_p)
-        except ValueError:
-            pass
-        else:
-            print('webscout.Local: top_p updated')
-        try:
-            self.sampler.min_p = float(new_min_p)
-        except ValueError:
-            pass
-        else:
-            print('webscout.Local: min_p updated')
-        try:
-            self.sampler.frequency_penalty = float(new_frequency_penalty)
-        except ValueError:
-            pass
-        else:
-            print('webscout.Local: frequency_penalty updated')
-        try:
-            self.sampler.presence_penalty = float(new_presence_penalty)
-        except ValueError:
-            pass
-        else:
-            print('webscout.Local: presence_penalty updated')
-        try:
-            self.sampler.repeat_penalty = float(new_repeat_penalty)
-        except ValueError:
-            pass
-        else:
-            print('webscout.Local: repeat_penalty updated')
-        try:
-            self.sampler.top_k = int(new_top_k)
-        except ValueError:
-            pass
-        else:
-            print('webscout.Local: top_k updated')
-        print()
-    def _interactive_input(
-        self,
-        prompt: str,
-        _dim_style: str,
-        _user_style: str,
-        _bot_style: str,
-        _special_style: str
-    ) -> tuple:
-        """
-        Recive input from the user, while handling multi-line input
-        and commands
-        """
-        full_user_input = '' # may become multiline
-        while True:
-            user_input = input(prompt)
-            if user_input.endswith('\\'):
-                full_user_input += user_input[:-1] + '\n'
-            elif user_input == '!':
-                print()
-                try:
-                    command = input(f'{RESET_ALL}  ! {_dim_style}')
-                except KeyboardInterrupt:
-                    print('\n')
-                    continue
-                if command == '':
-                    print(f'\n[no command]\n')
-                elif command.lower() in ['reset', 'restart']:
-                    self.reset()
-                    print(f'\n[thread reset]\n')
-                elif command.lower() in ['cls', 'clear']:
-                    cls()
-                    print()
-                elif command.lower() in ['ctx', 'context']:
-                    print(f"\n{self.len_messages()}\n")
-                elif command.lower() in ['stats', 'print_stats']:
-                    print()
-                    self.print_stats()
-                    print()
-                elif command.lower() in ['sampler', 'samplers', 'settings']:
-                    self._interactive_update_sampler()
-                elif command.lower() in ['str', 'string', 'as_string']:
-                    print(f"\n{self.as_string()}\n")
-                elif command.lower() in ['repr', 'save', 'backup']:
-                    print(f"\n{repr(self)}\n")
-                elif command.lower() in ['remove', 'rem', 'delete', 'del']:
-                    print()
-                    old_len = len(self.messages)
-                    del self.messages[-1]
-                    assert len(self.messages) == (old_len - 1)
-                    print('[removed last message]\n')
-                elif command.lower() in ['last', 'repeat']:
-                    last_msg = self.messages[-1]
-                    if last_msg['role'] == 'user':
-                        print(f"\n{_user_style}{last_msg['content']}{RESET_ALL}\n")
-                    elif last_msg['role'] == 'bot':
-                        print(f"\n{_bot_style}{last_msg['content']}{RESET_ALL}\n")
-                elif command.lower() in ['inf', 'inference', 'inf_str']:
-                    print(f'\n"""{self.inference_str_from_messages()}"""\n')
-                elif command.lower() in ['reroll', 're-roll', 're', 'swipe']:
-                    old_len = len(self.messages)
-                    del self.messages[-1]
-                    assert len(self.messages) == (old_len - 1)
-                    return '', None
-                elif command.lower() in ['exit', 'quit']:
-                    print(RESET_ALL)
-                    return None, None
-                elif command.lower() in ['help', '/?', '?']:
-                    print()
-                    print('reset | restart     -- Reset the thread to its original state')
-                    print('clear | cls         -- Clear the terminal')
-                    print('context | ctx       -- Get the context usage in tokens')
-                    print('print_stats | stats -- Get the context usage stats')
-                    print('sampler | settings  -- Update the sampler settings')
-                    print('string | str        -- Print the message history as a string')
-                    print('repr | save         -- Print the representation of the thread')
-                    print('remove | delete     -- Remove the last message')
-                    print('last | repeat       -- Repeat the last message')
-                    print('inference | inf     -- Print the inference string')
-                    print('reroll | swipe      -- Regenerate the last message')
-                    print('exit | quit         -- Exit the interactive chat (can also use ^C)')
-                    print('help | ?            -- Show this screen')
-                    print()
-                    print("TIP: type < at the prompt and press ENTER to prefix the bot's next message.")
-                    print('     for example, type "Sure!" to bypass refusals')
-                    print()
-                    print("TIP: type !! at the prompt and press ENTER to insert a system message")
-                    print()
-                else:
-                    print(f'\n[unknown command]\n')
-            # prefix the bot's next message
-            elif user_input == '<':
-                print()
-                try:
-                    next_message_start = input(f'{RESET_ALL}  < {_dim_style}')
-                except KeyboardInterrupt:
-                    print(f'{RESET_ALL}\n')
-                    continue
-                else:
-                    print()
-                    return '', next_message_start
-            # insert a system message
-            elif user_input == '!!':
-                print()
-                try:
-                    next_sys_msg = input(f'{RESET_ALL} !! {_special_style}')
-                except KeyboardInterrupt:
-                    print(f'{RESET_ALL}\n')
-                    continue
-                else:
-                    print()
-                    return next_sys_msg, -1
-            # concatenate multi-line input
-            else:
-                full_user_input += user_input
-                return full_user_input, None
-    def interact(
-        self,
-        color: bool = True,
-        header: Optional[str] = None,
-        stream: bool = True
-    ) -> None:
-        """
-        Start an interactive chat session using this Thread.
-        While text is being generated, press `^C` to interrupt the bot.
-        Then you have the option to press `ENTER` to re-roll, or to simply type
-        another message.
-        At the prompt, press `^C` to end the chat session.
-        Type `!` and press `ENTER` to enter a basic command prompt. For a list
-        of  commands, type `help` at this prompt.
-        Type `<` and press `ENTER` to prefix the bot's next message, for
-        example with `Sure!`.
-        Type `!!` at the prompt and press `ENTER` to insert a system message.
-        The following parameters are optional:
-        - color: Whether to use colored text to differentiate user / bot
-        - header: Header text to print at the start of the interaction
-        - stream: Whether to stream text as it is generated
-        """
-        print()
-        # fresh import of color codes in case `color` param has changed
-        from .utils import SPECIAL_STYLE, USER_STYLE, BOT_STYLE, DIM_STYLE
-        # disable color codes if explicitly disabled by `color` param
-        if not color:
-            SPECIAL_STYLE = ''
-            USER_STYLE = ''
-            BOT_STYLE = ''
-            DIM_STYLE = ''
-        if header is not None:
-            print(f"{SPECIAL_STYLE}{header}{RESET_ALL}\n")
-        while True:
-            prompt = f"{RESET_ALL}  > {USER_STYLE}"
-            try:
-                user_prompt, next_message_start = self._interactive_input(
-                    prompt,
-                    DIM_STYLE,
-                    USER_STYLE,
-                    BOT_STYLE,
-                    SPECIAL_STYLE
-                )
-            except KeyboardInterrupt:
-                print(f"{RESET_ALL}\n")
-                return
-            # got 'exit' or 'quit' command
-            if user_prompt is None and next_message_start is None:
-                break
-            # insert a system message via `!!` prompt
-            if next_message_start == -1:
-                self.add_message('system', user_prompt)
-                continue
-            if next_message_start is not None:
-                try:
-                    if stream:
-                        print(f"{BOT_STYLE}{next_message_start}", end='', flush=True)
-                        output = next_message_start + self.model.stream_print(
-                            self.inference_str_from_messages() + next_message_start,
-                            stops=self.format['stops'],
-                            sampler=self.sampler,
-                            end=''
-                        )
-                    else:
-                        print(f"{BOT_STYLE}", end='', flush=True)
-                        output = next_message_start + self.model.generate(
-                            self.inference_str_from_messages() + next_message_start,
-                            stops=self.format['stops'],
-                            sampler=self.sampler
-                        )
-                        print(output, end='', flush=True)
-                except KeyboardInterrupt:
-                    print(f"{DIM_STYLE} [message not added to history; press ENTER to re-roll]\n")
-                    continue
-                else:
-                    self.add_message("bot", output)
-            else:
-                print(BOT_STYLE)
-                if user_prompt != "":
-                    self.add_message("user", user_prompt)
-                try:
-                    if stream:
-                        output = self.model.stream_print(
-                            self.inference_str_from_messages(),
-                            stops=self.format['stops'],
-                            sampler=self.sampler,
-                            end=''
-                        )
-                    else:
-                        output = self.model.generate(
-                            self.inference_str_from_messages(),
-                            stops=self.format['stops'],
-                            sampler=self.sampler
-                        )
-                        print(output, end='', flush=True)
-                except KeyboardInterrupt:
-                    print(f"{DIM_STYLE} [message not added to history; press ENTER to re-roll]\n")
-                    continue
-                else:
-                    self.add_message("bot", output)
-            if output.endswith("\n\n"):
-                print(RESET_ALL, end = '', flush=True)
-            elif output.endswith("\n"):
-                print(RESET_ALL)
-            else:
-                print(f"{RESET_ALL}\n")
-    def reset(self) -> None:
-        """
-        Clear the list of messages, which resets the thread to its original
-        state
-        """
-        self.messages: list[Message] = [
-            self.create_message("system", self.format['system_content'])
-        ] if self._messages is None else self._messages
-    def as_string(self) -> str:
-        """Return this thread's message history as a string"""
-        thread_string = ''
-        for msg in self.messages:
-            thread_string += msg.as_string()
-        return thread_string
-    def print_stats(
-        self,
-        end: str = '\n',
-        file: _SupportsWriteAndFlush = sys.stdout,
-        flush: bool = True
-    ) -> None:
-        """Print stats about the context usage in this thread"""
-        thread_len_tokens = self.len_messages()
-        max_ctx_len = self.model.context_length
-        context_used_percentage = round((thread_len_tokens/max_ctx_len)*100)
-        print(f"{thread_len_tokens} / {max_ctx_len} tokens", file=file, flush=flush)
-        print(f"{context_used_percentage}% of context used", file=file, flush=flush)
-        print(f"{len(self.messages)} messages", end=end, file=file, flush=flush)
-        if not flush:
-            file.flush()

webscout/Local/utils.py DELETED Viewed

@@ -1,185 +0,0 @@
-from ._version import __version__, __llama_cpp_version__
-import sys
-import numpy as np
-from typing   import Any, Iterable, TextIO
-from time     import strftime
-from enum     import IntEnum
-from struct   import unpack
-from colorama import Fore
-from huggingface_hub import hf_hub_url, cached_download
-# color codes used in Thread.interact()
-RESET_ALL = Fore.RESET
-USER_STYLE = RESET_ALL + Fore.GREEN
-BOT_STYLE = RESET_ALL + Fore.CYAN
-DIM_STYLE = RESET_ALL + Fore.LIGHTBLACK_EX
-SPECIAL_STYLE = RESET_ALL + Fore.YELLOW
-# for typing of softmax parameter `z`
-class _ArrayLike(Iterable):
-    pass
-# for typing of Model.stream_print() parameter `file`
-class _SupportsWriteAndFlush(TextIO):
-    pass
-def download_model(repo_id: str, filename: str, cache_dir: str = ".cache") -> str:
-    """
-    Downloads a GGUF model file from Hugging Face Hub.
-    repo_id: The Hugging Face repository ID (e.g., 'facebook/bart-large-cnn').
-    filename: The name of the GGUF file within the repository (e.g., 'model.gguf').
-    cache_dir: The directory where the downloaded file should be stored.
-    Returns: The path to the downloaded file.
-    """
-    url = hf_hub_url(repo_id, filename)
-    filepath = cached_download(url, cache_dir=cache_dir, force_filename=filename)
-    return filepath
-class GGUFReader:
-    """
-    Peek at file header for GGUF metadata
-    Raise ValueError if file is not GGUF or is outdated
-    Credit to oobabooga for the parts of the code in this class
-    Format spec: https://github.com/philpax/ggml/blob/gguf-spec/docs/gguf.md
-    """
-    class GGUFValueType(IntEnum):
-        UINT8 = 0
-        INT8 = 1
-        UINT16 = 2
-        INT16 = 3
-        UINT32 = 4
-        INT32 = 5
-        FLOAT32 = 6
-        BOOL = 7
-        STRING = 8
-        ARRAY = 9
-        UINT64 = 10
-        INT64 = 11
-        FLOAT64 = 12
-    _simple_value_packing = {
-        GGUFValueType.UINT8: "<B",
-        GGUFValueType.INT8: "<b",
-        GGUFValueType.UINT16: "<H",
-        GGUFValueType.INT16: "<h",
-        GGUFValueType.UINT32: "<I",
-        GGUFValueType.INT32: "<i",
-        GGUFValueType.FLOAT32: "<f",
-        GGUFValueType.UINT64: "<Q",
-        GGUFValueType.INT64: "<q",
-        GGUFValueType.FLOAT64: "<d",
-        GGUFValueType.BOOL: "?",
-    }
-    value_type_info = {
-        GGUFValueType.UINT8: 1,
-        GGUFValueType.INT8: 1,
-        GGUFValueType.UINT16: 2,
-        GGUFValueType.INT16: 2,
-        GGUFValueType.UINT32: 4,
-        GGUFValueType.INT32: 4,
-        GGUFValueType.FLOAT32: 4,
-        GGUFValueType.UINT64: 8,
-        GGUFValueType.INT64: 8,
-        GGUFValueType.FLOAT64: 8,
-        GGUFValueType.BOOL: 1,
-    }
-    def get_single(self, value_type, file) -> Any:
-        if value_type == GGUFReader.GGUFValueType.STRING:
-            value_length = unpack("<Q", file.read(8))[0]
-            value = file.read(value_length)
-            value = value.decode("utf-8")
-        else:
-            type_str = GGUFReader._simple_value_packing.get(value_type)
-            bytes_length = GGUFReader.value_type_info.get(value_type)
-            value = unpack(type_str, file.read(bytes_length))[0]
-        return value
-    def load_metadata(self, fname) -> dict:
-        metadata = {}
-        with open(fname, "rb") as file:
-            GGUF_MAGIC = file.read(4)
-            if GGUF_MAGIC != b"GGUF":
-                raise ValueError(
-                    "your model file is not a valid GGUF file "
-                    f"(magic number mismatch, got {GGUF_MAGIC}, "
-                    "expected b'GGUF')"
-                )
-            GGUF_VERSION = unpack("<I", file.read(4))[0]
-            if GGUF_VERSION == 1:
-                raise ValueError(
-                    "your model file reports GGUF version 1, "
-                    "but only versions 2 and above are supported. "
-                    "re-convert your model or download a newer version"
-                )
-            # ti_data_count = struct.unpack("<Q", file.read(8))[0]
-            file.read(8)
-            kv_data_count = unpack("<Q", file.read(8))[0]
-            for _ in range(kv_data_count):
-                key_length = unpack("<Q", file.read(8))[0]
-                key = file.read(key_length)
-                value_type = GGUFReader.GGUFValueType(
-                    unpack("<I", file.read(4))[0]
-                )
-                if value_type == GGUFReader.GGUFValueType.ARRAY:
-                    ltype = GGUFReader.GGUFValueType(
-                        unpack("<I", file.read(4))[0]
-                    )
-                    length = unpack("<Q", file.read(8))[0]
-                    arr = [
-                        GGUFReader.get_single(
-                            self,
-                            ltype,
-                            file
-                        ) for _ in range(length)
-                    ]
-                    metadata[key.decode()] = arr
-                else:
-                    value = GGUFReader.get_single(self, value_type, file)
-                    metadata[key.decode()] = value
-        return metadata
-def softmax(z: _ArrayLike) -> np.ndarray:
-    """
-    Compute softmax over values in z, where z is array-like
-    """
-    e_z = np.exp(z - np.max(z))
-    return e_z / e_z.sum()
-def cls() -> None:
-    """Clear the terminal"""
-    print("\033c\033[3J", end='', flush=True)
-# no longer used in this module, but left for others to use
-def get_timestamp_prefix_str() -> str:
-    # helpful: https://strftime.net
-    return strftime("[%Y, %b %e, %a %l:%M %p] ")
-def truncate(text: str) -> str:
-    return text if len(text) < 63 else f"{text[:60]}..."
-def print_verbose(text: str) -> None:
-    print("webscout.Local: verbose:", text, file=sys.stderr, flush=True)
-def print_info(text: str) -> None:
-    print("webscout.Local: info:", text, file=sys.stderr, flush=True)
-def print_warning(text: str) -> None:
-    print("webscout.Local: warning:", text, file=sys.stderr, flush=True)