|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import json |
|
import logging |
|
import os |
|
import re |
|
import uuid |
|
import warnings |
|
from collections.abc import Generator |
|
from copy import deepcopy |
|
from dataclasses import asdict, dataclass |
|
from enum import Enum |
|
from threading import Thread |
|
from typing import TYPE_CHECKING, Any |
|
|
|
from .monitoring import TokenUsage |
|
from .tools import Tool |
|
from .utils import _is_package_available, encode_image_base64, make_image_url, parse_json_blob |
|
|
|
|
|
if TYPE_CHECKING: |
|
from transformers import StoppingCriteriaList |
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
STRUCTURED_GENERATION_PROVIDERS = ["cerebras", "fireworks-ai"] |
|
CODEAGENT_RESPONSE_FORMAT = { |
|
"type": "json_schema", |
|
"json_schema": { |
|
"schema": { |
|
"additionalProperties": False, |
|
"properties": { |
|
"thought": { |
|
"description": "A free form text description of the thought process.", |
|
"title": "Thought", |
|
"type": "string", |
|
}, |
|
"code": { |
|
"description": "Valid Python code snippet implementing the thought.", |
|
"title": "Code", |
|
"type": "string", |
|
}, |
|
}, |
|
"required": ["thought", "code"], |
|
"title": "ThoughtAndCodeAnswer", |
|
"type": "object", |
|
}, |
|
"name": "ThoughtAndCodeAnswer", |
|
"strict": True, |
|
}, |
|
} |
|
|
|
|
|
def get_dict_from_nested_dataclasses(obj, ignore_key=None): |
|
def convert(obj): |
|
if hasattr(obj, "__dataclass_fields__"): |
|
return {k: convert(v) for k, v in asdict(obj).items() if k != ignore_key} |
|
return obj |
|
|
|
return convert(obj) |
|
|
|
|
|
@dataclass |
|
class ChatMessageToolCallFunction: |
|
arguments: Any |
|
name: str |
|
description: str | None = None |
|
|
|
|
|
@dataclass |
|
class ChatMessageToolCall: |
|
function: ChatMessageToolCallFunction |
|
id: str |
|
type: str |
|
|
|
def __str__(self) -> str: |
|
return f"Call: {self.id}: Calling {str(self.function.name)} with arguments: {str(self.function.arguments)}" |
|
|
|
|
|
class MessageRole(str, Enum): |
|
USER = "user" |
|
ASSISTANT = "assistant" |
|
SYSTEM = "system" |
|
TOOL_CALL = "tool-call" |
|
TOOL_RESPONSE = "tool-response" |
|
|
|
@classmethod |
|
def roles(cls): |
|
return [r.value for r in cls] |
|
|
|
|
|
@dataclass |
|
class ChatMessage: |
|
role: MessageRole |
|
content: str | list[dict[str, Any]] | None = None |
|
tool_calls: list[ChatMessageToolCall] | None = None |
|
raw: Any | None = None |
|
token_usage: TokenUsage | None = None |
|
|
|
def model_dump_json(self): |
|
return json.dumps(get_dict_from_nested_dataclasses(self, ignore_key="raw")) |
|
|
|
@classmethod |
|
def from_dict(cls, data: dict, raw: Any | None = None, token_usage: TokenUsage | None = None) -> "ChatMessage": |
|
if data.get("tool_calls"): |
|
tool_calls = [ |
|
ChatMessageToolCall( |
|
function=ChatMessageToolCallFunction(**tc["function"]), id=tc["id"], type=tc["type"] |
|
) |
|
for tc in data["tool_calls"] |
|
] |
|
data["tool_calls"] = tool_calls |
|
return cls( |
|
role=data["role"], |
|
content=data.get("content"), |
|
tool_calls=data.get("tool_calls"), |
|
raw=raw, |
|
token_usage=token_usage, |
|
) |
|
|
|
def dict(self): |
|
return get_dict_from_nested_dataclasses(self) |
|
|
|
def render_as_markdown(self) -> str: |
|
rendered = str(self.content) or "" |
|
if self.tool_calls: |
|
rendered += "\n".join( |
|
[ |
|
json.dumps({"tool": tool.function.name, "arguments": tool.function.arguments}) |
|
for tool in self.tool_calls |
|
] |
|
) |
|
return rendered |
|
|
|
|
|
def parse_json_if_needed(arguments: str | dict) -> str | dict: |
|
if isinstance(arguments, dict): |
|
return arguments |
|
else: |
|
try: |
|
return json.loads(arguments) |
|
except Exception: |
|
return arguments |
|
|
|
|
|
@dataclass |
|
class ChatMessageToolCallStreamDelta: |
|
"""Represents a streaming delta for tool calls during generation.""" |
|
|
|
index: int | None = None |
|
id: str | None = None |
|
type: str | None = None |
|
function: ChatMessageToolCallFunction | None = None |
|
|
|
|
|
@dataclass |
|
class ChatMessageStreamDelta: |
|
content: str | None = None |
|
tool_calls: list[ChatMessageToolCallStreamDelta] | None = None |
|
token_usage: TokenUsage | None = None |
|
|
|
|
|
def agglomerate_stream_deltas( |
|
stream_deltas: list[ChatMessageStreamDelta], role: MessageRole = MessageRole.ASSISTANT |
|
) -> ChatMessage: |
|
""" |
|
Agglomerate a list of stream deltas into a single stream delta. |
|
""" |
|
accumulated_tool_calls: dict[int, ChatMessageToolCallStreamDelta] = {} |
|
accumulated_content = "" |
|
total_input_tokens = 0 |
|
total_output_tokens = 0 |
|
for stream_delta in stream_deltas: |
|
if stream_delta.token_usage: |
|
total_input_tokens += stream_delta.token_usage.input_tokens |
|
total_output_tokens += stream_delta.token_usage.output_tokens |
|
if stream_delta.content: |
|
accumulated_content += stream_delta.content |
|
if stream_delta.tool_calls: |
|
for tool_call_delta in stream_delta.tool_calls: |
|
|
|
if tool_call_delta.index is not None: |
|
if tool_call_delta.index not in accumulated_tool_calls: |
|
accumulated_tool_calls[tool_call_delta.index] = ChatMessageToolCallStreamDelta( |
|
id=tool_call_delta.id, |
|
type=tool_call_delta.type, |
|
function=ChatMessageToolCallFunction(name="", arguments=""), |
|
) |
|
|
|
tool_call = accumulated_tool_calls[tool_call_delta.index] |
|
if tool_call_delta.id: |
|
tool_call.id = tool_call_delta.id |
|
if tool_call_delta.type: |
|
tool_call.type = tool_call_delta.type |
|
if tool_call_delta.function: |
|
if tool_call_delta.function.name and len(tool_call_delta.function.name) > 0: |
|
tool_call.function.name = tool_call_delta.function.name |
|
if tool_call_delta.function.arguments: |
|
tool_call.function.arguments += tool_call_delta.function.arguments |
|
else: |
|
raise ValueError(f"Tool call index is not provided in tool delta: {tool_call_delta}") |
|
|
|
return ChatMessage( |
|
role=role, |
|
content=accumulated_content, |
|
tool_calls=[ |
|
ChatMessageToolCall( |
|
function=ChatMessageToolCallFunction( |
|
name=tool_call_stream_delta.function.name, |
|
arguments=tool_call_stream_delta.function.arguments, |
|
), |
|
id=tool_call_stream_delta.id or "", |
|
type="function", |
|
) |
|
for tool_call_stream_delta in accumulated_tool_calls.values() |
|
if tool_call_stream_delta.function |
|
], |
|
token_usage=TokenUsage( |
|
input_tokens=total_input_tokens, |
|
output_tokens=total_output_tokens, |
|
), |
|
) |
|
|
|
|
|
tool_role_conversions = { |
|
MessageRole.TOOL_CALL: MessageRole.ASSISTANT, |
|
MessageRole.TOOL_RESPONSE: MessageRole.USER, |
|
} |
|
|
|
|
|
def get_tool_json_schema(tool: Tool) -> dict: |
|
properties = deepcopy(tool.inputs) |
|
required = [] |
|
for key, value in properties.items(): |
|
if value["type"] == "any": |
|
value["type"] = "string" |
|
if not ("nullable" in value and value["nullable"]): |
|
required.append(key) |
|
return { |
|
"type": "function", |
|
"function": { |
|
"name": tool.name, |
|
"description": tool.description, |
|
"parameters": { |
|
"type": "object", |
|
"properties": properties, |
|
"required": required, |
|
}, |
|
}, |
|
} |
|
|
|
|
|
def remove_stop_sequences(content: str, stop_sequences: list[str]) -> str: |
|
for stop_seq in stop_sequences: |
|
if content[-len(stop_seq) :] == stop_seq: |
|
content = content[: -len(stop_seq)] |
|
return content |
|
|
|
|
|
def get_clean_message_list( |
|
message_list: list[ChatMessage], |
|
role_conversions: dict[MessageRole, MessageRole] | dict[str, str] = {}, |
|
convert_images_to_image_urls: bool = False, |
|
flatten_messages_as_text: bool = False, |
|
) -> list[dict[str, Any]]: |
|
""" |
|
Creates a list of messages to give as input to the LLM. These messages are dictionaries and chat template compatible with transformers LLM chat template. |
|
Subsequent messages with the same role will be concatenated to a single message. |
|
|
|
Args: |
|
message_list (`list[dict[str, str]]`): List of chat messages. |
|
role_conversions (`dict[MessageRole, MessageRole]`, *optional* ): Mapping to convert roles. |
|
convert_images_to_image_urls (`bool`, default `False`): Whether to convert images to image URLs. |
|
flatten_messages_as_text (`bool`, default `False`): Whether to flatten messages as text. |
|
""" |
|
output_message_list: list[dict[str, Any]] = [] |
|
message_list = deepcopy(message_list) |
|
for message in message_list: |
|
role = message.role |
|
if role not in MessageRole.roles(): |
|
raise ValueError(f"Incorrect role {role}, only {MessageRole.roles()} are supported for now.") |
|
|
|
if role in role_conversions: |
|
message.role = role_conversions[role] |
|
|
|
if isinstance(message.content, list): |
|
for element in message.content: |
|
assert isinstance(element, dict), "Error: this element should be a dict:" + str(element) |
|
if element["type"] == "image": |
|
assert not flatten_messages_as_text, f"Cannot use images with {flatten_messages_as_text=}" |
|
if convert_images_to_image_urls: |
|
element.update( |
|
{ |
|
"type": "image_url", |
|
"image_url": {"url": make_image_url(encode_image_base64(element.pop("image")))}, |
|
} |
|
) |
|
else: |
|
element["image"] = encode_image_base64(element["image"]) |
|
|
|
if len(output_message_list) > 0 and message.role == output_message_list[-1]["role"]: |
|
assert isinstance(message.content, list), "Error: wrong content:" + str(message.content) |
|
if flatten_messages_as_text: |
|
output_message_list[-1]["content"] += "\n" + message.content[0]["text"] |
|
else: |
|
for el in message.content: |
|
if el["type"] == "text" and output_message_list[-1]["content"][-1]["type"] == "text": |
|
|
|
output_message_list[-1]["content"][-1]["text"] += "\n" + el["text"] |
|
else: |
|
output_message_list[-1]["content"].append(el) |
|
else: |
|
if flatten_messages_as_text: |
|
content = message.content[0]["text"] |
|
else: |
|
content = message.content |
|
output_message_list.append( |
|
{ |
|
"role": message.role, |
|
"content": content, |
|
} |
|
) |
|
return output_message_list |
|
|
|
|
|
def get_tool_call_from_text(text: str, tool_name_key: str, tool_arguments_key: str) -> ChatMessageToolCall: |
|
tool_call_dictionary, _ = parse_json_blob(text) |
|
try: |
|
tool_name = tool_call_dictionary[tool_name_key] |
|
except Exception as e: |
|
raise ValueError( |
|
f"Key {tool_name_key=} not found in the generated tool call. Got keys: {list(tool_call_dictionary.keys())} instead" |
|
) from e |
|
tool_arguments = tool_call_dictionary.get(tool_arguments_key, None) |
|
if isinstance(tool_arguments, str): |
|
tool_arguments = parse_json_if_needed(tool_arguments) |
|
return ChatMessageToolCall( |
|
id=str(uuid.uuid4()), |
|
type="function", |
|
function=ChatMessageToolCallFunction(name=tool_name, arguments=tool_arguments), |
|
) |
|
|
|
|
|
def supports_stop_parameter(model_id: str) -> bool: |
|
""" |
|
Check if the model supports the `stop` parameter. |
|
|
|
Not supported with reasoning models openai/o3 and openai/o4-mini (and their versioned variants). |
|
|
|
Args: |
|
model_id (`str`): Model identifier (e.g. "openai/o3", "o4-mini-2025-04-16") |
|
|
|
Returns: |
|
bool: True if the model supports the stop parameter, False otherwise |
|
""" |
|
model_name = model_id.split("/")[-1] |
|
|
|
pattern = r"^(o3[-\d]*|o4-mini[-\d]*)$" |
|
return not re.match(pattern, model_name) |
|
|
|
|
|
class Model: |
|
def __init__( |
|
self, |
|
flatten_messages_as_text: bool = False, |
|
tool_name_key: str = "name", |
|
tool_arguments_key: str = "arguments", |
|
model_id: str | None = None, |
|
**kwargs, |
|
): |
|
self.flatten_messages_as_text = flatten_messages_as_text |
|
self.tool_name_key = tool_name_key |
|
self.tool_arguments_key = tool_arguments_key |
|
self.kwargs = kwargs |
|
self._last_input_token_count: int | None = None |
|
self._last_output_token_count: int | None = None |
|
self.model_id: str | None = model_id |
|
|
|
@property |
|
def last_input_token_count(self) -> int | None: |
|
warnings.warn( |
|
"Attribute last_input_token_count is deprecated and will be removed in version 1.20. " |
|
"Please use TokenUsage.input_tokens instead.", |
|
FutureWarning, |
|
) |
|
return self._last_input_token_count |
|
|
|
@property |
|
def last_output_token_count(self) -> int | None: |
|
warnings.warn( |
|
"Attribute last_output_token_count is deprecated and will be removed in version 1.20. " |
|
"Please use TokenUsage.output_tokens instead.", |
|
FutureWarning, |
|
) |
|
return self._last_output_token_count |
|
|
|
def _prepare_completion_kwargs( |
|
self, |
|
messages: list[ChatMessage], |
|
stop_sequences: list[str] | None = None, |
|
response_format: dict[str, str] | None = None, |
|
tools_to_call_from: list[Tool] | None = None, |
|
custom_role_conversions: dict[str, str] | None = None, |
|
convert_images_to_image_urls: bool = False, |
|
tool_choice: str | dict | None = "required", |
|
**kwargs, |
|
) -> dict[str, Any]: |
|
""" |
|
Prepare parameters required for model invocation, handling parameter priorities. |
|
|
|
Parameter priority from high to low: |
|
1. Explicitly passed kwargs |
|
2. Specific parameters (stop_sequences, response_format, etc.) |
|
3. Default values in self.kwargs |
|
""" |
|
|
|
flatten_messages_as_text = kwargs.pop("flatten_messages_as_text", self.flatten_messages_as_text) |
|
messages_as_dicts = get_clean_message_list( |
|
messages, |
|
role_conversions=custom_role_conversions or tool_role_conversions, |
|
convert_images_to_image_urls=convert_images_to_image_urls, |
|
flatten_messages_as_text=flatten_messages_as_text, |
|
) |
|
|
|
completion_kwargs = { |
|
**self.kwargs, |
|
"messages": messages_as_dicts, |
|
} |
|
|
|
|
|
if stop_sequences is not None: |
|
|
|
if supports_stop_parameter(self.model_id or ""): |
|
completion_kwargs["stop"] = stop_sequences |
|
if response_format is not None: |
|
completion_kwargs["response_format"] = response_format |
|
|
|
|
|
if tools_to_call_from: |
|
tools_config = { |
|
"tools": [get_tool_json_schema(tool) for tool in tools_to_call_from], |
|
} |
|
if tool_choice is not None: |
|
tools_config["tool_choice"] = tool_choice |
|
completion_kwargs.update(tools_config) |
|
|
|
|
|
completion_kwargs.update(kwargs) |
|
|
|
return completion_kwargs |
|
|
|
def generate( |
|
self, |
|
messages: list[ChatMessage], |
|
stop_sequences: list[str] | None = None, |
|
response_format: dict[str, str] | None = None, |
|
tools_to_call_from: list[Tool] | None = None, |
|
**kwargs, |
|
) -> ChatMessage: |
|
"""Process the input messages and return the model's response. |
|
|
|
Parameters: |
|
messages (`list[dict[str, str | list[dict]]] | list[ChatMessage]`): |
|
A list of message dictionaries to be processed. Each dictionary should have the structure `{"role": "user/system", "content": "message content"}`. |
|
stop_sequences (`List[str]`, *optional*): |
|
A list of strings that will stop the generation if encountered in the model's output. |
|
response_format (`dict[str, str]`, *optional*): |
|
The response format to use in the model's response. |
|
tools_to_call_from (`List[Tool]`, *optional*): |
|
A list of tools that the model can use to generate responses. |
|
**kwargs: |
|
Additional keyword arguments to be passed to the underlying model. |
|
|
|
Returns: |
|
`ChatMessage`: A chat message object containing the model's response. |
|
""" |
|
raise NotImplementedError("This method must be implemented in child classes") |
|
|
|
def __call__(self, *args, **kwargs): |
|
return self.generate(*args, **kwargs) |
|
|
|
def parse_tool_calls(self, message: ChatMessage) -> ChatMessage: |
|
"""Sometimes APIs do not return the tool call as a specific object, so we need to parse it.""" |
|
message.role = MessageRole.ASSISTANT |
|
if not message.tool_calls: |
|
assert message.content is not None, "Message contains no content and no tool calls" |
|
message.tool_calls = [ |
|
get_tool_call_from_text(message.content, self.tool_name_key, self.tool_arguments_key) |
|
] |
|
assert len(message.tool_calls) > 0, "No tool call was found in the model output" |
|
for tool_call in message.tool_calls: |
|
tool_call.function.arguments = parse_json_if_needed(tool_call.function.arguments) |
|
return message |
|
|
|
def to_dict(self) -> dict: |
|
""" |
|
Converts the model into a JSON-compatible dictionary. |
|
""" |
|
model_dictionary = { |
|
**self.kwargs, |
|
"model_id": self.model_id, |
|
} |
|
for attribute in [ |
|
"custom_role_conversion", |
|
"temperature", |
|
"max_tokens", |
|
"provider", |
|
"timeout", |
|
"api_base", |
|
"torch_dtype", |
|
"device_map", |
|
"organization", |
|
"project", |
|
"azure_endpoint", |
|
]: |
|
if hasattr(self, attribute): |
|
model_dictionary[attribute] = getattr(self, attribute) |
|
|
|
dangerous_attributes = ["token", "api_key"] |
|
for attribute_name in dangerous_attributes: |
|
if hasattr(self, attribute_name): |
|
print( |
|
f"For security reasons, we do not export the `{attribute_name}` attribute of your model. Please export it manually." |
|
) |
|
return model_dictionary |
|
|
|
@classmethod |
|
def from_dict(cls, model_dictionary: dict[str, Any]) -> "Model": |
|
return cls(**{k: v for k, v in model_dictionary.items()}) |
|
|
|
|
|
class VLLMModel(Model): |
|
"""Model to use [vLLM](https://docs.vllm.ai/) for fast LLM inference and serving. |
|
|
|
Parameters: |
|
model_id (`str`): |
|
The Hugging Face model ID to be used for inference. |
|
This can be a path or model identifier from the Hugging Face model hub. |
|
model_kwargs (`dict[str, Any]`, *optional*): |
|
Additional keyword arguments to pass to the vLLM model (like revision, max_model_len, etc.). |
|
""" |
|
|
|
def __init__( |
|
self, |
|
model_id, |
|
model_kwargs: dict[str, Any] | None = None, |
|
**kwargs, |
|
): |
|
if not _is_package_available("vllm"): |
|
raise ModuleNotFoundError("Please install 'vllm' extra to use VLLMModel: `pip install 'smolagents[vllm]'`") |
|
|
|
from vllm import LLM |
|
from vllm.transformers_utils.tokenizer import get_tokenizer |
|
|
|
self.model_kwargs = model_kwargs or {} |
|
super().__init__(**kwargs) |
|
self.model_id = model_id |
|
self.model = LLM(model=model_id, **self.model_kwargs) |
|
assert self.model is not None |
|
self.tokenizer = get_tokenizer(model_id) |
|
self._is_vlm = False |
|
|
|
def cleanup(self): |
|
import gc |
|
|
|
import torch |
|
from vllm.distributed.parallel_state import ( |
|
destroy_distributed_environment, |
|
destroy_model_parallel, |
|
) |
|
|
|
destroy_model_parallel() |
|
if self.model is not None: |
|
|
|
del self.model.llm_engine.model_executor.driver_worker |
|
gc.collect() |
|
destroy_distributed_environment() |
|
torch.cuda.empty_cache() |
|
|
|
def generate( |
|
self, |
|
messages: list[ChatMessage], |
|
stop_sequences: list[str] | None = None, |
|
response_format: dict[str, str] | None = None, |
|
tools_to_call_from: list[Tool] | None = None, |
|
**kwargs, |
|
) -> ChatMessage: |
|
from vllm import SamplingParams |
|
|
|
completion_kwargs = self._prepare_completion_kwargs( |
|
messages=messages, |
|
flatten_messages_as_text=(not self._is_vlm), |
|
stop_sequences=stop_sequences, |
|
tools_to_call_from=tools_to_call_from, |
|
**kwargs, |
|
) |
|
|
|
guided_options_request = {"guided_json": response_format["json_schema"]["schema"]} if response_format else None |
|
|
|
messages = completion_kwargs.pop("messages") |
|
prepared_stop_sequences = completion_kwargs.pop("stop", []) |
|
tools = completion_kwargs.pop("tools", None) |
|
completion_kwargs.pop("tool_choice", None) |
|
|
|
prompt = self.tokenizer.apply_chat_template( |
|
messages, |
|
tools=tools, |
|
add_generation_prompt=True, |
|
tokenize=False, |
|
) |
|
|
|
sampling_params = SamplingParams( |
|
n=kwargs.get("n", 1), |
|
temperature=kwargs.get("temperature", 0.0), |
|
max_tokens=kwargs.get("max_tokens", 2048), |
|
stop=prepared_stop_sequences, |
|
) |
|
|
|
out = self.model.generate( |
|
prompt, |
|
sampling_params=sampling_params, |
|
guided_options_request=guided_options_request, |
|
) |
|
|
|
output_text = out[0].outputs[0].text |
|
self._last_input_token_count = len(out[0].prompt_token_ids) |
|
self._last_output_token_count = len(out[0].outputs[0].token_ids) |
|
return ChatMessage( |
|
role=MessageRole.ASSISTANT, |
|
content=output_text, |
|
raw={"out": output_text, "completion_kwargs": completion_kwargs}, |
|
token_usage=TokenUsage( |
|
input_tokens=len(out[0].prompt_token_ids), |
|
output_tokens=len(out[0].outputs[0].token_ids), |
|
), |
|
) |
|
|
|
|
|
class MLXModel(Model): |
|
"""A class to interact with models loaded using MLX on Apple silicon. |
|
|
|
> [!TIP] |
|
> You must have `mlx-lm` installed on your machine. Please run `pip install smolagents[mlx-lm]` if it's not the case. |
|
|
|
Parameters: |
|
model_id (str): |
|
The Hugging Face model ID to be used for inference. This can be a path or model identifier from the Hugging Face model hub. |
|
tool_name_key (str): |
|
The key, which can usually be found in the model's chat template, for retrieving a tool name. |
|
tool_arguments_key (str): |
|
The key, which can usually be found in the model's chat template, for retrieving tool arguments. |
|
trust_remote_code (bool, default `False`): |
|
Some models on the Hub require running remote code: for this model, you would have to set this flag to True. |
|
load_kwargs (dict[str, Any], *optional*): |
|
Additional keyword arguments to pass to the `mlx.lm.load` method when loading the model and tokenizer. |
|
apply_chat_template_kwargs (dict, *optional*): |
|
Additional keyword arguments to pass to the `apply_chat_template` method of the tokenizer. |
|
kwargs (dict, *optional*): |
|
Any additional keyword arguments that you want to use in model.generate(), for instance `max_tokens`. |
|
|
|
Example: |
|
```python |
|
>>> engine = MLXModel( |
|
... model_id="mlx-community/Qwen2.5-Coder-32B-Instruct-4bit", |
|
... max_tokens=10000, |
|
... ) |
|
>>> messages = [ |
|
... { |
|
... "role": "user", |
|
... "content": "Explain quantum mechanics in simple terms." |
|
... } |
|
... ] |
|
>>> response = engine(messages, stop_sequences=["END"]) |
|
>>> print(response) |
|
"Quantum mechanics is the branch of physics that studies..." |
|
``` |
|
""" |
|
|
|
def __init__( |
|
self, |
|
model_id: str, |
|
trust_remote_code: bool = False, |
|
load_kwargs: dict[str, Any] | None = None, |
|
apply_chat_template_kwargs: dict[str, Any] | None = None, |
|
**kwargs, |
|
): |
|
if not _is_package_available("mlx_lm"): |
|
raise ModuleNotFoundError( |
|
"Please install 'mlx-lm' extra to use 'MLXModel': `pip install 'smolagents[mlx-lm]'`" |
|
) |
|
import mlx_lm |
|
|
|
self.load_kwargs = load_kwargs or {} |
|
self.load_kwargs.setdefault("tokenizer_config", {}).setdefault("trust_remote_code", trust_remote_code) |
|
self.apply_chat_template_kwargs = apply_chat_template_kwargs or {} |
|
self.apply_chat_template_kwargs.setdefault("add_generation_prompt", True) |
|
|
|
super().__init__(model_id=model_id, flatten_messages_as_text=True, **kwargs) |
|
|
|
self.model, self.tokenizer = mlx_lm.load(self.model_id, **self.load_kwargs) |
|
self.stream_generate = mlx_lm.stream_generate |
|
self.is_vlm = False |
|
|
|
def generate( |
|
self, |
|
messages: list[ChatMessage], |
|
stop_sequences: list[str] | None = None, |
|
response_format: dict[str, str] | None = None, |
|
tools_to_call_from: list[Tool] | None = None, |
|
**kwargs, |
|
) -> ChatMessage: |
|
if response_format is not None: |
|
raise ValueError("MLX does not support structured outputs.") |
|
completion_kwargs = self._prepare_completion_kwargs( |
|
messages=messages, |
|
stop_sequences=stop_sequences, |
|
tools_to_call_from=tools_to_call_from, |
|
**kwargs, |
|
) |
|
messages = completion_kwargs.pop("messages") |
|
stops = completion_kwargs.pop("stop", []) |
|
tools = completion_kwargs.pop("tools", None) |
|
completion_kwargs.pop("tool_choice", None) |
|
|
|
prompt_ids = self.tokenizer.apply_chat_template(messages, tools=tools, **self.apply_chat_template_kwargs) |
|
|
|
output_tokens = 0 |
|
text = "" |
|
for response in self.stream_generate(self.model, self.tokenizer, prompt=prompt_ids, **completion_kwargs): |
|
output_tokens += 1 |
|
text += response.text |
|
if any((stop_index := text.rfind(stop)) != -1 for stop in stops): |
|
text = text[:stop_index] |
|
break |
|
|
|
self._last_input_token_count = len(prompt_ids) |
|
self._last_output_token_count = output_tokens |
|
return ChatMessage( |
|
role=MessageRole.ASSISTANT, |
|
content=text, |
|
raw={"out": text, "completion_kwargs": completion_kwargs}, |
|
token_usage=TokenUsage( |
|
input_tokens=len(prompt_ids), |
|
output_tokens=output_tokens, |
|
), |
|
) |
|
|
|
|
|
class TransformersModel(Model): |
|
"""A class that uses Hugging Face's Transformers library for language model interaction. |
|
|
|
This model allows you to load and use Hugging Face's models locally using the Transformers library. It supports features like stop sequences and grammar customization. |
|
|
|
> [!TIP] |
|
> You must have `transformers` and `torch` installed on your machine. Please run `pip install smolagents[transformers]` if it's not the case. |
|
|
|
Parameters: |
|
model_id (`str`): |
|
The Hugging Face model ID to be used for inference. This can be a path or model identifier from the Hugging Face model hub. |
|
For example, `"Qwen/Qwen2.5-Coder-32B-Instruct"`. |
|
device_map (`str`, *optional*): |
|
The device_map to initialize your model with. |
|
torch_dtype (`str`, *optional*): |
|
The torch_dtype to initialize your model with. |
|
trust_remote_code (bool, default `False`): |
|
Some models on the Hub require running remote code: for this model, you would have to set this flag to True. |
|
kwargs (dict, *optional*): |
|
Any additional keyword arguments that you want to use in model.generate(), for instance `max_new_tokens` or `device`. |
|
**kwargs: |
|
Additional keyword arguments to pass to `model.generate()`, for instance `max_new_tokens` or `device`. |
|
Raises: |
|
ValueError: |
|
If the model name is not provided. |
|
|
|
Example: |
|
```python |
|
>>> engine = TransformersModel( |
|
... model_id="Qwen/Qwen2.5-Coder-32B-Instruct", |
|
... device="cuda", |
|
... max_new_tokens=5000, |
|
... ) |
|
>>> messages = [{"role": "user", "content": "Explain quantum mechanics in simple terms."}] |
|
>>> response = engine(messages, stop_sequences=["END"]) |
|
>>> print(response) |
|
"Quantum mechanics is the branch of physics that studies..." |
|
``` |
|
""" |
|
|
|
def __init__( |
|
self, |
|
model_id: str | None = None, |
|
device_map: str | None = None, |
|
torch_dtype: str | None = None, |
|
trust_remote_code: bool = False, |
|
**kwargs, |
|
): |
|
try: |
|
import torch |
|
from transformers import ( |
|
AutoModelForCausalLM, |
|
AutoModelForImageTextToText, |
|
AutoProcessor, |
|
AutoTokenizer, |
|
TextIteratorStreamer, |
|
) |
|
except ModuleNotFoundError: |
|
raise ModuleNotFoundError( |
|
"Please install 'transformers' extra to use 'TransformersModel': `pip install 'smolagents[transformers]'`" |
|
) |
|
|
|
if not model_id: |
|
warnings.warn( |
|
"The 'model_id' parameter will be required in version 2.0.0. " |
|
"Please update your code to pass this parameter to avoid future errors. " |
|
"For now, it defaults to 'HuggingFaceTB/SmolLM2-1.7B-Instruct'.", |
|
FutureWarning, |
|
) |
|
model_id = "HuggingFaceTB/SmolLM2-1.7B-Instruct" |
|
|
|
default_max_tokens = 4096 |
|
max_new_tokens = kwargs.get("max_new_tokens") or kwargs.get("max_tokens") |
|
if not max_new_tokens: |
|
kwargs["max_new_tokens"] = default_max_tokens |
|
logger.warning( |
|
f"`max_new_tokens` not provided, using this default value for `max_new_tokens`: {default_max_tokens}" |
|
) |
|
|
|
if device_map is None: |
|
device_map = "cuda" if torch.cuda.is_available() else "cpu" |
|
logger.info(f"Using device: {device_map}") |
|
self._is_vlm = False |
|
try: |
|
self.model = AutoModelForImageTextToText.from_pretrained( |
|
model_id, |
|
device_map=device_map, |
|
torch_dtype=torch_dtype, |
|
trust_remote_code=trust_remote_code, |
|
) |
|
self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=trust_remote_code) |
|
self._is_vlm = True |
|
self.streamer = TextIteratorStreamer(self.processor.tokenizer, skip_prompt=True, skip_special_tokens=True) |
|
|
|
except ValueError as e: |
|
if "Unrecognized configuration class" in str(e): |
|
self.model = AutoModelForCausalLM.from_pretrained( |
|
model_id, |
|
device_map=device_map, |
|
torch_dtype=torch_dtype, |
|
trust_remote_code=trust_remote_code, |
|
) |
|
self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=trust_remote_code) |
|
self.streamer = TextIteratorStreamer(self.tokenizer, skip_prompt=True, skip_special_tokens=True) |
|
else: |
|
raise e |
|
except Exception as e: |
|
raise ValueError(f"Failed to load tokenizer and model for {model_id=}: {e}") from e |
|
super().__init__(flatten_messages_as_text=not self._is_vlm, model_id=model_id, **kwargs) |
|
|
|
def make_stopping_criteria(self, stop_sequences: list[str], tokenizer) -> "StoppingCriteriaList": |
|
from transformers import StoppingCriteria, StoppingCriteriaList |
|
|
|
class StopOnStrings(StoppingCriteria): |
|
def __init__(self, stop_strings: list[str], tokenizer): |
|
self.stop_strings = stop_strings |
|
self.tokenizer = tokenizer |
|
self.stream = "" |
|
|
|
def reset(self): |
|
self.stream = "" |
|
|
|
def __call__(self, input_ids, scores, **kwargs): |
|
generated = self.tokenizer.decode(input_ids[0][-1], skip_special_tokens=True) |
|
self.stream += generated |
|
if any([self.stream.endswith(stop_string) for stop_string in self.stop_strings]): |
|
return True |
|
return False |
|
|
|
return StoppingCriteriaList([StopOnStrings(stop_sequences, tokenizer)]) |
|
|
|
def _prepare_completion_args( |
|
self, |
|
messages: list[ChatMessage], |
|
stop_sequences: list[str] | None = None, |
|
tools_to_call_from: list[Tool] | None = None, |
|
**kwargs, |
|
) -> dict[str, Any]: |
|
completion_kwargs = self._prepare_completion_kwargs( |
|
messages=messages, |
|
stop_sequences=stop_sequences, |
|
**kwargs, |
|
) |
|
|
|
messages = completion_kwargs.pop("messages") |
|
stop_sequences = completion_kwargs.pop("stop", None) |
|
tools = completion_kwargs.pop("tools", None) |
|
|
|
max_new_tokens = ( |
|
kwargs.get("max_new_tokens") |
|
or kwargs.get("max_tokens") |
|
or self.kwargs.get("max_new_tokens") |
|
or self.kwargs.get("max_tokens") |
|
or 1024 |
|
) |
|
prompt_tensor = (self.processor if hasattr(self, "processor") else self.tokenizer).apply_chat_template( |
|
messages, |
|
tools=tools, |
|
return_tensors="pt", |
|
add_generation_prompt=True, |
|
tokenize=True, |
|
return_dict=True, |
|
) |
|
prompt_tensor = prompt_tensor.to(self.model.device) |
|
if hasattr(prompt_tensor, "input_ids"): |
|
prompt_tensor = prompt_tensor["input_ids"] |
|
|
|
model_tokenizer = self.processor.tokenizer if hasattr(self, "processor") else self.tokenizer |
|
stopping_criteria = ( |
|
self.make_stopping_criteria(stop_sequences, tokenizer=model_tokenizer) if stop_sequences else None |
|
) |
|
completion_kwargs["max_new_tokens"] = max_new_tokens |
|
return dict( |
|
inputs=prompt_tensor, |
|
use_cache=True, |
|
stopping_criteria=stopping_criteria, |
|
**completion_kwargs, |
|
) |
|
|
|
def generate( |
|
self, |
|
messages: list[ChatMessage], |
|
stop_sequences: list[str] | None = None, |
|
response_format: dict[str, str] | None = None, |
|
tools_to_call_from: list[Tool] | None = None, |
|
**kwargs, |
|
) -> ChatMessage: |
|
if response_format is not None: |
|
raise ValueError("Transformers does not support structured outputs, use VLLMModel for this.") |
|
generation_kwargs = self._prepare_completion_args( |
|
messages=messages, |
|
stop_sequences=stop_sequences, |
|
tools_to_call_from=tools_to_call_from, |
|
**kwargs, |
|
) |
|
count_prompt_tokens = generation_kwargs["inputs"].shape[1] |
|
out = self.model.generate( |
|
**generation_kwargs, |
|
) |
|
generated_tokens = out[0, count_prompt_tokens:] |
|
if hasattr(self, "processor"): |
|
output_text = self.processor.decode(generated_tokens, skip_special_tokens=True) |
|
else: |
|
output_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True) |
|
|
|
if stop_sequences is not None: |
|
output_text = remove_stop_sequences(output_text, stop_sequences) |
|
|
|
self._last_input_token_count = count_prompt_tokens |
|
self._last_output_token_count = len(generated_tokens) |
|
return ChatMessage( |
|
role=MessageRole.ASSISTANT, |
|
content=output_text, |
|
raw={ |
|
"out": output_text, |
|
"completion_kwargs": {key: value for key, value in generation_kwargs.items() if key != "inputs"}, |
|
}, |
|
token_usage=TokenUsage( |
|
input_tokens=count_prompt_tokens, |
|
output_tokens=len(generated_tokens), |
|
), |
|
) |
|
|
|
def generate_stream( |
|
self, |
|
messages: list[ChatMessage], |
|
stop_sequences: list[str] | None = None, |
|
response_format: dict[str, str] | None = None, |
|
tools_to_call_from: list[Tool] | None = None, |
|
**kwargs, |
|
) -> Generator[ChatMessageStreamDelta]: |
|
if response_format is not None: |
|
raise ValueError("Transformers does not support structured outputs, use VLLMModel for this.") |
|
generation_kwargs = self._prepare_completion_args( |
|
messages=messages, |
|
stop_sequences=stop_sequences, |
|
response_format=response_format, |
|
tools_to_call_from=tools_to_call_from, |
|
**kwargs, |
|
) |
|
count_prompt_tokens = generation_kwargs["inputs"].shape[1] |
|
|
|
thread = Thread(target=self.model.generate, kwargs={"streamer": self.streamer, **generation_kwargs}) |
|
thread.start() |
|
|
|
|
|
for new_text in self.streamer: |
|
self._last_input_token_count = count_prompt_tokens |
|
self._last_output_token_count = 1 |
|
yield ChatMessageStreamDelta( |
|
content=new_text, |
|
tool_calls=None, |
|
token_usage=TokenUsage(input_tokens=count_prompt_tokens, output_tokens=1), |
|
) |
|
thread.join() |
|
|
|
|
|
class ApiModel(Model): |
|
""" |
|
Base class for API-based language models. |
|
|
|
This class serves as a foundation for implementing models that interact with |
|
external APIs. It handles the common functionality for managing model IDs, |
|
custom role mappings, and API client connections. |
|
|
|
Parameters: |
|
model_id (`str`): |
|
The identifier for the model to be used with the API. |
|
custom_role_conversions (`dict[str, str`], **optional**): |
|
Mapping to convert between internal role names and API-specific role names. Defaults to None. |
|
client (`Any`, **optional**): |
|
Pre-configured API client instance. If not provided, a default client will be created. Defaults to None. |
|
**kwargs: Additional keyword arguments to pass to the parent class. |
|
""" |
|
|
|
def __init__( |
|
self, model_id: str, custom_role_conversions: dict[str, str] | None = None, client: Any | None = None, **kwargs |
|
): |
|
super().__init__(model_id=model_id, **kwargs) |
|
self.custom_role_conversions = custom_role_conversions or {} |
|
self.client = client or self.create_client() |
|
|
|
def create_client(self): |
|
"""Create the API client for the specific service.""" |
|
raise NotImplementedError("Subclasses must implement this method to create a client") |
|
|
|
|
|
class LiteLLMModel(ApiModel): |
|
"""Model to use [LiteLLM Python SDK](https://docs.litellm.ai/docs/#litellm-python-sdk) to access hundreds of LLMs. |
|
|
|
Parameters: |
|
model_id (`str`): |
|
The model identifier to use on the server (e.g. "gpt-3.5-turbo"). |
|
api_base (`str`, *optional*): |
|
The base URL of the provider API to call the model. |
|
api_key (`str`, *optional*): |
|
The API key to use for authentication. |
|
custom_role_conversions (`dict[str, str]`, *optional*): |
|
Custom role conversion mapping to convert message roles in others. |
|
Useful for specific models that do not support specific message roles like "system". |
|
flatten_messages_as_text (`bool`, *optional*): Whether to flatten messages as text. |
|
Defaults to `True` for models that start with "ollama", "groq", "cerebras". |
|
**kwargs: |
|
Additional keyword arguments to pass to the OpenAI API. |
|
""" |
|
|
|
def __init__( |
|
self, |
|
model_id: str | None = None, |
|
api_base: str | None = None, |
|
api_key: str | None = None, |
|
custom_role_conversions: dict[str, str] | None = None, |
|
flatten_messages_as_text: bool | None = None, |
|
**kwargs, |
|
): |
|
if not model_id: |
|
warnings.warn( |
|
"The 'model_id' parameter will be required in version 2.0.0. " |
|
"Please update your code to pass this parameter to avoid future errors. " |
|
"For now, it defaults to 'anthropic/claude-3-5-sonnet-20240620'.", |
|
FutureWarning, |
|
) |
|
model_id = "anthropic/claude-3-5-sonnet-20240620" |
|
self.api_base = api_base |
|
self.api_key = api_key |
|
flatten_messages_as_text = ( |
|
flatten_messages_as_text |
|
if flatten_messages_as_text is not None |
|
else model_id.startswith(("ollama", "groq", "cerebras")) |
|
) |
|
super().__init__( |
|
model_id=model_id, |
|
custom_role_conversions=custom_role_conversions, |
|
flatten_messages_as_text=flatten_messages_as_text, |
|
**kwargs, |
|
) |
|
|
|
def create_client(self): |
|
"""Create the LiteLLM client.""" |
|
try: |
|
import litellm |
|
except ModuleNotFoundError as e: |
|
raise ModuleNotFoundError( |
|
"Please install 'litellm' extra to use LiteLLMModel: `pip install 'smolagents[litellm]'`" |
|
) from e |
|
|
|
return litellm |
|
|
|
def generate( |
|
self, |
|
messages: list[ChatMessage], |
|
stop_sequences: list[str] | None = None, |
|
response_format: dict[str, str] | None = None, |
|
tools_to_call_from: list[Tool] | None = None, |
|
**kwargs, |
|
) -> ChatMessage: |
|
completion_kwargs = self._prepare_completion_kwargs( |
|
messages=messages, |
|
stop_sequences=stop_sequences, |
|
response_format=response_format, |
|
tools_to_call_from=tools_to_call_from, |
|
model=self.model_id, |
|
api_base=self.api_base, |
|
api_key=self.api_key, |
|
convert_images_to_image_urls=True, |
|
custom_role_conversions=self.custom_role_conversions, |
|
**kwargs, |
|
) |
|
|
|
response = self.client.completion(**completion_kwargs) |
|
|
|
self._last_input_token_count = response.usage.prompt_tokens |
|
self._last_output_token_count = response.usage.completion_tokens |
|
return ChatMessage.from_dict( |
|
response.choices[0].message.model_dump(include={"role", "content", "tool_calls"}), |
|
raw=response, |
|
token_usage=TokenUsage( |
|
input_tokens=response.usage.prompt_tokens, |
|
output_tokens=response.usage.completion_tokens, |
|
), |
|
) |
|
|
|
def generate_stream( |
|
self, |
|
messages: list[ChatMessage], |
|
stop_sequences: list[str] | None = None, |
|
response_format: dict[str, str] | None = None, |
|
tools_to_call_from: list[Tool] | None = None, |
|
**kwargs, |
|
) -> Generator[ChatMessageStreamDelta]: |
|
completion_kwargs = self._prepare_completion_kwargs( |
|
messages=messages, |
|
stop_sequences=stop_sequences, |
|
response_format=response_format, |
|
tools_to_call_from=tools_to_call_from, |
|
model=self.model_id, |
|
api_base=self.api_base, |
|
api_key=self.api_key, |
|
custom_role_conversions=self.custom_role_conversions, |
|
convert_images_to_image_urls=True, |
|
**kwargs, |
|
) |
|
for event in self.client.completion(**completion_kwargs, stream=True, stream_options={"include_usage": True}): |
|
if getattr(event, "usage", None): |
|
self._last_input_token_count = event.usage.prompt_tokens |
|
self._last_output_token_count = event.usage.completion_tokens |
|
yield ChatMessageStreamDelta( |
|
content="", |
|
token_usage=TokenUsage( |
|
input_tokens=event.usage.prompt_tokens, |
|
output_tokens=event.usage.completion_tokens, |
|
), |
|
) |
|
if event.choices: |
|
choice = event.choices[0] |
|
if choice.delta: |
|
yield ChatMessageStreamDelta( |
|
content=choice.delta.content, |
|
tool_calls=[ |
|
ChatMessageToolCallStreamDelta( |
|
index=delta.index, |
|
id=delta.id, |
|
type=delta.type, |
|
function=delta.function, |
|
) |
|
for delta in choice.delta.tool_calls |
|
] |
|
if choice.delta.tool_calls |
|
else None, |
|
) |
|
else: |
|
if not getattr(choice, "finish_reason", None): |
|
raise ValueError(f"No content or tool calls in event: {event}") |
|
|
|
|
|
class LiteLLMRouterModel(LiteLLMModel): |
|
"""Router‑based client for interacting with the [LiteLLM Python SDK Router](https://docs.litellm.ai/docs/routing). |
|
|
|
This class provides a high-level interface for distributing requests among multiple language models using |
|
the LiteLLM SDK's routing capabilities. It is responsible for initializing and configuring the router client, |
|
applying custom role conversions, and managing message formatting to ensure seamless integration with various LLMs. |
|
|
|
Parameters: |
|
model_id (`str`): |
|
Identifier for the model group to use from the model list (e.g., "model-group-1"). |
|
model_list (`list[dict[str, Any]]`): |
|
Model configurations to be used for routing. |
|
Each configuration should include the model group name and any necessary parameters. |
|
For more details, refer to the [LiteLLM Routing](https://docs.litellm.ai/docs/routing#quick-start) documentation. |
|
client_kwargs (`dict[str, Any]`, *optional*): |
|
Additional configuration parameters for the Router client. For more details, see the |
|
[LiteLLM Routing Configurations](https://docs.litellm.ai/docs/routing). |
|
custom_role_conversions (`dict[str, str]`, *optional*): |
|
Custom role conversion mapping to convert message roles in others. |
|
Useful for specific models that do not support specific message roles like "system". |
|
flatten_messages_as_text (`bool`, *optional*): Whether to flatten messages as text. |
|
Defaults to `True` for models that start with "ollama", "groq", "cerebras". |
|
**kwargs: |
|
Additional keyword arguments to pass to the LiteLLM Router completion method. |
|
|
|
Example: |
|
```python |
|
>>> import os |
|
>>> from smolagents import CodeAgent, WebSearchTool, LiteLLMRouterModel |
|
>>> os.environ["OPENAI_API_KEY"] = "" |
|
>>> os.environ["AWS_ACCESS_KEY_ID"] = "" |
|
>>> os.environ["AWS_SECRET_ACCESS_KEY"] = "" |
|
>>> os.environ["AWS_REGION"] = "" |
|
>>> llm_loadbalancer_model_list = [ |
|
... { |
|
... "model_name": "model-group-1", |
|
... "litellm_params": { |
|
... "model": "gpt-4o-mini", |
|
... "api_key": os.getenv("OPENAI_API_KEY"), |
|
... }, |
|
... }, |
|
... { |
|
... "model_name": "model-group-1", |
|
... "litellm_params": { |
|
... "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", |
|
... "aws_access_key_id": os.getenv("AWS_ACCESS_KEY_ID"), |
|
... "aws_secret_access_key": os.getenv("AWS_SECRET_ACCESS_KEY"), |
|
... "aws_region_name": os.getenv("AWS_REGION"), |
|
... }, |
|
... }, |
|
>>> ] |
|
>>> model = LiteLLMRouterModel( |
|
... model_id="model-group-1", |
|
... model_list=llm_loadbalancer_model_list, |
|
... client_kwargs={ |
|
... "routing_strategy":"simple-shuffle" |
|
... } |
|
>>> ) |
|
>>> agent = CodeAgent(tools=[WebSearchTool()], model=model) |
|
>>> agent.run("How many seconds would it take for a leopard at full speed to run through Pont des Arts?") |
|
``` |
|
""" |
|
|
|
def __init__( |
|
self, |
|
model_id: str, |
|
model_list: list[dict[str, Any]], |
|
client_kwargs: dict[str, Any] | None = None, |
|
custom_role_conversions: dict[str, str] | None = None, |
|
flatten_messages_as_text: bool | None = None, |
|
**kwargs, |
|
): |
|
self.client_kwargs = { |
|
"model_list": model_list, |
|
**(client_kwargs or {}), |
|
} |
|
super().__init__( |
|
model_id=model_id, |
|
custom_role_conversions=custom_role_conversions, |
|
flatten_messages_as_text=flatten_messages_as_text, |
|
**kwargs, |
|
) |
|
|
|
def create_client(self): |
|
try: |
|
from litellm.router import Router |
|
except ModuleNotFoundError as e: |
|
raise ModuleNotFoundError( |
|
"Please install 'litellm' extra to use LiteLLMRouterModel: `pip install 'smolagents[litellm]'`" |
|
) from e |
|
return Router(**self.client_kwargs) |
|
|
|
|
|
class InferenceClientModel(ApiModel): |
|
"""A class to interact with Hugging Face's Inference Providers for language model interaction. |
|
|
|
This model allows you to communicate with Hugging Face's models using Inference Providers. It can be used in both serverless mode, with a dedicated endpoint, or even with a local URL, supporting features like stop sequences and grammar customization. |
|
|
|
Providers include Cerebras, Cohere, Fal, Fireworks, HF-Inference, Hyperbolic, Nebius, Novita, Replicate, SambaNova, Together, and more. |
|
|
|
Parameters: |
|
model_id (`str`, *optional*, default `"Qwen/Qwen2.5-Coder-32B-Instruct"`): |
|
The Hugging Face model ID to be used for inference. |
|
This can be a model identifier from the Hugging Face model hub or a URL to a deployed Inference Endpoint. |
|
Currently, it defaults to `"Qwen/Qwen2.5-Coder-32B-Instruct"`, but this may change in the future. |
|
provider (`str`, *optional*): |
|
Name of the provider to use for inference. A list of supported providers can be found in the [Inference Providers documentation](https://huggingface.co/docs/inference-providers/index#partners). |
|
Defaults to "auto" i.e. the first of the providers available for the model, sorted by the user's order [here](https://hf.co/settings/inference-providers). |
|
If `base_url` is passed, then `provider` is not used. |
|
token (`str`, *optional*): |
|
Token used by the Hugging Face API for authentication. This token need to be authorized 'Make calls to the serverless Inference Providers'. |
|
If the model is gated (like Llama-3 models), the token also needs 'Read access to contents of all public gated repos you can access'. |
|
If not provided, the class will try to use environment variable 'HF_TOKEN', else use the token stored in the Hugging Face CLI configuration. |
|
timeout (`int`, *optional*, defaults to 120): |
|
Timeout for the API request, in seconds. |
|
client_kwargs (`dict[str, Any]`, *optional*): |
|
Additional keyword arguments to pass to the Hugging Face InferenceClient. |
|
custom_role_conversions (`dict[str, str]`, *optional*): |
|
Custom role conversion mapping to convert message roles in others. |
|
Useful for specific models that do not support specific message roles like "system". |
|
api_key (`str`, *optional*): |
|
Token to use for authentication. This is a duplicated argument from `token` to make [`InferenceClientModel`] |
|
follow the same pattern as `openai.OpenAI` client. Cannot be used if `token` is set. Defaults to None. |
|
bill_to (`str`, *optional*): |
|
The billing account to use for the requests. By default the requests are billed on the user's account. Requests can only be billed to |
|
an organization the user is a member of, and which has subscribed to Enterprise Hub. |
|
base_url (`str`, `optional`): |
|
Base URL to run inference. This is a duplicated argument from `model` to make [`InferenceClientModel`] |
|
follow the same pattern as `openai.OpenAI` client. Cannot be used if `model` is set. Defaults to None. |
|
**kwargs: |
|
Additional keyword arguments to pass to the Hugging Face InferenceClient. |
|
|
|
Raises: |
|
ValueError: |
|
If the model name is not provided. |
|
|
|
Example: |
|
```python |
|
>>> engine = InferenceClientModel( |
|
... model_id="Qwen/Qwen2.5-Coder-32B-Instruct", |
|
... provider="nebius", |
|
... token="your_hf_token_here", |
|
... max_tokens=5000, |
|
... ) |
|
>>> messages = [{"role": "user", "content": "Explain quantum mechanics in simple terms."}] |
|
>>> response = engine(messages, stop_sequences=["END"]) |
|
>>> print(response) |
|
"Quantum mechanics is the branch of physics that studies..." |
|
``` |
|
""" |
|
|
|
def __init__( |
|
self, |
|
model_id: str = "Qwen/Qwen2.5-Coder-32B-Instruct", |
|
provider: str | None = None, |
|
token: str | None = None, |
|
timeout: int = 120, |
|
client_kwargs: dict[str, Any] | None = None, |
|
custom_role_conversions: dict[str, str] | None = None, |
|
api_key: str | None = None, |
|
bill_to: str | None = None, |
|
base_url: str | None = None, |
|
**kwargs, |
|
): |
|
if token is not None and api_key is not None: |
|
raise ValueError( |
|
"Received both `token` and `api_key` arguments. Please provide only one of them." |
|
" `api_key` is an alias for `token` to make the API compatible with OpenAI's client." |
|
" It has the exact same behavior as `token`." |
|
) |
|
token = token if token is not None else api_key |
|
if token is None: |
|
token = os.getenv("HF_TOKEN") |
|
self.client_kwargs = { |
|
**(client_kwargs or {}), |
|
"model": model_id, |
|
"provider": provider, |
|
"token": token, |
|
"timeout": timeout, |
|
"bill_to": bill_to, |
|
"base_url": base_url, |
|
} |
|
super().__init__(model_id=model_id, custom_role_conversions=custom_role_conversions, **kwargs) |
|
|
|
def create_client(self): |
|
"""Create the Hugging Face client.""" |
|
from huggingface_hub import InferenceClient |
|
|
|
return InferenceClient(**self.client_kwargs) |
|
|
|
def generate( |
|
self, |
|
messages: list[ChatMessage], |
|
stop_sequences: list[str] | None = None, |
|
response_format: dict[str, str] | None = None, |
|
tools_to_call_from: list[Tool] | None = None, |
|
**kwargs, |
|
) -> ChatMessage: |
|
if response_format is not None and self.client_kwargs["provider"] not in STRUCTURED_GENERATION_PROVIDERS: |
|
raise ValueError( |
|
"InferenceClientModel only supports structured outputs with these providers:" |
|
+ ", ".join(STRUCTURED_GENERATION_PROVIDERS) |
|
) |
|
completion_kwargs = self._prepare_completion_kwargs( |
|
messages=messages, |
|
stop_sequences=stop_sequences, |
|
tools_to_call_from=tools_to_call_from, |
|
|
|
convert_images_to_image_urls=True, |
|
custom_role_conversions=self.custom_role_conversions, |
|
**kwargs, |
|
) |
|
response = self.client.chat_completion(**completion_kwargs) |
|
|
|
self._last_input_token_count = response.usage.prompt_tokens |
|
self._last_output_token_count = response.usage.completion_tokens |
|
return ChatMessage.from_dict( |
|
asdict(response.choices[0].message), |
|
raw=response, |
|
token_usage=TokenUsage( |
|
input_tokens=response.usage.prompt_tokens, |
|
output_tokens=response.usage.completion_tokens, |
|
), |
|
) |
|
|
|
def generate_stream( |
|
self, |
|
messages: list[ChatMessage], |
|
stop_sequences: list[str] | None = None, |
|
response_format: dict[str, str] | None = None, |
|
tools_to_call_from: list[Tool] | None = None, |
|
**kwargs, |
|
) -> Generator[ChatMessageStreamDelta]: |
|
completion_kwargs = self._prepare_completion_kwargs( |
|
messages=messages, |
|
stop_sequences=stop_sequences, |
|
response_format=response_format, |
|
tools_to_call_from=tools_to_call_from, |
|
model=self.model_id, |
|
custom_role_conversions=self.custom_role_conversions, |
|
convert_images_to_image_urls=True, |
|
**kwargs, |
|
) |
|
for event in self.client.chat.completions.create( |
|
**completion_kwargs, stream=True, stream_options={"include_usage": True} |
|
): |
|
if getattr(event, "usage", None): |
|
self._last_input_token_count = event.usage.prompt_tokens |
|
self._last_output_token_count = event.usage.completion_tokens |
|
yield ChatMessageStreamDelta( |
|
content="", |
|
token_usage=TokenUsage( |
|
input_tokens=event.usage.prompt_tokens, |
|
output_tokens=event.usage.completion_tokens, |
|
), |
|
) |
|
if event.choices: |
|
choice = event.choices[0] |
|
if choice.delta: |
|
yield ChatMessageStreamDelta( |
|
content=choice.delta.content, |
|
tool_calls=[ |
|
ChatMessageToolCallStreamDelta( |
|
index=delta.index, |
|
id=delta.id, |
|
type=delta.type, |
|
function=delta.function, |
|
) |
|
for delta in choice.delta.tool_calls |
|
] |
|
if choice.delta.tool_calls |
|
else None, |
|
) |
|
else: |
|
if not getattr(choice, "finish_reason", None): |
|
raise ValueError(f"No content or tool calls in event: {event}") |
|
|
|
|
|
class OpenAIServerModel(ApiModel): |
|
"""This model connects to an OpenAI-compatible API server. |
|
|
|
Parameters: |
|
model_id (`str`): |
|
The model identifier to use on the server (e.g. "gpt-3.5-turbo"). |
|
api_base (`str`, *optional*): |
|
The base URL of the OpenAI-compatible API server. |
|
api_key (`str`, *optional*): |
|
The API key to use for authentication. |
|
organization (`str`, *optional*): |
|
The organization to use for the API request. |
|
project (`str`, *optional*): |
|
The project to use for the API request. |
|
client_kwargs (`dict[str, Any]`, *optional*): |
|
Additional keyword arguments to pass to the OpenAI client (like organization, project, max_retries etc.). |
|
custom_role_conversions (`dict[str, str]`, *optional*): |
|
Custom role conversion mapping to convert message roles in others. |
|
Useful for specific models that do not support specific message roles like "system". |
|
flatten_messages_as_text (`bool`, default `False`): |
|
Whether to flatten messages as text. |
|
**kwargs: |
|
Additional keyword arguments to pass to the OpenAI API. |
|
""" |
|
|
|
def __init__( |
|
self, |
|
model_id: str, |
|
api_base: str | None = None, |
|
api_key: str | None = None, |
|
organization: str | None = None, |
|
project: str | None = None, |
|
client_kwargs: dict[str, Any] | None = None, |
|
custom_role_conversions: dict[str, str] | None = None, |
|
flatten_messages_as_text: bool = False, |
|
**kwargs, |
|
): |
|
self.client_kwargs = { |
|
**(client_kwargs or {}), |
|
"api_key": api_key, |
|
"base_url": api_base, |
|
"organization": organization, |
|
"project": project, |
|
} |
|
super().__init__( |
|
model_id=model_id, |
|
custom_role_conversions=custom_role_conversions, |
|
flatten_messages_as_text=flatten_messages_as_text, |
|
**kwargs, |
|
) |
|
|
|
def create_client(self): |
|
try: |
|
import openai |
|
except ModuleNotFoundError as e: |
|
raise ModuleNotFoundError( |
|
"Please install 'openai' extra to use OpenAIServerModel: `pip install 'smolagents[openai]'`" |
|
) from e |
|
|
|
return openai.OpenAI(**self.client_kwargs) |
|
|
|
def generate_stream( |
|
self, |
|
messages: list[ChatMessage], |
|
stop_sequences: list[str] | None = None, |
|
response_format: dict[str, str] | None = None, |
|
tools_to_call_from: list[Tool] | None = None, |
|
**kwargs, |
|
) -> Generator[ChatMessageStreamDelta]: |
|
completion_kwargs = self._prepare_completion_kwargs( |
|
messages=messages, |
|
stop_sequences=stop_sequences, |
|
response_format=response_format, |
|
tools_to_call_from=tools_to_call_from, |
|
model=self.model_id, |
|
custom_role_conversions=self.custom_role_conversions, |
|
convert_images_to_image_urls=True, |
|
**kwargs, |
|
) |
|
for event in self.client.chat.completions.create( |
|
**completion_kwargs, stream=True, stream_options={"include_usage": True} |
|
): |
|
if event.usage: |
|
self._last_input_token_count = event.usage.prompt_tokens |
|
self._last_output_token_count = event.usage.completion_tokens |
|
yield ChatMessageStreamDelta( |
|
content="", |
|
token_usage=TokenUsage( |
|
input_tokens=event.usage.prompt_tokens, |
|
output_tokens=event.usage.completion_tokens, |
|
), |
|
) |
|
if event.choices: |
|
choice = event.choices[0] |
|
if choice.delta: |
|
yield ChatMessageStreamDelta( |
|
content=choice.delta.content, |
|
tool_calls=[ |
|
ChatMessageToolCallStreamDelta( |
|
index=delta.index, |
|
id=delta.id, |
|
type=delta.type, |
|
function=delta.function, |
|
) |
|
for delta in choice.delta.tool_calls |
|
] |
|
if choice.delta.tool_calls |
|
else None, |
|
) |
|
else: |
|
if not getattr(choice, "finish_reason", None): |
|
raise ValueError(f"No content or tool calls in event: {event}") |
|
|
|
def generate( |
|
self, |
|
messages: list[ChatMessage], |
|
stop_sequences: list[str] | None = None, |
|
response_format: dict[str, str] | None = None, |
|
tools_to_call_from: list[Tool] | None = None, |
|
**kwargs, |
|
) -> ChatMessage: |
|
completion_kwargs = self._prepare_completion_kwargs( |
|
messages=messages, |
|
stop_sequences=stop_sequences, |
|
response_format=response_format, |
|
tools_to_call_from=tools_to_call_from, |
|
model=self.model_id, |
|
custom_role_conversions=self.custom_role_conversions, |
|
convert_images_to_image_urls=True, |
|
**kwargs, |
|
) |
|
response = self.client.chat.completions.create(**completion_kwargs) |
|
|
|
|
|
self._last_input_token_count = getattr(response.usage, "prompt_tokens", 0) |
|
self._last_output_token_count = getattr(response.usage, "completion_tokens", 0) |
|
return ChatMessage.from_dict( |
|
response.choices[0].message.model_dump(include={"role", "content", "tool_calls"}), |
|
raw=response, |
|
token_usage=TokenUsage( |
|
input_tokens=response.usage.prompt_tokens, |
|
output_tokens=response.usage.completion_tokens, |
|
), |
|
) |
|
|
|
|
|
OpenAIModel = OpenAIServerModel |
|
|
|
|
|
class AzureOpenAIServerModel(OpenAIServerModel): |
|
"""This model connects to an Azure OpenAI deployment. |
|
|
|
Parameters: |
|
model_id (`str`): |
|
The model deployment name to use when connecting (e.g. "gpt-4o-mini"). |
|
azure_endpoint (`str`, *optional*): |
|
The Azure endpoint, including the resource, e.g. `https://example-resource.azure.openai.com/`. If not provided, it will be inferred from the `AZURE_OPENAI_ENDPOINT` environment variable. |
|
api_key (`str`, *optional*): |
|
The API key to use for authentication. If not provided, it will be inferred from the `AZURE_OPENAI_API_KEY` environment variable. |
|
api_version (`str`, *optional*): |
|
The API version to use. If not provided, it will be inferred from the `OPENAI_API_VERSION` environment variable. |
|
client_kwargs (`dict[str, Any]`, *optional*): |
|
Additional keyword arguments to pass to the AzureOpenAI client (like organization, project, max_retries etc.). |
|
custom_role_conversions (`dict[str, str]`, *optional*): |
|
Custom role conversion mapping to convert message roles in others. |
|
Useful for specific models that do not support specific message roles like "system". |
|
**kwargs: |
|
Additional keyword arguments to pass to the Azure OpenAI API. |
|
""" |
|
|
|
def __init__( |
|
self, |
|
model_id: str, |
|
azure_endpoint: str | None = None, |
|
api_key: str | None = None, |
|
api_version: str | None = None, |
|
client_kwargs: dict[str, Any] | None = None, |
|
custom_role_conversions: dict[str, str] | None = None, |
|
**kwargs, |
|
): |
|
client_kwargs = client_kwargs or {} |
|
client_kwargs.update( |
|
{ |
|
"api_version": api_version, |
|
"azure_endpoint": azure_endpoint, |
|
} |
|
) |
|
super().__init__( |
|
model_id=model_id, |
|
api_key=api_key, |
|
client_kwargs=client_kwargs, |
|
custom_role_conversions=custom_role_conversions, |
|
**kwargs, |
|
) |
|
|
|
def create_client(self): |
|
try: |
|
import openai |
|
except ModuleNotFoundError as e: |
|
raise ModuleNotFoundError( |
|
"Please install 'openai' extra to use AzureOpenAIServerModel: `pip install 'smolagents[openai]'`" |
|
) from e |
|
|
|
return openai.AzureOpenAI(**self.client_kwargs) |
|
|
|
|
|
AzureOpenAIModel = AzureOpenAIServerModel |
|
|
|
|
|
class AmazonBedrockServerModel(ApiModel): |
|
""" |
|
A model class for interacting with Amazon Bedrock Server models through the Bedrock API. |
|
|
|
This class provides an interface to interact with various Bedrock language models, |
|
allowing for customized model inference, guardrail configuration, message handling, |
|
and other parameters allowed by boto3 API. |
|
|
|
Parameters: |
|
model_id (`str`): |
|
The model identifier to use on Bedrock (e.g. "us.amazon.nova-pro-v1:0"). |
|
client (`boto3.client`, *optional*): |
|
A custom boto3 client for AWS interactions. If not provided, a default client will be created. |
|
client_kwargs (dict[str, Any], *optional*): |
|
Keyword arguments used to configure the boto3 client if it needs to be created internally. |
|
Examples include `region_name`, `config`, or `endpoint_url`. |
|
custom_role_conversions (`dict[str, str]`, *optional*): |
|
Custom role conversion mapping to convert message roles in others. |
|
Useful for specific models that do not support specific message roles like "system". |
|
Defaults to converting all roles to "user" role to enable using all the Bedrock models. |
|
flatten_messages_as_text (`bool`, default `False`): |
|
Whether to flatten messages as text. |
|
**kwargs |
|
Additional keyword arguments passed directly to the underlying API calls. |
|
|
|
Example: |
|
Creating a model instance with default settings: |
|
>>> bedrock_model = AmazonBedrockServerModel( |
|
... model_id='us.amazon.nova-pro-v1:0' |
|
... ) |
|
|
|
Creating a model instance with a custom boto3 client: |
|
>>> import boto3 |
|
>>> client = boto3.client('bedrock-runtime', region_name='us-west-2') |
|
>>> bedrock_model = AmazonBedrockServerModel( |
|
... model_id='us.amazon.nova-pro-v1:0', |
|
... client=client |
|
... ) |
|
|
|
Creating a model instance with client_kwargs for internal client creation: |
|
>>> bedrock_model = AmazonBedrockServerModel( |
|
... model_id='us.amazon.nova-pro-v1:0', |
|
... client_kwargs={'region_name': 'us-west-2', 'endpoint_url': 'https://custom-endpoint.com'} |
|
... ) |
|
|
|
Creating a model instance with inference and guardrail configurations: |
|
>>> additional_api_config = { |
|
... "inferenceConfig": { |
|
... "maxTokens": 3000 |
|
... }, |
|
... "guardrailConfig": { |
|
... "guardrailIdentifier": "identify1", |
|
... "guardrailVersion": 'v1' |
|
... }, |
|
... } |
|
>>> bedrock_model = AmazonBedrockServerModel( |
|
... model_id='anthropic.claude-3-haiku-20240307-v1:0', |
|
... **additional_api_config |
|
... ) |
|
""" |
|
|
|
def __init__( |
|
self, |
|
model_id: str, |
|
client=None, |
|
client_kwargs: dict[str, Any] | None = None, |
|
custom_role_conversions: dict[str, str] | None = None, |
|
**kwargs, |
|
): |
|
self.client_kwargs = client_kwargs or {} |
|
|
|
|
|
|
|
|
|
custom_role_conversions = custom_role_conversions or { |
|
MessageRole.SYSTEM: MessageRole.USER, |
|
MessageRole.ASSISTANT: MessageRole.USER, |
|
MessageRole.TOOL_CALL: MessageRole.USER, |
|
MessageRole.TOOL_RESPONSE: MessageRole.USER, |
|
} |
|
|
|
super().__init__( |
|
model_id=model_id, |
|
custom_role_conversions=custom_role_conversions, |
|
flatten_messages_as_text=False, |
|
client=client, |
|
**kwargs, |
|
) |
|
|
|
def _prepare_completion_kwargs( |
|
self, |
|
messages: list[ChatMessage], |
|
stop_sequences: list[str] | None = None, |
|
response_format: dict[str, str] | None = None, |
|
tools_to_call_from: list[Tool] | None = None, |
|
custom_role_conversions: dict[str, str] | None = None, |
|
convert_images_to_image_urls: bool = False, |
|
tool_choice: str | dict[Any, Any] | None = None, |
|
**kwargs, |
|
) -> dict: |
|
""" |
|
Overrides the base method to handle Bedrock-specific configurations. |
|
|
|
This implementation adapts the completion keyword arguments to align with |
|
Bedrock's requirements, ensuring compatibility with its unique setup and |
|
constraints. |
|
""" |
|
completion_kwargs = super()._prepare_completion_kwargs( |
|
messages=messages, |
|
stop_sequences=None, |
|
tools_to_call_from=tools_to_call_from, |
|
custom_role_conversions=custom_role_conversions, |
|
convert_images_to_image_urls=convert_images_to_image_urls, |
|
**kwargs, |
|
) |
|
|
|
|
|
|
|
completion_kwargs.pop("toolConfig", None) |
|
|
|
|
|
|
|
for message in completion_kwargs.get("messages", []): |
|
for content in message.get("content", []): |
|
if "type" in content: |
|
del content["type"] |
|
|
|
return { |
|
"modelId": self.model_id, |
|
**completion_kwargs, |
|
} |
|
|
|
def create_client(self): |
|
try: |
|
import boto3 |
|
except ModuleNotFoundError as e: |
|
raise ModuleNotFoundError( |
|
"Please install 'bedrock' extra to use AmazonBedrockServerModel: `pip install 'smolagents[bedrock]'`" |
|
) from e |
|
|
|
return boto3.client("bedrock-runtime", **self.client_kwargs) |
|
|
|
def generate( |
|
self, |
|
messages: list[ChatMessage], |
|
stop_sequences: list[str] | None = None, |
|
response_format: dict[str, str] | None = None, |
|
tools_to_call_from: list[Tool] | None = None, |
|
**kwargs, |
|
) -> ChatMessage: |
|
if response_format is not None: |
|
raise ValueError("Amazon Bedrock does not support response_format") |
|
completion_kwargs: dict = self._prepare_completion_kwargs( |
|
messages=messages, |
|
tools_to_call_from=tools_to_call_from, |
|
custom_role_conversions=self.custom_role_conversions, |
|
convert_images_to_image_urls=True, |
|
**kwargs, |
|
) |
|
|
|
|
|
response = self.client.converse(**completion_kwargs) |
|
|
|
|
|
response["output"]["message"]["content"] = response["output"]["message"]["content"][0]["text"] |
|
|
|
self._last_input_token_count = response["usage"]["inputTokens"] |
|
self._last_output_token_count = response["usage"]["outputTokens"] |
|
return ChatMessage.from_dict( |
|
response["output"]["message"], |
|
raw=response, |
|
token_usage=TokenUsage( |
|
input_tokens=response["usage"]["inputTokens"], |
|
output_tokens=response["usage"]["outputTokens"], |
|
), |
|
) |
|
|
|
|
|
AmazonBedrockModel = AmazonBedrockServerModel |
|
|
|
__all__ = [ |
|
"MessageRole", |
|
"tool_role_conversions", |
|
"get_clean_message_list", |
|
"Model", |
|
"MLXModel", |
|
"TransformersModel", |
|
"ApiModel", |
|
"InferenceClientModel", |
|
"LiteLLMModel", |
|
"LiteLLMRouterModel", |
|
"OpenAIServerModel", |
|
"OpenAIModel", |
|
"VLLMModel", |
|
"AzureOpenAIServerModel", |
|
"AzureOpenAIModel", |
|
"AmazonBedrockServerModel", |
|
"AmazonBedrockModel", |
|
"ChatMessage", |
|
] |
|
|