# Copyright 2024 The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import json import logging import os import re import uuid import warnings from collections.abc import Generator from copy import deepcopy from dataclasses import asdict, dataclass from enum import Enum from threading import Thread from typing import TYPE_CHECKING, Any from .monitoring import TokenUsage from .tools import Tool from .utils import _is_package_available, encode_image_base64, make_image_url, parse_json_blob if TYPE_CHECKING: from transformers import StoppingCriteriaList logger = logging.getLogger(__name__) STRUCTURED_GENERATION_PROVIDERS = ["cerebras", "fireworks-ai"] CODEAGENT_RESPONSE_FORMAT = { "type": "json_schema", "json_schema": { "schema": { "additionalProperties": False, "properties": { "thought": { "description": "A free form text description of the thought process.", "title": "Thought", "type": "string", }, "code": { "description": "Valid Python code snippet implementing the thought.", "title": "Code", "type": "string", }, }, "required": ["thought", "code"], "title": "ThoughtAndCodeAnswer", "type": "object", }, "name": "ThoughtAndCodeAnswer", "strict": True, }, } def get_dict_from_nested_dataclasses(obj, ignore_key=None): def convert(obj): if hasattr(obj, "__dataclass_fields__"): return {k: convert(v) for k, v in asdict(obj).items() if k != ignore_key} return obj return convert(obj) @dataclass class ChatMessageToolCallFunction: arguments: Any name: str description: str | None = None @dataclass class ChatMessageToolCall: function: ChatMessageToolCallFunction id: str type: str def __str__(self) -> str: return f"Call: {self.id}: Calling {str(self.function.name)} with arguments: {str(self.function.arguments)}" class MessageRole(str, Enum): USER = "user" ASSISTANT = "assistant" SYSTEM = "system" TOOL_CALL = "tool-call" TOOL_RESPONSE = "tool-response" @classmethod def roles(cls): return [r.value for r in cls] @dataclass class ChatMessage: role: MessageRole content: str | list[dict[str, Any]] | None = None tool_calls: list[ChatMessageToolCall] | None = None raw: Any | None = None # Stores the raw output from the API token_usage: TokenUsage | None = None def model_dump_json(self): return json.dumps(get_dict_from_nested_dataclasses(self, ignore_key="raw")) @classmethod def from_dict(cls, data: dict, raw: Any | None = None, token_usage: TokenUsage | None = None) -> "ChatMessage": if data.get("tool_calls"): tool_calls = [ ChatMessageToolCall( function=ChatMessageToolCallFunction(**tc["function"]), id=tc["id"], type=tc["type"] ) for tc in data["tool_calls"] ] data["tool_calls"] = tool_calls return cls( role=data["role"], content=data.get("content"), tool_calls=data.get("tool_calls"), raw=raw, token_usage=token_usage, ) def dict(self): return get_dict_from_nested_dataclasses(self) def render_as_markdown(self) -> str: rendered = str(self.content) or "" if self.tool_calls: rendered += "\n".join( [ json.dumps({"tool": tool.function.name, "arguments": tool.function.arguments}) for tool in self.tool_calls ] ) return rendered def parse_json_if_needed(arguments: str | dict) -> str | dict: if isinstance(arguments, dict): return arguments else: try: return json.loads(arguments) except Exception: return arguments @dataclass class ChatMessageToolCallStreamDelta: """Represents a streaming delta for tool calls during generation.""" index: int | None = None id: str | None = None type: str | None = None function: ChatMessageToolCallFunction | None = None @dataclass class ChatMessageStreamDelta: content: str | None = None tool_calls: list[ChatMessageToolCallStreamDelta] | None = None token_usage: TokenUsage | None = None def agglomerate_stream_deltas( stream_deltas: list[ChatMessageStreamDelta], role: MessageRole = MessageRole.ASSISTANT ) -> ChatMessage: """ Agglomerate a list of stream deltas into a single stream delta. """ accumulated_tool_calls: dict[int, ChatMessageToolCallStreamDelta] = {} accumulated_content = "" total_input_tokens = 0 total_output_tokens = 0 for stream_delta in stream_deltas: if stream_delta.token_usage: total_input_tokens += stream_delta.token_usage.input_tokens total_output_tokens += stream_delta.token_usage.output_tokens if stream_delta.content: accumulated_content += stream_delta.content if stream_delta.tool_calls: for tool_call_delta in stream_delta.tool_calls: # ?ormally there should be only one call at a time # Extend accumulated_tool_calls list to accommodate the new tool call if needed if tool_call_delta.index is not None: if tool_call_delta.index not in accumulated_tool_calls: accumulated_tool_calls[tool_call_delta.index] = ChatMessageToolCallStreamDelta( id=tool_call_delta.id, type=tool_call_delta.type, function=ChatMessageToolCallFunction(name="", arguments=""), ) # Update the tool call at the specific index tool_call = accumulated_tool_calls[tool_call_delta.index] if tool_call_delta.id: tool_call.id = tool_call_delta.id if tool_call_delta.type: tool_call.type = tool_call_delta.type if tool_call_delta.function: if tool_call_delta.function.name and len(tool_call_delta.function.name) > 0: tool_call.function.name = tool_call_delta.function.name if tool_call_delta.function.arguments: tool_call.function.arguments += tool_call_delta.function.arguments else: raise ValueError(f"Tool call index is not provided in tool delta: {tool_call_delta}") return ChatMessage( role=role, content=accumulated_content, tool_calls=[ ChatMessageToolCall( function=ChatMessageToolCallFunction( name=tool_call_stream_delta.function.name, arguments=tool_call_stream_delta.function.arguments, ), id=tool_call_stream_delta.id or "", type="function", ) for tool_call_stream_delta in accumulated_tool_calls.values() if tool_call_stream_delta.function ], token_usage=TokenUsage( input_tokens=total_input_tokens, output_tokens=total_output_tokens, ), ) tool_role_conversions = { MessageRole.TOOL_CALL: MessageRole.ASSISTANT, MessageRole.TOOL_RESPONSE: MessageRole.USER, } def get_tool_json_schema(tool: Tool) -> dict: properties = deepcopy(tool.inputs) required = [] for key, value in properties.items(): if value["type"] == "any": value["type"] = "string" if not ("nullable" in value and value["nullable"]): required.append(key) return { "type": "function", "function": { "name": tool.name, "description": tool.description, "parameters": { "type": "object", "properties": properties, "required": required, }, }, } def remove_stop_sequences(content: str, stop_sequences: list[str]) -> str: for stop_seq in stop_sequences: if content[-len(stop_seq) :] == stop_seq: content = content[: -len(stop_seq)] return content def get_clean_message_list( message_list: list[ChatMessage], role_conversions: dict[MessageRole, MessageRole] | dict[str, str] = {}, convert_images_to_image_urls: bool = False, flatten_messages_as_text: bool = False, ) -> list[dict[str, Any]]: """ Creates a list of messages to give as input to the LLM. These messages are dictionaries and chat template compatible with transformers LLM chat template. Subsequent messages with the same role will be concatenated to a single message. Args: message_list (`list[dict[str, str]]`): List of chat messages. role_conversions (`dict[MessageRole, MessageRole]`, *optional* ): Mapping to convert roles. convert_images_to_image_urls (`bool`, default `False`): Whether to convert images to image URLs. flatten_messages_as_text (`bool`, default `False`): Whether to flatten messages as text. """ output_message_list: list[dict[str, Any]] = [] message_list = deepcopy(message_list) # Avoid modifying the original list for message in message_list: role = message.role if role not in MessageRole.roles(): raise ValueError(f"Incorrect role {role}, only {MessageRole.roles()} are supported for now.") if role in role_conversions: message.role = role_conversions[role] # type: ignore # encode images if needed if isinstance(message.content, list): for element in message.content: assert isinstance(element, dict), "Error: this element should be a dict:" + str(element) if element["type"] == "image": assert not flatten_messages_as_text, f"Cannot use images with {flatten_messages_as_text=}" if convert_images_to_image_urls: element.update( { "type": "image_url", "image_url": {"url": make_image_url(encode_image_base64(element.pop("image")))}, } ) else: element["image"] = encode_image_base64(element["image"]) if len(output_message_list) > 0 and message.role == output_message_list[-1]["role"]: assert isinstance(message.content, list), "Error: wrong content:" + str(message.content) if flatten_messages_as_text: output_message_list[-1]["content"] += "\n" + message.content[0]["text"] else: for el in message.content: if el["type"] == "text" and output_message_list[-1]["content"][-1]["type"] == "text": # Merge consecutive text messages rather than creating new ones output_message_list[-1]["content"][-1]["text"] += "\n" + el["text"] else: output_message_list[-1]["content"].append(el) else: if flatten_messages_as_text: content = message.content[0]["text"] else: content = message.content output_message_list.append( { "role": message.role, "content": content, } ) return output_message_list def get_tool_call_from_text(text: str, tool_name_key: str, tool_arguments_key: str) -> ChatMessageToolCall: tool_call_dictionary, _ = parse_json_blob(text) try: tool_name = tool_call_dictionary[tool_name_key] except Exception as e: raise ValueError( f"Key {tool_name_key=} not found in the generated tool call. Got keys: {list(tool_call_dictionary.keys())} instead" ) from e tool_arguments = tool_call_dictionary.get(tool_arguments_key, None) if isinstance(tool_arguments, str): tool_arguments = parse_json_if_needed(tool_arguments) return ChatMessageToolCall( id=str(uuid.uuid4()), type="function", function=ChatMessageToolCallFunction(name=tool_name, arguments=tool_arguments), ) def supports_stop_parameter(model_id: str) -> bool: """ Check if the model supports the `stop` parameter. Not supported with reasoning models openai/o3 and openai/o4-mini (and their versioned variants). Args: model_id (`str`): Model identifier (e.g. "openai/o3", "o4-mini-2025-04-16") Returns: bool: True if the model supports the stop parameter, False otherwise """ model_name = model_id.split("/")[-1] # o3 and o4-mini (including versioned variants, o3-2025-04-16) don't support stop parameter pattern = r"^(o3[-\d]*|o4-mini[-\d]*)$" return not re.match(pattern, model_name) class Model: def __init__( self, flatten_messages_as_text: bool = False, tool_name_key: str = "name", tool_arguments_key: str = "arguments", model_id: str | None = None, **kwargs, ): self.flatten_messages_as_text = flatten_messages_as_text self.tool_name_key = tool_name_key self.tool_arguments_key = tool_arguments_key self.kwargs = kwargs self._last_input_token_count: int | None = None self._last_output_token_count: int | None = None self.model_id: str | None = model_id @property def last_input_token_count(self) -> int | None: warnings.warn( "Attribute last_input_token_count is deprecated and will be removed in version 1.20. " "Please use TokenUsage.input_tokens instead.", FutureWarning, ) return self._last_input_token_count @property def last_output_token_count(self) -> int | None: warnings.warn( "Attribute last_output_token_count is deprecated and will be removed in version 1.20. " "Please use TokenUsage.output_tokens instead.", FutureWarning, ) return self._last_output_token_count def _prepare_completion_kwargs( self, messages: list[ChatMessage], stop_sequences: list[str] | None = None, response_format: dict[str, str] | None = None, tools_to_call_from: list[Tool] | None = None, custom_role_conversions: dict[str, str] | None = None, convert_images_to_image_urls: bool = False, tool_choice: str | dict | None = "required", # Configurable tool_choice parameter **kwargs, ) -> dict[str, Any]: """ Prepare parameters required for model invocation, handling parameter priorities. Parameter priority from high to low: 1. Explicitly passed kwargs 2. Specific parameters (stop_sequences, response_format, etc.) 3. Default values in self.kwargs """ # Clean and standardize the message list flatten_messages_as_text = kwargs.pop("flatten_messages_as_text", self.flatten_messages_as_text) messages_as_dicts = get_clean_message_list( messages, role_conversions=custom_role_conversions or tool_role_conversions, convert_images_to_image_urls=convert_images_to_image_urls, flatten_messages_as_text=flatten_messages_as_text, ) # Use self.kwargs as the base configuration completion_kwargs = { **self.kwargs, "messages": messages_as_dicts, } # Handle specific parameters if stop_sequences is not None: # Some models do not support stop parameter if supports_stop_parameter(self.model_id or ""): completion_kwargs["stop"] = stop_sequences if response_format is not None: completion_kwargs["response_format"] = response_format # Handle tools parameter if tools_to_call_from: tools_config = { "tools": [get_tool_json_schema(tool) for tool in tools_to_call_from], } if tool_choice is not None: tools_config["tool_choice"] = tool_choice completion_kwargs.update(tools_config) # Finally, use the passed-in kwargs to override all settings completion_kwargs.update(kwargs) return completion_kwargs def generate( self, messages: list[ChatMessage], stop_sequences: list[str] | None = None, response_format: dict[str, str] | None = None, tools_to_call_from: list[Tool] | None = None, **kwargs, ) -> ChatMessage: """Process the input messages and return the model's response. Parameters: messages (`list[dict[str, str | list[dict]]] | list[ChatMessage]`): A list of message dictionaries to be processed. Each dictionary should have the structure `{"role": "user/system", "content": "message content"}`. stop_sequences (`List[str]`, *optional*): A list of strings that will stop the generation if encountered in the model's output. response_format (`dict[str, str]`, *optional*): The response format to use in the model's response. tools_to_call_from (`List[Tool]`, *optional*): A list of tools that the model can use to generate responses. **kwargs: Additional keyword arguments to be passed to the underlying model. Returns: `ChatMessage`: A chat message object containing the model's response. """ raise NotImplementedError("This method must be implemented in child classes") def __call__(self, *args, **kwargs): return self.generate(*args, **kwargs) def parse_tool_calls(self, message: ChatMessage) -> ChatMessage: """Sometimes APIs do not return the tool call as a specific object, so we need to parse it.""" message.role = MessageRole.ASSISTANT # Overwrite role if needed if not message.tool_calls: assert message.content is not None, "Message contains no content and no tool calls" message.tool_calls = [ get_tool_call_from_text(message.content, self.tool_name_key, self.tool_arguments_key) ] assert len(message.tool_calls) > 0, "No tool call was found in the model output" for tool_call in message.tool_calls: tool_call.function.arguments = parse_json_if_needed(tool_call.function.arguments) return message def to_dict(self) -> dict: """ Converts the model into a JSON-compatible dictionary. """ model_dictionary = { **self.kwargs, "model_id": self.model_id, } for attribute in [ "custom_role_conversion", "temperature", "max_tokens", "provider", "timeout", "api_base", "torch_dtype", "device_map", "organization", "project", "azure_endpoint", ]: if hasattr(self, attribute): model_dictionary[attribute] = getattr(self, attribute) dangerous_attributes = ["token", "api_key"] for attribute_name in dangerous_attributes: if hasattr(self, attribute_name): print( f"For security reasons, we do not export the `{attribute_name}` attribute of your model. Please export it manually." ) return model_dictionary @classmethod def from_dict(cls, model_dictionary: dict[str, Any]) -> "Model": return cls(**{k: v for k, v in model_dictionary.items()}) class VLLMModel(Model): """Model to use [vLLM](https://docs.vllm.ai/) for fast LLM inference and serving. Parameters: model_id (`str`): The Hugging Face model ID to be used for inference. This can be a path or model identifier from the Hugging Face model hub. model_kwargs (`dict[str, Any]`, *optional*): Additional keyword arguments to pass to the vLLM model (like revision, max_model_len, etc.). """ def __init__( self, model_id, model_kwargs: dict[str, Any] | None = None, **kwargs, ): if not _is_package_available("vllm"): raise ModuleNotFoundError("Please install 'vllm' extra to use VLLMModel: `pip install 'smolagents[vllm]'`") from vllm import LLM # type: ignore from vllm.transformers_utils.tokenizer import get_tokenizer # type: ignore self.model_kwargs = model_kwargs or {} super().__init__(**kwargs) self.model_id = model_id self.model = LLM(model=model_id, **self.model_kwargs) assert self.model is not None self.tokenizer = get_tokenizer(model_id) self._is_vlm = False # VLLMModel does not support vision models yet. def cleanup(self): import gc import torch from vllm.distributed.parallel_state import ( # type: ignore destroy_distributed_environment, destroy_model_parallel, ) destroy_model_parallel() if self.model is not None: # taken from https://github.com/vllm-project/vllm/issues/1908#issuecomment-2076870351 del self.model.llm_engine.model_executor.driver_worker gc.collect() destroy_distributed_environment() torch.cuda.empty_cache() def generate( self, messages: list[ChatMessage], stop_sequences: list[str] | None = None, response_format: dict[str, str] | None = None, tools_to_call_from: list[Tool] | None = None, **kwargs, ) -> ChatMessage: from vllm import SamplingParams # type: ignore completion_kwargs = self._prepare_completion_kwargs( messages=messages, flatten_messages_as_text=(not self._is_vlm), stop_sequences=stop_sequences, tools_to_call_from=tools_to_call_from, **kwargs, ) # Override the OpenAI schema for VLLM compatibility guided_options_request = {"guided_json": response_format["json_schema"]["schema"]} if response_format else None messages = completion_kwargs.pop("messages") prepared_stop_sequences = completion_kwargs.pop("stop", []) tools = completion_kwargs.pop("tools", None) completion_kwargs.pop("tool_choice", None) prompt = self.tokenizer.apply_chat_template( messages, tools=tools, add_generation_prompt=True, tokenize=False, ) sampling_params = SamplingParams( n=kwargs.get("n", 1), temperature=kwargs.get("temperature", 0.0), max_tokens=kwargs.get("max_tokens", 2048), stop=prepared_stop_sequences, ) out = self.model.generate( prompt, sampling_params=sampling_params, guided_options_request=guided_options_request, ) output_text = out[0].outputs[0].text self._last_input_token_count = len(out[0].prompt_token_ids) self._last_output_token_count = len(out[0].outputs[0].token_ids) return ChatMessage( role=MessageRole.ASSISTANT, content=output_text, raw={"out": output_text, "completion_kwargs": completion_kwargs}, token_usage=TokenUsage( input_tokens=len(out[0].prompt_token_ids), output_tokens=len(out[0].outputs[0].token_ids), ), ) class MLXModel(Model): """A class to interact with models loaded using MLX on Apple silicon. > [!TIP] > You must have `mlx-lm` installed on your machine. Please run `pip install smolagents[mlx-lm]` if it's not the case. Parameters: model_id (str): The Hugging Face model ID to be used for inference. This can be a path or model identifier from the Hugging Face model hub. tool_name_key (str): The key, which can usually be found in the model's chat template, for retrieving a tool name. tool_arguments_key (str): The key, which can usually be found in the model's chat template, for retrieving tool arguments. trust_remote_code (bool, default `False`): Some models on the Hub require running remote code: for this model, you would have to set this flag to True. load_kwargs (dict[str, Any], *optional*): Additional keyword arguments to pass to the `mlx.lm.load` method when loading the model and tokenizer. apply_chat_template_kwargs (dict, *optional*): Additional keyword arguments to pass to the `apply_chat_template` method of the tokenizer. kwargs (dict, *optional*): Any additional keyword arguments that you want to use in model.generate(), for instance `max_tokens`. Example: ```python >>> engine = MLXModel( ... model_id="mlx-community/Qwen2.5-Coder-32B-Instruct-4bit", ... max_tokens=10000, ... ) >>> messages = [ ... { ... "role": "user", ... "content": "Explain quantum mechanics in simple terms." ... } ... ] >>> response = engine(messages, stop_sequences=["END"]) >>> print(response) "Quantum mechanics is the branch of physics that studies..." ``` """ def __init__( self, model_id: str, trust_remote_code: bool = False, load_kwargs: dict[str, Any] | None = None, apply_chat_template_kwargs: dict[str, Any] | None = None, **kwargs, ): if not _is_package_available("mlx_lm"): raise ModuleNotFoundError( "Please install 'mlx-lm' extra to use 'MLXModel': `pip install 'smolagents[mlx-lm]'`" ) import mlx_lm self.load_kwargs = load_kwargs or {} self.load_kwargs.setdefault("tokenizer_config", {}).setdefault("trust_remote_code", trust_remote_code) self.apply_chat_template_kwargs = apply_chat_template_kwargs or {} self.apply_chat_template_kwargs.setdefault("add_generation_prompt", True) # mlx-lm doesn't support vision models: flatten_messages_as_text=True super().__init__(model_id=model_id, flatten_messages_as_text=True, **kwargs) self.model, self.tokenizer = mlx_lm.load(self.model_id, **self.load_kwargs) self.stream_generate = mlx_lm.stream_generate self.is_vlm = False # mlx-lm doesn't support vision models def generate( self, messages: list[ChatMessage], stop_sequences: list[str] | None = None, response_format: dict[str, str] | None = None, tools_to_call_from: list[Tool] | None = None, **kwargs, ) -> ChatMessage: if response_format is not None: raise ValueError("MLX does not support structured outputs.") completion_kwargs = self._prepare_completion_kwargs( messages=messages, stop_sequences=stop_sequences, tools_to_call_from=tools_to_call_from, **kwargs, ) messages = completion_kwargs.pop("messages") stops = completion_kwargs.pop("stop", []) tools = completion_kwargs.pop("tools", None) completion_kwargs.pop("tool_choice", None) prompt_ids = self.tokenizer.apply_chat_template(messages, tools=tools, **self.apply_chat_template_kwargs) output_tokens = 0 text = "" for response in self.stream_generate(self.model, self.tokenizer, prompt=prompt_ids, **completion_kwargs): output_tokens += 1 text += response.text if any((stop_index := text.rfind(stop)) != -1 for stop in stops): text = text[:stop_index] break self._last_input_token_count = len(prompt_ids) self._last_output_token_count = output_tokens return ChatMessage( role=MessageRole.ASSISTANT, content=text, raw={"out": text, "completion_kwargs": completion_kwargs}, token_usage=TokenUsage( input_tokens=len(prompt_ids), output_tokens=output_tokens, ), ) class TransformersModel(Model): """A class that uses Hugging Face's Transformers library for language model interaction. This model allows you to load and use Hugging Face's models locally using the Transformers library. It supports features like stop sequences and grammar customization. > [!TIP] > You must have `transformers` and `torch` installed on your machine. Please run `pip install smolagents[transformers]` if it's not the case. Parameters: model_id (`str`): The Hugging Face model ID to be used for inference. This can be a path or model identifier from the Hugging Face model hub. For example, `"Qwen/Qwen2.5-Coder-32B-Instruct"`. device_map (`str`, *optional*): The device_map to initialize your model with. torch_dtype (`str`, *optional*): The torch_dtype to initialize your model with. trust_remote_code (bool, default `False`): Some models on the Hub require running remote code: for this model, you would have to set this flag to True. kwargs (dict, *optional*): Any additional keyword arguments that you want to use in model.generate(), for instance `max_new_tokens` or `device`. **kwargs: Additional keyword arguments to pass to `model.generate()`, for instance `max_new_tokens` or `device`. Raises: ValueError: If the model name is not provided. Example: ```python >>> engine = TransformersModel( ... model_id="Qwen/Qwen2.5-Coder-32B-Instruct", ... device="cuda", ... max_new_tokens=5000, ... ) >>> messages = [{"role": "user", "content": "Explain quantum mechanics in simple terms."}] >>> response = engine(messages, stop_sequences=["END"]) >>> print(response) "Quantum mechanics is the branch of physics that studies..." ``` """ def __init__( self, model_id: str | None = None, device_map: str | None = None, torch_dtype: str | None = None, trust_remote_code: bool = False, **kwargs, ): try: import torch from transformers import ( AutoModelForCausalLM, AutoModelForImageTextToText, AutoProcessor, AutoTokenizer, TextIteratorStreamer, ) except ModuleNotFoundError: raise ModuleNotFoundError( "Please install 'transformers' extra to use 'TransformersModel': `pip install 'smolagents[transformers]'`" ) if not model_id: warnings.warn( "The 'model_id' parameter will be required in version 2.0.0. " "Please update your code to pass this parameter to avoid future errors. " "For now, it defaults to 'HuggingFaceTB/SmolLM2-1.7B-Instruct'.", FutureWarning, ) model_id = "HuggingFaceTB/SmolLM2-1.7B-Instruct" default_max_tokens = 4096 max_new_tokens = kwargs.get("max_new_tokens") or kwargs.get("max_tokens") if not max_new_tokens: kwargs["max_new_tokens"] = default_max_tokens logger.warning( f"`max_new_tokens` not provided, using this default value for `max_new_tokens`: {default_max_tokens}" ) if device_map is None: device_map = "cuda" if torch.cuda.is_available() else "cpu" logger.info(f"Using device: {device_map}") self._is_vlm = False try: self.model = AutoModelForImageTextToText.from_pretrained( model_id, device_map=device_map, torch_dtype=torch_dtype, trust_remote_code=trust_remote_code, ) self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=trust_remote_code) self._is_vlm = True self.streamer = TextIteratorStreamer(self.processor.tokenizer, skip_prompt=True, skip_special_tokens=True) # type: ignore except ValueError as e: if "Unrecognized configuration class" in str(e): self.model = AutoModelForCausalLM.from_pretrained( model_id, device_map=device_map, torch_dtype=torch_dtype, trust_remote_code=trust_remote_code, ) self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=trust_remote_code) self.streamer = TextIteratorStreamer(self.tokenizer, skip_prompt=True, skip_special_tokens=True) # type: ignore else: raise e except Exception as e: raise ValueError(f"Failed to load tokenizer and model for {model_id=}: {e}") from e super().__init__(flatten_messages_as_text=not self._is_vlm, model_id=model_id, **kwargs) def make_stopping_criteria(self, stop_sequences: list[str], tokenizer) -> "StoppingCriteriaList": from transformers import StoppingCriteria, StoppingCriteriaList class StopOnStrings(StoppingCriteria): def __init__(self, stop_strings: list[str], tokenizer): self.stop_strings = stop_strings self.tokenizer = tokenizer self.stream = "" def reset(self): self.stream = "" def __call__(self, input_ids, scores, **kwargs): generated = self.tokenizer.decode(input_ids[0][-1], skip_special_tokens=True) self.stream += generated if any([self.stream.endswith(stop_string) for stop_string in self.stop_strings]): return True return False return StoppingCriteriaList([StopOnStrings(stop_sequences, tokenizer)]) def _prepare_completion_args( self, messages: list[ChatMessage], stop_sequences: list[str] | None = None, tools_to_call_from: list[Tool] | None = None, **kwargs, ) -> dict[str, Any]: completion_kwargs = self._prepare_completion_kwargs( messages=messages, stop_sequences=stop_sequences, **kwargs, ) messages = completion_kwargs.pop("messages") stop_sequences = completion_kwargs.pop("stop", None) tools = completion_kwargs.pop("tools", None) max_new_tokens = ( kwargs.get("max_new_tokens") or kwargs.get("max_tokens") or self.kwargs.get("max_new_tokens") or self.kwargs.get("max_tokens") or 1024 ) prompt_tensor = (self.processor if hasattr(self, "processor") else self.tokenizer).apply_chat_template( messages, tools=tools, return_tensors="pt", add_generation_prompt=True, tokenize=True, return_dict=True, ) prompt_tensor = prompt_tensor.to(self.model.device) # type: ignore if hasattr(prompt_tensor, "input_ids"): prompt_tensor = prompt_tensor["input_ids"] model_tokenizer = self.processor.tokenizer if hasattr(self, "processor") else self.tokenizer stopping_criteria = ( self.make_stopping_criteria(stop_sequences, tokenizer=model_tokenizer) if stop_sequences else None ) completion_kwargs["max_new_tokens"] = max_new_tokens return dict( inputs=prompt_tensor, use_cache=True, stopping_criteria=stopping_criteria, **completion_kwargs, ) def generate( self, messages: list[ChatMessage], stop_sequences: list[str] | None = None, response_format: dict[str, str] | None = None, tools_to_call_from: list[Tool] | None = None, **kwargs, ) -> ChatMessage: if response_format is not None: raise ValueError("Transformers does not support structured outputs, use VLLMModel for this.") generation_kwargs = self._prepare_completion_args( messages=messages, stop_sequences=stop_sequences, tools_to_call_from=tools_to_call_from, **kwargs, ) count_prompt_tokens = generation_kwargs["inputs"].shape[1] # type: ignore out = self.model.generate( **generation_kwargs, ) generated_tokens = out[0, count_prompt_tokens:] if hasattr(self, "processor"): output_text = self.processor.decode(generated_tokens, skip_special_tokens=True) else: output_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True) if stop_sequences is not None: output_text = remove_stop_sequences(output_text, stop_sequences) self._last_input_token_count = count_prompt_tokens self._last_output_token_count = len(generated_tokens) return ChatMessage( role=MessageRole.ASSISTANT, content=output_text, raw={ "out": output_text, "completion_kwargs": {key: value for key, value in generation_kwargs.items() if key != "inputs"}, }, token_usage=TokenUsage( input_tokens=count_prompt_tokens, output_tokens=len(generated_tokens), ), ) def generate_stream( self, messages: list[ChatMessage], stop_sequences: list[str] | None = None, response_format: dict[str, str] | None = None, tools_to_call_from: list[Tool] | None = None, **kwargs, ) -> Generator[ChatMessageStreamDelta]: if response_format is not None: raise ValueError("Transformers does not support structured outputs, use VLLMModel for this.") generation_kwargs = self._prepare_completion_args( messages=messages, stop_sequences=stop_sequences, response_format=response_format, tools_to_call_from=tools_to_call_from, **kwargs, ) count_prompt_tokens = generation_kwargs["inputs"].shape[1] # type: ignore thread = Thread(target=self.model.generate, kwargs={"streamer": self.streamer, **generation_kwargs}) thread.start() # Generate with streaming for new_text in self.streamer: self._last_input_token_count = count_prompt_tokens self._last_output_token_count = 1 yield ChatMessageStreamDelta( content=new_text, tool_calls=None, token_usage=TokenUsage(input_tokens=count_prompt_tokens, output_tokens=1), ) thread.join() class ApiModel(Model): """ Base class for API-based language models. This class serves as a foundation for implementing models that interact with external APIs. It handles the common functionality for managing model IDs, custom role mappings, and API client connections. Parameters: model_id (`str`): The identifier for the model to be used with the API. custom_role_conversions (`dict[str, str`], **optional**): Mapping to convert between internal role names and API-specific role names. Defaults to None. client (`Any`, **optional**): Pre-configured API client instance. If not provided, a default client will be created. Defaults to None. **kwargs: Additional keyword arguments to pass to the parent class. """ def __init__( self, model_id: str, custom_role_conversions: dict[str, str] | None = None, client: Any | None = None, **kwargs ): super().__init__(model_id=model_id, **kwargs) self.custom_role_conversions = custom_role_conversions or {} self.client = client or self.create_client() def create_client(self): """Create the API client for the specific service.""" raise NotImplementedError("Subclasses must implement this method to create a client") class LiteLLMModel(ApiModel): """Model to use [LiteLLM Python SDK](https://docs.litellm.ai/docs/#litellm-python-sdk) to access hundreds of LLMs. Parameters: model_id (`str`): The model identifier to use on the server (e.g. "gpt-3.5-turbo"). api_base (`str`, *optional*): The base URL of the provider API to call the model. api_key (`str`, *optional*): The API key to use for authentication. custom_role_conversions (`dict[str, str]`, *optional*): Custom role conversion mapping to convert message roles in others. Useful for specific models that do not support specific message roles like "system". flatten_messages_as_text (`bool`, *optional*): Whether to flatten messages as text. Defaults to `True` for models that start with "ollama", "groq", "cerebras". **kwargs: Additional keyword arguments to pass to the OpenAI API. """ def __init__( self, model_id: str | None = None, api_base: str | None = None, api_key: str | None = None, custom_role_conversions: dict[str, str] | None = None, flatten_messages_as_text: bool | None = None, **kwargs, ): if not model_id: warnings.warn( "The 'model_id' parameter will be required in version 2.0.0. " "Please update your code to pass this parameter to avoid future errors. " "For now, it defaults to 'anthropic/claude-3-5-sonnet-20240620'.", FutureWarning, ) model_id = "anthropic/claude-3-5-sonnet-20240620" self.api_base = api_base self.api_key = api_key flatten_messages_as_text = ( flatten_messages_as_text if flatten_messages_as_text is not None else model_id.startswith(("ollama", "groq", "cerebras")) ) super().__init__( model_id=model_id, custom_role_conversions=custom_role_conversions, flatten_messages_as_text=flatten_messages_as_text, **kwargs, ) def create_client(self): """Create the LiteLLM client.""" try: import litellm except ModuleNotFoundError as e: raise ModuleNotFoundError( "Please install 'litellm' extra to use LiteLLMModel: `pip install 'smolagents[litellm]'`" ) from e return litellm def generate( self, messages: list[ChatMessage], stop_sequences: list[str] | None = None, response_format: dict[str, str] | None = None, tools_to_call_from: list[Tool] | None = None, **kwargs, ) -> ChatMessage: completion_kwargs = self._prepare_completion_kwargs( messages=messages, stop_sequences=stop_sequences, response_format=response_format, tools_to_call_from=tools_to_call_from, model=self.model_id, api_base=self.api_base, api_key=self.api_key, convert_images_to_image_urls=True, custom_role_conversions=self.custom_role_conversions, **kwargs, ) response = self.client.completion(**completion_kwargs) self._last_input_token_count = response.usage.prompt_tokens self._last_output_token_count = response.usage.completion_tokens return ChatMessage.from_dict( response.choices[0].message.model_dump(include={"role", "content", "tool_calls"}), raw=response, token_usage=TokenUsage( input_tokens=response.usage.prompt_tokens, output_tokens=response.usage.completion_tokens, ), ) def generate_stream( self, messages: list[ChatMessage], stop_sequences: list[str] | None = None, response_format: dict[str, str] | None = None, tools_to_call_from: list[Tool] | None = None, **kwargs, ) -> Generator[ChatMessageStreamDelta]: completion_kwargs = self._prepare_completion_kwargs( messages=messages, stop_sequences=stop_sequences, response_format=response_format, tools_to_call_from=tools_to_call_from, model=self.model_id, api_base=self.api_base, api_key=self.api_key, custom_role_conversions=self.custom_role_conversions, convert_images_to_image_urls=True, **kwargs, ) for event in self.client.completion(**completion_kwargs, stream=True, stream_options={"include_usage": True}): if getattr(event, "usage", None): self._last_input_token_count = event.usage.prompt_tokens self._last_output_token_count = event.usage.completion_tokens yield ChatMessageStreamDelta( content="", token_usage=TokenUsage( input_tokens=event.usage.prompt_tokens, output_tokens=event.usage.completion_tokens, ), ) if event.choices: choice = event.choices[0] if choice.delta: yield ChatMessageStreamDelta( content=choice.delta.content, tool_calls=[ ChatMessageToolCallStreamDelta( index=delta.index, id=delta.id, type=delta.type, function=delta.function, ) for delta in choice.delta.tool_calls ] if choice.delta.tool_calls else None, ) else: if not getattr(choice, "finish_reason", None): raise ValueError(f"No content or tool calls in event: {event}") class LiteLLMRouterModel(LiteLLMModel): """Router‑based client for interacting with the [LiteLLM Python SDK Router](https://docs.litellm.ai/docs/routing). This class provides a high-level interface for distributing requests among multiple language models using the LiteLLM SDK's routing capabilities. It is responsible for initializing and configuring the router client, applying custom role conversions, and managing message formatting to ensure seamless integration with various LLMs. Parameters: model_id (`str`): Identifier for the model group to use from the model list (e.g., "model-group-1"). model_list (`list[dict[str, Any]]`): Model configurations to be used for routing. Each configuration should include the model group name and any necessary parameters. For more details, refer to the [LiteLLM Routing](https://docs.litellm.ai/docs/routing#quick-start) documentation. client_kwargs (`dict[str, Any]`, *optional*): Additional configuration parameters for the Router client. For more details, see the [LiteLLM Routing Configurations](https://docs.litellm.ai/docs/routing). custom_role_conversions (`dict[str, str]`, *optional*): Custom role conversion mapping to convert message roles in others. Useful for specific models that do not support specific message roles like "system". flatten_messages_as_text (`bool`, *optional*): Whether to flatten messages as text. Defaults to `True` for models that start with "ollama", "groq", "cerebras". **kwargs: Additional keyword arguments to pass to the LiteLLM Router completion method. Example: ```python >>> import os >>> from smolagents import CodeAgent, WebSearchTool, LiteLLMRouterModel >>> os.environ["OPENAI_API_KEY"] = "" >>> os.environ["AWS_ACCESS_KEY_ID"] = "" >>> os.environ["AWS_SECRET_ACCESS_KEY"] = "" >>> os.environ["AWS_REGION"] = "" >>> llm_loadbalancer_model_list = [ ... { ... "model_name": "model-group-1", ... "litellm_params": { ... "model": "gpt-4o-mini", ... "api_key": os.getenv("OPENAI_API_KEY"), ... }, ... }, ... { ... "model_name": "model-group-1", ... "litellm_params": { ... "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", ... "aws_access_key_id": os.getenv("AWS_ACCESS_KEY_ID"), ... "aws_secret_access_key": os.getenv("AWS_SECRET_ACCESS_KEY"), ... "aws_region_name": os.getenv("AWS_REGION"), ... }, ... }, >>> ] >>> model = LiteLLMRouterModel( ... model_id="model-group-1", ... model_list=llm_loadbalancer_model_list, ... client_kwargs={ ... "routing_strategy":"simple-shuffle" ... } >>> ) >>> agent = CodeAgent(tools=[WebSearchTool()], model=model) >>> agent.run("How many seconds would it take for a leopard at full speed to run through Pont des Arts?") ``` """ def __init__( self, model_id: str, model_list: list[dict[str, Any]], client_kwargs: dict[str, Any] | None = None, custom_role_conversions: dict[str, str] | None = None, flatten_messages_as_text: bool | None = None, **kwargs, ): self.client_kwargs = { "model_list": model_list, **(client_kwargs or {}), } super().__init__( model_id=model_id, custom_role_conversions=custom_role_conversions, flatten_messages_as_text=flatten_messages_as_text, **kwargs, ) def create_client(self): try: from litellm.router import Router except ModuleNotFoundError as e: raise ModuleNotFoundError( "Please install 'litellm' extra to use LiteLLMRouterModel: `pip install 'smolagents[litellm]'`" ) from e return Router(**self.client_kwargs) class InferenceClientModel(ApiModel): """A class to interact with Hugging Face's Inference Providers for language model interaction. This model allows you to communicate with Hugging Face's models using Inference Providers. It can be used in both serverless mode, with a dedicated endpoint, or even with a local URL, supporting features like stop sequences and grammar customization. Providers include Cerebras, Cohere, Fal, Fireworks, HF-Inference, Hyperbolic, Nebius, Novita, Replicate, SambaNova, Together, and more. Parameters: model_id (`str`, *optional*, default `"Qwen/Qwen2.5-Coder-32B-Instruct"`): The Hugging Face model ID to be used for inference. This can be a model identifier from the Hugging Face model hub or a URL to a deployed Inference Endpoint. Currently, it defaults to `"Qwen/Qwen2.5-Coder-32B-Instruct"`, but this may change in the future. provider (`str`, *optional*): Name of the provider to use for inference. A list of supported providers can be found in the [Inference Providers documentation](https://huggingface.co/docs/inference-providers/index#partners). Defaults to "auto" i.e. the first of the providers available for the model, sorted by the user's order [here](https://hf.co/settings/inference-providers). If `base_url` is passed, then `provider` is not used. token (`str`, *optional*): Token used by the Hugging Face API for authentication. This token need to be authorized 'Make calls to the serverless Inference Providers'. If the model is gated (like Llama-3 models), the token also needs 'Read access to contents of all public gated repos you can access'. If not provided, the class will try to use environment variable 'HF_TOKEN', else use the token stored in the Hugging Face CLI configuration. timeout (`int`, *optional*, defaults to 120): Timeout for the API request, in seconds. client_kwargs (`dict[str, Any]`, *optional*): Additional keyword arguments to pass to the Hugging Face InferenceClient. custom_role_conversions (`dict[str, str]`, *optional*): Custom role conversion mapping to convert message roles in others. Useful for specific models that do not support specific message roles like "system". api_key (`str`, *optional*): Token to use for authentication. This is a duplicated argument from `token` to make [`InferenceClientModel`] follow the same pattern as `openai.OpenAI` client. Cannot be used if `token` is set. Defaults to None. bill_to (`str`, *optional*): The billing account to use for the requests. By default the requests are billed on the user's account. Requests can only be billed to an organization the user is a member of, and which has subscribed to Enterprise Hub. base_url (`str`, `optional`): Base URL to run inference. This is a duplicated argument from `model` to make [`InferenceClientModel`] follow the same pattern as `openai.OpenAI` client. Cannot be used if `model` is set. Defaults to None. **kwargs: Additional keyword arguments to pass to the Hugging Face InferenceClient. Raises: ValueError: If the model name is not provided. Example: ```python >>> engine = InferenceClientModel( ... model_id="Qwen/Qwen2.5-Coder-32B-Instruct", ... provider="nebius", ... token="your_hf_token_here", ... max_tokens=5000, ... ) >>> messages = [{"role": "user", "content": "Explain quantum mechanics in simple terms."}] >>> response = engine(messages, stop_sequences=["END"]) >>> print(response) "Quantum mechanics is the branch of physics that studies..." ``` """ def __init__( self, model_id: str = "Qwen/Qwen2.5-Coder-32B-Instruct", provider: str | None = None, token: str | None = None, timeout: int = 120, client_kwargs: dict[str, Any] | None = None, custom_role_conversions: dict[str, str] | None = None, api_key: str | None = None, bill_to: str | None = None, base_url: str | None = None, **kwargs, ): if token is not None and api_key is not None: raise ValueError( "Received both `token` and `api_key` arguments. Please provide only one of them." " `api_key` is an alias for `token` to make the API compatible with OpenAI's client." " It has the exact same behavior as `token`." ) token = token if token is not None else api_key if token is None: token = os.getenv("HF_TOKEN") self.client_kwargs = { **(client_kwargs or {}), "model": model_id, "provider": provider, "token": token, "timeout": timeout, "bill_to": bill_to, "base_url": base_url, } super().__init__(model_id=model_id, custom_role_conversions=custom_role_conversions, **kwargs) def create_client(self): """Create the Hugging Face client.""" from huggingface_hub import InferenceClient return InferenceClient(**self.client_kwargs) def generate( self, messages: list[ChatMessage], stop_sequences: list[str] | None = None, response_format: dict[str, str] | None = None, tools_to_call_from: list[Tool] | None = None, **kwargs, ) -> ChatMessage: if response_format is not None and self.client_kwargs["provider"] not in STRUCTURED_GENERATION_PROVIDERS: raise ValueError( "InferenceClientModel only supports structured outputs with these providers:" + ", ".join(STRUCTURED_GENERATION_PROVIDERS) ) completion_kwargs = self._prepare_completion_kwargs( messages=messages, stop_sequences=stop_sequences, tools_to_call_from=tools_to_call_from, # response_format=response_format, convert_images_to_image_urls=True, custom_role_conversions=self.custom_role_conversions, **kwargs, ) response = self.client.chat_completion(**completion_kwargs) self._last_input_token_count = response.usage.prompt_tokens self._last_output_token_count = response.usage.completion_tokens return ChatMessage.from_dict( asdict(response.choices[0].message), raw=response, token_usage=TokenUsage( input_tokens=response.usage.prompt_tokens, output_tokens=response.usage.completion_tokens, ), ) def generate_stream( self, messages: list[ChatMessage], stop_sequences: list[str] | None = None, response_format: dict[str, str] | None = None, tools_to_call_from: list[Tool] | None = None, **kwargs, ) -> Generator[ChatMessageStreamDelta]: completion_kwargs = self._prepare_completion_kwargs( messages=messages, stop_sequences=stop_sequences, response_format=response_format, tools_to_call_from=tools_to_call_from, model=self.model_id, custom_role_conversions=self.custom_role_conversions, convert_images_to_image_urls=True, **kwargs, ) for event in self.client.chat.completions.create( **completion_kwargs, stream=True, stream_options={"include_usage": True} ): if getattr(event, "usage", None): self._last_input_token_count = event.usage.prompt_tokens self._last_output_token_count = event.usage.completion_tokens yield ChatMessageStreamDelta( content="", token_usage=TokenUsage( input_tokens=event.usage.prompt_tokens, output_tokens=event.usage.completion_tokens, ), ) if event.choices: choice = event.choices[0] if choice.delta: yield ChatMessageStreamDelta( content=choice.delta.content, tool_calls=[ ChatMessageToolCallStreamDelta( index=delta.index, id=delta.id, type=delta.type, function=delta.function, ) for delta in choice.delta.tool_calls ] if choice.delta.tool_calls else None, ) else: if not getattr(choice, "finish_reason", None): raise ValueError(f"No content or tool calls in event: {event}") class OpenAIServerModel(ApiModel): """This model connects to an OpenAI-compatible API server. Parameters: model_id (`str`): The model identifier to use on the server (e.g. "gpt-3.5-turbo"). api_base (`str`, *optional*): The base URL of the OpenAI-compatible API server. api_key (`str`, *optional*): The API key to use for authentication. organization (`str`, *optional*): The organization to use for the API request. project (`str`, *optional*): The project to use for the API request. client_kwargs (`dict[str, Any]`, *optional*): Additional keyword arguments to pass to the OpenAI client (like organization, project, max_retries etc.). custom_role_conversions (`dict[str, str]`, *optional*): Custom role conversion mapping to convert message roles in others. Useful for specific models that do not support specific message roles like "system". flatten_messages_as_text (`bool`, default `False`): Whether to flatten messages as text. **kwargs: Additional keyword arguments to pass to the OpenAI API. """ def __init__( self, model_id: str, api_base: str | None = None, api_key: str | None = None, organization: str | None = None, project: str | None = None, client_kwargs: dict[str, Any] | None = None, custom_role_conversions: dict[str, str] | None = None, flatten_messages_as_text: bool = False, **kwargs, ): self.client_kwargs = { **(client_kwargs or {}), "api_key": api_key, "base_url": api_base, "organization": organization, "project": project, } super().__init__( model_id=model_id, custom_role_conversions=custom_role_conversions, flatten_messages_as_text=flatten_messages_as_text, **kwargs, ) def create_client(self): try: import openai except ModuleNotFoundError as e: raise ModuleNotFoundError( "Please install 'openai' extra to use OpenAIServerModel: `pip install 'smolagents[openai]'`" ) from e return openai.OpenAI(**self.client_kwargs) def generate_stream( self, messages: list[ChatMessage], stop_sequences: list[str] | None = None, response_format: dict[str, str] | None = None, tools_to_call_from: list[Tool] | None = None, **kwargs, ) -> Generator[ChatMessageStreamDelta]: completion_kwargs = self._prepare_completion_kwargs( messages=messages, stop_sequences=stop_sequences, response_format=response_format, tools_to_call_from=tools_to_call_from, model=self.model_id, custom_role_conversions=self.custom_role_conversions, convert_images_to_image_urls=True, **kwargs, ) for event in self.client.chat.completions.create( **completion_kwargs, stream=True, stream_options={"include_usage": True} ): if event.usage: self._last_input_token_count = event.usage.prompt_tokens self._last_output_token_count = event.usage.completion_tokens yield ChatMessageStreamDelta( content="", token_usage=TokenUsage( input_tokens=event.usage.prompt_tokens, output_tokens=event.usage.completion_tokens, ), ) if event.choices: choice = event.choices[0] if choice.delta: yield ChatMessageStreamDelta( content=choice.delta.content, tool_calls=[ ChatMessageToolCallStreamDelta( index=delta.index, id=delta.id, type=delta.type, function=delta.function, ) for delta in choice.delta.tool_calls ] if choice.delta.tool_calls else None, ) else: if not getattr(choice, "finish_reason", None): raise ValueError(f"No content or tool calls in event: {event}") def generate( self, messages: list[ChatMessage], stop_sequences: list[str] | None = None, response_format: dict[str, str] | None = None, tools_to_call_from: list[Tool] | None = None, **kwargs, ) -> ChatMessage: completion_kwargs = self._prepare_completion_kwargs( messages=messages, stop_sequences=stop_sequences, response_format=response_format, tools_to_call_from=tools_to_call_from, model=self.model_id, custom_role_conversions=self.custom_role_conversions, convert_images_to_image_urls=True, **kwargs, ) response = self.client.chat.completions.create(**completion_kwargs) # Reported that `response.usage` can be None in some cases when using OpenRouter: see GH-1401 self._last_input_token_count = getattr(response.usage, "prompt_tokens", 0) self._last_output_token_count = getattr(response.usage, "completion_tokens", 0) return ChatMessage.from_dict( response.choices[0].message.model_dump(include={"role", "content", "tool_calls"}), raw=response, token_usage=TokenUsage( input_tokens=response.usage.prompt_tokens, output_tokens=response.usage.completion_tokens, ), ) OpenAIModel = OpenAIServerModel class AzureOpenAIServerModel(OpenAIServerModel): """This model connects to an Azure OpenAI deployment. Parameters: model_id (`str`): The model deployment name to use when connecting (e.g. "gpt-4o-mini"). azure_endpoint (`str`, *optional*): The Azure endpoint, including the resource, e.g. `https://example-resource.azure.openai.com/`. If not provided, it will be inferred from the `AZURE_OPENAI_ENDPOINT` environment variable. api_key (`str`, *optional*): The API key to use for authentication. If not provided, it will be inferred from the `AZURE_OPENAI_API_KEY` environment variable. api_version (`str`, *optional*): The API version to use. If not provided, it will be inferred from the `OPENAI_API_VERSION` environment variable. client_kwargs (`dict[str, Any]`, *optional*): Additional keyword arguments to pass to the AzureOpenAI client (like organization, project, max_retries etc.). custom_role_conversions (`dict[str, str]`, *optional*): Custom role conversion mapping to convert message roles in others. Useful for specific models that do not support specific message roles like "system". **kwargs: Additional keyword arguments to pass to the Azure OpenAI API. """ def __init__( self, model_id: str, azure_endpoint: str | None = None, api_key: str | None = None, api_version: str | None = None, client_kwargs: dict[str, Any] | None = None, custom_role_conversions: dict[str, str] | None = None, **kwargs, ): client_kwargs = client_kwargs or {} client_kwargs.update( { "api_version": api_version, "azure_endpoint": azure_endpoint, } ) super().__init__( model_id=model_id, api_key=api_key, client_kwargs=client_kwargs, custom_role_conversions=custom_role_conversions, **kwargs, ) def create_client(self): try: import openai except ModuleNotFoundError as e: raise ModuleNotFoundError( "Please install 'openai' extra to use AzureOpenAIServerModel: `pip install 'smolagents[openai]'`" ) from e return openai.AzureOpenAI(**self.client_kwargs) AzureOpenAIModel = AzureOpenAIServerModel class AmazonBedrockServerModel(ApiModel): """ A model class for interacting with Amazon Bedrock Server models through the Bedrock API. This class provides an interface to interact with various Bedrock language models, allowing for customized model inference, guardrail configuration, message handling, and other parameters allowed by boto3 API. Parameters: model_id (`str`): The model identifier to use on Bedrock (e.g. "us.amazon.nova-pro-v1:0"). client (`boto3.client`, *optional*): A custom boto3 client for AWS interactions. If not provided, a default client will be created. client_kwargs (dict[str, Any], *optional*): Keyword arguments used to configure the boto3 client if it needs to be created internally. Examples include `region_name`, `config`, or `endpoint_url`. custom_role_conversions (`dict[str, str]`, *optional*): Custom role conversion mapping to convert message roles in others. Useful for specific models that do not support specific message roles like "system". Defaults to converting all roles to "user" role to enable using all the Bedrock models. flatten_messages_as_text (`bool`, default `False`): Whether to flatten messages as text. **kwargs Additional keyword arguments passed directly to the underlying API calls. Example: Creating a model instance with default settings: >>> bedrock_model = AmazonBedrockServerModel( ... model_id='us.amazon.nova-pro-v1:0' ... ) Creating a model instance with a custom boto3 client: >>> import boto3 >>> client = boto3.client('bedrock-runtime', region_name='us-west-2') >>> bedrock_model = AmazonBedrockServerModel( ... model_id='us.amazon.nova-pro-v1:0', ... client=client ... ) Creating a model instance with client_kwargs for internal client creation: >>> bedrock_model = AmazonBedrockServerModel( ... model_id='us.amazon.nova-pro-v1:0', ... client_kwargs={'region_name': 'us-west-2', 'endpoint_url': 'https://custom-endpoint.com'} ... ) Creating a model instance with inference and guardrail configurations: >>> additional_api_config = { ... "inferenceConfig": { ... "maxTokens": 3000 ... }, ... "guardrailConfig": { ... "guardrailIdentifier": "identify1", ... "guardrailVersion": 'v1' ... }, ... } >>> bedrock_model = AmazonBedrockServerModel( ... model_id='anthropic.claude-3-haiku-20240307-v1:0', ... **additional_api_config ... ) """ def __init__( self, model_id: str, client=None, client_kwargs: dict[str, Any] | None = None, custom_role_conversions: dict[str, str] | None = None, **kwargs, ): self.client_kwargs = client_kwargs or {} # Bedrock only supports `assistant` and `user` roles. # Many Bedrock models do not allow conversations to start with the `assistant` role, so the default is set to `user/user`. # This parameter is retained for future model implementations and extended support. custom_role_conversions = custom_role_conversions or { MessageRole.SYSTEM: MessageRole.USER, MessageRole.ASSISTANT: MessageRole.USER, MessageRole.TOOL_CALL: MessageRole.USER, MessageRole.TOOL_RESPONSE: MessageRole.USER, } super().__init__( model_id=model_id, custom_role_conversions=custom_role_conversions, flatten_messages_as_text=False, # Bedrock API doesn't support flatten messages, must be a list of messages client=client, **kwargs, ) def _prepare_completion_kwargs( self, messages: list[ChatMessage], stop_sequences: list[str] | None = None, response_format: dict[str, str] | None = None, tools_to_call_from: list[Tool] | None = None, custom_role_conversions: dict[str, str] | None = None, convert_images_to_image_urls: bool = False, tool_choice: str | dict[Any, Any] | None = None, **kwargs, ) -> dict: """ Overrides the base method to handle Bedrock-specific configurations. This implementation adapts the completion keyword arguments to align with Bedrock's requirements, ensuring compatibility with its unique setup and constraints. """ completion_kwargs = super()._prepare_completion_kwargs( messages=messages, stop_sequences=None, # Bedrock support stop_sequence using Inference Config tools_to_call_from=tools_to_call_from, custom_role_conversions=custom_role_conversions, convert_images_to_image_urls=convert_images_to_image_urls, **kwargs, ) # Not all models in Bedrock support `toolConfig`. Also, smolagents already include the tool call in the prompt, # so adding `toolConfig` could cause conflicts. We remove it to avoid issues. completion_kwargs.pop("toolConfig", None) # The Bedrock API does not support the `type` key in requests. # This block of code modifies the object to meet Bedrock's requirements. for message in completion_kwargs.get("messages", []): for content in message.get("content", []): if "type" in content: del content["type"] return { "modelId": self.model_id, **completion_kwargs, } def create_client(self): try: import boto3 # type: ignore except ModuleNotFoundError as e: raise ModuleNotFoundError( "Please install 'bedrock' extra to use AmazonBedrockServerModel: `pip install 'smolagents[bedrock]'`" ) from e return boto3.client("bedrock-runtime", **self.client_kwargs) def generate( self, messages: list[ChatMessage], stop_sequences: list[str] | None = None, response_format: dict[str, str] | None = None, tools_to_call_from: list[Tool] | None = None, **kwargs, ) -> ChatMessage: if response_format is not None: raise ValueError("Amazon Bedrock does not support response_format") completion_kwargs: dict = self._prepare_completion_kwargs( messages=messages, tools_to_call_from=tools_to_call_from, custom_role_conversions=self.custom_role_conversions, convert_images_to_image_urls=True, **kwargs, ) # self.client is created in ApiModel class response = self.client.converse(**completion_kwargs) # Get first message response["output"]["message"]["content"] = response["output"]["message"]["content"][0]["text"] self._last_input_token_count = response["usage"]["inputTokens"] self._last_output_token_count = response["usage"]["outputTokens"] return ChatMessage.from_dict( response["output"]["message"], raw=response, token_usage=TokenUsage( input_tokens=response["usage"]["inputTokens"], output_tokens=response["usage"]["outputTokens"], ), ) AmazonBedrockModel = AmazonBedrockServerModel __all__ = [ "MessageRole", "tool_role_conversions", "get_clean_message_list", "Model", "MLXModel", "TransformersModel", "ApiModel", "InferenceClientModel", "LiteLLMModel", "LiteLLMRouterModel", "OpenAIServerModel", "OpenAIModel", "VLLMModel", "AzureOpenAIServerModel", "AzureOpenAIModel", "AmazonBedrockServerModel", "AmazonBedrockModel", "ChatMessage", ]