Spaces:

unitxt
/

metric

Running

App Files Files Community

Elron commited on Apr 16

Commit

5fdede1

verified ·

1 Parent(s): 077586b

Upload folder using huggingface_hub

Browse files

Files changed (9) hide show

dataset.py +1 -0
evaluate_cli.py +828 -0
inference.py +233 -112
llm_as_judge.py +4 -4
metric.py +1 -0
metrics.py +106 -95
parsing_utils.py +2 -2
processors.py +70 -16
version.py +1 -1

dataset.py CHANGED Viewed

@@ -20,6 +20,7 @@ from .dialog_operators import __file__ as _
 from .dict_utils import __file__ as _
 from .error_utils import __file__ as _
 from .eval_utils import __file__ as _
 from .file_utils import __file__ as _
 from .formats import __file__ as _
 from .fusion import __file__ as _

 from .dict_utils import __file__ as _
 from .error_utils import __file__ as _
 from .eval_utils import __file__ as _
+from .evaluate_cli import __file__ as _
 from .file_utils import __file__ as _
 from .formats import __file__ as _
 from .fusion import __file__ as _

evaluate_cli.py ADDED Viewed

	@@ -0,0 +1,828 @@

+# evaluate_cli.py
+import argparse
+import importlib.metadata
+import json
+import logging
+import os
+import platform
+import subprocess
+import sys
+from datetime import datetime
+from functools import partial
+from typing import Any, Dict, List, Optional, Tuple, Union
+from datasets import Dataset as HFDataset
+from . import evaluate, get_logger, load_dataset
+from .artifact import UnitxtArtifactNotFoundError
+from .benchmark import Benchmark
+# Use HFAutoModelInferenceEngine for local models
+from .inference import (
+    CrossProviderInferenceEngine,
+    HFAutoModelInferenceEngine,
+    InferenceEngine,
+)
+from .metric_utils import EvaluationResults
+from .parsing_utils import parse_key_equals_value_string_to_dict
+from .settings_utils import settings
+from .standard import DatasetRecipe
+# Define logger early so it can be used in initial error handling
+# Basic config for initial messages, will be reconfigured in main()
+logger = get_logger()
+def try_parse_json(value: str) -> Union[str, dict, None]:
+    """Attempts to parse a string as JSON or key=value pairs.
+    Returns the original string if parsing fails
+    and the string doesn't look like JSON/kv pairs.
+    Raises ArgumentTypeError if it looks like JSON but is invalid.
+    """
+    if value is None:
+        return None
+    try:
+        # Handle simple key-value pairs like "key=value,key2=value2"
+        if "=" in value and "{" not in value:
+            parsed_dict = parse_key_equals_value_string_to_dict(value)
+            if parsed_dict:
+                return parsed_dict
+        # Attempt standard JSON parsing
+        return json.loads(value)
+    except json.JSONDecodeError as e:
+        if value.strip().startswith("{") or value.strip().startswith("["):
+            raise argparse.ArgumentTypeError(
+                f"Invalid JSON: '{value}'. Hint: Use double quotes for JSON strings and check syntax."
+            ) from e
+        return value  # Return as string if not JSON-like
+    except Exception as e:
+        logger.error(f"Error parsing argument '{value}': {e}")
+        raise argparse.ArgumentTypeError(f"Could not parse argument: '{value}'") from e
+def setup_parser() -> argparse.ArgumentParser:
+    """Sets up the argument parser."""
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.RawTextHelpFormatter,
+        description="CLI utility for running evaluations with unitxt.",
+    )
+    # --- Task/Dataset Arguments ---
+    parser.add_argument(
+        "--tasks",  # Changed to plural to better reflect it holds a list
+        "-t",
+        dest="tasks",  # Explicitly set the attribute name to 'tasks'
+        type=partial(str.split, sep="+"),  # Use the custom function for type conversion
+        required=True,
+        help="Plus-separated (+) list of Unitxt task/dataset identifier strings.\n"
+        "Each task format: 'card=<card_ref>,template=<template_ref>,...'\n"
+        "Example: 'card=cards.mmlu,t=t.mmlu.all+card=cards.hellaswag,t=t.hellaswag.no'",
+    )
+    parser.add_argument(
+        "--split",
+        type=str,
+        default="test",
+        help="Dataset split to use (e.g., 'train', 'validation', 'test'). Default: 'test'.",
+    )
+    parser.add_argument(
+        "--num_fewshots",
+        type=int,
+        default=None,
+        help="number of fewshots to use",
+    )
+    parser.add_argument(
+        "--limit",
+        "-L",
+        type=int,
+        default=None,
+        metavar="N",
+        help="Limit the number of examples per task/dataset.",
+    )
+    parser.add_argument(
+        "--batch_size",
+        "-b",
+        type=int,
+        default=1,
+        help="Batch size for use in inference when selected model is hf. Default 1",
+    )
+    # --- Model Arguments (Explicit Types) ---
+    parser.add_argument(
+        "--model",
+        "-m",
+        type=str,
+        default="hf",
+        choices=["hf", "cross_provider"],
+        help="Specifies the model type/engine.\n"
+        "- 'hf': Local Hugging Face model via HFAutoModel (default). Requires 'pretrained=...' in --model_args.\n"
+        "- 'cross_provider': Remote model via CrossProviderInferenceEngine. Requires 'model_name=...' in --model_args.",
+    )
+    parser.add_argument(
+        "--model_args",
+        "-a",
+        type=try_parse_json,
+        default={},
+        help="Comma separated string or JSON formatted arguments for the model/inference engine.\n"
+        "Examples:\n"
+        "- For --model hf (default): 'pretrained=meta-llama/Llama-3.1-8B-Instruct,torch_dtype=bfloat16,device=cuda'\n"
+        "  (Note: 'pretrained' key is REQUIRED. Other args like 'torch_dtype', 'device', generation params are passed too)\n"
+        "- For --model generic_remote: 'model_name=llama-3-3-70b-instruct,max_tokens=256,temperature=0.7'\n"
+        "  (Note: 'model_name' key is REQUIRED)\n"
+        '- JSON format: \'{"pretrained": "my_model", "torch_dtype": "float32"}\' or \'{"model_name": "openai/gpt-4o"}\'',
+    )
+    parser.add_argument(
+        "--gen_kwargs",
+        type=try_parse_json,
+        default=None,
+        help=(
+            "Comma delimited string for model generation on greedy_until tasks,"
+            """ e.g. temperature=0,top_p=0.1."""
+        ),
+    )
+    parser.add_argument(
+        "--chat_template_kwargs",
+        type=try_parse_json,
+        default=None,
+        help=(
+            "Comma delimited string for tokenizer kwargs"
+            "e.g. thinking=True (https://github.com/huggingface/transformers/blob/9a1c1fe7edaefdb25ab37116a979832df298d6ea/src/transformers/tokenization_utils_base.py#L1542)"
+        ),
+    )
+    # --- Output and Logging Arguments ---
+    parser.add_argument(
+        "--output_path",
+        "-o",
+        type=str,
+        default=".",
+        help="Directory to save evaluation results and logs. Default: current directory.",
+    )
+    parser.add_argument(
+        "--output_file_prefix",
+        type=str,
+        default="evaluation_results",
+        help="Prefix for the output JSON file names. Default: 'evaluation_results'.",
+    )
+    parser.add_argument(
+        "--log_samples",
+        "-s",
+        action="store_true",
+        default=False,
+        help="If True, save individual predictions and scores to a separate JSON file.",
+    )
+    parser.add_argument(
+        "--verbosity",
+        "-v",
+        type=str.upper,
+        default="INFO",
+        choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
+        help="Controls logging verbosity level. Default: INFO.",
+    )
+    parser.add_argument(
+        "--apply_chat_template",
+        action="store_true",
+        default=False,
+    )
+    # --- Unitxt Settings ---
+    parser.add_argument(
+        "--trust_remote_code",
+        action="store_true",
+        default=False,
+        help="Allow execution of unverified code from the HuggingFace Hub (used by datasets/unitxt).",
+    )
+    parser.add_argument(
+        "--disable_hf_cache",
+        action="store_true",
+        default=False,
+        help="Disable HuggingFace datasets caching.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="Directory for HuggingFace datasets cache (overrides default).",
+    )
+    return parser
+def setup_logging(verbosity: str) -> None:
+    """Configures logging based on verbosity level."""
+    logging.basicConfig(
+        level=verbosity,
+        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+        force=True,  # Ensures reconfiguration works if basicConfig was called before
+    )
+    # Re-get the logger instance after basicConfig is set
+    global logger
+    logger = get_logger()
+    logger.setLevel(verbosity)
+def prepare_output_paths(output_path: str, prefix: str) -> Tuple[str, str]:
+    """Creates output directory and defines file paths.
+    Args:
+        output_path (str): The directory where output files will be saved.
+        prefix (str): The prefix for the output file names.
+    Returns:
+        Tuple[str, str]: A tuple containing the path for the results summary file
+                         and the path for the detailed samples file.
+    """
+    os.makedirs(output_path, exist_ok=True)
+    results_file_path = os.path.join(output_path, f"{prefix}.json")
+    samples_file_path = os.path.join(output_path, f"{prefix}_samples.json")
+    return results_file_path, samples_file_path
+def configure_unitxt_settings(args: argparse.Namespace):
+    """Configures unitxt settings and returns a context manager.
+    Args:
+        args (argparse.Namespace): Parsed command-line arguments.
+    Returns:
+        ContextManager: A context manager for applying unitxt settings.
+    """
+    unitxt_settings_dict = {
+        "disable_hf_datasets_cache": args.disable_hf_cache,
+        "allow_unverified_code": args.trust_remote_code,
+    }
+    if args.cache_dir:
+        unitxt_settings_dict["hf_cache_dir"] = args.cache_dir
+        # Also set environment variable as some HF parts might read it directly
+        os.environ["HF_DATASETS_CACHE"] = args.cache_dir
+        os.environ["HF_HOME"] = args.cache_dir
+        logger.info(f"Set HF_DATASETS_CACHE to: {args.cache_dir}")
+    if args.disable_hf_cache:
+        os.environ["UNITXT_DISABLE_HF_DATASETS_CACHE"] = "True"
+    logger.info(f"Applying unitxt settings: {unitxt_settings_dict}")
+    return settings.context(**unitxt_settings_dict)
+def cli_load_dataset(args: argparse.Namespace) -> HFDataset:
+    """Loads the dataset based on command line arguments.
+    Args:
+        args (argparse.Namespace): Parsed command-line arguments.
+    Returns:
+        HFDataset: The loaded dataset.
+    Raises:
+        UnitxtArtifactNotFoundError: If the specified card or template artifact is not found.
+        FileNotFoundError: If a specified file (e.g., in a local card path) is not found.
+        AttributeError: If there's an issue accessing attributes during loading.
+        ValueError: If there's a value-related error during loading (e.g., parsing).
+    """
+    logger.info(
+        f"Loading task/dataset using identifier: '{args.tasks}' with split '{args.split}'"
+    )
+    benchmark_subsets = {}
+    for task_str in args.tasks:
+        dataset_args = task_str_to_dataset_args(task_str, args)
+        benchmark_subsets[task_str] = DatasetRecipe(**dataset_args)
+    benchmark = Benchmark(subsets=benchmark_subsets)
+    test_dataset = load_dataset(benchmark, split=args.split)
+    logger.info(
+        f"Dataset loaded successfully. Number of instances: {len(test_dataset)}"
+    )
+    return test_dataset
+def task_str_to_dataset_args(task_str, args):
+    dataset_args = parse_key_equals_value_string_to_dict(task_str)
+    if args.limit is not None:
+        assert f"max_{args.split}_instances" not in dataset_args, (
+            "limit was inputted both as an arg and as a task parameter"
+        )
+        # Check if limit or loader_limit is already present
+        # dataset_args[f"max_{args.split}_instances"] = args.limit
+        dataset_args[f"max_{args.split}_instances"] = args.limit
+        # Use loader_limit for unitxt compatibility
+        logger.info(
+            f"Applying limit from --limit argument: max_{args.split}_instances={args.limit}"
+        )
+    if args.num_fewshots:
+        assert "num_demos" not in dataset_args, (
+            "num_demos was inputted both as an arg and as a task parameter"
+        )
+        dataset_args["num_demos"] = args.num_fewshots
+        dataset_args.update(
+            {
+                "demos_taken_from": "train",
+                "demos_pool_size": -1,
+                "demos_removed_from_data": True,
+            }
+        )  # Use loader_limit for unitxt compatibility
+        logger.info(
+            f"Applying limit from --limit argument: num_demos={args.num_fewshots}"
+        )
+    if args.apply_chat_template:
+        assert "format" not in dataset_args, (
+            "format was inputted as a task parameter, but chat_api was requested"
+        )
+        dataset_args["format"] = "formats.chat_api"
+        logger.info(
+            "Applying chat template from --apply_chat_template argument: format=formats.chat_api"
+        )
+    return dataset_args
+def prepare_kwargs(kwargs: dict) -> Dict[str, Any]:
+    """Prepares the model arguments dictionary.
+    Args:
+        kwargs (dict): Parsed command-line arguments.
+    Returns:
+        Dict[str, Any]: The processed model arguments dictionary.
+    """
+    # Ensure model_args is a dictionary, handling potential string return from try_parse_json
+    kwargs_dict = kwargs if isinstance(kwargs, dict) else {}
+    if not isinstance(kwargs, dict) and kwargs is not None:
+        logger.warning(
+            f"Could not parse kwargs '{kwargs}' as JSON or key-value pairs. Treating as empty."
+        )
+    logger.info(f"Using kwargs: {kwargs_dict}")
+    return kwargs_dict
+def initialize_inference_engine(
+    args: argparse.Namespace,
+    model_args_dict: Dict[str, Any],
+    chat_kwargs_dict: Dict[str, Any],
+) -> InferenceEngine:
+    """Initializes the appropriate inference engine based on arguments.
+    Args:
+        args (argparse.Namespace): Parsed command-line arguments.
+        model_args_dict (Dict[str, Any]): Processed model arguments.
+        chat_kwargs_dict (Dict[str, Any]): Processed chat arguments.
+    Returns:
+        InferenceEngine: The initialized inference engine instance.
+    Raises:
+        SystemExit: If required dependencies are missing for the selected model type.
+        ValueError: If required keys are missing in model_args for the selected model type.
+    """
+    inference_model = None
+    # --- Local Hugging Face Model (using HFAutoModelInferenceEngine) ---
+    if args.model.lower() == "hf":
+        if "pretrained" not in model_args_dict:
+            logger.error(
+                "Missing 'pretrained=<model_id_or_path>' in --model_args for '--model hf'."
+            )
+            raise ValueError(
+                "Argument 'pretrained' is required in --model_args when --model is 'hf'"
+            )
+        local_model_name = model_args_dict.pop("pretrained")
+        logger.info(
+            f"Initializing HFAutoModelInferenceEngine for model: {local_model_name}"
+        )
+        model_args_dict.update({"batch_size": args.batch_size})
+        logger.info(f"HFAutoModelInferenceEngine args: {model_args_dict}")
+        inference_model = HFAutoModelInferenceEngine(
+            model_name=local_model_name,
+            **model_args_dict,
+            chat_kwargs_dict=chat_kwargs_dict,
+        )
+    # --- Remote Model (CrossProviderInferenceEngine) ---
+    elif args.model.lower() == "cross_provider":
+        if "model_name" not in model_args_dict:
+            logger.error(
+                "Missing 'model_name=<provider/model_id>' in --model_args for '--model cross_provider'."
+            )
+            raise ValueError(
+                "Argument 'model_name' is required in --model_args when --model is 'cross_provider'"
+            )
+        remote_model_name = model_args_dict.pop("model_name")
+        logger.info(
+            f"Initializing CrossProviderInferenceEngine for model: {remote_model_name}"
+        )
+        if (
+            "max_tokens" not in model_args_dict
+            and "max_new_tokens" not in model_args_dict
+        ):
+            logger.warning(
+                f"'max_tokens' or 'max_new_tokens' not found in --model_args, {remote_model_name} might require it."
+            )
+        logger.info(f"CrossProviderInferenceEngine args: {model_args_dict}")
+        # Note: CrossProviderInferenceEngine expects 'model' parameter, not 'model_name'
+        inference_model = CrossProviderInferenceEngine(
+            model=remote_model_name,
+            **model_args_dict,
+        )
+    else:
+        # This case should not be reached due to argparse choices
+        logger.error(
+            f"Invalid --model type specified: {args.model}. Use 'hf' or 'cross_provider'."
+        )
+        sys.exit(1)  # Exit here as it's an invalid configuration
+    return inference_model
+def run_inference(engine: InferenceEngine, dataset: HFDataset) -> List[Any]:
+    """Runs inference using the initialized engine.
+    Args:
+        engine (InferenceEngine): The inference engine instance.
+        dataset (HFDataset): The dataset to run inference on.
+    Returns:
+        List[Any]: A list of predictions.
+    Raises:
+        Exception: If an error occurs during inference.
+    """
+    logger.info("Starting inference...")
+    try:
+        predictions = engine.infer(dataset)
+        logger.info("Inference completed.")
+        if not predictions:
+            logger.warning("Inference returned no predictions.")
+            return []  # Return empty list if no predictions
+        if len(predictions) != len(dataset):
+            logger.error(
+                f"Inference returned an unexpected number of predictions ({len(predictions)}). Expected {len(dataset)}."
+            )
+            # Don't exit, but log error. Evaluation might still work partially or fail later.
+        return predictions
+    except Exception:
+        logger.exception("An error occurred during inference")  # Use logger.exception
+        raise  # Re-raise after logging
+def run_evaluation(predictions: List[Any], dataset: HFDataset) -> EvaluationResults:
+    """Runs evaluation on the predictions.
+    Args:
+        predictions (List[Any]): The list of predictions from the model.
+        dataset (HFDataset): The dataset containing references and other data.
+    Returns:
+        EvaluationResults: The evaluated dataset (list of instances with scores).
+    Raises:
+        RuntimeError: If evaluation returns no results or an unexpected type.
+        Exception: If any other error occurs during evaluation.
+    """
+    logger.info("Starting evaluation...")
+    if not predictions:
+        logger.warning("Skipping evaluation as there are no predictions.")
+        return []  # Return empty list if no predictions to evaluate
+    try:
+        evaluation_results = evaluate(predictions=predictions, data=dataset)
+        logger.info("Evaluation completed.")
+        if not evaluation_results:
+            logger.error("Evaluation returned no results (empty list/None).")
+            # Raise an error as this indicates a problem in the evaluation process
+            raise RuntimeError("Evaluation returned no results.")
+        if not isinstance(evaluation_results, EvaluationResults):
+            logger.error(
+                f"Evaluation returned unexpected type: {type(evaluation_results)}. Expected list."
+            )
+            raise RuntimeError(
+                f"Evaluation returned unexpected type: {type(evaluation_results)}"
+            )
+        return evaluation_results
+    except Exception:
+        logger.exception("An error occurred during evaluation")  # Use logger.exception
+        raise  # Re-raise after logging
+def _get_unitxt_commit_hash() -> Optional[str]:
+    """Tries to get the git commit hash of the installed unitxt package."""
+    try:
+        # Find the directory of the unitxt package
+        # Use inspect to be more robust finding the package path
+        current_script_path = os.path.abspath(__file__)
+        package_dir = os.path.dirname(current_script_path)
+        # Check if it's a git repository and get the commit hash
+        # Use absolute path for git command
+        git_command = ["git", "-C", os.path.abspath(package_dir), "rev-parse", "HEAD"]
+        logger.debug(f"Running git command: {' '.join(git_command)}")
+        result = subprocess.run(
+            git_command,
+            capture_output=True,
+            text=True,
+            check=False,  # Don't raise error if git command fails
+            encoding="utf-8",
+            errors="ignore",  # Ignore potential decoding errors
+        )
+        if result.returncode == 0:
+            commit_hash = result.stdout.strip()
+            logger.info(f"Found unitxt git commit hash: {commit_hash}")
+            # Verify it looks like a hash (e.g., 40 hex chars)
+            if len(commit_hash) == 40 and all(
+                c in "0123456789abcdef" for c in commit_hash
+            ):
+                return commit_hash
+            logger.warning(
+                f"Git command output '{commit_hash}' doesn't look like a valid commit hash."
+            )
+            return None
+        stderr_msg = result.stderr.strip() if result.stderr else "No stderr"
+        logger.warning(
+            f"Could not get unitxt git commit hash (git command failed with code {result.returncode}): {stderr_msg}"
+        )
+        return None
+    except ImportError:
+        logger.warning("unitxt package not found, cannot determine commit hash.")
+        return None
+    except FileNotFoundError:
+        logger.warning(
+            "'git' command not found in PATH. Cannot determine unitxt commit hash."
+        )
+        return None
+    except Exception as e:
+        logger.warning(
+            f"Error getting unitxt commit hash: {e}", exc_info=True
+        )  # Log traceback
+        return None
+def _get_installed_packages() -> Dict[str, str]:
+    """Gets a dictionary of installed packages and their versions."""
+    packages = {}
+    try:
+        for dist in importlib.metadata.distributions():
+            # Handle potential missing metadata gracefully
+            name = dist.metadata.get("Name")
+            version = dist.metadata.get("Version")
+            if name and version:
+                packages[name] = version
+            elif name:
+                packages[name] = "N/A"  # Record package even if version is missing
+                logger.debug(f"Could not find version for package: {name}")
+        logger.info(f"Collected versions for {len(packages)} installed packages.")
+    except Exception as e:
+        logger.warning(f"Could not retrieve installed package list: {e}", exc_info=True)
+    return packages
+def _get_unitxt_version() -> str:
+    """Gets the installed unitxt version using importlib.metadata."""
+    try:
+        version = importlib.metadata.version("unitxt")
+        logger.info(f"Found unitxt version using importlib.metadata: {version}")
+        return version
+    except importlib.metadata.PackageNotFoundError:
+        logger.warning(
+            "Could not find 'unitxt' package version using importlib.metadata. Is it installed correctly?"
+        )
+        return "N/A"
+    except Exception as e:
+        logger.warning(
+            f"Error getting unitxt version using importlib.metadata: {e}", exc_info=True
+        )
+        return "N/A"
+def prepend_timestamp_to_path(original_path, timestamp):
+    """Takes a path string and a timestamp string, prepends the timestamp to the filename part of the path, and returns the new path string."""
+    directory, filename = os.path.split(original_path)
+    # Use an f-string to create the new filename with the timestamp prepended
+    new_filename = f"{timestamp}_{filename}"
+    # Join the directory and the new filename back together
+    return os.path.join(directory, new_filename)
+def _save_results_to_disk(
+    args: argparse.Namespace,
+    global_scores: Dict[str, Any],
+    all_samples_data: Dict[str, List[Dict[str, Any]]],
+    results_path: str,
+    samples_path: str,
+) -> None:
+    """Saves the configuration, environment info, global scores, and samples to JSON files.
+    Args:
+        args (argparse.Namespace): Parsed command-line arguments.
+        global_scores (Dict[str, Any]): Dictionary of global scores.
+        all_samples_data (Dict[str, List[Dict[str, Any]]]): List of processed sample data.
+        results_path (str): Path to save the summary results JSON file.
+        samples_path (str): Path to save the detailed samples JSON file.
+    """
+    # --- Gather Configuration ---
+    config_to_save = {}
+    for k, v in vars(args).items():
+        # Ensure complex objects are represented as strings
+        if isinstance(v, (str, int, float, bool, list, dict, type(None))):
+            config_to_save[k] = v
+        else:
+            try:
+                # Try standard repr first
+                config_to_save[k] = repr(v)
+            except Exception:
+                # Fallback if repr fails
+                config_to_save[k] = (
+                    f"<Object of type {type(v).__name__} could not be represented>"
+                )
+    # --- Gather Environment Info ---
+    unitxt_commit = _get_unitxt_commit_hash()
+    # Get version using the dedicated function
+    unitxt_pkg_version = _get_unitxt_version()
+    environment_info = {
+        "timestamp_utc": datetime.utcnow().isoformat() + "Z",
+        "command_line_invocation": sys.argv,
+        "parsed_arguments": config_to_save,  # Include parsed args here as well
+        "unitxt_version": unitxt_pkg_version,  # Use version from importlib.metadata
+        "unitxt_commit_hash": unitxt_commit if unitxt_commit else "N/A",
+        "python_version": platform.python_version(),
+        "system": platform.system(),
+        "system_version": platform.version(),
+        "installed_packages": _get_installed_packages(),
+    }
+    # --- Prepare Final Results Structure ---
+    results_summary = {
+        "environment_info": environment_info,
+        "results": global_scores,
+    }
+    # prepend to the results_path name the time in a wat like this: 2025-04-04T11:37:32
+    timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
+    results_path = prepend_timestamp_to_path(results_path, timestamp)
+    samples_path = prepend_timestamp_to_path(samples_path, timestamp)
+    # --- Save Summary ---
+    logger.info(f"Saving global results summary to: {results_path}")
+    try:
+        with open(results_path, "w", encoding="utf-8") as f:
+            json.dump(results_summary, f, indent=4, ensure_ascii=False)
+    except OSError as e:
+        logger.error(f"Failed to write results summary file {results_path}: {e}")
+    except TypeError as e:
+        logger.error(
+            f"Failed to serialize results summary to JSON: {e}. Check data types."
+        )
+        # Log the problematic structure if possible (might be large)
+        # logger.debug(f"Problematic results_summary structure: {results_summary}")
+    # --- Save Samples (if requested) ---
+    if args.log_samples:
+        logger.info(f"Saving detailed samples to: {samples_path}")
+        # Structure samples file with environment info as well for self-containment
+        samples_output = {
+            "environment_info": environment_info,  # Repeat env info here
+            "samples": all_samples_data,
+        }
+        try:
+            with open(samples_path, "w", encoding="utf-8") as f:
+                json.dump(samples_output, f, indent=4, ensure_ascii=False)
+        except OSError as e:
+            logger.error(f"Failed to write samples file {samples_path}: {e}")
+        except TypeError as e:
+            logger.error(f"Failed to serialize samples to JSON: {e}. Check data types.")
+def process_and_save_results(
+    args: argparse.Namespace,
+    evaluation_results: EvaluationResults,
+    results_path: str,
+    samples_path: str,
+) -> None:
+    """Processes, prints, and saves the evaluation results.
+    Args:
+        args (argparse.Namespace): Parsed command-line arguments.
+        evaluation_results (EvaluationResults): The list of evaluated instances.
+        results_path (str): Path to save the summary results JSON file.
+        samples_path (str): Path to save the detailed samples JSON file.
+    Raises:
+        Exception: If an error occurs during result processing or saving (re-raised).
+    """
+    try:
+        # global_scores, all_samples_data = _extract_scores_and_samples(evaluated_dataset)
+        subsets_scores = evaluation_results.subsets_scores
+        instances_results = evaluation_results.instance_scores
+        subset_instances = {}
+        for instance in instances_results:
+            if instance["subset"][0] not in subset_instances:
+                subset_instances[instance["subset"][0]] = []
+            del instance["postprocessors"]
+            subset_instances[instance["subset"][0]].append(instance)
+        logger.info(f"\n{subsets_scores.summary}")
+        # --- Save Results ---
+        # Pass all necessary data to the saving function
+        _save_results_to_disk(
+            args, subsets_scores, subset_instances, results_path, samples_path
+        )
+    except Exception:
+        logger.exception(
+            "An error occurred during result processing or saving"
+        )  # Use logger.exception
+        raise  # Re-raise after logging
+def main():
+    """Main function to parse arguments and run evaluation."""
+    parser = setup_parser()
+    args = parser.parse_args()
+    # Setup logging ASAP
+    setup_logging(args.verbosity)
+    logger.info("Starting Unitxt Evaluation CLI")
+    # Log raw and parsed args at DEBUG level
+    logger.debug(f"Raw command line arguments: {sys.argv}")
+    logger.debug(f"Parsed arguments: {vars(args)}")  # Log the vars(args) dict
+    logger.debug(
+        f"Parsed model_args type: {type(args.model_args)}, value: {args.model_args}"
+    )
+    try:
+        results_path, samples_path = prepare_output_paths(
+            args.output_path, args.output_file_prefix
+        )
+        # Apply unitxt settings within a context manager
+        with configure_unitxt_settings(args):
+            test_dataset = cli_load_dataset(args)
+            model_args_dict = prepare_kwargs(args.model_args)
+            gen_kwargs_dict = prepare_kwargs(args.gen_kwargs)
+            chat_kwargs_dict = prepare_kwargs(args.chat_template_kwargs)
+            model_args_dict.update(gen_kwargs_dict)
+            inference_model = initialize_inference_engine(
+                args, model_args_dict, chat_kwargs_dict
+            )
+            predictions = run_inference(inference_model, test_dataset)
+            evaluation_results = run_evaluation(predictions, test_dataset)
+            process_and_save_results(
+                args, evaluation_results, results_path, samples_path
+            )
+    # --- More Specific Error Handling ---
+    except (UnitxtArtifactNotFoundError, FileNotFoundError) as e:
+        logger.exception(f"Error loading artifact or file: {e}")
+        sys.exit(1)
+    except (AttributeError, ValueError) as e:
+        # Catch issues like missing keys in args, parsing errors, etc.
+        logger.exception(f"Configuration or value error: {e}")
+        sys.exit(1)
+    except ImportError as e:
+        # Catch missing optional dependencies
+        logger.exception(f"Missing dependency: {e}")
+        sys.exit(1)
+    except RuntimeError as e:
+        # Catch errors explicitly raised during execution (e.g., evaluation failure)
+        logger.exception(f"Runtime error during processing: {e}")
+        sys.exit(1)
+    except Exception as e:
+        # Catch any other unexpected errors
+        logger.exception(f"An unexpected error occurred: {e}")
+        sys.exit(1)
+    logger.info("Unitxt Evaluation CLI finished successfully.")
+if __name__ == "__main__":
+    main()

inference.py CHANGED Viewed

@@ -61,6 +61,7 @@ def batched(lst, n):
     while batch := list(islice(it, n)):
         yield batch
 class StandardAPIParamsMixin(Artifact):
     model: str
     frequency_penalty: Optional[float] = None
@@ -157,6 +158,7 @@ class ListWithMetadata(List[T]):
 class InferenceEngine(Artifact):
     """Abstract base class for inference."""
     cache_batch_size: int = 100
     use_cache: bool = True
@@ -206,9 +208,9 @@ class InferenceEngine(Artifact):
         instance_str = json.dumps(record, sort_keys=True)
         return hashlib.md5(instance_str.encode()).hexdigest()
-    def verify_infer_inputs(self,
-                            dataset: Union[List[Dict[str, Any]], Dataset],
-                            return_meta_data: bool):
         if not isoftype(dataset, Union[List[Dict[str, Any]], Dataset]):
             raise Exception(
                 "Dataset passed to infer() is not list of dictionaries or Huggingface Dataset"
@@ -238,33 +240,49 @@ class InferenceEngine(Artifact):
             if self.use_cache:
                 number_of_batches = len(dataset) // self.cache_batch_size + 1
                 result = []
-                for batch_index, batch in enumerate(batched(dataset, self.cache_batch_size)):
                     cached_results = []
                     missing_examples = []
                     for i, item in enumerate(batch):
                         cache_key = self._get_cache_key(item)
                         cached_value = self._cache.get(cache_key)
                         if cached_value is not None:
-                            cached_results.append((i, cached_value)) # each element is index in batch, and value
                         else:
-                            missing_examples.append((i, item)) # each element is index in batch and example
                     # infare on missing examples only, without indices
-                    logger.info(f"Inferring batch {batch_index + 1} / {number_of_batches} with {len(missing_examples)} instances (found {len(cached_results)} instances in {self._cache.directory})")
-                    if (len(missing_examples) > 0):
-                        inferred_results = self._infer([e[1] for e in missing_examples], return_meta_data)
                         # recombined to index and value
-                        inferred_results = list(zip([e[0] for e in missing_examples], inferred_results))
                         # Add missing examples to cache
-                        for (_, item), (_, prediction) in zip(missing_examples, inferred_results):
                             if prediction is None:
                                 continue
                             cache_key = self._get_cache_key(item)
                             self._cache[cache_key] = prediction
                     else:
-                        inferred_results=[]
                     # Combine cached and inferred results in original order
-                    batch_predictions = [p[1] for p in sorted(cached_results + inferred_results)]
                     result.extend(batch_predictions)
             else:
                 result = self._infer(dataset, return_meta_data)
@@ -414,6 +432,8 @@ class HFInferenceEngineBase(
     low_cpu_mem_usage: bool = True
     torch_dtype: str = "torch.float16"
     model: Any = InternalField(default=None, name="Inference object")
     processor: Any = InternalField(default=None, name="Input processor (tokenizer)")
@@ -618,16 +638,52 @@ class HFInferenceEngineBase(
 class HFAutoModelInferenceEngine(HFInferenceEngineBase):
     label: str = "hf_auto_model"
     def _init_processor(self):
         from transformers import AutoTokenizer
         self.processor = AutoTokenizer.from_pretrained(
             pretrained_model_name_or_path=self.model_name,
             use_fast=self.use_fast_tokenizer,
-            padding=True,
-            truncation=True,
         )
     def _init_model(self):
         from transformers import (
             AutoConfig,
@@ -641,11 +697,12 @@ class HFAutoModelInferenceEngine(HFInferenceEngineBase):
             else AutoModelForCausalLM
         )
         self.model = model_class.from_pretrained(
             pretrained_model_name_or_path=self.model_name,
             trust_remote_code=True,
-            device_map=self.device_map,
-            torch_dtype=self._get_torch_dtype(),
         )
         if self.device_map is None:
             self.model.to(self.device)
@@ -653,13 +710,21 @@ class HFAutoModelInferenceEngine(HFInferenceEngineBase):
     def prepare_inputs(self, data: Iterable) -> Mapping:
         if isinstance(data[0], list):
             data = self.processor.apply_chat_template(
-                data, tokenize=False, add_generation_prompt=True
             )
         return self.processor(
             data,
-            padding=True,
-            truncation=True,
             return_tensors="pt",
         ).to(self.device or self.device_map)
     def _infer_fn(
@@ -668,40 +733,81 @@ class HFAutoModelInferenceEngine(HFInferenceEngineBase):
         return_meta_data: bool,
         return_logprobs: bool,
     ) -> Union[List[str], List[Dict], List[TextGenerationInferenceOutput]]:
-        tokenized_inputs = self.prepare_inputs(
-            [instance["source"] for instance in dataset]
-        )
-        input_length = (
-            1
-            if self.model.config.is_encoder_decoder
-            else tokenized_inputs.input_ids.shape[1]
-        )
-        predictions = self.make_predictions(tokenized_inputs)
-        sequences = predictions.sequences
-        string_tokens = [
-            self.decode_tokens(sequence, input_length) for sequence in sequences
-        ]
-        final_outputs = (
-            self.get_logprobs(predictions, string_tokens)
-            if return_logprobs
-            else [self.create_string_from_tokens(strings) for strings in string_tokens]
-        )
-        return [
-            self.get_return_object(
-                output=final_outputs[i],
-                output_tokens=len(string_tokens[i]),
-                inp=dataset[i]["source"],
-                inp_tokens=len(tokenized_inputs.encodings[i].tokens)
-                if tokenized_inputs.encodings is not None
-                else None,
-                return_meta_data=return_meta_data,
             )
-            for i in range(len(sequences))
-        ]
     def _infer(
         self,
@@ -885,10 +991,10 @@ class HFPeftInferenceEngine(HFAutoModelInferenceEngine):
         model_class = (
             AutoPeftModelForSeq2SeqLM
-            if AutoConfig.from_pretrained(self.model_name).is_encoder_decoder
             else AutoPeftModelForCausalLM
         )
-        path = self.peft_config.base_model_name_or_path
         if settings.hf_offline_models_path is not None:
             path = os.path.join(settings.hf_offline_models_path, path)
@@ -899,6 +1005,7 @@ class HFPeftInferenceEngine(HFAutoModelInferenceEngine):
             low_cpu_mem_usage=self.low_cpu_mem_usage,
             torch_dtype=self._get_torch_dtype(),
         )
         if self.device_map is None:
             self.model.to(self.device)
@@ -949,19 +1056,27 @@ class HFPipelineBasedInferenceEngine(
         except Exception:
             try:
                 from peft import PeftConfig
                 # If full model loading fails, try loading as a PEFT adapter
                 peft_config = PeftConfig.from_pretrained(path)
                 if not peft_config.base_model_name_or_path:
-                    raise ValueError(f"Base model name not found in PEFT config for {path}")
                 # Load the base model's config
-                config = AutoConfig.from_pretrained(peft_config.base_model_name_or_path, trust_remote_code=True)
             except Exception as err2:
-                raise ValueError(f"Could not determine model type for: {path}") from err2
-        self.task =  "text2text-generation" if config.is_encoder_decoder else "text-generation"
     def _get_model_args(self) -> Dict[str, Any]:
         import torch
@@ -1306,9 +1421,9 @@ class OptionSelectingByLogProbsInferenceEngine:
             for option in instance["task_data"]["options"]
         ]
-        dataset_with_options_logprobs: List[
-            List[Dict[str, Union[float, str]]]
-        ] = self.get_options_log_probs(dataset_with_options)
         dataset_iterator = iter(dataset_with_options_logprobs)
@@ -1381,7 +1496,7 @@ class IbmGenAiInferenceEngine(
     def _get_credentials():
         from genai import Credentials
-        api_key_env_var_name = "GENAI_KEY" # pragma: allowlist secret
         api_key = os.environ.get(api_key_env_var_name)
         assert api_key is not None, (
@@ -1467,9 +1582,9 @@ class IbmGenAiInferenceEngine(
         predict_results = []
         for prediction in predictions:
             result: TextGenerationResult = prediction.results[0]
-            assert isinstance(
-                result.generated_tokens, list
-            ), "result.generated_tokens should be a list"
             predict_result = []
             for base_token in result.generated_tokens:
@@ -1714,6 +1829,7 @@ class OpenAiInferenceEngine(
     @run_with_imap
     def _get_chat_completion(self, instance, return_meta_data):
         import openai
         messages = self.to_messages(instance)
         try:
             response = self.client.chat.completions.create(
@@ -1725,13 +1841,17 @@ class OpenAiInferenceEngine(
             return self.get_return_object(prediction, response, return_meta_data)
         # catch in case of content_filtering failure
         except openai.BadRequestError as e:
-            logging.error(f"Error predicting instance {messages}:{e}. Returning empty prediction")
-            return TextGenerationInferenceOutput(prediction = "-", input_tokens=0, output_tokens=0)
     @run_with_imap
     def _get_logprobs(self, instance, return_meta_data):
         import openai
         messages = self.to_messages(instance)
         try:
             response = self.client.chat.completions.create(
@@ -1752,13 +1872,13 @@ class OpenAiInferenceEngine(
             return self.get_return_object(pred_output, response, return_meta_data)
         # catch in case of content_filtering failure
         except openai.BadRequestError as e:
-            logging.error(f"Error predicting instance {messages}:{e}. Returning empty prediction")
-            prediction = [{"top_tokens": [
-                        {"text": "-", "logprob": 0}
-                    ]
-            }]
-            return TextGenerationInferenceOutput(prediction=prediction, input_tokens=0, output_tokens=0)
     def get_return_object(self, predict_result, response, return_meta_data):
         if return_meta_data:
@@ -1792,9 +1912,9 @@ class AzureOpenAIInferenceEngine(OpenAiInferenceEngine):
         api_version = self.credentials.get(
             "api_version", os.environ.get("OPENAI_API_VERSION", None)
         )
-        assert (
-            api_version and azure_openapi_host
-        ), "Error while trying to run AzureOpenAIInferenceEngine: Missing environment variable param AZURE_OPENAI_HOST or OPENAI_API_VERSION"
         api_url = f"{azure_openapi_host}/openai/deployments/{self.model_name}/chat/completions?api-version={api_version}"
         return {"api_key": api_key, "api_url": api_url, "api_version": api_version}
@@ -1821,9 +1941,7 @@ class RITSInferenceEngine(
     label: str = "rits"
     data_classification_policy = ["public", "proprietary"]
-    model_names_dict = {
-        "microsoft/phi-4": "microsoft-phi-4"
-    }
     def get_default_headers(self):
         return {"RITS_API_KEY": self.credentials["api_key"]}
@@ -1891,7 +2009,7 @@ class TogetherAiInferenceEngine(
         from together import Together
         from together.types.models import ModelType
-        api_key_env_var_name = "TOGETHER_API_KEY" # pragma: allowlist secret
         api_key = os.environ.get(api_key_env_var_name)
         assert api_key is not None, (
             f"Error while trying to run TogetherAiInferenceEngine."
@@ -1906,9 +2024,9 @@ class TogetherAiInferenceEngine(
             together_model.id: together_model.type for together_model in together_models
         }
         model_type = together_model_id_to_type.get(self.model_name)
-        assert (
-            model_type is not None
-        ), f"Could not find model {self.model_name} in Together AI model list"
         assert model_type in [ModelType.CHAT, ModelType.LANGUAGE, ModelType.CODE], (
             f"Together AI model type {model_type} is not supported; "
             "supported types are 'chat', 'language' and 'code'."
@@ -2087,11 +2205,11 @@ class WMLInferenceEngineBase(
     def verify(self):
         super().verify()
-        assert (
-            self.model_name
-            or self.deployment_id
-            and not (self.model_name and self.deployment_id)
-        ), "Either 'model_name' or 'deployment_id' must be specified, but not both at the same time."
     # def process_data_before_dump(self, data):
     #     if "credentials" in data:
@@ -2110,11 +2228,11 @@ class WMLInferenceEngineBase(
         self._verify_wml_credentials(self.credentials)
         return APIClient(
             credentials=Credentials(
-                api_key=self.credentials["api_key"],
-                url=self.credentials["url"]
             ),
             project_id=self.credentials.get("project_id", None),
-            space_id=self.credentials.get("space_id", None))
     @staticmethod
     def _read_wml_credentials_from_env() -> CredentialsWML:
@@ -2182,9 +2300,9 @@ class WMLInferenceEngineBase(
             "['url', 'api_key', 'username', 'password']."
         )
-        assert credentials.get(
-            "url"
-        ), "'url' is a mandatory key for WML credentials dict."
         assert "space_id" in credentials or "project_id" in credentials, (
             "Either 'space_id' or 'project_id' must be provided "
             "as keys for WML credentials dict."
@@ -2585,7 +2703,9 @@ class WMLInferenceEngineChat(WMLInferenceEngineBase, WMLChatParamsMixin):
         return True
     def to_messages(self, instance: Union[Dict, List]) -> List[List[Dict[str, Any]]]:
-        if isinstance(instance["source"], str) and self.check_instance_contains_image(instance):
             return self._create_messages_from_instance(instance)
         messages = super().to_messages(instance)
@@ -2909,7 +3029,7 @@ class VLLMParamsMixin(Artifact):
 class VLLMInferenceEngine(InferenceEngine, PackageRequirementsMixin, VLLMParamsMixin):
-    label="vllm"
     def get_engine_id(self):
         return get_model_and_label_id(self.model, self.label)
@@ -3011,7 +3131,6 @@ class LiteLLMInferenceEngine(
         self.inference_type = "litellm"
         from litellm import acompletion
         self._completion = acompletion
         # Initialize a semaphore to limit concurrency
         self._semaphore = asyncio.Semaphore(round(self.max_requests_per_second))
@@ -3032,7 +3151,6 @@ class LiteLLMInferenceEngine(
                 response = await self._completion(
                     messages=messages,
                     max_retries=self.max_retries,
-                    caching=True,
                     drop_params=False,
                     **self.credentials,
                     **kwargs,
@@ -3123,10 +3241,10 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
     label: str = "cross_provider"
     provider: Optional[_supported_apis] = None
-    provider_specific_args: Optional[Dict[str, Dict[str,str]]] = None
     provider_model_map: Dict[_supported_apis, Dict[str, str]] = {
-        "watsonx-sdk": { # checked from ibm_watsonx_ai.APIClient().foundation_models.ChatModels
             "granite-20b-code-instruct": "ibm/granite-20b-code-instruct",
             "granite-3-2-8b-instruct": "ibm/granite-3-2-8b-instruct",
             "granite-3-2b-instruct": "ibm/granite-3-2b-instruct",
@@ -3153,7 +3271,7 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
             "llama-3-1-70b-instruct": "together_ai/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
             "llama-3-1-405b-instruct": "together_ai/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
             "llama-3-2-1b-instruct": "together_ai/togethercomputer/llama-3-2-1b-instruct",
-            "llama-3-3-70b-instruct": "together_ai/meta-llama/Llama-3.3-70B-Instruct-Turbo"
         },
         "aws": {
             "llama-3-8b-instruct": "bedrock/meta.llama3-8b-instruct-v1:0",
@@ -3167,7 +3285,7 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
             "llama-3-1-405b-instruct": "llama3.1:405b",
             "llama-3-2-1b-instruct": "llama3.2:1b",
             "llama-3-2-3b-instruct": "llama3.2:3b",
-            "llama-3-3-70b-instruct": "llama3.3"
         },
         "bam": {
             "granite-3-8b-instruct": "ibm/granite-8b-instruct-preview-4k",
@@ -3242,12 +3360,14 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
             "llama-3-1-405b-instruct": "vertex_ai/meta/llama-3.1-405b-instruct-maas",
         },
         "replicate": {
-            "granite-20b-code-instruct-8k": "replicate/ibm-granite/granite-20b-code-instruct-8k",
-            "granite-3-2b-instruct": "replicate/ibm-granite/granite-3.0-2b-instruct",
-            "granite-3-8b-instruct": "replicate/ibm-granite/granite-3.0-8b-instruct",
-            "granite-3-1-2b-instruct": "replicate/ibm-granite/granite-3.1-2b-instruct",
             "granite-3-1-8b-instruct": "replicate/ibm-granite/granite-3.1-8b-instruct",
             "granite-8b-code-instruct-128k": "replicate/ibm-granite/granite-8b-code-instruct-128k",
             "llama-2-13b": "replicate/meta/llama-2-13b",
             "llama-2-13b-chat": "replicate/meta/llama-2-13b-chat",
             "llama-2-70b": "replicate/meta/llama-2-70b",
@@ -3264,7 +3384,9 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
             "mixtral-8x7b-instruct-v0.1": "replicate/mistralai/mixtral-8x7b-instruct-v0.1",
         },
     }
-    provider_model_map["watsonx"] = {k: f"watsonx/{v}" for k,v in provider_model_map["watsonx-sdk"].items()}
     _provider_to_base_class = {
         "watsonx": LiteLLMInferenceEngine,
@@ -3307,7 +3429,7 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
         args["model"] = self.provider_model_map[provider].get(self.model, self.model)
         if self.provider_specific_args is not None:
-            provider_args =  self.provider_specific_args.get(provider)
             if provider_args is not None:
                 args.update(provider_args)
@@ -3342,6 +3464,7 @@ class HFOptionSelectingInferenceEngine(InferenceEngine, TorchDeviceMixin):
     This class uses models from the HuggingFace Transformers library to calculate log probabilities for text inputs.
     """
     label = "hf_option_selection"
     model_name: str
     batch_size: int
@@ -3368,10 +3491,8 @@ class HFOptionSelectingInferenceEngine(InferenceEngine, TorchDeviceMixin):
             path,
         )
         self.model = AutoModelForCausalLM.from_pretrained(
-                path,
-            ).to(
-            self.device
-        )
         # Set pad_token if it doesn't exist
         if self.tokenizer.pad_token is None:
             self.tokenizer.pad_token = self.tokenizer.eos_token

     while batch := list(islice(it, n)):
         yield batch
 class StandardAPIParamsMixin(Artifact):
     model: str
     frequency_penalty: Optional[float] = None
 class InferenceEngine(Artifact):
     """Abstract base class for inference."""
     cache_batch_size: int = 100
     use_cache: bool = True
         instance_str = json.dumps(record, sort_keys=True)
         return hashlib.md5(instance_str.encode()).hexdigest()
+    def verify_infer_inputs(
+        self, dataset: Union[List[Dict[str, Any]], Dataset], return_meta_data: bool
+    ):
         if not isoftype(dataset, Union[List[Dict[str, Any]], Dataset]):
             raise Exception(
                 "Dataset passed to infer() is not list of dictionaries or Huggingface Dataset"
             if self.use_cache:
                 number_of_batches = len(dataset) // self.cache_batch_size + 1
                 result = []
+                for batch_index, batch in enumerate(
+                    batched(dataset, self.cache_batch_size)
+                ):
                     cached_results = []
                     missing_examples = []
                     for i, item in enumerate(batch):
                         cache_key = self._get_cache_key(item)
                         cached_value = self._cache.get(cache_key)
                         if cached_value is not None:
+                            cached_results.append(
+                                (i, cached_value)
+                            )  # each element is index in batch, and value
                         else:
+                            missing_examples.append(
+                                (i, item)
+                            )  # each element is index in batch and example
                     # infare on missing examples only, without indices
+                    logger.info(
+                        f"Inferring batch {batch_index + 1} / {number_of_batches} with {len(missing_examples)} instances (found {len(cached_results)} instances in {self._cache.directory})"
+                    )
+                    if len(missing_examples) > 0:
+                        inferred_results = self._infer(
+                            [e[1] for e in missing_examples], return_meta_data
+                        )
                         # recombined to index and value
+                        inferred_results = list(
+                            zip([e[0] for e in missing_examples], inferred_results)
+                        )
                         # Add missing examples to cache
+                        for (_, item), (_, prediction) in zip(
+                            missing_examples, inferred_results
+                        ):
                             if prediction is None:
                                 continue
                             cache_key = self._get_cache_key(item)
                             self._cache[cache_key] = prediction
                     else:
+                        inferred_results = []
                     # Combine cached and inferred results in original order
+                    batch_predictions = [
+                        p[1] for p in sorted(cached_results + inferred_results)
+                    ]
                     result.extend(batch_predictions)
             else:
                 result = self._infer(dataset, return_meta_data)
     low_cpu_mem_usage: bool = True
     torch_dtype: str = "torch.float16"
+    batch_size: int = 1
     model: Any = InternalField(default=None, name="Inference object")
     processor: Any = InternalField(default=None, name="Input processor (tokenizer)")
 class HFAutoModelInferenceEngine(HFInferenceEngineBase):
     label: str = "hf_auto_model"
+    use_fp16: bool = True
+    load_in_8bit: bool = False
+    device_map: Any = None
+    padding: bool = True
+    truncation: bool = True
+    padding_side: str = "left"  # for decoder only models
+    chat_kwargs_dict: dict = {}
     def _init_processor(self):
         from transformers import AutoTokenizer
         self.processor = AutoTokenizer.from_pretrained(
             pretrained_model_name_or_path=self.model_name,
             use_fast=self.use_fast_tokenizer,
         )
+    def _get_model_args(self) -> Dict[str, Any]:
+        import torch
+        from transformers import BitsAndBytesConfig
+        args = {}
+        if self.load_in_8bit:
+            quantization_config = BitsAndBytesConfig(load_in_8bit=self.load_in_8bit)
+            args["quantization_config"] = quantization_config
+        elif self.use_fp16:
+            if self.device == torch.device("mps"):
+                args["torch_dtype"] = torch.float16
+            else:
+                args["torch_dtype"] = torch.bfloat16
+        # We do this, because in some cases, using device:auto will offload some weights to the cpu
+        # (even though the model might *just* fit to a single gpu), even if there is a gpu available, and this will
+        # cause an error because the data is always on the gpu
+        # if torch.cuda.device_count() > 1:
+        # assert self.device == torch.device(0)
+        args["device_map"] = "auto"
+        # else:
+        #     if not self.load_in_8bit:
+        #         args["device"] = self.device
+        return args
     def _init_model(self):
         from transformers import (
             AutoConfig,
             else AutoModelForCausalLM
         )
+        model_args = self._get_model_args()
         self.model = model_class.from_pretrained(
             pretrained_model_name_or_path=self.model_name,
             trust_remote_code=True,
+            **model_args,
         )
         if self.device_map is None:
             self.model.to(self.device)
     def prepare_inputs(self, data: Iterable) -> Mapping:
         if isinstance(data[0], list):
             data = self.processor.apply_chat_template(
+                data,
+                tokenize=False,
+                add_generation_prompt=True,
+                **self.chat_kwargs_dict,
             )
+        if self.processor.pad_token is None:
+            self.processor.pad_token_id = self.model.config.eos_token_id[0]
         return self.processor(
             data,
             return_tensors="pt",
+            padding=self.padding,
+            truncation=self.truncation,
+            padding_side=self.padding_side,
         ).to(self.device or self.device_map)
     def _infer_fn(
         return_meta_data: bool,
         return_logprobs: bool,
     ) -> Union[List[str], List[Dict], List[TextGenerationInferenceOutput]]:
+        """Performs inference on the dataset in batches.
+        Args:
+            dataset: A list of dictionaries or a Dataset object containing the input data.
+                     Each item should have a "source" key.
+            return_meta_data: Whether to include metadata in the output.
+            return_logprobs: Whether to return log probabilities along with the output.
+        Returns:
+            A list of outputs, which can be strings, dictionaries (if metadata is returned),
+            or TextGenerationInferenceOutput objects (if logprobs are returned).
+        """
+        all_final_outputs = []  # List to store results from all batches
+        for i in tqdm(
+            range(0, len(dataset), self.batch_size),
+            desc=f"Running inference in batches of {self.batch_size}",
+        ):
+            # Get the current batch
+            batch_data = dataset[i : i + self.batch_size]
+            batch_sources = [instance["source"] for instance in batch_data]
+            # --- Process the current batch ---
+            # 1. Tokenize inputs for the batch
+            tokenized_inputs = self.prepare_inputs(batch_sources)
+            # 2. Determine input length (handle encoder-decoder models)
+            input_length = (
+                1
+                if self.model.config.is_encoder_decoder
+                else tokenized_inputs.input_ids.shape[1]
+            )
+            # 3. Make predictions for the batch
+            predictions = self.make_predictions(tokenized_inputs)
+            sequences = predictions.sequences  # Sequences for the current batch
+            # 4. Decode tokens for the batch
+            string_tokens_batch = [
+                self.decode_tokens(sequence, input_length) for sequence in sequences
+            ]
+            # 5. Calculate logprobs or create strings for the batch
+            final_outputs_batch = (
+                self.get_logprobs(predictions, string_tokens_batch)
+                if return_logprobs
+                else [
+                    self.create_string_from_tokens(strings)
+                    for strings in string_tokens_batch
+                ]
             )
+            # 6. Create return objects for the batch
+            batch_results = [
+                self.get_return_object(
+                    output=final_outputs_batch[
+                        j
+                    ],  # Output for the j-th item in the batch
+                    output_tokens=len(string_tokens_batch[j]),
+                    inp=batch_data[j]["source"],  # Original input for the j-th item
+                    inp_tokens=len(tokenized_inputs.encodings[j].tokens)
+                    if tokenized_inputs.encodings is not None
+                    else None,
+                    return_meta_data=return_meta_data,
+                )
+                for j in range(
+                    len(sequences)
+                )  # Iterate through items in the current batch
+            ]
+            # Add results from this batch to the overall list
+            all_final_outputs.extend(batch_results)
+            # --- End of batch processing ---
+        return all_final_outputs
     def _infer(
         self,
         model_class = (
             AutoPeftModelForSeq2SeqLM
+            if AutoConfig.from_pretrained(self.peft_config.base_model_name_or_path).is_encoder_decoder
             else AutoPeftModelForCausalLM
         )
+        path = self.model_name
         if settings.hf_offline_models_path is not None:
             path = os.path.join(settings.hf_offline_models_path, path)
             low_cpu_mem_usage=self.low_cpu_mem_usage,
             torch_dtype=self._get_torch_dtype(),
         )
+        self.model = self.model.to(dtype=self._get_torch_dtype()) # Make sure that base model and adapter use same dtype
         if self.device_map is None:
             self.model.to(self.device)
         except Exception:
             try:
                 from peft import PeftConfig
                 # If full model loading fails, try loading as a PEFT adapter
                 peft_config = PeftConfig.from_pretrained(path)
                 if not peft_config.base_model_name_or_path:
+                    raise ValueError(
+                        f"Base model name not found in PEFT config for {path}"
+                    )
                 # Load the base model's config
+                config = AutoConfig.from_pretrained(
+                    peft_config.base_model_name_or_path, trust_remote_code=True
+                )
             except Exception as err2:
+                raise ValueError(
+                    f"Could not determine model type for: {path}"
+                ) from err2
+        self.task = (
+            "text2text-generation" if config.is_encoder_decoder else "text-generation"
+        )
     def _get_model_args(self) -> Dict[str, Any]:
         import torch
             for option in instance["task_data"]["options"]
         ]
+        dataset_with_options_logprobs: List[List[Dict[str, Union[float, str]]]] = (
+            self.get_options_log_probs(dataset_with_options)
+        )
         dataset_iterator = iter(dataset_with_options_logprobs)
     def _get_credentials():
         from genai import Credentials
+        api_key_env_var_name = "GENAI_KEY"  # pragma: allowlist secret
         api_key = os.environ.get(api_key_env_var_name)
         assert api_key is not None, (
         predict_results = []
         for prediction in predictions:
             result: TextGenerationResult = prediction.results[0]
+            assert isinstance(result.generated_tokens, list), (
+                "result.generated_tokens should be a list"
+            )
             predict_result = []
             for base_token in result.generated_tokens:
     @run_with_imap
     def _get_chat_completion(self, instance, return_meta_data):
         import openai
         messages = self.to_messages(instance)
         try:
             response = self.client.chat.completions.create(
             return self.get_return_object(prediction, response, return_meta_data)
         # catch in case of content_filtering failure
         except openai.BadRequestError as e:
+            logging.error(
+                f"Error predicting instance {messages}:{e}. Returning empty prediction"
+            )
+            return TextGenerationInferenceOutput(
+                prediction="-", input_tokens=0, output_tokens=0
+            )
     @run_with_imap
     def _get_logprobs(self, instance, return_meta_data):
         import openai
         messages = self.to_messages(instance)
         try:
             response = self.client.chat.completions.create(
             return self.get_return_object(pred_output, response, return_meta_data)
         # catch in case of content_filtering failure
         except openai.BadRequestError as e:
+            logging.error(
+                f"Error predicting instance {messages}:{e}. Returning empty prediction"
+            )
+            prediction = [{"top_tokens": [{"text": "-", "logprob": 0}]}]
+            return TextGenerationInferenceOutput(
+                prediction=prediction, input_tokens=0, output_tokens=0
+            )
     def get_return_object(self, predict_result, response, return_meta_data):
         if return_meta_data:
         api_version = self.credentials.get(
             "api_version", os.environ.get("OPENAI_API_VERSION", None)
         )
+        assert api_version and azure_openapi_host, (
+            "Error while trying to run AzureOpenAIInferenceEngine: Missing environment variable param AZURE_OPENAI_HOST or OPENAI_API_VERSION"
+        )
         api_url = f"{azure_openapi_host}/openai/deployments/{self.model_name}/chat/completions?api-version={api_version}"
         return {"api_key": api_key, "api_url": api_url, "api_version": api_version}
     label: str = "rits"
     data_classification_policy = ["public", "proprietary"]
+    model_names_dict = {"microsoft/phi-4": "microsoft-phi-4"}
     def get_default_headers(self):
         return {"RITS_API_KEY": self.credentials["api_key"]}
         from together import Together
         from together.types.models import ModelType
+        api_key_env_var_name = "TOGETHER_API_KEY"  # pragma: allowlist secret
         api_key = os.environ.get(api_key_env_var_name)
         assert api_key is not None, (
             f"Error while trying to run TogetherAiInferenceEngine."
             together_model.id: together_model.type for together_model in together_models
         }
         model_type = together_model_id_to_type.get(self.model_name)
+        assert model_type is not None, (
+            f"Could not find model {self.model_name} in Together AI model list"
+        )
         assert model_type in [ModelType.CHAT, ModelType.LANGUAGE, ModelType.CODE], (
             f"Together AI model type {model_type} is not supported; "
             "supported types are 'chat', 'language' and 'code'."
     def verify(self):
         super().verify()
+        assert self.model_name or (
+            self.deployment_id and not (self.model_name and self.deployment_id)
+        ), (
+            "Either 'model_name' or 'deployment_id' must be specified, but not both at the same time."
+        )
     # def process_data_before_dump(self, data):
     #     if "credentials" in data:
         self._verify_wml_credentials(self.credentials)
         return APIClient(
             credentials=Credentials(
+                api_key=self.credentials["api_key"], url=self.credentials["url"]
             ),
             project_id=self.credentials.get("project_id", None),
+            space_id=self.credentials.get("space_id", None),
+        )
     @staticmethod
     def _read_wml_credentials_from_env() -> CredentialsWML:
             "['url', 'api_key', 'username', 'password']."
         )
+        assert credentials.get("url"), (
+            "'url' is a mandatory key for WML credentials dict."
+        )
         assert "space_id" in credentials or "project_id" in credentials, (
             "Either 'space_id' or 'project_id' must be provided "
             "as keys for WML credentials dict."
         return True
     def to_messages(self, instance: Union[Dict, List]) -> List[List[Dict[str, Any]]]:
+        if isinstance(instance["source"], str) and self.check_instance_contains_image(
+            instance
+        ):
             return self._create_messages_from_instance(instance)
         messages = super().to_messages(instance)
 class VLLMInferenceEngine(InferenceEngine, PackageRequirementsMixin, VLLMParamsMixin):
+    label = "vllm"
     def get_engine_id(self):
         return get_model_and_label_id(self.model, self.label)
         self.inference_type = "litellm"
         from litellm import acompletion
         self._completion = acompletion
         # Initialize a semaphore to limit concurrency
         self._semaphore = asyncio.Semaphore(round(self.max_requests_per_second))
                 response = await self._completion(
                     messages=messages,
                     max_retries=self.max_retries,
                     drop_params=False,
                     **self.credentials,
                     **kwargs,
     label: str = "cross_provider"
     provider: Optional[_supported_apis] = None
+    provider_specific_args: Optional[Dict[str, Dict[str, str]]] = None
     provider_model_map: Dict[_supported_apis, Dict[str, str]] = {
+        "watsonx-sdk": {  # checked from ibm_watsonx_ai.APIClient().foundation_models.ChatModels
             "granite-20b-code-instruct": "ibm/granite-20b-code-instruct",
             "granite-3-2-8b-instruct": "ibm/granite-3-2-8b-instruct",
             "granite-3-2b-instruct": "ibm/granite-3-2b-instruct",
             "llama-3-1-70b-instruct": "together_ai/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
             "llama-3-1-405b-instruct": "together_ai/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
             "llama-3-2-1b-instruct": "together_ai/togethercomputer/llama-3-2-1b-instruct",
+            "llama-3-3-70b-instruct": "together_ai/meta-llama/Llama-3.3-70B-Instruct-Turbo",
         },
         "aws": {
             "llama-3-8b-instruct": "bedrock/meta.llama3-8b-instruct-v1:0",
             "llama-3-1-405b-instruct": "llama3.1:405b",
             "llama-3-2-1b-instruct": "llama3.2:1b",
             "llama-3-2-3b-instruct": "llama3.2:3b",
+            "llama-3-3-70b-instruct": "llama3.3",
         },
         "bam": {
             "granite-3-8b-instruct": "ibm/granite-8b-instruct-preview-4k",
             "llama-3-1-405b-instruct": "vertex_ai/meta/llama-3.1-405b-instruct-maas",
         },
         "replicate": {
+            "granite-3-2-8b-instruct": "replicate/ibm-granite/granite-3.2-8b-instruct",
+            "granite-vision-3-2-2b": "replicate/ibm-granite/granite-vision-3.2-2b",
             "granite-3-1-8b-instruct": "replicate/ibm-granite/granite-3.1-8b-instruct",
+            "granite-3-1-2b-instruct": "replicate/ibm-granite/granite-3.1-2b-instruct",
+            "granite-3-8b-instruct": "replicate/ibm-granite/granite-3.0-8b-instruct",
+            "granite-3-2b-instruct": "replicate/ibm-granite/granite-3.0-2b-instruct",
             "granite-8b-code-instruct-128k": "replicate/ibm-granite/granite-8b-code-instruct-128k",
+            "granite-20b-code-instruct-8k": "replicate/ibm-granite/granite-20b-code-instruct-8k",
             "llama-2-13b": "replicate/meta/llama-2-13b",
             "llama-2-13b-chat": "replicate/meta/llama-2-13b-chat",
             "llama-2-70b": "replicate/meta/llama-2-70b",
             "mixtral-8x7b-instruct-v0.1": "replicate/mistralai/mixtral-8x7b-instruct-v0.1",
         },
     }
+    provider_model_map["watsonx"] = {
+        k: f"watsonx/{v}" for k, v in provider_model_map["watsonx-sdk"].items()
+    }
     _provider_to_base_class = {
         "watsonx": LiteLLMInferenceEngine,
         args["model"] = self.provider_model_map[provider].get(self.model, self.model)
         if self.provider_specific_args is not None:
+            provider_args = self.provider_specific_args.get(provider)
             if provider_args is not None:
                 args.update(provider_args)
     This class uses models from the HuggingFace Transformers library to calculate log probabilities for text inputs.
     """
     label = "hf_option_selection"
     model_name: str
     batch_size: int
             path,
         )
         self.model = AutoModelForCausalLM.from_pretrained(
+            path,
+        ).to(self.device)
         # Set pad_token if it doesn't exist
         if self.tokenizer.pad_token is None:
             self.tokenizer.pad_token = self.tokenizer.eos_token

llm_as_judge.py CHANGED Viewed

@@ -240,7 +240,7 @@ class LLMJudgeDirect(LLMJudge):
     main_score = "llm_as_judge"
     """The primary score name used in the results. By default, it will take the value of the criteria name (if only one criteria is being used for evaluation) or "llm_as_judge" otherwise."""
     reduction_map = {"mean": ["llm_as_judge"]}
-    """A mapping used for score aggregation. By default, it will take the value of `{'mean': [<default_main_score_name>]}`."""
     def prepare(self):
         super().prepare()
@@ -420,7 +420,7 @@ class LLMJudgeDirect(LLMJudge):
         This method evaluates the quality of of the predictions by calculating scores for each instance based on a criterion.
         Returns:
-        -------
         List[Dict]
             A list of dictionaries containing the evaluation results for each instance. The results include the computed scores for each prediction. Each result will have the `score_name` as a prefix, which may be the criterion name if only one used, or "llm_as_judge" if several criteria were used.
@@ -647,7 +647,7 @@ class LLMJudgePairwise(LLMJudge):
     main_score = "1_winrate"
     """The main score metric for pairwise evaluation. By default, its value is `1_winrate`, and will take the value of the winrate of the first system."""
     reduction_map = {"mean": ["score"]}
-    """A mapping specifying how scores should be reduced. By default, it will be `{'main': ['score']}`"""
     def prepare(self):
         """Prepares the pairwise comparison by initializing the necessary templates and tasks. These tasks will be used to assess, summarize, and select options from candidate responses."""
@@ -937,7 +937,7 @@ class LLMJudgePairwise(LLMJudge):
             task_data (List[Dict[str, str]]): Task data to be used for evaluation.
         Returns:
-        -------
         List[Dict[str,Dict]]
             The results of the evaluation, including winrate, ranking, and other metrics.

     main_score = "llm_as_judge"
     """The primary score name used in the results. By default, it will take the value of the criteria name (if only one criteria is being used for evaluation) or "llm_as_judge" otherwise."""
     reduction_map = {"mean": ["llm_as_judge"]}
+    """A mapping used for score aggregation. By default, it will take the value of ``{'mean': [<default_main_score_name>]}`` ."""
     def prepare(self):
         super().prepare()
         This method evaluates the quality of of the predictions by calculating scores for each instance based on a criterion.
         Returns:
+        --------
         List[Dict]
             A list of dictionaries containing the evaluation results for each instance. The results include the computed scores for each prediction. Each result will have the `score_name` as a prefix, which may be the criterion name if only one used, or "llm_as_judge" if several criteria were used.
     main_score = "1_winrate"
     """The main score metric for pairwise evaluation. By default, its value is `1_winrate`, and will take the value of the winrate of the first system."""
     reduction_map = {"mean": ["score"]}
+    """A mapping specifying how scores should be reduced. By default, it will be ``{'main': ['score']}`` ."""
     def prepare(self):
         """Prepares the pairwise comparison by initializing the necessary templates and tasks. These tasks will be used to assess, summarize, and select options from candidate responses."""
             task_data (List[Dict[str, str]]): Task data to be used for evaluation.
         Returns:
+        --------
         List[Dict[str,Dict]]
             The results of the evaluation, including winrate, ranking, and other metrics.

metric.py CHANGED Viewed

@@ -18,6 +18,7 @@ from .dialog_operators import __file__ as _
 from .dict_utils import __file__ as _
 from .error_utils import __file__ as _
 from .eval_utils import __file__ as _
 from .file_utils import __file__ as _
 from .formats import __file__ as _
 from .fusion import __file__ as _

 from .dict_utils import __file__ as _
 from .error_utils import __file__ as _
 from .eval_utils import __file__ as _
+from .evaluate_cli import __file__ as _
 from .file_utils import __file__ as _
 from .formats import __file__ as _
 from .fusion import __file__ as _

metrics.py CHANGED Viewed

@@ -71,6 +71,7 @@ settings = get_settings()
 warnings.filterwarnings("ignore", category=DegenerateDataWarning)
 @retry_connection_with_exponential_backoff(backoff_factor=2)
 def hf_evaluate_load(path: str, *args, **kwargs):
     if settings.hf_offline_metrics_path is not None:
@@ -792,6 +793,7 @@ class MetricWithConfidenceInterval(Metric):
     n_resamples: int = None
     confidence_level: float = 0.95
     ci_scores: List[str] = None
     @staticmethod
     def new_random_generator():
@@ -907,6 +909,7 @@ class MetricWithConfidenceInterval(Metric):
                 n_resamples=self.n_resamples,
                 confidence_level=self.confidence_level,
                 random_state=self.new_random_generator(),
             ).confidence_interval
             full_score_name = ci_score_prefix + score_name
             result[f"{full_score_name}_ci_low"] = ci.low
@@ -1007,6 +1010,7 @@ class MetricWithConfidenceInterval(Metric):
                     n_resamples=self.n_resamples,
                     confidence_level=self.confidence_level,
                     random_state=random_gen,
                 ).confidence_interval
             result["score_ci_low"] = float(ci.low)
             result["score_ci_high"] = float(ci.high)
@@ -1193,9 +1197,9 @@ class BulkInstanceMetric(StreamOperator, MetricWithConfidenceInterval):
             )
         for reduction, fields in self.reduction_map.items():
-            assert (
-                reduction in self.implemented_reductions
-            ), f"Reduction {reduction} is not implemented, use one of {self.implemented_reductions}"
             if reduction == "mean":
                 for field_name in fields:
@@ -1464,12 +1468,12 @@ class InstanceMetric(StreamOperator, MetricWithConfidenceInterval):
     def _validate_group_mean_task_data(self, instance):
         # instances need to all have task_data field with field group_id
         assert "task_data" in instance, "each instance must have an task_data field"
-        assert isinstance(
-            instance["task_data"], dict
-        ), "each instance must have an task_data field that is a dict"
-        assert (
-            "group_id" in instance["task_data"]
-        ), "each instance task_data dict must have a key group_id"
     def _validate_group_mean_reduction(self):
         """Ensure that group_mean reduction_map is properly formatted.
@@ -1522,30 +1526,30 @@ class InstanceMetric(StreamOperator, MetricWithConfidenceInterval):
             2           'Why are ants eating my food?'               'original'
         """
         # validate the reduction_map
-        assert (
-            "group_mean" in self.reduction_map
-        ), "reduction_map must have a 'group_mean' key"
         fields = self.reduction_map["group_mean"]
         # for group_mean, expects a dict
         assert isinstance(fields, dict)
-        assert (
-            "agg_func" in fields
-        ), "fields should have a key 'agg_func' whose value is a 3-element list of a function name, function definition, and a boolean indicator"
-        assert isinstance(
-            fields["agg_func"], list
-        ), "fields['agg_func'] should be a list"
-        assert (
-            len(fields["agg_func"]) == 3
-        ), "fields['agg_func'] should be a 3-element list"
-        assert isinstance(
-            fields["agg_func"][0], str
-        ), "first item in fields['agg_func'] should be a string name of a function"
-        assert callable(
-            fields["agg_func"][1]
-        ), "second item in fields['agg_func'] should be a callable function"
-        assert isinstance(
-            fields["agg_func"][2], bool
-        ), "third item in fields['agg_func'] should be a boolean value"
         if "score_fields" in fields:
             assert isinstance(fields["score_fields"], list)
@@ -1553,9 +1557,9 @@ class InstanceMetric(StreamOperator, MetricWithConfidenceInterval):
         instance_scores = self.compute_instance_scores(stream)
         global_score = {"num_of_instances": len(instance_scores)}
         for reduction_type, reduction_params in self.reduction_map.items():
-            assert (
-                reduction_type in self.implemented_reductions
-            ), f"Reduction {reduction_type} is not implemented, use one of {self.implemented_reductions}"
             field_name_full_prefix = ""
             # used for passing to the bootstrapping, depends on whether the groups are fixed or not
@@ -1653,7 +1657,9 @@ class InstanceMetric(StreamOperator, MetricWithConfidenceInterval):
                 assert (
                     "task_data" in instance
                     and self.subgroup_column in instance["task_data"]
-                ), f"each instance task_data dict must have a key {self.subgroup_column}"
             task_data = instance["task_data"] if "task_data" in instance else {}
@@ -2249,15 +2255,15 @@ class MetricPipeline(MultiStreamOperator, Metric):
     def verify(self):
         super().verify()
-        assert (
-            self.metric is not None
-        ), f"'metric' is not set in {self.get_metric_name()}"
-        assert (
-            self.main_score is not None
-        ), f"'main_score' is not set in {self.get_metric_name()}"
-        assert isinstance(
-            self.metric, Metric
-        ), f"'metric' is not set to a Metric class in {self.get_metric_name()} (type{self.metric})"
         if self.postpreprocess_steps is not None:
             depr_message = "Field 'postpreprocess_steps' is deprecated. Please use 'postprocess_steps' for the same purpose."
             warnings.warn(depr_message, DeprecationWarning, stacklevel=2)
@@ -2278,9 +2284,9 @@ class MetricPipeline(MultiStreamOperator, Metric):
             and isinstance(self.postprocess_steps, list)
             and len(self.postprocess_steps) > 0
         )
-        assert not (
-            has_postpreprocess and has_postprocess
-        ), "Must define at most one of postpreprocess_steps (which is deprecated) and postprocess_steps (to be used from now on)"
         if has_postpreprocess:
             self.postprocess_steps = self.postpreprocess_steps
         self.prepare_score = SequentialOperator(
@@ -2357,10 +2363,14 @@ class HuggingfaceMetric(GlobalMetric):
         assert self.hf_additional_input_fields is None or isoftype(
             self.hf_additional_input_fields, List[str]
-        ), f"Argument hf_additional_input_fields should be either None or List[str]. It is now: {self.hf_additional_input_fields}."
         assert self.hf_additional_input_fields_pass_one_value is None or isoftype(
             self.hf_additional_input_fields_pass_one_value, List[str]
-        ), f"Argument hf_additional_input_fields_pass_one_value should be either None or List[str]. It is now: {self.hf_additional_input_fields_pass_one_value}."
         return super().verify()
@@ -2377,25 +2387,25 @@ class HuggingfaceMetric(GlobalMetric):
     ) -> dict:
         passed_task_data = {}
         for additional_input_field in self.hf_additional_input_fields:
-            assert (
-                additional_input_field in task_data[0]
-            ), f"'{additional_input_field}' field required by {__class__.__name__} is not in passed in task_data: {task_data[0]}"
             passed_task_data[additional_input_field] = [
                 additional_input[additional_input_field]
                 for additional_input in task_data
             ]
         for additional_input_field in self.hf_additional_input_fields_pass_one_value:
-            assert (
-                additional_input_field in task_data[0]
-            ), f"'{additional_input_field}' field required by {__class__.__name__} is not in passed in task_data: {task_data[0]}"
             values = {
                 additional_input[additional_input_field]
                 for additional_input in task_data
             }
-            assert (
-                len(values) == 1
-            ), f"Values of '{additional_input_field}' field required by {__class__.__name__}  should all be the same, but have multiple values {values}"
             passed_task_data[additional_input_field] = next(iter(values))
@@ -2410,22 +2420,22 @@ class HuggingfaceMetric(GlobalMetric):
             result[self.main_score] = float(result[self.hf_main_score])
             del result[self.hf_main_score]
         if self.scale != 1.0:
-            assert (
-                self.scaled_fields is not None
-            ), f"Scaling factor was set to {self.scale}, but no fields specified"
             for key in self.scaled_fields:
-                assert (
-                    key in result
-                ), f"Trying to scale field '{key}' which is not in results of metrics: {result}"
                 if isinstance(result[key], list):
-                    assert all(
-                        isinstance(v, float) for v in result[key]
-                    ), "Not all scaled field '{key}' values are floats: {result[key]}"
                     result[key] = [v / self.scale for v in result[key]]
                 else:
-                    assert isinstance(
-                        result[key], float
-                    ), "Scaled field '{key}' is not float: {result[key]}"
                     result[key] /= self.scale
         if self.main_score in result:
             result[self.main_score] = float(result[self.main_score])
@@ -2452,9 +2462,9 @@ class HuggingfaceBulkMetric(BulkInstanceMetric):
     ) -> List[Dict[str, Any]]:
         passed_task_data = {}
         for additional_input_field in self.hf_additional_input_fields:
-            assert (
-                additional_input_field in task_data[0]
-            ), f"'{additional_input_field}' field required by {__class__.__name__} is not in passed in task_data: {task_data[0]}"
             passed_task_data[additional_input_field] = [
                 additional_input[additional_input_field]
                 for additional_input in task_data
@@ -2791,9 +2801,9 @@ class FinQAEval(InstanceMetric):
                 response = requests.get(url)
                 response.raise_for_status()
                 content = response.content
-                assert (
-                    hashlib.md5(content).hexdigest() == hash_of_script
-                ), f'URL ("{url}") is different than expected. Make sure you added the right one.'
                 with open(local_path, "wb") as file:
                     file.write(content)
@@ -2925,9 +2935,9 @@ class F1MultiLabel(GlobalMetric, PackageRequirementsMixin):
             labels=labels_param,
         )
         if isinstance(result[self.metric], numpy.ndarray):
-            assert len(result[self.metric]) == len(
-                labels
-            ), f"F1 result ({result[self.metric]}) has more entries than labels ({labels})"
             final_result = {self.main_score: nan_mean(result[self.metric])}
             for i, label in enumerate(labels):
                 final_result[self.metric + "_" + label] = result[self.metric][i]
@@ -3414,7 +3424,6 @@ class CustomF1(GlobalMetric):
 class KeyValueExtraction(GlobalMetric):
     prediction_type = Dict[str, str]
     metric: Metric
     single_reference_per_prediction = True
@@ -3978,9 +3987,9 @@ class LlamaIndexLLMMetric(InstanceMetric):
     prediction_type = str
     reduction_map: Dict[str, List[str]] = None
     openai_models: List[str] = ["gpt-3.5-turbo"]
-    anthropic_models: List[str] = (
-        []
-    )  # this is here for the sake of documentation for future models
     mock_models: List[str] = ["mock"]
     external_api_models = openai_models + anthropic_models
     data_classification_policy = ["public"]
@@ -4819,12 +4828,12 @@ def validate_subgroup_types(
             for subgroup_name, score_list in subgroup_scores_dict.items()
         }
     )
-    assert isinstance(
-        control_subgroup_types, list
-    ), "control_subgroup_types must be a list"
-    assert isinstance(
-        comparison_subgroup_types, list
-    ), "comparison_subgroup_types must be a list"
     # make sure each list is unique, so that labels aren't double-counted
     control_subgroup_types = list(set(control_subgroup_types))
     comparison_subgroup_types = list(set(comparison_subgroup_types))
@@ -4979,9 +4988,9 @@ def normalized_cohens_h(
     # requires scores to be in [0,1]
     for subgroup_name, score_list in subgroup_scores_dict.items():
-        assert all(
-            0 <= score <= 1 for score in score_list
-        ), f"all {subgroup_name} scores must be in [0,1]"
     # combine all scores from each label (if there are more than 1 in each group) into a list
     group_scores_list = [
@@ -5967,9 +5976,9 @@ class RandomForestMetricsEnsemble(MetricsEnsemble):
             return json.load(file)
     def ensemble(self, instance):
-        assert (
-            self.weights is not None
-        ), "RandomForestMetricsEnsemble must set self.weights before it can be used"
         ensemble_model = self.decode_forest(self.weights)
         prediction_lst = []
@@ -6378,7 +6387,7 @@ class SQLExecutionAccuracy(InstanceMetric):
     ]
     prediction_type = "Any"  # string representation is compared
-    sql_timeout = 100.0
     _requirements_list = ["sqlglot", "func_timeout"]
@@ -6445,6 +6454,7 @@ class SQLExecutionAccuracy(InstanceMetric):
         Comparison is column order independent, and could optionally be row order independent.
         We interpret "subset" as follows:
         - For each row in df1, there must be a matching (or superset) row in df2, i.e. the set of values
           in the df1 row is a subset of the set of values in that df2 row. Then do the same check in reverse.
         - If either condition (df1 is subset of df2 OR df2 is subset of df1) is satisfied, return True.
@@ -6458,6 +6468,7 @@ class SQLExecutionAccuracy(InstanceMetric):
         Returns:
             bool: True if df1 is a subset of df2 or vice versa, based on the specified row-order condition.
         """
         df1_array = df1.values.astype(str)
         df2_array = df2.values.astype(str)

 warnings.filterwarnings("ignore", category=DegenerateDataWarning)
 @retry_connection_with_exponential_backoff(backoff_factor=2)
 def hf_evaluate_load(path: str, *args, **kwargs):
     if settings.hf_offline_metrics_path is not None:
     n_resamples: int = None
     confidence_level: float = 0.95
     ci_scores: List[str] = None
+    ci_method: str = "BCa"
     @staticmethod
     def new_random_generator():
                 n_resamples=self.n_resamples,
                 confidence_level=self.confidence_level,
                 random_state=self.new_random_generator(),
+                method=self.ci_method
             ).confidence_interval
             full_score_name = ci_score_prefix + score_name
             result[f"{full_score_name}_ci_low"] = ci.low
                     n_resamples=self.n_resamples,
                     confidence_level=self.confidence_level,
                     random_state=random_gen,
+                    method=self.ci_method
                 ).confidence_interval
             result["score_ci_low"] = float(ci.low)
             result["score_ci_high"] = float(ci.high)
             )
         for reduction, fields in self.reduction_map.items():
+            assert reduction in self.implemented_reductions, (
+                f"Reduction {reduction} is not implemented, use one of {self.implemented_reductions}"
+            )
             if reduction == "mean":
                 for field_name in fields:
     def _validate_group_mean_task_data(self, instance):
         # instances need to all have task_data field with field group_id
         assert "task_data" in instance, "each instance must have an task_data field"
+        assert isinstance(instance["task_data"], dict), (
+            "each instance must have an task_data field that is a dict"
+        )
+        assert "group_id" in instance["task_data"], (
+            "each instance task_data dict must have a key group_id"
+        )
     def _validate_group_mean_reduction(self):
         """Ensure that group_mean reduction_map is properly formatted.
             2           'Why are ants eating my food?'               'original'
         """
         # validate the reduction_map
+        assert "group_mean" in self.reduction_map, (
+            "reduction_map must have a 'group_mean' key"
+        )
         fields = self.reduction_map["group_mean"]
         # for group_mean, expects a dict
         assert isinstance(fields, dict)
+        assert "agg_func" in fields, (
+            "fields should have a key 'agg_func' whose value is a 3-element list of a function name, function definition, and a boolean indicator"
+        )
+        assert isinstance(fields["agg_func"], list), (
+            "fields['agg_func'] should be a list"
+        )
+        assert len(fields["agg_func"]) == 3, (
+            "fields['agg_func'] should be a 3-element list"
+        )
+        assert isinstance(fields["agg_func"][0], str), (
+            "first item in fields['agg_func'] should be a string name of a function"
+        )
+        assert callable(fields["agg_func"][1]), (
+            "second item in fields['agg_func'] should be a callable function"
+        )
+        assert isinstance(fields["agg_func"][2], bool), (
+            "third item in fields['agg_func'] should be a boolean value"
+        )
         if "score_fields" in fields:
             assert isinstance(fields["score_fields"], list)
         instance_scores = self.compute_instance_scores(stream)
         global_score = {"num_of_instances": len(instance_scores)}
         for reduction_type, reduction_params in self.reduction_map.items():
+            assert reduction_type in self.implemented_reductions, (
+                f"Reduction {reduction_type} is not implemented, use one of {self.implemented_reductions}"
+            )
             field_name_full_prefix = ""
             # used for passing to the bootstrapping, depends on whether the groups are fixed or not
                 assert (
                     "task_data" in instance
                     and self.subgroup_column in instance["task_data"]
+                ), (
+                    f"each instance task_data dict must have a key {self.subgroup_column}"
+                )
             task_data = instance["task_data"] if "task_data" in instance else {}
     def verify(self):
         super().verify()
+        assert self.metric is not None, (
+            f"'metric' is not set in {self.get_metric_name()}"
+        )
+        assert self.main_score is not None, (
+            f"'main_score' is not set in {self.get_metric_name()}"
+        )
+        assert isinstance(self.metric, Metric), (
+            f"'metric' is not set to a Metric class in {self.get_metric_name()} (type{self.metric})"
+        )
         if self.postpreprocess_steps is not None:
             depr_message = "Field 'postpreprocess_steps' is deprecated. Please use 'postprocess_steps' for the same purpose."
             warnings.warn(depr_message, DeprecationWarning, stacklevel=2)
             and isinstance(self.postprocess_steps, list)
             and len(self.postprocess_steps) > 0
         )
+        assert not (has_postpreprocess and has_postprocess), (
+            "Must define at most one of postpreprocess_steps (which is deprecated) and postprocess_steps (to be used from now on)"
+        )
         if has_postpreprocess:
             self.postprocess_steps = self.postpreprocess_steps
         self.prepare_score = SequentialOperator(
         assert self.hf_additional_input_fields is None or isoftype(
             self.hf_additional_input_fields, List[str]
+        ), (
+            f"Argument hf_additional_input_fields should be either None or List[str]. It is now: {self.hf_additional_input_fields}."
+        )
         assert self.hf_additional_input_fields_pass_one_value is None or isoftype(
             self.hf_additional_input_fields_pass_one_value, List[str]
+        ), (
+            f"Argument hf_additional_input_fields_pass_one_value should be either None or List[str]. It is now: {self.hf_additional_input_fields_pass_one_value}."
+        )
         return super().verify()
     ) -> dict:
         passed_task_data = {}
         for additional_input_field in self.hf_additional_input_fields:
+            assert additional_input_field in task_data[0], (
+                f"'{additional_input_field}' field required by {__class__.__name__} is not in passed in task_data: {task_data[0]}"
+            )
             passed_task_data[additional_input_field] = [
                 additional_input[additional_input_field]
                 for additional_input in task_data
             ]
         for additional_input_field in self.hf_additional_input_fields_pass_one_value:
+            assert additional_input_field in task_data[0], (
+                f"'{additional_input_field}' field required by {__class__.__name__} is not in passed in task_data: {task_data[0]}"
+            )
             values = {
                 additional_input[additional_input_field]
                 for additional_input in task_data
             }
+            assert len(values) == 1, (
+                f"Values of '{additional_input_field}' field required by {__class__.__name__}  should all be the same, but have multiple values {values}"
+            )
             passed_task_data[additional_input_field] = next(iter(values))
             result[self.main_score] = float(result[self.hf_main_score])
             del result[self.hf_main_score]
         if self.scale != 1.0:
+            assert self.scaled_fields is not None, (
+                f"Scaling factor was set to {self.scale}, but no fields specified"
+            )
             for key in self.scaled_fields:
+                assert key in result, (
+                    f"Trying to scale field '{key}' which is not in results of metrics: {result}"
+                )
                 if isinstance(result[key], list):
+                    assert all(isinstance(v, float) for v in result[key]), (
+                        "Not all scaled field '{key}' values are floats: {result[key]}"
+                    )
                     result[key] = [v / self.scale for v in result[key]]
                 else:
+                    assert isinstance(result[key], float), (
+                        "Scaled field '{key}' is not float: {result[key]}"
+                    )
                     result[key] /= self.scale
         if self.main_score in result:
             result[self.main_score] = float(result[self.main_score])
     ) -> List[Dict[str, Any]]:
         passed_task_data = {}
         for additional_input_field in self.hf_additional_input_fields:
+            assert additional_input_field in task_data[0], (
+                f"'{additional_input_field}' field required by {__class__.__name__} is not in passed in task_data: {task_data[0]}"
+            )
             passed_task_data[additional_input_field] = [
                 additional_input[additional_input_field]
                 for additional_input in task_data
                 response = requests.get(url)
                 response.raise_for_status()
                 content = response.content
+                assert hashlib.md5(content).hexdigest() == hash_of_script, (
+                    f'URL ("{url}") is different than expected. Make sure you added the right one.'
+                )
                 with open(local_path, "wb") as file:
                     file.write(content)
             labels=labels_param,
         )
         if isinstance(result[self.metric], numpy.ndarray):
+            assert len(result[self.metric]) == len(labels), (
+                f"F1 result ({result[self.metric]}) has more entries than labels ({labels})"
+            )
             final_result = {self.main_score: nan_mean(result[self.metric])}
             for i, label in enumerate(labels):
                 final_result[self.metric + "_" + label] = result[self.metric][i]
 class KeyValueExtraction(GlobalMetric):
     prediction_type = Dict[str, str]
     metric: Metric
     single_reference_per_prediction = True
     prediction_type = str
     reduction_map: Dict[str, List[str]] = None
     openai_models: List[str] = ["gpt-3.5-turbo"]
+    anthropic_models: List[
+        str
+    ] = []  # this is here for the sake of documentation for future models
     mock_models: List[str] = ["mock"]
     external_api_models = openai_models + anthropic_models
     data_classification_policy = ["public"]
             for subgroup_name, score_list in subgroup_scores_dict.items()
         }
     )
+    assert isinstance(control_subgroup_types, list), (
+        "control_subgroup_types must be a list"
+    )
+    assert isinstance(comparison_subgroup_types, list), (
+        "comparison_subgroup_types must be a list"
+    )
     # make sure each list is unique, so that labels aren't double-counted
     control_subgroup_types = list(set(control_subgroup_types))
     comparison_subgroup_types = list(set(comparison_subgroup_types))
     # requires scores to be in [0,1]
     for subgroup_name, score_list in subgroup_scores_dict.items():
+        assert all(0 <= score <= 1 for score in score_list), (
+            f"all {subgroup_name} scores must be in [0,1]"
+        )
     # combine all scores from each label (if there are more than 1 in each group) into a list
     group_scores_list = [
             return json.load(file)
     def ensemble(self, instance):
+        assert self.weights is not None, (
+            "RandomForestMetricsEnsemble must set self.weights before it can be used"
+        )
         ensemble_model = self.decode_forest(self.weights)
         prediction_lst = []
     ]
     prediction_type = "Any"  # string representation is compared
+    sql_timeout = 30.0
     _requirements_list = ["sqlglot", "func_timeout"]
         Comparison is column order independent, and could optionally be row order independent.
         We interpret "subset" as follows:
         - For each row in df1, there must be a matching (or superset) row in df2, i.e. the set of values
           in the df1 row is a subset of the set of values in that df2 row. Then do the same check in reverse.
         - If either condition (df1 is subset of df2 OR df2 is subset of df1) is satisfied, return True.
         Returns:
             bool: True if df1 is a subset of df2 or vice versa, based on the specified row-order condition.
         """
         df1_array = df1.values.astype(str)
         df2_array = df2.values.astype(str)

parsing_utils.py CHANGED Viewed

@@ -51,9 +51,9 @@ def consume_name_val(instring: str) -> Tuple[Any, str]:
     instring = instring[len(name_val) :].strip()
     name_val = name_val.strip()
-    if name_val == "True":
         return (True, instring)
-    if name_val == "False":
         return (False, instring)
     if name_val == "None":
         return (None, instring)

     instring = instring[len(name_val) :].strip()
     name_val = name_val.strip()
+    if name_val.lower() == "true":
         return (True, instring)
+    if name_val.lower() == "false":
         return (False, instring)
     if name_val == "None":
         return (None, instring)

processors.py CHANGED Viewed

@@ -430,32 +430,86 @@ class AddPrefix(FieldOperator):
 class GetSQL(FieldOperator):
     def process_value(self, text: str) -> str:
-        """Extracts the first SQL query from a given text.
         Args:
-        text: The input string containing the SQL query.
         Returns:
-        The first SQL query found in the text, or None if no query is found.
         """
-        match = re.search(
-            r"(?:```)?.*?(SELECT.*?(?:FROM|WITH|;|$).*?)(?:```|;|$)",
-            text,
-            re.IGNORECASE | re.DOTALL,
-        )
-        if match:
-            out = (
-                text[match.start() : match.end()]
-                .replace("```", "")
-                .replace(";", "")
-                .strip()
             )
         else:
-            out = "No query found in generation"
-        return out
 class ScaleNumberToZeroOneReturnZeroIfFails(FieldOperator):

 class GetSQL(FieldOperator):
+    """Operator to extract the most likely SQL query from text, often generated by language models.
+    It prioritizes SQL within markdown code blocks (```sql or ```)
+    and defaults to finding the last SELECT statement in the text
+    if no code blocks are found. It attempts to remove trailing text
+    after the first semicolon in the identified query.
+    """
     def process_value(self, text: str) -> str:
+        """Extracts the most plausible SQL query from the given text.
         Args:
+            text: The input string potentially containing an SQL query
+                  and other text (e.g., explanations, markdown).
         Returns:
+            The extracted SQL query string, or a message indicating
+            no query was found.
         """
+        if not isinstance(text, str):
+            return "Input must be a string"  # Basic type check
+        sql_query_candidate = None  # Renamed to indicate it might need cleanup
+        # 1. Try to find ```sql ... ``` code blocks
+        sql_blocks = re.findall(
+            r"```sql\s*(.*?)\s*```", text, re.DOTALL | re.IGNORECASE
+        )
+        if sql_blocks:
+            # Use the content of the last ```sql block
+            sql_query_candidate = sql_blocks[-1].strip()
+        else:
+            # 2. If no ```sql blocks, try to find generic ``` ... ``` blocks
+            generic_blocks = re.findall(r"```\s*(.*?)\s*```", text, re.DOTALL)
+            if generic_blocks:
+                # Check if the last block looks like SQL (starts with SELECT, INSERT, etc.)
+                last_block_content = generic_blocks[-1].strip()
+                # Allow common SQL starting keywords
+                sql_keywords = (
+                    r"^(SELECT|INSERT|UPDATE|DELETE|CREATE|ALTER|WITH|DROP|TRUNCATE)\b"
+                )
+                if re.match(sql_keywords, last_block_content, re.IGNORECASE):
+                    sql_query_candidate = last_block_content
+        # 3. If no suitable code blocks found, search the entire text for the last relevant SQL keyword
+        if sql_query_candidate is None:
+            # Find the start index of the *last* common SQL keyword (case-insensitive)
+            last_match = None
+            # Expand search beyond just SELECT for better fallback
+            sql_keywords_search = (
+                r"\b(SELECT|INSERT|UPDATE|DELETE|CREATE|ALTER|WITH|DROP|TRUNCATE)\b"
             )
+            for match in re.finditer(sql_keywords_search, text, re.IGNORECASE):
+                last_match = match
+            if last_match:
+                # Extract from the last keyword to the end of the string
+                sql_query_candidate = text[last_match.start() :].strip()
+        # 4. Cleanup: Truncate at first semicolon and strip whitespace
+        if sql_query_candidate:
+            # Find the first semicolon in the candidate string
+            first_semicolon_index = sql_query_candidate.find(";")
+            if first_semicolon_index != -1:
+                # If found, take everything before it
+                sql_query = sql_query_candidate[:first_semicolon_index].strip()
+            else:
+                # If no semicolon, use the candidate as is (after stripping)
+                sql_query = sql_query_candidate.strip()
+            # clean the ```sql\n from the start and the \n``` in case it is there
+            sql_query = sql_query.replace("```sql", "").replace("```", "").strip()
         else:
+            sql_query = None  # Ensure sql_query is None if no candidate was found
+        # 5. Return result or 'not found' message
+        return (
+            sql_query if sql_query is not None else "No query found in generation"
+        )  # Check for None explicitly
 class ScaleNumberToZeroOneReturnZeroIfFails(FieldOperator):

version.py CHANGED Viewed

	@@ -1 +1 @@
1	- version = "1.22.1"


1	+ version = "1.22.2"