Skip to content

Ultralytics Models

sahi.models.ultralytics

Ultralytics detection model wrapper for SAHI.

Provides integration with Ultralytics YOLO models for object detection, instance segmentation, and oriented bounding box detection.

Classes

UltralyticsDetectionModel

Bases: DetectionModel

Detection model for Ultralytics YOLO models.

Supports PyTorch (.pt), ONNX (.onnx), OpenVINO (.xml or _openvino_model/), NCNN (.param or _ncnn_model/), and TorchScript (.torchscript) models.

Source code in sahi/models/ultralytics.py
class UltralyticsDetectionModel(DetectionModel):
    """Detection model for Ultralytics YOLO models.

    Supports PyTorch (.pt), ONNX (.onnx), OpenVINO (.xml or _openvino_model/),
    NCNN (.param or _ncnn_model/), and TorchScript (.torchscript) models.
    """

    def __init__(self, *args: object, fuse: bool = False, task: str | None = None, **kwargs: object) -> None:
        """Initialize the Ultralytics detection model.

        Accepts all arguments from ``DetectionModel.__init__`` plus the
        following keyword arguments.

        Args:
            *args: Variable length argument list passed to DetectionModel.
            fuse: If True, fuse Conv2d and BatchNorm2d layers for faster
                inference. Default: False.
            task: Ultralytics task type (e.g. ``"detect"``, ``"segment"``,
                ``"obb"``). When None, the task is inferred from the model.
                Default: None.
            **kwargs: Arbitrary keyword arguments passed to DetectionModel.
        """
        self.fuse: bool = fuse
        self.task: str | None = task
        existing_packages = getattr(self, "required_packages", None) or []
        self.required_packages = [*list(existing_packages), "ultralytics"]
        super().__init__(*args, **kwargs)  # type: ignore[misc, arg-type]

    def load_model(self) -> None:
        """Detection model is initialized and set to self.model."""
        from ultralytics import YOLO

        try:
            assert self.model_path is not None, "model_path must be provided for Ultralytics models"
            if self.task:
                model = YOLO(self.model_path, task=self.task)
            else:
                model = YOLO(self.model_path)

            # Only call .to(device) for PyTorch models, not ONNX,OpenVINO,Ncnn.
            if self.model_path and isinstance(self.model_path, str) and self.model_path.endswith(".pt"):
                model.to(self.device)
            self.set_model(model)
            if self.fuse and hasattr(model, "fuse"):
                model.fuse()

        except Exception as e:
            raise TypeError("model_path is not a valid Ultralytics model path: ", e)

    def set_model(self, model: Any, **kwargs: Any) -> None:
        """Sets the underlying Ultralytics model.

        Args:
            model: Any
                A Ultralytics model
            **kwargs: Any
                Additional keyword arguments for model setup.
        """
        self.model = model
        # set category_mapping
        if not self.category_mapping:
            category_mapping = {str(ind): category_name for ind, category_name in enumerate(self.category_names)}
            self.category_mapping = category_mapping

    def perform_inference(self, image: np.ndarray) -> None:
        """Prediction is performed using self.model and the prediction result is set to self._original_predictions.

        Args:
            image: np.ndarray
                A numpy array that contains the image to be predicted. 3 channel image should be in RGB order.
        """
        self.perform_batch_inference([image])

    def _extract_predictions(self, prediction_result: Any) -> list:
        """Extracts predictions from YOLO result objects into the internal format.

        Args:
            prediction_result: list of YOLO Result objects from self.model()

        Returns:
            list of extracted predictions (tensors or tuple of tensors per image)
        """
        import torch

        if self.has_mask:
            from ultralytics.engine.results import Masks

            for result in prediction_result:
                if not result.masks:
                    device = getattr(self.model, "device", "cpu")
                    result.masks = Masks(torch.tensor([], device=device), result.boxes.orig_shape)

            return [
                (
                    result.boxes.data,
                    result.masks.data,
                )
                for result in prediction_result
            ]
        elif self.is_obb:
            device = getattr(self.model, "device", "cpu")
            return [
                (
                    torch.cat(
                        [
                            result.obb.xyxy,
                            result.obb.conf.unsqueeze(-1),
                            result.obb.cls.unsqueeze(-1),
                        ],
                        dim=1,
                    )
                    if result.obb is not None
                    else torch.empty((0, 6), device=device),
                    result.obb.xyxyxyxy if result.obb is not None else torch.empty((0, 4, 2), device=device),
                )
                for result in prediction_result
            ]
        else:
            return [result.boxes.data for result in prediction_result]

    def perform_batch_inference(self, images: list[np.ndarray]) -> None:
        """Performs inference on a batch of images using native YOLO batch support.

        Args:
            images: list[np.ndarray]
                List of numpy arrays (H, W, C) in RGB order.
        """
        if self.model is None:
            raise ValueError("Model is not loaded, load it by calling .load_model()")

        kwargs = {"cfg": self.config_path, "verbose": False, "conf": self.confidence_threshold, "device": self.device}

        if self.image_size is not None:
            kwargs = {"imgsz": self.image_size, **kwargs}

        # YOLO expects BGR — convert each image and pass the list for native batch inference
        images_bgr = [img[:, :, ::-1] for img in images]
        prediction_result = self.model(images_bgr, **kwargs)

        self._original_predictions = self._extract_predictions(prediction_result)
        self._original_shapes = [img.shape for img in images]

    @property
    def category_names(self) -> list:
        """Returns the list of category names from the model.

        Falls back to ``category_mapping`` values when model metadata is
        unavailable (e.g. ONNX models without embedded names).

        Raises:
            ValueError: If neither model names nor category_mapping are available.
        """
        # For ONNX models, names might not be available, use category_mapping
        assert self.model is not None
        if hasattr(self.model, "names") and self.model.names:
            return list(self.model.names.values())
        elif self.category_mapping:
            return list(self.category_mapping.values())
        else:
            raise ValueError("Category names not available. Please provide category_mapping for ONNX models.")

    @property
    def num_categories(self) -> int:
        """Returns number of categories."""
        assert self.model is not None
        if hasattr(self.model, "names") and self.model.names:
            return len(self.model.names)
        elif self.category_mapping:
            return len(self.category_mapping)
        else:
            raise ValueError("Cannot determine number of categories. Please provide category_mapping for ONNX models.")

    @property
    def has_mask(self) -> bool:
        """Returns if model output contains segmentation mask."""
        # Check if model has 'task' attribute (for both .pt and .onnx models)
        assert self.model is not None
        if hasattr(self.model, "overrides") and "task" in self.model.overrides:
            return self.model.overrides["task"] == "segment"
        # For ONNX models, task might be stored differently
        elif hasattr(self.model, "task"):
            return self.model.task == "segment"
        # For ONNX models without task info, check model path
        elif self.model_path and isinstance(self.model_path, str):
            return "seg" in self.model_path.lower()
        return False

    @property
    def is_obb(self) -> bool:
        """Returns if model output contains oriented bounding boxes."""
        # Check if model has 'task' attribute (for both .pt and .onnx models)
        assert self.model is not None
        if hasattr(self.model, "overrides") and "task" in self.model.overrides:
            return self.model.overrides["task"] == "obb"
        # For ONNX models, task might be stored differently
        elif hasattr(self.model, "task"):
            return self.model.task == "obb"
        # For ONNX models without task info, check model path
        elif self.model_path and isinstance(self.model_path, str):
            return "obb" in self.model_path.lower()
        return False

    def _create_object_prediction_list_from_original_predictions(
        self,
        shift_amount_list: list[list[int | float]] | None = [[0, 0]],
        full_shape_list: list[list[int | float]] | None = None,
    ) -> None:
        """Convert predictions to ObjectPrediction list.

        self._original_predictions is converted to a list of prediction.ObjectPrediction and set to
        self._object_prediction_list_per_image.

        Args:
            shift_amount_list: list of list
                To shift the box and mask predictions from sliced image to full sized image, should
                be in the form of List[[shift_x, shift_y],[shift_x, shift_y],...]
            full_shape_list: list of list
                Size of the full image after shifting, should be in the form of
                List[[height, width],[height, width],...]
        """
        assert self._original_predictions is not None
        assert self._original_shapes is not None
        original_predictions = self._original_predictions

        # compatibility for sahi v0.8.15
        shift_amount_list_typed: list[list[int | float]] = fix_shift_amount_list(shift_amount_list)
        full_shape_list_typed: list[list[int | float]] | None = fix_full_shape_list(full_shape_list)

        # handle all predictions
        object_prediction_list_per_image = []

        for image_ind, image_predictions in enumerate(original_predictions):
            shift_amount = [int(x) for x in shift_amount_list_typed[image_ind]]
            full_shape = None if full_shape_list_typed is None else [int(x) for x in full_shape_list_typed[image_ind]]
            image_shape = self._original_shapes[image_ind]
            object_prediction_list = []

            # Extract boxes and optional masks/obb
            if self.has_mask or self.is_obb:
                boxes = image_predictions[0].cpu().detach().numpy()
                masks_or_points = image_predictions[1].cpu().detach().numpy()
            else:
                boxes = image_predictions.data.cpu().detach().numpy()
                masks_or_points = None

            # Process each prediction
            for pred_ind, prediction in enumerate(boxes):
                # Get bbox coordinates
                bbox = prediction[:4].tolist()
                score = prediction[4]
                category_id = int(prediction[5])
                assert self.category_mapping is not None
                category_name = self.category_mapping[str(category_id)]

                # Fix box coordinates
                bbox = [max(0, coord) for coord in bbox]
                if full_shape is not None:
                    bbox[0] = min(full_shape[1], bbox[0])
                    bbox[1] = min(full_shape[0], bbox[1])
                    bbox[2] = min(full_shape[1], bbox[2])
                    bbox[3] = min(full_shape[0], bbox[3])

                # Ignore invalid predictions
                if not (bbox[0] < bbox[2]) or not (bbox[1] < bbox[3]):
                    logger.warning(f"ignoring invalid prediction with bbox: {bbox}")
                    continue

                # Get segmentation or OBB points
                segmentation = None
                if masks_or_points is not None:
                    if self.has_mask:
                        bool_mask = masks_or_points[pred_ind]
                        # Resize mask to original image size
                        bool_mask = cv2.resize(bool_mask.astype(np.uint8), (image_shape[1], image_shape[0]))
                        segmentation = get_coco_segmentation_from_bool_mask(bool_mask)
                    else:  # is_obb
                        obb_points = masks_or_points[pred_ind]  # Get OBB points for this prediction
                        segmentation = [obb_points.reshape(-1).tolist()]

                    if len(segmentation) == 0:
                        continue

                # Create and append object prediction
                object_prediction = ObjectPrediction(
                    bbox=bbox,
                    category_id=category_id,
                    score=score,
                    segmentation=segmentation,
                    category_name=category_name,
                    shift_amount=shift_amount,
                    full_shape=list(image_shape[:2]) if full_shape is None else full_shape,  # (height, width)
                )
                object_prediction_list.append(object_prediction)

            object_prediction_list_per_image.append(object_prediction_list)

        self._object_prediction_list_per_image = object_prediction_list_per_image
Attributes
category_names property

Returns the list of category names from the model.

Falls back to category_mapping values when model metadata is unavailable (e.g. ONNX models without embedded names).

Raises:

Type Description
ValueError

If neither model names nor category_mapping are available.

has_mask property

Returns if model output contains segmentation mask.

is_obb property

Returns if model output contains oriented bounding boxes.

num_categories property

Returns number of categories.

Functions
__init__(*args, fuse=False, task=None, **kwargs)

Initialize the Ultralytics detection model.

Accepts all arguments from DetectionModel.__init__ plus the following keyword arguments.

Parameters:

Name Type Description Default
*args object

Variable length argument list passed to DetectionModel.

()
fuse bool

If True, fuse Conv2d and BatchNorm2d layers for faster inference. Default: False.

False
task str | None

Ultralytics task type (e.g. "detect", "segment", "obb"). When None, the task is inferred from the model. Default: None.

None
**kwargs object

Arbitrary keyword arguments passed to DetectionModel.

{}
Source code in sahi/models/ultralytics.py
def __init__(self, *args: object, fuse: bool = False, task: str | None = None, **kwargs: object) -> None:
    """Initialize the Ultralytics detection model.

    Accepts all arguments from ``DetectionModel.__init__`` plus the
    following keyword arguments.

    Args:
        *args: Variable length argument list passed to DetectionModel.
        fuse: If True, fuse Conv2d and BatchNorm2d layers for faster
            inference. Default: False.
        task: Ultralytics task type (e.g. ``"detect"``, ``"segment"``,
            ``"obb"``). When None, the task is inferred from the model.
            Default: None.
        **kwargs: Arbitrary keyword arguments passed to DetectionModel.
    """
    self.fuse: bool = fuse
    self.task: str | None = task
    existing_packages = getattr(self, "required_packages", None) or []
    self.required_packages = [*list(existing_packages), "ultralytics"]
    super().__init__(*args, **kwargs)  # type: ignore[misc, arg-type]
load_model()

Detection model is initialized and set to self.model.

Source code in sahi/models/ultralytics.py
def load_model(self) -> None:
    """Detection model is initialized and set to self.model."""
    from ultralytics import YOLO

    try:
        assert self.model_path is not None, "model_path must be provided for Ultralytics models"
        if self.task:
            model = YOLO(self.model_path, task=self.task)
        else:
            model = YOLO(self.model_path)

        # Only call .to(device) for PyTorch models, not ONNX,OpenVINO,Ncnn.
        if self.model_path and isinstance(self.model_path, str) and self.model_path.endswith(".pt"):
            model.to(self.device)
        self.set_model(model)
        if self.fuse and hasattr(model, "fuse"):
            model.fuse()

    except Exception as e:
        raise TypeError("model_path is not a valid Ultralytics model path: ", e)
perform_batch_inference(images)

Performs inference on a batch of images using native YOLO batch support.

Parameters:

Name Type Description Default
images list[ndarray]

list[np.ndarray] List of numpy arrays (H, W, C) in RGB order.

required
Source code in sahi/models/ultralytics.py
def perform_batch_inference(self, images: list[np.ndarray]) -> None:
    """Performs inference on a batch of images using native YOLO batch support.

    Args:
        images: list[np.ndarray]
            List of numpy arrays (H, W, C) in RGB order.
    """
    if self.model is None:
        raise ValueError("Model is not loaded, load it by calling .load_model()")

    kwargs = {"cfg": self.config_path, "verbose": False, "conf": self.confidence_threshold, "device": self.device}

    if self.image_size is not None:
        kwargs = {"imgsz": self.image_size, **kwargs}

    # YOLO expects BGR — convert each image and pass the list for native batch inference
    images_bgr = [img[:, :, ::-1] for img in images]
    prediction_result = self.model(images_bgr, **kwargs)

    self._original_predictions = self._extract_predictions(prediction_result)
    self._original_shapes = [img.shape for img in images]
perform_inference(image)

Prediction is performed using self.model and the prediction result is set to self._original_predictions.

Parameters:

Name Type Description Default
image ndarray

np.ndarray A numpy array that contains the image to be predicted. 3 channel image should be in RGB order.

required
Source code in sahi/models/ultralytics.py
def perform_inference(self, image: np.ndarray) -> None:
    """Prediction is performed using self.model and the prediction result is set to self._original_predictions.

    Args:
        image: np.ndarray
            A numpy array that contains the image to be predicted. 3 channel image should be in RGB order.
    """
    self.perform_batch_inference([image])
set_model(model, **kwargs)

Sets the underlying Ultralytics model.

Parameters:

Name Type Description Default
model Any

Any A Ultralytics model

required
**kwargs Any

Any Additional keyword arguments for model setup.

{}
Source code in sahi/models/ultralytics.py
def set_model(self, model: Any, **kwargs: Any) -> None:
    """Sets the underlying Ultralytics model.

    Args:
        model: Any
            A Ultralytics model
        **kwargs: Any
            Additional keyword arguments for model setup.
    """
    self.model = model
    # set category_mapping
    if not self.category_mapping:
        category_mapping = {str(ind): category_name for ind, category_name in enumerate(self.category_names)}
        self.category_mapping = category_mapping

Functions