Ultralytics Models¶

`sahi.models.ultralytics` ¶

Ultralytics detection model wrapper for SAHI.

Provides integration with Ultralytics YOLO models for object detection, instance segmentation, and oriented bounding box detection.

Classes¶

`UltralyticsDetectionModel` ¶

Bases: DetectionModel

Detection model for Ultralytics YOLO models.

Supports PyTorch (.pt), ONNX (.onnx), OpenVINO (.xml or _openvino_model/), NCNN (.param or _ncnn_model/), and TorchScript (.torchscript) models.

Source code in sahi/models/ultralytics.py

class UltralyticsDetectionModel(DetectionModel):
    """Detection model for Ultralytics YOLO models.

    Supports PyTorch (.pt), ONNX (.onnx), OpenVINO (.xml or _openvino_model/),
    NCNN (.param or _ncnn_model/), and TorchScript (.torchscript) models.
    """

    def __init__(self, *args: object, fuse: bool = False, task: str | None = None, **kwargs: object) -> None:
        """Initialize the Ultralytics detection model.

        Accepts all arguments from ``DetectionModel.__init__`` plus the
        following keyword arguments.

        Args:
            *args: Variable length argument list passed to DetectionModel.
            fuse: If True, fuse Conv2d and BatchNorm2d layers for faster
                inference. Default: False.
            task: Ultralytics task type (e.g. ``"detect"``, ``"segment"``,
                ``"obb"``). When None, the task is inferred from the model.
                Default: None.
            **kwargs: Arbitrary keyword arguments passed to DetectionModel.
        """
        self.fuse: bool = fuse
        self.task: str | None = task
        existing_packages = getattr(self, "required_packages", None) or []
        self.required_packages = [*list(existing_packages), "ultralytics"]
        super().__init__(*args, **kwargs)  # type: ignore[misc, arg-type]

    def load_model(self) -> None:
        """Detection model is initialized and set to self.model."""
        from ultralytics import YOLO

        try:
            assert self.model_path is not None, "model_path must be provided for Ultralytics models"
            if self.task:
                model = YOLO(self.model_path, task=self.task)
            else:
                model = YOLO(self.model_path)

            # Only call .to(device) for PyTorch models, not ONNX,OpenVINO,Ncnn.
            if self.model_path and isinstance(self.model_path, str) and self.model_path.endswith(".pt"):
                model.to(self.device)
            self.set_model(model)
            if self.fuse and hasattr(model, "fuse"):
                model.fuse()

        except Exception as e:
            raise TypeError("model_path is not a valid Ultralytics model path: ", e)

    def set_model(self, model: Any, **kwargs: Any) -> None:
        """Sets the underlying Ultralytics model.

        Args:
            model: Any
                A Ultralytics model
            **kwargs: Any
                Additional keyword arguments for model setup.
        """
        self.model = model
        # set category_mapping
        if not self.category_mapping:
            category_mapping = {str(ind): category_name for ind, category_name in enumerate(self.category_names)}
            self.category_mapping = category_mapping

    def perform_inference(self, image: np.ndarray) -> None:
        """Prediction is performed using self.model and the prediction result is set to self._original_predictions.

        Args:
            image: np.ndarray
                A numpy array that contains the image to be predicted. 3 channel image should be in RGB order.
        """
        self.perform_batch_inference([image])

    def _extract_predictions(self, prediction_result: Any) -> list:
        """Extracts predictions from YOLO result objects into the internal format.

        Args:
            prediction_result: list of YOLO Result objects from self.model()

        Returns:
            list of extracted predictions (tensors or tuple of tensors per image)
        """
        import torch

        if self.has_mask:
            from ultralytics.engine.results import Masks

            for result in prediction_result:
                if not result.masks:
                    device = getattr(self.model, "device", "cpu")
                    result.masks = Masks(torch.tensor([], device=device), result.boxes.orig_shape)

            return [
                (
                    result.boxes.data,
                    result.masks.data,
                )
                for result in prediction_result
            ]
        elif self.is_obb:
            device = getattr(self.model, "device", "cpu")
            return [
                (
                    torch.cat(
                        [
                            result.obb.xyxy,
                            result.obb.conf.unsqueeze(-1),
                            result.obb.cls.unsqueeze(-1),
                        ],
                        dim=1,
                    )
                    if result.obb is not None
                    else torch.empty((0, 6), device=device),
                    result.obb.xyxyxyxy if result.obb is not None else torch.empty((0, 4, 2), device=device),
                )
                for result in prediction_result
            ]
        else:
            return [result.boxes.data for result in prediction_result]

    def perform_batch_inference(self, images: list[np.ndarray]) -> None:
        """Performs inference on a batch of images using native YOLO batch support.

        Args:
            images: list[np.ndarray]
                List of numpy arrays (H, W, C) in RGB order.
        """
        if self.model is None:
            raise ValueError("Model is not loaded, load it by calling .load_model()")

        kwargs = {"cfg": self.config_path, "verbose": False, "conf": self.confidence_threshold, "device": self.device}

        if self.image_size is not None:
            kwargs = {"imgsz": self.image_size, **kwargs}

        # YOLO expects BGR — convert each image and pass the list for native batch inference
        images_bgr = [img[:, :, ::-1] for img in images]
        prediction_result = self.model(images_bgr, **kwargs)

        self._original_predictions = self._extract_predictions(prediction_result)
        self._original_shapes = [img.shape for img in images]

    @property
    def category_names(self) -> list:
        """Returns the list of category names from the model.

        Falls back to ``category_mapping`` values when model metadata is
        unavailable (e.g. ONNX models without embedded names).

        Raises:
            ValueError: If neither model names nor category_mapping are available.
        """
        # For ONNX models, names might not be available, use category_mapping
        assert self.model is not None
        if hasattr(self.model, "names") and self.model.names:
            return list(self.model.names.values())
        elif self.category_mapping:
            return list(self.category_mapping.values())
        else:
            raise ValueError("Category names not available. Please provide category_mapping for ONNX models.")

    @property
    def num_categories(self) -> int:
        """Returns number of categories."""
        assert self.model is not None
        if hasattr(self.model, "names") and self.model.names:
            return len(self.model.names)
        elif self.category_mapping:
            return len(self.category_mapping)
        else:
            raise ValueError("Cannot determine number of categories. Please provide category_mapping for ONNX models.")

    @property
    def has_mask(self) -> bool:
        """Returns if model output contains segmentation mask."""
        # Check if model has 'task' attribute (for both .pt and .onnx models)
        assert self.model is not None
        if hasattr(self.model, "overrides") and "task" in self.model.overrides:
            return self.model.overrides["task"] == "segment"
        # For ONNX models, task might be stored differently
        elif hasattr(self.model, "task"):
            return self.model.task == "segment"
        # For ONNX models without task info, check model path
        elif self.model_path and isinstance(self.model_path, str):
            return "seg" in self.model_path.lower()
        return False

    @property
    def is_obb(self) -> bool:
        """Returns if model output contains oriented bounding boxes."""
        # Check if model has 'task' attribute (for both .pt and .onnx models)
        assert self.model is not None
        if hasattr(self.model, "overrides") and "task" in self.model.overrides:
            return self.model.overrides["task"] == "obb"
        # For ONNX models, task might be stored differently
        elif hasattr(self.model, "task"):
            return self.model.task == "obb"
        # For ONNX models without task info, check model path
        elif self.model_path and isinstance(self.model_path, str):
            return "obb" in self.model_path.lower()
        return False

    def _create_object_prediction_list_from_original_predictions(
        self,
        shift_amount_list: list[list[int | float]] | None = [[0, 0]],
        full_shape_list: list[list[int | float]] | None = None,
    ) -> None:
        """Convert predictions to ObjectPrediction list.

        self._original_predictions is converted to a list of prediction.ObjectPrediction and set to
        self._object_prediction_list_per_image.

        Args:
            shift_amount_list: list of list
                To shift the box and mask predictions from sliced image to full sized image, should
                be in the form of List[[shift_x, shift_y],[shift_x, shift_y],...]
            full_shape_list: list of list
                Size of the full image after shifting, should be in the form of
                List[[height, width],[height, width],...]
        """
        assert self._original_predictions is not None
        assert self._original_shapes is not None
        original_predictions = self._original_predictions

        # compatibility for sahi v0.8.15
        shift_amount_list_typed: list[list[int | float]] = fix_shift_amount_list(shift_amount_list)
        full_shape_list_typed: list[list[int | float]] | None = fix_full_shape_list(full_shape_list)

        # handle all predictions
        object_prediction_list_per_image = []

        for image_ind, image_predictions in enumerate(original_predictions):
            shift_amount = [int(x) for x in shift_amount_list_typed[image_ind]]
            full_shape = None if full_shape_list_typed is None else [int(x) for x in full_shape_list_typed[image_ind]]
            image_shape = self._original_shapes[image_ind]
            object_prediction_list = []

            # Extract boxes and optional masks/obb
            if self.has_mask or self.is_obb:
                boxes = image_predictions[0].cpu().detach().numpy()
                masks_or_points = image_predictions[1].cpu().detach().numpy()
            else:
                boxes = image_predictions.data.cpu().detach().numpy()
                masks_or_points = None

            # Process each prediction
            for pred_ind, prediction in enumerate(boxes):
                # Get bbox coordinates
                bbox = prediction[:4].tolist()
                score = prediction[4]
                category_id = int(prediction[5])
                assert self.category_mapping is not None
                category_name = self.category_mapping[str(category_id)]

                # Fix box coordinates
                bbox = [max(0, coord) for coord in bbox]
                if full_shape is not None:
                    bbox[0] = min(full_shape[1], bbox[0])
                    bbox[1] = min(full_shape[0], bbox[1])
                    bbox[2] = min(full_shape[1], bbox[2])
                    bbox[3] = min(full_shape[0], bbox[3])

                # Ignore invalid predictions
                if not (bbox[0] < bbox[2]) or not (bbox[1] < bbox[3]):
                    logger.warning(f"ignoring invalid prediction with bbox: {bbox}")
                    continue

                # Get segmentation or OBB points
                segmentation = None
                if masks_or_points is not None:
                    if self.has_mask:
                        bool_mask = masks_or_points[pred_ind]
                        # Resize mask to original image size
                        bool_mask = cv2.resize(bool_mask.astype(np.uint8), (image_shape[1], image_shape[0]))
                        segmentation = get_coco_segmentation_from_bool_mask(bool_mask)
                    else:  # is_obb
                        obb_points = masks_or_points[pred_ind]  # Get OBB points for this prediction
                        segmentation = [obb_points.reshape(-1).tolist()]

                    if len(segmentation) == 0:
                        continue

                # Create and append object prediction
                object_prediction = ObjectPrediction(
                    bbox=bbox,
                    category_id=category_id,
                    score=score,
                    segmentation=segmentation,
                    category_name=category_name,
                    shift_amount=shift_amount,
                    full_shape=list(image_shape[:2]) if full_shape is None else full_shape,  # (height, width)
                )
                object_prediction_list.append(object_prediction)

            object_prediction_list_per_image.append(object_prediction_list)

        self._object_prediction_list_per_image = object_prediction_list_per_image

Attributes¶

`category_names` `property` ¶

Returns the list of category names from the model.

Falls back to category_mapping values when model metadata is unavailable (e.g. ONNX models without embedded names).

Raises:

Type	Description
`ValueError`	If neither model names nor category_mapping are available.

`has_mask` `property` ¶

Returns if model output contains segmentation mask.

`is_obb` `property` ¶

Returns if model output contains oriented bounding boxes.

`num_categories` `property` ¶

Returns number of categories.

Functions¶

`init(*args, fuse=False, task=None, **kwargs)` ¶

Initialize the Ultralytics detection model.

Accepts all arguments from DetectionModel.__init__ plus the following keyword arguments.

Parameters:

Name	Type	Description	Default
`*args` ¶	`object`	Variable length argument list passed to DetectionModel.	`()`
`fuse` ¶	`bool`	If True, fuse Conv2d and BatchNorm2d layers for faster inference. Default: False.	`False`
`task` ¶	`str \| None`	Ultralytics task type (e.g. `"detect"`, `"segment"`, `"obb"`). When None, the task is inferred from the model. Default: None.	`None`
`**kwargs` ¶	`object`	Arbitrary keyword arguments passed to DetectionModel.	`{}`

Source code in sahi/models/ultralytics.py

def __init__(self, *args: object, fuse: bool = False, task: str | None = None, **kwargs: object) -> None:
    """Initialize the Ultralytics detection model.

    Accepts all arguments from ``DetectionModel.__init__`` plus the
    following keyword arguments.

    Args:
        *args: Variable length argument list passed to DetectionModel.
        fuse: If True, fuse Conv2d and BatchNorm2d layers for faster
            inference. Default: False.
        task: Ultralytics task type (e.g. ``"detect"``, ``"segment"``,
            ``"obb"``). When None, the task is inferred from the model.
            Default: None.
        **kwargs: Arbitrary keyword arguments passed to DetectionModel.
    """
    self.fuse: bool = fuse
    self.task: str | None = task
    existing_packages = getattr(self, "required_packages", None) or []
    self.required_packages = [*list(existing_packages), "ultralytics"]
    super().__init__(*args, **kwargs)  # type: ignore[misc, arg-type]

`load_model()` ¶

Detection model is initialized and set to self.model.

Source code in sahi/models/ultralytics.py

def load_model(self) -> None:
    """Detection model is initialized and set to self.model."""
    from ultralytics import YOLO

    try:
        assert self.model_path is not None, "model_path must be provided for Ultralytics models"
        if self.task:
            model = YOLO(self.model_path, task=self.task)
        else:
            model = YOLO(self.model_path)

        # Only call .to(device) for PyTorch models, not ONNX,OpenVINO,Ncnn.
        if self.model_path and isinstance(self.model_path, str) and self.model_path.endswith(".pt"):
            model.to(self.device)
        self.set_model(model)
        if self.fuse and hasattr(model, "fuse"):
            model.fuse()

    except Exception as e:
        raise TypeError("model_path is not a valid Ultralytics model path: ", e)

`perform_batch_inference(images)` ¶

Performs inference on a batch of images using native YOLO batch support.

Parameters:

Name	Type	Description	Default
`images` ¶	`list[ndarray]`	list[np.ndarray] List of numpy arrays (H, W, C) in RGB order.	required

Source code in sahi/models/ultralytics.py

def perform_batch_inference(self, images: list[np.ndarray]) -> None:
    """Performs inference on a batch of images using native YOLO batch support.

    Args:
        images: list[np.ndarray]
            List of numpy arrays (H, W, C) in RGB order.
    """
    if self.model is None:
        raise ValueError("Model is not loaded, load it by calling .load_model()")

    kwargs = {"cfg": self.config_path, "verbose": False, "conf": self.confidence_threshold, "device": self.device}

    if self.image_size is not None:
        kwargs = {"imgsz": self.image_size, **kwargs}

    # YOLO expects BGR — convert each image and pass the list for native batch inference
    images_bgr = [img[:, :, ::-1] for img in images]
    prediction_result = self.model(images_bgr, **kwargs)

    self._original_predictions = self._extract_predictions(prediction_result)
    self._original_shapes = [img.shape for img in images]

`perform_inference(image)` ¶

Prediction is performed using self.model and the prediction result is set to self._original_predictions.

Parameters:

Name	Type	Description	Default
`image` ¶	`ndarray`	np.ndarray A numpy array that contains the image to be predicted. 3 channel image should be in RGB order.	required

Source code in sahi/models/ultralytics.py

def perform_inference(self, image: np.ndarray) -> None:
    """Prediction is performed using self.model and the prediction result is set to self._original_predictions.

    Args:
        image: np.ndarray
            A numpy array that contains the image to be predicted. 3 channel image should be in RGB order.
    """
    self.perform_batch_inference([image])

`set_model(model, **kwargs)` ¶

Sets the underlying Ultralytics model.

Parameters:

Name	Type	Description	Default
`model` ¶	`Any`	Any A Ultralytics model	required
`**kwargs` ¶	`Any`	Any Additional keyword arguments for model setup.	`{}`

Source code in sahi/models/ultralytics.py

def set_model(self, model: Any, **kwargs: Any) -> None:
    """Sets the underlying Ultralytics model.

    Args:
        model: Any
            A Ultralytics model
        **kwargs: Any
            Additional keyword arguments for model setup.
    """
    self.model = model
    # set category_mapping
    if not self.category_mapping:
        category_mapping = {str(ind): category_name for ind, category_name in enumerate(self.category_names)}
        self.category_mapping = category_mapping

Ultralytics Models¶

sahi.models.ultralytics ¶

Classes¶

UltralyticsDetectionModel ¶

Attributes¶

category_names property ¶

has_mask property ¶

is_obb property ¶

num_categories property ¶

Functions¶

__init__(*args, fuse=False, task=None, **kwargs) ¶

load_model() ¶

perform_batch_inference(images) ¶

perform_inference(image) ¶

set_model(model, **kwargs) ¶

Functions¶

`sahi.models.ultralytics` ¶

`UltralyticsDetectionModel` ¶

`category_names` `property` ¶

`has_mask` `property` ¶

`is_obb` `property` ¶

`num_categories` `property` ¶

`init(*args, fuse=False, task=None, **kwargs)` ¶

`load_model()` ¶

`perform_batch_inference(images)` ¶

`perform_inference(image)` ¶

`set_model(model, **kwargs)` ¶