model_path

`sahi` ¶

SAHI: Sliced Aided Hyper Inference.

A framework for performing object detection on large images using slicing.

Classes¶

`AutoDetectionModel` ¶

Automatic detection model loader.

Source code in sahi/auto_model.py

class AutoDetectionModel:
    """Automatic detection model loader."""

    @staticmethod
    def from_pretrained(
        model_type: str,
        model_path: str | None = None,
        model: object | None = None,
        config_path: str | None = None,
        device: str | None = None,
        mask_threshold: float = 0.5,
        confidence_threshold: float = 0.3,
        category_mapping: dict | None = None,
        category_remapping: dict | None = None,
        load_at_init: bool = True,
        image_size: int | None = None,
        **kwargs: object,
    ) -> DetectionModel:
        """Load a DetectionModel from given path.

        Args:
            model_type: str
                Name of the detection framework (example: "ultralytics", "huggingface", "torchvision")
            model_path: str
                Path of the detection model (ex. 'model.pt')
            model: Any
                A pre-initialized model instance, if available
            config_path: str
                Path of the config file (ex. 'mmdet/configs/cascade_rcnn_r50_fpn_1x.py')
            device: str
                Device, "cpu" or "cuda:0"
            mask_threshold: float
                Value to threshold mask pixels, should be between 0 and 1
            confidence_threshold: float
                All predictions with score < confidence_threshold will be discarded
            category_mapping: dict: str to str
                Mapping from category id (str) to category name (str) e.g. {"1": "pedestrian"}
            category_remapping: dict: str to int
                Remap category ids based on category names, after performing inference e.g. {"car": 3}
            load_at_init: bool
                If True, automatically loads the model at initialization
            image_size: int
                Inference input size.
            **kwargs: object
                Additional keyword arguments to pass to the model.

        Returns:
            Returns an instance of a DetectionModel

        Raises:
            ImportError: If given {model_type} framework is not installed
        """
        if model_type in ULTRALYTICS_MODEL_NAMES:
            model_type = "ultralytics"
        model_class_name = MODEL_TYPE_TO_MODEL_CLASS_NAME[model_type]
        DetectionModel = import_model_class(model_type, model_class_name)

        return DetectionModel(
            model_path=model_path,
            model=model,
            config_path=config_path,
            device=device,
            mask_threshold=mask_threshold,
            confidence_threshold=confidence_threshold,
            category_mapping=category_mapping,
            category_remapping=category_remapping,
            load_at_init=load_at_init,
            image_size=image_size,
            **kwargs,
        )

Functions¶

`from_pretrained(model_type, model_path=None, model=None, config_path=None, device=None, mask_threshold=0.5, confidence_threshold=0.3, category_mapping=None, category_remapping=None, load_at_init=True, image_size=None, **kwargs)` `staticmethod` ¶

Load a DetectionModel from given path.

Parameters:

Name	Type	Description	Default
`model_type` ¶	`str`	str Name of the detection framework (example: "ultralytics", "huggingface", "torchvision")	required
`model_path` ¶	`str \| None`	str Path of the detection model (ex. 'model.pt')	`None`
`model` ¶	`object \| None`	Any A pre-initialized model instance, if available	`None`
`config_path` ¶	`str \| None`	str Path of the config file (ex. 'mmdet/configs/cascade_rcnn_r50_fpn_1x.py')	`None`
`device` ¶	`str \| None`	str Device, "cpu" or "cuda:0"	`None`
`mask_threshold` ¶	`float`	float Value to threshold mask pixels, should be between 0 and 1	`0.5`
`confidence_threshold` ¶	`float`	float All predictions with score < confidence_threshold will be discarded	`0.3`
`category_mapping` ¶	`dict \| None`	dict: str to str Mapping from category id (str) to category name (str) e.g. {"1": "pedestrian"}	`None`
`category_remapping` ¶	`dict \| None`	dict: str to int Remap category ids based on category names, after performing inference e.g. {"car": 3}	`None`
`load_at_init` ¶	`bool`	bool If True, automatically loads the model at initialization	`True`
`image_size` ¶	`int \| None`	int Inference input size.	`None`
`**kwargs` ¶	`object`	object Additional keyword arguments to pass to the model.	`{}`

Returns:

Type	Description
`DetectionModel`	Returns an instance of a DetectionModel

Raises:

Type	Description
`ImportError`	If given {model_type} framework is not installed

Source code in sahi/auto_model.py

@staticmethod
def from_pretrained(
    model_type: str,
    model_path: str | None = None,
    model: object | None = None,
    config_path: str | None = None,
    device: str | None = None,
    mask_threshold: float = 0.5,
    confidence_threshold: float = 0.3,
    category_mapping: dict | None = None,
    category_remapping: dict | None = None,
    load_at_init: bool = True,
    image_size: int | None = None,
    **kwargs: object,
) -> DetectionModel:
    """Load a DetectionModel from given path.

    Args:
        model_type: str
            Name of the detection framework (example: "ultralytics", "huggingface", "torchvision")
        model_path: str
            Path of the detection model (ex. 'model.pt')
        model: Any
            A pre-initialized model instance, if available
        config_path: str
            Path of the config file (ex. 'mmdet/configs/cascade_rcnn_r50_fpn_1x.py')
        device: str
            Device, "cpu" or "cuda:0"
        mask_threshold: float
            Value to threshold mask pixels, should be between 0 and 1
        confidence_threshold: float
            All predictions with score < confidence_threshold will be discarded
        category_mapping: dict: str to str
            Mapping from category id (str) to category name (str) e.g. {"1": "pedestrian"}
        category_remapping: dict: str to int
            Remap category ids based on category names, after performing inference e.g. {"car": 3}
        load_at_init: bool
            If True, automatically loads the model at initialization
        image_size: int
            Inference input size.
        **kwargs: object
            Additional keyword arguments to pass to the model.

    Returns:
        Returns an instance of a DetectionModel

    Raises:
        ImportError: If given {model_type} framework is not installed
    """
    if model_type in ULTRALYTICS_MODEL_NAMES:
        model_type = "ultralytics"
    model_class_name = MODEL_TYPE_TO_MODEL_CLASS_NAME[model_type]
    DetectionModel = import_model_class(model_type, model_class_name)

    return DetectionModel(
        model_path=model_path,
        model=model,
        config_path=config_path,
        device=device,
        mask_threshold=mask_threshold,
        confidence_threshold=confidence_threshold,
        category_mapping=category_mapping,
        category_remapping=category_remapping,
        load_at_init=load_at_init,
        image_size=image_size,
        **kwargs,
    )

`BoundingBox` `dataclass` ¶

BoundingBox represents a rectangular region in 2D space, typically used for object detection annotations.

Attributes:

Name	Type	Description
`box`	`Tuple[float, float, float, float]`	The bounding box coordinates in the format (minx, miny, maxx, maxy). - minx (float): Minimum x-coordinate (left). - miny (float): Minimum y-coordinate (top). - maxx (float): Maximum x-coordinate (right). - maxy (float): Maximum y-coordinate (bottom).
`shift_amount`	`Tuple[int, int]`	The amount to shift the bounding box in the x and y directions. Defaults to (0, 0).

BoundingBox Usage Example

bbox = BoundingBox((10.0, 20.0, 50.0, 80.0))
area = bbox.area
expanded_bbox = bbox.get_expanded_box(ratio=0.2)
shifted_bbox = bbox.get_shifted_box()
coco_format = bbox.to_coco_bbox()

Source code in sahi/annotation.py

@dataclass(frozen=True)
class BoundingBox:
    """BoundingBox represents a rectangular region in 2D space, typically used for object detection annotations.

    Attributes:
        box (Tuple[float, float, float, float]): The bounding box coordinates in the format (minx, miny, maxx, maxy).
            - minx (float): Minimum x-coordinate (left).
            - miny (float): Minimum y-coordinate (top).
            - maxx (float): Maximum x-coordinate (right).
            - maxy (float): Maximum y-coordinate (bottom).
        shift_amount (Tuple[int, int], optional): The amount to shift the bounding box in the x and y directions.
            Defaults to (0, 0).

    !!! example "BoundingBox Usage Example"
        ```python
        bbox = BoundingBox((10.0, 20.0, 50.0, 80.0))
        area = bbox.area
        expanded_bbox = bbox.get_expanded_box(ratio=0.2)
        shifted_bbox = bbox.get_shifted_box()
        coco_format = bbox.to_coco_bbox()
        ```
    """

    box: tuple[float, float, float, float] | list[float] | list[int]
    shift_amount: tuple[int, int] = (0, 0)

    def __post_init__(self) -> None:
        """Validate bounding box coordinates and shift amount."""
        if len(self.box) != 4 or any(coord < 0 for coord in self.box):
            raise ValueError("box must be 4 non-negative floats: [minx, miny, maxx, maxy]")
        if len(self.shift_amount) != 2:
            raise ValueError("shift_amount must be 2 integers: [shift_x, shift_y]")

    @property
    def minx(self) -> float:
        """Return minimum x-coordinate."""
        return self.box[0]

    @property
    def miny(self) -> float:
        """Return minimum y-coordinate."""
        return self.box[1]

    @property
    def maxx(self) -> float:
        """Return maximum x-coordinate."""
        return self.box[2]

    @property
    def maxy(self) -> float:
        """Return maximum y-coordinate."""
        return self.box[3]

    @property
    def shift_x(self) -> int:
        """Return x-coordinate shift."""
        return self.shift_amount[0]

    @property
    def shift_y(self) -> int:
        """Return y-coordinate shift."""
        return self.shift_amount[1]

    @property
    def area(self) -> float:
        """Return bounding box area."""
        return (self.maxx - self.minx) * (self.maxy - self.miny)

    def get_expanded_box(self, ratio: float = 0.1, max_x: int | None = None, max_y: int | None = None) -> BoundingBox:
        """Get an expanded bounding box by increasing its size by a given ratio.

        The expansion is applied equally in all directions. Optionally, the expanded box can be
        clipped to maximum x and y boundaries.

        Args:
            ratio (float, optional): The proportion by which to expand the box size.
                Default is 0.1 (10%).
            max_x (int, optional): The maximum allowed x-coordinate for the expanded box.
                If None, no maximum is applied.
            max_y (int, optional): The maximum allowed y-coordinate for the expanded box.
                If None, no maximum is applied.

        Returns:
            BoundingBox: A new BoundingBox instance representing the expanded box.
        """
        w = self.maxx - self.minx
        h = self.maxy - self.miny
        y_mar = int(h * ratio)
        x_mar = int(w * ratio)
        maxx = min(max_x, self.maxx + x_mar) if max_x else self.maxx + x_mar
        minx = max(0, self.minx - x_mar)
        maxy = min(max_y, self.maxy + y_mar) if max_y else self.maxy + y_mar
        miny = max(0, self.miny - y_mar)
        box: list[float] = [minx, miny, maxx, maxy]
        return BoundingBox(box)

    def to_xywh(self) -> list[float]:
        """Convert to [xmin, ymin, width, height] format.

        Returns:
            list[float]: A list containing the bounding box in the format [xmin, ymin, width, height].
        """
        return [self.minx, self.miny, self.maxx - self.minx, self.maxy - self.miny]

    def to_coco_bbox(self) -> list[float]:
        """Convert to COCO format: [xmin, ymin, width, height].

        Returns:
            list[float]: A list containing the bounding box in COCO format.
        """
        return self.to_xywh()

    def to_xyxy(self) -> list[float]:
        """Convert to [xmin, ymin, xmax, ymax] format.

        Returns:
            list[float]: A list containing the bounding box in the format [xmin, ymin, xmax, ymax].
        """
        return [self.minx, self.miny, self.maxx, self.maxy]

    def to_voc_bbox(self) -> list[float]:
        """Convert to VOC format: [xmin, ymin, xmax, ymax].

        Returns:
            list[float]: A list containing the bounding box in VOC format.
        """
        return self.to_xyxy()

    def get_shifted_box(self) -> BoundingBox:
        """Get shifted BoundingBox.

        Returns:
            BoundingBox: A new BoundingBox instance representing the shifted box.
        """
        box = [
            self.minx + self.shift_x,
            self.miny + self.shift_y,
            self.maxx + self.shift_x,
            self.maxy + self.shift_y,
        ]
        return BoundingBox(box)

    def __repr__(self) -> str:
        """Return string representation of bounding box."""
        return (
            f"BoundingBox: <{(self.minx, self.miny, self.maxx, self.maxy)}, "
            f"w: {self.maxx - self.minx}, h: {self.maxy - self.miny}>"
        )

Attributes¶

`area` `property` ¶

Return bounding box area.

`maxx` `property` ¶

Return maximum x-coordinate.

`maxy` `property` ¶

Return maximum y-coordinate.

`minx` `property` ¶

Return minimum x-coordinate.

`miny` `property` ¶

Return minimum y-coordinate.

`shift_x` `property` ¶

Return x-coordinate shift.

`shift_y` `property` ¶

Return y-coordinate shift.

Functions¶

`__post_init__()` ¶

Validate bounding box coordinates and shift amount.

Source code in sahi/annotation.py

def __post_init__(self) -> None:
    """Validate bounding box coordinates and shift amount."""
    if len(self.box) != 4 or any(coord < 0 for coord in self.box):
        raise ValueError("box must be 4 non-negative floats: [minx, miny, maxx, maxy]")
    if len(self.shift_amount) != 2:
        raise ValueError("shift_amount must be 2 integers: [shift_x, shift_y]")

`repr()` ¶

Return string representation of bounding box.

Source code in sahi/annotation.py

def __repr__(self) -> str:
    """Return string representation of bounding box."""
    return (
        f"BoundingBox: <{(self.minx, self.miny, self.maxx, self.maxy)}, "
        f"w: {self.maxx - self.minx}, h: {self.maxy - self.miny}>"
    )

`get_expanded_box(ratio=0.1, max_x=None, max_y=None)` ¶

Get an expanded bounding box by increasing its size by a given ratio.

The expansion is applied equally in all directions. Optionally, the expanded box can be clipped to maximum x and y boundaries.

Parameters:

Name	Type	Description	Default
`ratio` ¶	`float`	The proportion by which to expand the box size. Default is 0.1 (10%).	`0.1`
`max_x` ¶	`int`	The maximum allowed x-coordinate for the expanded box. If None, no maximum is applied.	`None`
`max_y` ¶	`int`	The maximum allowed y-coordinate for the expanded box. If None, no maximum is applied.	`None`

Returns:

Name	Type	Description
`BoundingBox`	`BoundingBox`	A new BoundingBox instance representing the expanded box.

Source code in sahi/annotation.py

def get_expanded_box(self, ratio: float = 0.1, max_x: int | None = None, max_y: int | None = None) -> BoundingBox:
    """Get an expanded bounding box by increasing its size by a given ratio.

    The expansion is applied equally in all directions. Optionally, the expanded box can be
    clipped to maximum x and y boundaries.

    Args:
        ratio (float, optional): The proportion by which to expand the box size.
            Default is 0.1 (10%).
        max_x (int, optional): The maximum allowed x-coordinate for the expanded box.
            If None, no maximum is applied.
        max_y (int, optional): The maximum allowed y-coordinate for the expanded box.
            If None, no maximum is applied.

    Returns:
        BoundingBox: A new BoundingBox instance representing the expanded box.
    """
    w = self.maxx - self.minx
    h = self.maxy - self.miny
    y_mar = int(h * ratio)
    x_mar = int(w * ratio)
    maxx = min(max_x, self.maxx + x_mar) if max_x else self.maxx + x_mar
    minx = max(0, self.minx - x_mar)
    maxy = min(max_y, self.maxy + y_mar) if max_y else self.maxy + y_mar
    miny = max(0, self.miny - y_mar)
    box: list[float] = [minx, miny, maxx, maxy]
    return BoundingBox(box)

`get_shifted_box()` ¶

Get shifted BoundingBox.

Returns:

Name	Type	Description
`BoundingBox`	`BoundingBox`	A new BoundingBox instance representing the shifted box.

Source code in sahi/annotation.py

def get_shifted_box(self) -> BoundingBox:
    """Get shifted BoundingBox.

    Returns:
        BoundingBox: A new BoundingBox instance representing the shifted box.
    """
    box = [
        self.minx + self.shift_x,
        self.miny + self.shift_y,
        self.maxx + self.shift_x,
        self.maxy + self.shift_y,
    ]
    return BoundingBox(box)

`to_coco_bbox()` ¶

Convert to COCO format: [xmin, ymin, width, height].

Returns:

Type	Description
`list[float]`	list[float]: A list containing the bounding box in COCO format.

Source code in sahi/annotation.py

def to_coco_bbox(self) -> list[float]:
    """Convert to COCO format: [xmin, ymin, width, height].

    Returns:
        list[float]: A list containing the bounding box in COCO format.
    """
    return self.to_xywh()

`to_voc_bbox()` ¶

Convert to VOC format: [xmin, ymin, xmax, ymax].

Returns:

Type	Description
`list[float]`	list[float]: A list containing the bounding box in VOC format.

Source code in sahi/annotation.py

def to_voc_bbox(self) -> list[float]:
    """Convert to VOC format: [xmin, ymin, xmax, ymax].

    Returns:
        list[float]: A list containing the bounding box in VOC format.
    """
    return self.to_xyxy()

`to_xywh()` ¶

Convert to [xmin, ymin, width, height] format.

Returns:

Type	Description
`list[float]`	list[float]: A list containing the bounding box in the format [xmin, ymin, width, height].

Source code in sahi/annotation.py

def to_xywh(self) -> list[float]:
    """Convert to [xmin, ymin, width, height] format.

    Returns:
        list[float]: A list containing the bounding box in the format [xmin, ymin, width, height].
    """
    return [self.minx, self.miny, self.maxx - self.minx, self.maxy - self.miny]

`to_xyxy()` ¶

Convert to [xmin, ymin, xmax, ymax] format.

Returns:

Type	Description
`list[float]`	list[float]: A list containing the bounding box in the format [xmin, ymin, xmax, ymax].

Source code in sahi/annotation.py

def to_xyxy(self) -> list[float]:
    """Convert to [xmin, ymin, xmax, ymax] format.

    Returns:
        list[float]: A list containing the bounding box in the format [xmin, ymin, xmax, ymax].
    """
    return [self.minx, self.miny, self.maxx, self.maxy]

`Category` `dataclass` ¶

Category of the annotation.

Attributes:

Name	Type	Description
`id`	`int`	Unique identifier for the category.
`name`	`str`	Name of the category.

Source code in sahi/annotation.py

@dataclass(frozen=True)
class Category:
    """Category of the annotation.

    Attributes:
        id (int): Unique identifier for the category.
        name (str): Name of the category.
    """

    id: int
    name: str

    def __post_init__(self) -> None:
        """Validate category id and name types."""
        if not isinstance(self.id, int):
            raise TypeError("id should be integer")
        if not isinstance(self.name, str):
            raise TypeError("name should be string")

    def __repr__(self) -> str:
        """Return string representation of category."""
        return f"Category: <id: {self.id}, name: {self.name}>"

Functions¶

`__post_init__()` ¶

Validate category id and name types.

Source code in sahi/annotation.py

def __post_init__(self) -> None:
    """Validate category id and name types."""
    if not isinstance(self.id, int):
        raise TypeError("id should be integer")
    if not isinstance(self.name, str):
        raise TypeError("name should be string")

`repr()` ¶

Return string representation of category.

Source code in sahi/annotation.py

def __repr__(self) -> str:
    """Return string representation of category."""
    return f"Category: <id: {self.id}, name: {self.name}>"

`DetectionModel` ¶

Base class for all detection models in SAHI.

Subclasses must implement load_model, perform_inference, and _create_object_prediction_list_from_original_predictions to integrate a new detection framework. The base class handles device management, dependency checking, category remapping, and the public prediction API.

Source code in sahi/models/base.py

class DetectionModel:
    """Base class for all detection models in SAHI.

    Subclasses must implement ``load_model``, ``perform_inference``, and
    ``_create_object_prediction_list_from_original_predictions`` to integrate
    a new detection framework. The base class handles device management,
    dependency checking, category remapping, and the public prediction API.
    """

    required_packages: list[str] | None = None

    def __init__(
        self,
        model_path: str | None = None,
        model: Any | None = None,
        config_path: str | None = None,
        device: str | None = None,
        mask_threshold: float = 0.5,
        confidence_threshold: float = 0.3,
        category_mapping: dict | None = None,
        category_remapping: dict | None = None,
        load_at_init: bool = True,
        image_size: int | None = None,
    ) -> None:
        """Init object detection/instance segmentation model.

        Args:
            model_path: str
                Path for the instance segmentation model weight
            model: Any
                A pre-loaded detection model instance.
            config_path: str
                Path for the mmdetection instance segmentation model config file
            device: Torch device, "cpu", "mps", "cuda", "cuda:0", "cuda:1", etc.
            mask_threshold: float
                Value to threshold mask pixels, should be between 0 and 1
            confidence_threshold: float
                All predictions with score < confidence_threshold will be discarded
            category_mapping: dict: str to str
                Mapping from category id (str) to category name (str) e.g. {"1": "pedestrian"}
            category_remapping: dict: str to int
                Remap category ids based on category names, after performing inference e.g. {"car": 3}
            load_at_init: bool
                If True, automatically loads the model at initialization
            image_size: int
                Inference input size.
        """
        self.model_path = model_path
        self.config_path = config_path
        self.model: Any = None
        self.mask_threshold = mask_threshold
        self.confidence_threshold = confidence_threshold
        self.category_mapping = category_mapping
        self.category_remapping = category_remapping
        self.image_size = image_size
        self._original_predictions: Any = None
        self._object_prediction_list_per_image: list[list[ObjectPrediction]] | None = None
        self._batch_images: list[np.ndarray] | None = None
        self._original_shapes: list[tuple[int, ...]] | None = None
        self.set_device(device)

        # automatically ensure dependencies
        self.check_dependencies()

        # automatically load model if load_at_init is True
        if load_at_init:
            if model:
                self.set_model(model)
            else:
                self.load_model()

    def check_dependencies(self, packages: list[str] | None = None) -> None:
        """Ensures required dependencies are installed.

        If 'packages' is None, uses self.required_packages. Subclasses may still call with a custom list for dynamic
        needs.
        """
        pkgs = packages if packages is not None else getattr(self, "required_packages", [])
        if pkgs:
            check_requirements(pkgs)

    def load_model(self) -> None:
        """Load the detection model from disk and assign it to ``self.model``.

        Subclasses must override this method. The implementation should use
        ``self.model_path``, ``self.config_path``, and ``self.device`` to
        construct the underlying model object and store it in ``self.model``.
        """
        raise NotImplementedError()

    def set_model(self, model: Any, **kwargs: Any) -> None:
        """Set an already-instantiated model as the underlying detection model.

        Subclasses must override this method to assign ``model`` to
        ``self.model`` and perform any additional setup (e.g. category mapping).

        Args:
            model: Any
                A pre-loaded detection model instance.
            **kwargs: Any
                Additional keyword arguments for subclass-specific setup.
        """
        raise NotImplementedError()

    def set_device(self, device: str | None = None) -> None:
        """Sets the device pytorch should use for the model.

        Args:
            device: Torch device, "cpu", "mps", "cuda", "cuda:0", "cuda:1", etc.
        """
        self.device = select_device(device)

    def unload_model(self) -> None:
        """Unloads the model from CPU/GPU."""
        self.model = None
        empty_cuda_cache()

    def perform_inference(self, image: np.ndarray) -> None:
        """Run inference on a single image and store raw predictions.

        Subclasses must override this method. The implementation should run
        the model on ``image`` and assign the raw results to
        ``self._original_predictions``.

        Args:
            image: np.ndarray
                A numpy array (H, W, C) containing the image to run inference on.
        """
        raise NotImplementedError()

    def perform_batch_inference(self, images: list[np.ndarray]) -> None:
        """Performs inference on a batch of images.

        Subclasses can override this for native batch support (e.g.
        ``UltralyticsDetectionModel`` passes the full list to YOLO for
        true GPU batching, ``HuggingfaceDetectionModel`` feeds all images
        to the processor in one call).

        The default does **not** run inference here.  It stores images so
        that ``convert_original_predictions`` can call ``perform_inference``
        per image, preserving each model's ``_original_predictions`` format.
        Subclasses with native batch support override this to run inference
        immediately.

        Args:
            images: list[np.ndarray]
                List of numpy arrays (H, W, C) to run inference on.
        """
        self._batch_images = images
        self._original_shapes = [img.shape for img in images]

    def _create_object_prediction_list_from_original_predictions(
        self,
        shift_amount_list: list[list[int | float]] | None = [[0, 0]],
        full_shape_list: list[list[int | float]] | None = None,
    ) -> None:
        """Convert raw predictions to a list of ObjectPrediction instances.

        Subclasses must override this method. The implementation should read
        ``self._original_predictions``, convert each raw prediction into an
        ``ObjectPrediction``, and store the result in
        ``self._object_prediction_list_per_image``. ``self.mask_threshold``
        may be used to threshold segmentation masks.

        Args:
            shift_amount_list: list of list
                Per-image pixel shifts for mapping sliced predictions back to
                the full image, in the form ``[[shift_x, shift_y], ...]``.
            full_shape_list: list of list
                Per-image full image dimensions after shifting, in the form
                ``[[height, width], ...]``.
        """
        raise NotImplementedError()

    def _apply_category_remapping(self) -> None:
        """Applies category remapping based on mapping given in self.category_remapping."""
        # confirm self.category_remapping is not None
        if self.category_remapping is None:
            raise ValueError("self.category_remapping cannot be None")
        # remap categories
        if not isinstance(self._object_prediction_list_per_image, list):
            logger.error(
                f"Unknown type for self._object_prediction_list_per_image: "
                f"{type(self._object_prediction_list_per_image)}"
            )
            return
        for object_prediction_list in self._object_prediction_list_per_image:  # type: ignore
            for object_prediction in object_prediction_list:
                old_category_id_str = str(object_prediction.category.id)
                new_category_id_int = self.category_remapping[old_category_id_str]
                object_prediction.category = Category(id=new_category_id_int, name=object_prediction.category.name)

    def convert_original_predictions(
        self,
        shift_amount: list[list[int | float]] | None = [[0, 0]],
        full_shape: list[list[int | float]] | None = None,
    ) -> None:
        """Convert raw predictions to ObjectPrediction lists.

        Should be called after ``perform_inference`` or ``perform_batch_inference``.

        When the default (sequential) ``perform_batch_inference`` was used,
        this method runs inference + conversion one image at a time so that
        each model's internal ``_original_predictions`` format is preserved.

        Args:
            shift_amount: Per-image shift amounts ``[[shift_x, shift_y], ...]``
                or a single ``[shift_x, shift_y]`` for one image.
            full_shape: Per-image full image sizes ``[[height, width], ...]``
                or a single ``[height, width]`` for one image.
        """
        batch_images = getattr(self, "_batch_images", None)
        if batch_images is not None:
            from sahi.utils.compatibility import fix_full_shape_list, fix_shift_amount_list

            shift_amount_list = fix_shift_amount_list(shift_amount)
            full_shape_list = fix_full_shape_list(full_shape)

            all_preds: list[list[ObjectPrediction]] = []
            for i, image in enumerate(batch_images):
                self.perform_inference(np.ascontiguousarray(image))
                sa: list[list[int | float]] = [shift_amount_list[i]] if shift_amount_list else [[0, 0]]
                fs: list[list[int | float]] | None = [full_shape_list[i]] if full_shape_list else None
                self._create_object_prediction_list_from_original_predictions(
                    shift_amount_list=sa,
                    full_shape_list=fs,
                )
                if self.category_remapping:
                    self._apply_category_remapping()
                all_preds.extend(self._object_prediction_list_per_image or [])
            self._object_prediction_list_per_image = all_preds
            self._batch_images = None  # clear deferred state
            return

        # Standard single-image path
        self._create_object_prediction_list_from_original_predictions(
            shift_amount_list=shift_amount,
            full_shape_list=full_shape,
        )
        if self.category_remapping:
            self._apply_category_remapping()

    @property
    def object_prediction_list(self) -> list[ObjectPrediction]:
        """Returns the object predictions for the first image.

        This is a convenience accessor for single-image inference. For batch
        inference results, use ``object_prediction_list_per_image`` instead.
        """
        if self._object_prediction_list_per_image is None:
            return []
        if len(self._object_prediction_list_per_image) == 0:
            return []
        return self._object_prediction_list_per_image[0]

    @property
    def object_prediction_list_per_image(self) -> list[list[ObjectPrediction]]:
        """Returns object predictions grouped by image.

        Each element is a list of ``ObjectPrediction`` instances for the
        corresponding image in the batch.
        """
        return self._object_prediction_list_per_image or []

    @property
    def original_predictions(self) -> object:
        """Returns the raw predictions from the underlying model.

        The format is model-specific and is set by ``perform_inference`` or
        ``perform_batch_inference``.
        """
        return self._original_predictions

Attributes¶

`object_prediction_list` `property` ¶

Returns the object predictions for the first image.

This is a convenience accessor for single-image inference. For batch inference results, use object_prediction_list_per_image instead.

`object_prediction_list_per_image` `property` ¶

Returns object predictions grouped by image.

Each element is a list of ObjectPrediction instances for the corresponding image in the batch.

`original_predictions` `property` ¶

Returns the raw predictions from the underlying model.

The format is model-specific and is set by perform_inference or perform_batch_inference.

Functions¶

`init(model_path=None, model=None, config_path=None, device=None, mask_threshold=0.5, confidence_threshold=0.3, category_mapping=None, category_remapping=None, load_at_init=True, image_size=None)` ¶

Init object detection/instance segmentation model.

Parameters:

Name	Type	Description	Default
`model_path` ¶	`str \| None`	str Path for the instance segmentation model weight	`None`
`model` ¶	`Any \| None`	Any A pre-loaded detection model instance.	`None`
`config_path` ¶	`str \| None`	str Path for the mmdetection instance segmentation model config file	`None`
`device` ¶	`str \| None`	Torch device, "cpu", "mps", "cuda", "cuda:0", "cuda:1", etc.	`None`
`mask_threshold` ¶	`float`	float Value to threshold mask pixels, should be between 0 and 1	`0.5`
`confidence_threshold` ¶	`float`	float All predictions with score < confidence_threshold will be discarded	`0.3`
`category_mapping` ¶	`dict \| None`	dict: str to str Mapping from category id (str) to category name (str) e.g. {"1": "pedestrian"}	`None`
`category_remapping` ¶	`dict \| None`	dict: str to int Remap category ids based on category names, after performing inference e.g. {"car": 3}	`None`
`load_at_init` ¶	`bool`	bool If True, automatically loads the model at initialization	`True`
`image_size` ¶	`int \| None`	int Inference input size.	`None`

Source code in sahi/models/base.py

def __init__(
    self,
    model_path: str | None = None,
    model: Any | None = None,
    config_path: str | None = None,
    device: str | None = None,
    mask_threshold: float = 0.5,
    confidence_threshold: float = 0.3,
    category_mapping: dict | None = None,
    category_remapping: dict | None = None,
    load_at_init: bool = True,
    image_size: int | None = None,
) -> None:
    """Init object detection/instance segmentation model.

    Args:
        model_path: str
            Path for the instance segmentation model weight
        model: Any
            A pre-loaded detection model instance.
        config_path: str
            Path for the mmdetection instance segmentation model config file
        device: Torch device, "cpu", "mps", "cuda", "cuda:0", "cuda:1", etc.
        mask_threshold: float
            Value to threshold mask pixels, should be between 0 and 1
        confidence_threshold: float
            All predictions with score < confidence_threshold will be discarded
        category_mapping: dict: str to str
            Mapping from category id (str) to category name (str) e.g. {"1": "pedestrian"}
        category_remapping: dict: str to int
            Remap category ids based on category names, after performing inference e.g. {"car": 3}
        load_at_init: bool
            If True, automatically loads the model at initialization
        image_size: int
            Inference input size.
    """
    self.model_path = model_path
    self.config_path = config_path
    self.model: Any = None
    self.mask_threshold = mask_threshold
    self.confidence_threshold = confidence_threshold
    self.category_mapping = category_mapping
    self.category_remapping = category_remapping
    self.image_size = image_size
    self._original_predictions: Any = None
    self._object_prediction_list_per_image: list[list[ObjectPrediction]] | None = None
    self._batch_images: list[np.ndarray] | None = None
    self._original_shapes: list[tuple[int, ...]] | None = None
    self.set_device(device)

    # automatically ensure dependencies
    self.check_dependencies()

    # automatically load model if load_at_init is True
    if load_at_init:
        if model:
            self.set_model(model)
        else:
            self.load_model()

`check_dependencies(packages=None)` ¶

Ensures required dependencies are installed.

If 'packages' is None, uses self.required_packages. Subclasses may still call with a custom list for dynamic needs.

Source code in sahi/models/base.py

def check_dependencies(self, packages: list[str] | None = None) -> None:
    """Ensures required dependencies are installed.

    If 'packages' is None, uses self.required_packages. Subclasses may still call with a custom list for dynamic
    needs.
    """
    pkgs = packages if packages is not None else getattr(self, "required_packages", [])
    if pkgs:
        check_requirements(pkgs)

`convert_original_predictions(shift_amount=[[0, 0]], full_shape=None)` ¶

Convert raw predictions to ObjectPrediction lists.

Should be called after perform_inference or perform_batch_inference.

When the default (sequential) perform_batch_inference was used, this method runs inference + conversion one image at a time so that each model's internal _original_predictions format is preserved.

Parameters:

Name	Type	Description	Default
`shift_amount` ¶	`list[list[int \| float]] \| None`	Per-image shift amounts `[[shift_x, shift_y], ...]` or a single `[shift_x, shift_y]` for one image.	`[[0, 0]]`
`full_shape` ¶	`list[list[int \| float]] \| None`	Per-image full image sizes `[[height, width], ...]` or a single `[height, width]` for one image.	`None`

Source code in sahi/models/base.py

def convert_original_predictions(
    self,
    shift_amount: list[list[int | float]] | None = [[0, 0]],
    full_shape: list[list[int | float]] | None = None,
) -> None:
    """Convert raw predictions to ObjectPrediction lists.

    Should be called after ``perform_inference`` or ``perform_batch_inference``.

    When the default (sequential) ``perform_batch_inference`` was used,
    this method runs inference + conversion one image at a time so that
    each model's internal ``_original_predictions`` format is preserved.

    Args:
        shift_amount: Per-image shift amounts ``[[shift_x, shift_y], ...]``
            or a single ``[shift_x, shift_y]`` for one image.
        full_shape: Per-image full image sizes ``[[height, width], ...]``
            or a single ``[height, width]`` for one image.
    """
    batch_images = getattr(self, "_batch_images", None)
    if batch_images is not None:
        from sahi.utils.compatibility import fix_full_shape_list, fix_shift_amount_list

        shift_amount_list = fix_shift_amount_list(shift_amount)
        full_shape_list = fix_full_shape_list(full_shape)

        all_preds: list[list[ObjectPrediction]] = []
        for i, image in enumerate(batch_images):
            self.perform_inference(np.ascontiguousarray(image))
            sa: list[list[int | float]] = [shift_amount_list[i]] if shift_amount_list else [[0, 0]]
            fs: list[list[int | float]] | None = [full_shape_list[i]] if full_shape_list else None
            self._create_object_prediction_list_from_original_predictions(
                shift_amount_list=sa,
                full_shape_list=fs,
            )
            if self.category_remapping:
                self._apply_category_remapping()
            all_preds.extend(self._object_prediction_list_per_image or [])
        self._object_prediction_list_per_image = all_preds
        self._batch_images = None  # clear deferred state
        return

    # Standard single-image path
    self._create_object_prediction_list_from_original_predictions(
        shift_amount_list=shift_amount,
        full_shape_list=full_shape,
    )
    if self.category_remapping:
        self._apply_category_remapping()

`load_model()` ¶

Load the detection model from disk and assign it to self.model.

Subclasses must override this method. The implementation should use self.model_path, self.config_path, and self.device to construct the underlying model object and store it in self.model.

Source code in sahi/models/base.py

def load_model(self) -> None:
    """Load the detection model from disk and assign it to ``self.model``.

    Subclasses must override this method. The implementation should use
    ``self.model_path``, ``self.config_path``, and ``self.device`` to
    construct the underlying model object and store it in ``self.model``.
    """
    raise NotImplementedError()

`perform_batch_inference(images)` ¶

Performs inference on a batch of images.

Subclasses can override this for native batch support (e.g. UltralyticsDetectionModel passes the full list to YOLO for true GPU batching, HuggingfaceDetectionModel feeds all images to the processor in one call).

The default does not run inference here. It stores images so that convert_original_predictions can call perform_inference per image, preserving each model's _original_predictions format. Subclasses with native batch support override this to run inference immediately.

Parameters:

Name	Type	Description	Default
`images` ¶	`list[ndarray]`	list[np.ndarray] List of numpy arrays (H, W, C) to run inference on.	required

Source code in sahi/models/base.py

def perform_batch_inference(self, images: list[np.ndarray]) -> None:
    """Performs inference on a batch of images.

    Subclasses can override this for native batch support (e.g.
    ``UltralyticsDetectionModel`` passes the full list to YOLO for
    true GPU batching, ``HuggingfaceDetectionModel`` feeds all images
    to the processor in one call).

    The default does **not** run inference here.  It stores images so
    that ``convert_original_predictions`` can call ``perform_inference``
    per image, preserving each model's ``_original_predictions`` format.
    Subclasses with native batch support override this to run inference
    immediately.

    Args:
        images: list[np.ndarray]
            List of numpy arrays (H, W, C) to run inference on.
    """
    self._batch_images = images
    self._original_shapes = [img.shape for img in images]

`perform_inference(image)` ¶

Run inference on a single image and store raw predictions.

Subclasses must override this method. The implementation should run the model on image and assign the raw results to self._original_predictions.

Parameters:

Name	Type	Description	Default
`image` ¶	`ndarray`	np.ndarray A numpy array (H, W, C) containing the image to run inference on.	required

Source code in sahi/models/base.py

def perform_inference(self, image: np.ndarray) -> None:
    """Run inference on a single image and store raw predictions.

    Subclasses must override this method. The implementation should run
    the model on ``image`` and assign the raw results to
    ``self._original_predictions``.

    Args:
        image: np.ndarray
            A numpy array (H, W, C) containing the image to run inference on.
    """
    raise NotImplementedError()

`set_device(device=None)` ¶

Sets the device pytorch should use for the model.

Parameters:

Name	Type	Description	Default
`device` ¶	`str \| None`	Torch device, "cpu", "mps", "cuda", "cuda:0", "cuda:1", etc.	`None`

Source code in sahi/models/base.py

def set_device(self, device: str | None = None) -> None:
    """Sets the device pytorch should use for the model.

    Args:
        device: Torch device, "cpu", "mps", "cuda", "cuda:0", "cuda:1", etc.
    """
    self.device = select_device(device)

`set_model(model, **kwargs)` ¶

Set an already-instantiated model as the underlying detection model.

Subclasses must override this method to assign model to self.model and perform any additional setup (e.g. category mapping).

Parameters:

Name	Type	Description	Default
`model` ¶	`Any`	Any A pre-loaded detection model instance.	required
`**kwargs` ¶	`Any`	Any Additional keyword arguments for subclass-specific setup.	`{}`

Source code in sahi/models/base.py

def set_model(self, model: Any, **kwargs: Any) -> None:
    """Set an already-instantiated model as the underlying detection model.

    Subclasses must override this method to assign ``model`` to
    ``self.model`` and perform any additional setup (e.g. category mapping).

    Args:
        model: Any
            A pre-loaded detection model instance.
        **kwargs: Any
            Additional keyword arguments for subclass-specific setup.
    """
    raise NotImplementedError()

`unload_model()` ¶

Unloads the model from CPU/GPU.

Source code in sahi/models/base.py

def unload_model(self) -> None:
    """Unloads the model from CPU/GPU."""
    self.model = None
    empty_cuda_cache()

`Mask` ¶

Init Mask from coco segmentation representation.

Parameters:

Name	Type	Description	Default
`segmentation` ¶	`list[list[float]] \| ndarray`	List[List] [ [x1, y1, x2, y2, x3, y3, ...], [x1, y1, x2, y2, x3, y3, ...], ... ]	required
`full_shape` ¶	`list[int] \| list[int \| float] \| None`	List[int] Size of the full image, should be in the form of [height, width]	required
`shift_amount` ¶	`list[int] \| list[int \| float]`	List[int] To shift the box and mask predictions from sliced image to full sized image, should be in the form of [shift_x, shift_y]	`[0, 0]`

Source code in sahi/annotation.py

class Mask:
    """Init Mask from coco segmentation representation.

    Args:
        segmentation: List[List]
            [
                [x1, y1, x2, y2, x3, y3, ...],
                [x1, y1, x2, y2, x3, y3, ...],
                ...
            ]
        full_shape: List[int]
            Size of the full image, should be in the form of [height, width]
        shift_amount: List[int]
            To shift the box and mask predictions from sliced image to full
            sized image, should be in the form of [shift_x, shift_y]
    """

    def __init__(
        self,
        segmentation: list[list[float]] | np.ndarray,
        full_shape: list[int] | list[int | float] | None,
        shift_amount: list[int] | list[int | float] = [0, 0],
    ) -> None:
        """Initialize Mask object."""
        if full_shape is None:
            raise ValueError("full_shape must be provided")  # pyright: ignore[reportUnreachable]

        self.shift_x = int(shift_amount[0])
        self.shift_y = int(shift_amount[1])
        self.full_shape_height = int(full_shape[0])
        self.full_shape_width = int(full_shape[1])
        # Ensure segmentation is a list
        if isinstance(segmentation, np.ndarray):
            self.segmentation: list[list[float]] = segmentation.tolist()
        else:
            self.segmentation = segmentation

    @classmethod
    def from_float_mask(
        cls,
        mask: np.ndarray,
        full_shape: list[int],
        mask_threshold: float = 0.5,
        shift_amount: list[int] | None = None,
    ) -> Mask:
        """Create Mask from float mask array.

        Args:
            mask: np.ndarray of np.float elements
                Mask values between 0 and 1 (should have a shape of height*width)
            mask_threshold: float
                Value to threshold mask pixels between 0 and 1
            shift_amount: List
                To shift the box and mask predictions from sliced image
                to full sized image, should be in the form of [shift_x, shift_y]
            full_shape: List[int]
                Size of the full image after shifting, should be in the form of [height, width].
        """
        bool_mask = mask > mask_threshold
        if shift_amount is None:
            shift_amount = [0, 0]
        return cls(
            segmentation=get_coco_segmentation_from_bool_mask(bool_mask),
            shift_amount=shift_amount,
            full_shape=full_shape,
        )

    @classmethod
    def from_bool_mask(
        cls,
        bool_mask: np.ndarray,
        full_shape: list[int],
        shift_amount: list[int] | None = None,
    ) -> Mask:
        """Create Mask from boolean mask array.

        Args:
            bool_mask: np.ndarray with bool elements
                2D mask of object, should have a shape of height*width
            full_shape: List[int]
                Size of the full image, should be in the form of [height, width]
            shift_amount: List[int]
                To shift the box and mask predictions from sliced image to full
                sized image, should be in the form of [shift_x, shift_y].
        """
        if shift_amount is None:
            shift_amount = [0, 0]
        return cls(
            segmentation=get_coco_segmentation_from_bool_mask(bool_mask),
            shift_amount=shift_amount,
            full_shape=full_shape,
        )

    @property
    def bool_mask(self) -> np.ndarray:
        """Return boolean mask representation."""
        # Ensure segmentation is list[list[float]] for the utility function
        seg = self.segmentation
        if isinstance(seg, np.ndarray):
            seg = seg.tolist()
        return get_bool_mask_from_coco_segmentation(seg, width=self.full_shape_width, height=self.full_shape_height)

    @property
    def shape(self) -> list[int]:
        """Returns mask shape as [height, width]."""
        return [self.bool_mask.shape[0], self.bool_mask.shape[1]]

    @property
    def full_shape(self) -> list[int]:
        """Returns full mask shape after shifting as [height, width]."""
        return [self.full_shape_height, self.full_shape_width]

    @property
    def shift_amount(self) -> list[int]:
        """Returns the shift amount of the mask slice as [shift_x, shift_y]."""
        return [self.shift_x, self.shift_y]

    def get_shifted_mask(self) -> Mask:
        """Return shifted mask."""
        # Confirm full_shape is specified
        if (self.full_shape_height is None) or (self.full_shape_width is None):
            raise ValueError("full_shape is None")
        shifted_segmentation = []
        for s in self.segmentation:
            xs = [min(self.shift_x + s[i], self.full_shape_width) for i in range(0, len(s) - 1, 2)]
            ys = [min(self.shift_y + s[i], self.full_shape_height) for i in range(1, len(s), 2)]
            shifted_segmentation.append([j for i in zip(xs, ys) for j in i])
        return Mask(
            segmentation=shifted_segmentation,
            shift_amount=[0, 0],
            full_shape=self.full_shape,
        )

Attributes¶

`bool_mask` `property` ¶

Return boolean mask representation.

`full_shape` `property` ¶

Returns full mask shape after shifting as [height, width].

`shape` `property` ¶

Returns mask shape as [height, width].

`shift_amount` `property` ¶

Returns the shift amount of the mask slice as [shift_x, shift_y].

Functions¶

`init(segmentation, full_shape, shift_amount=[0, 0])` ¶

Initialize Mask object.

Source code in sahi/annotation.py

def __init__(
    self,
    segmentation: list[list[float]] | np.ndarray,
    full_shape: list[int] | list[int | float] | None,
    shift_amount: list[int] | list[int | float] = [0, 0],
) -> None:
    """Initialize Mask object."""
    if full_shape is None:
        raise ValueError("full_shape must be provided")  # pyright: ignore[reportUnreachable]

    self.shift_x = int(shift_amount[0])
    self.shift_y = int(shift_amount[1])
    self.full_shape_height = int(full_shape[0])
    self.full_shape_width = int(full_shape[1])
    # Ensure segmentation is a list
    if isinstance(segmentation, np.ndarray):
        self.segmentation: list[list[float]] = segmentation.tolist()
    else:
        self.segmentation = segmentation

`from_bool_mask(bool_mask, full_shape, shift_amount=None)` `classmethod` ¶

Create Mask from boolean mask array.

Parameters:

Name	Type	Description	Default
`bool_mask` ¶	`ndarray`	np.ndarray with bool elements 2D mask of object, should have a shape of height*width	required
`full_shape` ¶	`list[int]`	List[int] Size of the full image, should be in the form of [height, width]	required
`shift_amount` ¶	`list[int] \| None`	List[int] To shift the box and mask predictions from sliced image to full sized image, should be in the form of [shift_x, shift_y].	`None`

Source code in sahi/annotation.py

@classmethod
def from_bool_mask(
    cls,
    bool_mask: np.ndarray,
    full_shape: list[int],
    shift_amount: list[int] | None = None,
) -> Mask:
    """Create Mask from boolean mask array.

    Args:
        bool_mask: np.ndarray with bool elements
            2D mask of object, should have a shape of height*width
        full_shape: List[int]
            Size of the full image, should be in the form of [height, width]
        shift_amount: List[int]
            To shift the box and mask predictions from sliced image to full
            sized image, should be in the form of [shift_x, shift_y].
    """
    if shift_amount is None:
        shift_amount = [0, 0]
    return cls(
        segmentation=get_coco_segmentation_from_bool_mask(bool_mask),
        shift_amount=shift_amount,
        full_shape=full_shape,
    )

`from_float_mask(mask, full_shape, mask_threshold=0.5, shift_amount=None)` `classmethod` ¶

Create Mask from float mask array.

Parameters:

Name	Type	Description	Default
`mask` ¶	`ndarray`	np.ndarray of np.float elements Mask values between 0 and 1 (should have a shape of height*width)	required
`mask_threshold` ¶	`float`	float Value to threshold mask pixels between 0 and 1	`0.5`
`shift_amount` ¶	`list[int] \| None`	List To shift the box and mask predictions from sliced image to full sized image, should be in the form of [shift_x, shift_y]	`None`
`full_shape` ¶	`list[int]`	List[int] Size of the full image after shifting, should be in the form of [height, width].	required

Source code in sahi/annotation.py

@classmethod
def from_float_mask(
    cls,
    mask: np.ndarray,
    full_shape: list[int],
    mask_threshold: float = 0.5,
    shift_amount: list[int] | None = None,
) -> Mask:
    """Create Mask from float mask array.

    Args:
        mask: np.ndarray of np.float elements
            Mask values between 0 and 1 (should have a shape of height*width)
        mask_threshold: float
            Value to threshold mask pixels between 0 and 1
        shift_amount: List
            To shift the box and mask predictions from sliced image
            to full sized image, should be in the form of [shift_x, shift_y]
        full_shape: List[int]
            Size of the full image after shifting, should be in the form of [height, width].
    """
    bool_mask = mask > mask_threshold
    if shift_amount is None:
        shift_amount = [0, 0]
    return cls(
        segmentation=get_coco_segmentation_from_bool_mask(bool_mask),
        shift_amount=shift_amount,
        full_shape=full_shape,
    )

`get_shifted_mask()` ¶

Return shifted mask.

Source code in sahi/annotation.py

def get_shifted_mask(self) -> Mask:
    """Return shifted mask."""
    # Confirm full_shape is specified
    if (self.full_shape_height is None) or (self.full_shape_width is None):
        raise ValueError("full_shape is None")
    shifted_segmentation = []
    for s in self.segmentation:
        xs = [min(self.shift_x + s[i], self.full_shape_width) for i in range(0, len(s) - 1, 2)]
        ys = [min(self.shift_y + s[i], self.full_shape_height) for i in range(1, len(s), 2)]
        shifted_segmentation.append([j for i in zip(xs, ys) for j in i])
    return Mask(
        segmentation=shifted_segmentation,
        shift_amount=[0, 0],
        full_shape=self.full_shape,
    )

`ObjectPrediction` ¶

Bases: ObjectAnnotation

Class for handling detection model predictions.

Source code in sahi/prediction.py

class ObjectPrediction(ObjectAnnotation):
    """Class for handling detection model predictions."""

    def __init__(
        self,
        bbox: list[float] | None = None,
        category_id: int | None = None,
        category_name: str | None = None,
        segmentation: list[list[float]] | None = None,
        score: float = 0.0,
        shift_amount: list[int] | list[int | float] | None = None,
        full_shape: list[int] | list[int | float] | None = None,
    ) -> None:
        """Initialize ObjectPrediction from bbox, score, category_id, category_name, segmentation.

        Args:
            bbox: list
                [minx, miny, maxx, maxy]
            score: float
                Prediction score between 0 and 1
            category_id: int
                ID of the object category
            category_name: str
                Name of the object category
            segmentation: List[List]
                [
                    [x1, y1, x2, y2, x3, y3, ...],
                    [x1, y1, x2, y2, x3, y3, ...],
                    ...
                ]
            shift_amount: list
                To shift the box and mask predictions from sliced image
                to full sized image, should be in the form of [shift_x, shift_y]
            full_shape: list
                Size of the full image after shifting, should be in
                the form of [height, width]
        """
        self.score = PredictionScore(score)
        super().__init__(
            bbox=bbox,
            category_id=category_id,
            segmentation=segmentation,
            category_name=category_name,
            shift_amount=shift_amount,
            full_shape=full_shape,
        )

    def get_shifted_object_prediction(self) -> ObjectPrediction:
        """Get shifted version of ObjectPrediction.

        Shifts bbox and mask coords. Used for mapping sliced predictions over full image.
        """
        if self.mask:
            shifted_mask = self.mask.get_shifted_mask()
            return ObjectPrediction(
                bbox=self.bbox.get_shifted_box().to_xyxy(),
                category_id=self.category.id,
                score=self.score.value,
                segmentation=shifted_mask.segmentation,
                category_name=self.category.name,
                shift_amount=[0, 0],
                full_shape=shifted_mask.full_shape,
            )
        else:
            return ObjectPrediction(
                bbox=self.bbox.get_shifted_box().to_xyxy(),
                category_id=self.category.id,
                score=self.score.value,
                segmentation=None,
                category_name=self.category.name,
                shift_amount=[0, 0],
                full_shape=None,
            )

    def to_coco_prediction(self, image_id: int | None = None) -> CocoPrediction:
        """Convert to sahi.utils.coco.CocoPrediction representation."""
        bbox_xywh = self.bbox.to_xywh()
        if self.mask:
            coco_prediction = CocoPrediction.from_coco_segmentation(  # type: ignore[arg-type]
                segmentation=self.mask.segmentation,
                category_id=self.category.id,
                category_name=self.category.name,
                score=self.score.value,
                image_id=image_id,
            )
        else:
            coco_prediction = CocoPrediction.from_coco_bbox(
                bbox=bbox_xywh,  # type: ignore[arg-type]
                category_id=self.category.id,
                category_name=self.category.name,
                score=self.score.value,
                image_id=image_id,
            )
        return coco_prediction

    def to_fiftyone_detection(self, image_height: int, image_width: int) -> object:
        """Convert to fiftyone.Detection representation."""
        try:
            import fiftyone as fo
        except ImportError:
            raise ImportError('Please run "pip install -U fiftyone" to install fiftyone first for fiftyone conversion.')

        x1, y1, x2, y2 = self.bbox.to_xyxy()
        rel_box = [x1 / image_width, y1 / image_height, (x2 - x1) / image_width, (y2 - y1) / image_height]
        fiftyone_detection = fo.Detection(label=self.category.name, bounding_box=rel_box, confidence=self.score.value)
        return fiftyone_detection

    def __repr__(self) -> str:
        """Return string representation of ObjectPrediction."""
        return f"""ObjectPrediction<
    bbox: {self.bbox},
    mask: {self.mask},
    score: {self.score},
    category: {self.category}>"""

Functions¶

`init(bbox=None, category_id=None, category_name=None, segmentation=None, score=0.0, shift_amount=None, full_shape=None)` ¶

Initialize ObjectPrediction from bbox, score, category_id, category_name, segmentation.

Parameters:

Name	Type	Description	Default
`bbox` ¶	`list[float] \| None`	list [minx, miny, maxx, maxy]	`None`
`score` ¶	`float`	float Prediction score between 0 and 1	`0.0`
`category_id` ¶	`int \| None`	int ID of the object category	`None`
`category_name` ¶	`str \| None`	str Name of the object category	`None`
`segmentation` ¶	`list[list[float]] \| None`	List[List] [ [x1, y1, x2, y2, x3, y3, ...], [x1, y1, x2, y2, x3, y3, ...], ... ]	`None`
`shift_amount` ¶	`list[int] \| list[int \| float] \| None`	list To shift the box and mask predictions from sliced image to full sized image, should be in the form of [shift_x, shift_y]	`None`
`full_shape` ¶	`list[int] \| list[int \| float] \| None`	list Size of the full image after shifting, should be in the form of [height, width]	`None`

Source code in sahi/prediction.py

def __init__(
    self,
    bbox: list[float] | None = None,
    category_id: int | None = None,
    category_name: str | None = None,
    segmentation: list[list[float]] | None = None,
    score: float = 0.0,
    shift_amount: list[int] | list[int | float] | None = None,
    full_shape: list[int] | list[int | float] | None = None,
) -> None:
    """Initialize ObjectPrediction from bbox, score, category_id, category_name, segmentation.

    Args:
        bbox: list
            [minx, miny, maxx, maxy]
        score: float
            Prediction score between 0 and 1
        category_id: int
            ID of the object category
        category_name: str
            Name of the object category
        segmentation: List[List]
            [
                [x1, y1, x2, y2, x3, y3, ...],
                [x1, y1, x2, y2, x3, y3, ...],
                ...
            ]
        shift_amount: list
            To shift the box and mask predictions from sliced image
            to full sized image, should be in the form of [shift_x, shift_y]
        full_shape: list
            Size of the full image after shifting, should be in
            the form of [height, width]
    """
    self.score = PredictionScore(score)
    super().__init__(
        bbox=bbox,
        category_id=category_id,
        segmentation=segmentation,
        category_name=category_name,
        shift_amount=shift_amount,
        full_shape=full_shape,
    )

`repr()` ¶

Return string representation of ObjectPrediction.

Source code in sahi/prediction.py

def __repr__(self) -> str:
    """Return string representation of ObjectPrediction."""
    return f"""ObjectPrediction<
bbox: {self.bbox},
mask: {self.mask},
score: {self.score},
category: {self.category}>"""

`get_shifted_object_prediction()` ¶

Get shifted version of ObjectPrediction.

Shifts bbox and mask coords. Used for mapping sliced predictions over full image.

Source code in sahi/prediction.py

def get_shifted_object_prediction(self) -> ObjectPrediction:
    """Get shifted version of ObjectPrediction.

    Shifts bbox and mask coords. Used for mapping sliced predictions over full image.
    """
    if self.mask:
        shifted_mask = self.mask.get_shifted_mask()
        return ObjectPrediction(
            bbox=self.bbox.get_shifted_box().to_xyxy(),
            category_id=self.category.id,
            score=self.score.value,
            segmentation=shifted_mask.segmentation,
            category_name=self.category.name,
            shift_amount=[0, 0],
            full_shape=shifted_mask.full_shape,
        )
    else:
        return ObjectPrediction(
            bbox=self.bbox.get_shifted_box().to_xyxy(),
            category_id=self.category.id,
            score=self.score.value,
            segmentation=None,
            category_name=self.category.name,
            shift_amount=[0, 0],
            full_shape=None,
        )

`to_coco_prediction(image_id=None)` ¶

Convert to sahi.utils.coco.CocoPrediction representation.

Source code in sahi/prediction.py

def to_coco_prediction(self, image_id: int | None = None) -> CocoPrediction:
    """Convert to sahi.utils.coco.CocoPrediction representation."""
    bbox_xywh = self.bbox.to_xywh()
    if self.mask:
        coco_prediction = CocoPrediction.from_coco_segmentation(  # type: ignore[arg-type]
            segmentation=self.mask.segmentation,
            category_id=self.category.id,
            category_name=self.category.name,
            score=self.score.value,
            image_id=image_id,
        )
    else:
        coco_prediction = CocoPrediction.from_coco_bbox(
            bbox=bbox_xywh,  # type: ignore[arg-type]
            category_id=self.category.id,
            category_name=self.category.name,
            score=self.score.value,
            image_id=image_id,
        )
    return coco_prediction

`to_fiftyone_detection(image_height, image_width)` ¶

Convert to fiftyone.Detection representation.

Source code in sahi/prediction.py

def to_fiftyone_detection(self, image_height: int, image_width: int) -> object:
    """Convert to fiftyone.Detection representation."""
    try:
        import fiftyone as fo
    except ImportError:
        raise ImportError('Please run "pip install -U fiftyone" to install fiftyone first for fiftyone conversion.')

    x1, y1, x2, y2 = self.bbox.to_xyxy()
    rel_box = [x1 / image_width, y1 / image_height, (x2 - x1) / image_width, (y2 - y1) / image_height]
    fiftyone_detection = fo.Detection(label=self.category.name, bounding_box=rel_box, confidence=self.score.value)
    return fiftyone_detection

Functions¶

`getattr(name)` ¶

Lazily import public symbols on first access.

When a name listed in _LAZY_IMPORTS is accessed on the sahi module, this function dynamically imports the corresponding submodule, retrieves the attribute, and caches it in the module globals so that subsequent accesses bypass this hook.

Raises:

Type	Description
`AttributeError`	If `name` is not a known lazy import.

Source code in sahi/__init__.py

def __getattr__(name: str) -> object:
    """Lazily import public symbols on first access.

    When a name listed in ``_LAZY_IMPORTS`` is accessed on the ``sahi``
    module, this function dynamically imports the corresponding submodule,
    retrieves the attribute, and caches it in the module globals so that
    subsequent accesses bypass this hook.

    Raises:
        AttributeError: If ``name`` is not a known lazy import.
    """
    if name in _LAZY_IMPORTS:
        import importlib

        module = importlib.import_module(_LAZY_IMPORTS[name])
        value = getattr(module, name)
        # Cache on the module so __getattr__ is not called again
        globals()[name] = value
        return value
    raise AttributeError(f"module 'sahi' has no attribute {name!r}")

Modules¶

`annotation` ¶

Annotation classes for object detection.

Contains classes for handling bounding boxes, categories, masks, and annotations.

Classes¶

`BoundingBox` `dataclass` ¶

BoundingBox represents a rectangular region in 2D space, typically used for object detection annotations.

Attributes:

Name	Type	Description
`box`	`Tuple[float, float, float, float]`	The bounding box coordinates in the format (minx, miny, maxx, maxy). - minx (float): Minimum x-coordinate (left). - miny (float): Minimum y-coordinate (top). - maxx (float): Maximum x-coordinate (right). - maxy (float): Maximum y-coordinate (bottom).
`shift_amount`	`Tuple[int, int]`	The amount to shift the bounding box in the x and y directions. Defaults to (0, 0).

BoundingBox Usage Example

bbox = BoundingBox((10.0, 20.0, 50.0, 80.0))
area = bbox.area
expanded_bbox = bbox.get_expanded_box(ratio=0.2)
shifted_bbox = bbox.get_shifted_box()
coco_format = bbox.to_coco_bbox()

Source code in sahi/annotation.py

@dataclass(frozen=True)
class BoundingBox:
    """BoundingBox represents a rectangular region in 2D space, typically used for object detection annotations.

    Attributes:
        box (Tuple[float, float, float, float]): The bounding box coordinates in the format (minx, miny, maxx, maxy).
            - minx (float): Minimum x-coordinate (left).
            - miny (float): Minimum y-coordinate (top).
            - maxx (float): Maximum x-coordinate (right).
            - maxy (float): Maximum y-coordinate (bottom).
        shift_amount (Tuple[int, int], optional): The amount to shift the bounding box in the x and y directions.
            Defaults to (0, 0).

    !!! example "BoundingBox Usage Example"
        ```python
        bbox = BoundingBox((10.0, 20.0, 50.0, 80.0))
        area = bbox.area
        expanded_bbox = bbox.get_expanded_box(ratio=0.2)
        shifted_bbox = bbox.get_shifted_box()
        coco_format = bbox.to_coco_bbox()
        ```
    """

    box: tuple[float, float, float, float] | list[float] | list[int]
    shift_amount: tuple[int, int] = (0, 0)

    def __post_init__(self) -> None:
        """Validate bounding box coordinates and shift amount."""
        if len(self.box) != 4 or any(coord < 0 for coord in self.box):
            raise ValueError("box must be 4 non-negative floats: [minx, miny, maxx, maxy]")
        if len(self.shift_amount) != 2:
            raise ValueError("shift_amount must be 2 integers: [shift_x, shift_y]")

    @property
    def minx(self) -> float:
        """Return minimum x-coordinate."""
        return self.box[0]

    @property
    def miny(self) -> float:
        """Return minimum y-coordinate."""
        return self.box[1]

    @property
    def maxx(self) -> float:
        """Return maximum x-coordinate."""
        return self.box[2]

    @property
    def maxy(self) -> float:
        """Return maximum y-coordinate."""
        return self.box[3]

    @property
    def shift_x(self) -> int:
        """Return x-coordinate shift."""
        return self.shift_amount[0]

    @property
    def shift_y(self) -> int:
        """Return y-coordinate shift."""
        return self.shift_amount[1]

    @property
    def area(self) -> float:
        """Return bounding box area."""
        return (self.maxx - self.minx) * (self.maxy - self.miny)

    def get_expanded_box(self, ratio: float = 0.1, max_x: int | None = None, max_y: int | None = None) -> BoundingBox:
        """Get an expanded bounding box by increasing its size by a given ratio.

        The expansion is applied equally in all directions. Optionally, the expanded box can be
        clipped to maximum x and y boundaries.

        Args:
            ratio (float, optional): The proportion by which to expand the box size.
                Default is 0.1 (10%).
            max_x (int, optional): The maximum allowed x-coordinate for the expanded box.
                If None, no maximum is applied.
            max_y (int, optional): The maximum allowed y-coordinate for the expanded box.
                If None, no maximum is applied.

        Returns:
            BoundingBox: A new BoundingBox instance representing the expanded box.
        """
        w = self.maxx - self.minx
        h = self.maxy - self.miny
        y_mar = int(h * ratio)
        x_mar = int(w * ratio)
        maxx = min(max_x, self.maxx + x_mar) if max_x else self.maxx + x_mar
        minx = max(0, self.minx - x_mar)
        maxy = min(max_y, self.maxy + y_mar) if max_y else self.maxy + y_mar
        miny = max(0, self.miny - y_mar)
        box: list[float] = [minx, miny, maxx, maxy]
        return BoundingBox(box)

    def to_xywh(self) -> list[float]:
        """Convert to [xmin, ymin, width, height] format.

        Returns:
            list[float]: A list containing the bounding box in the format [xmin, ymin, width, height].
        """
        return [self.minx, self.miny, self.maxx - self.minx, self.maxy - self.miny]

    def to_coco_bbox(self) -> list[float]:
        """Convert to COCO format: [xmin, ymin, width, height].

        Returns:
            list[float]: A list containing the bounding box in COCO format.
        """
        return self.to_xywh()

    def to_xyxy(self) -> list[float]:
        """Convert to [xmin, ymin, xmax, ymax] format.

        Returns:
            list[float]: A list containing the bounding box in the format [xmin, ymin, xmax, ymax].
        """
        return [self.minx, self.miny, self.maxx, self.maxy]

    def to_voc_bbox(self) -> list[float]:
        """Convert to VOC format: [xmin, ymin, xmax, ymax].

        Returns:
            list[float]: A list containing the bounding box in VOC format.
        """
        return self.to_xyxy()

    def get_shifted_box(self) -> BoundingBox:
        """Get shifted BoundingBox.

        Returns:
            BoundingBox: A new BoundingBox instance representing the shifted box.
        """
        box = [
            self.minx + self.shift_x,
            self.miny + self.shift_y,
            self.maxx + self.shift_x,
            self.maxy + self.shift_y,
        ]
        return BoundingBox(box)

    def __repr__(self) -> str:
        """Return string representation of bounding box."""
        return (
            f"BoundingBox: <{(self.minx, self.miny, self.maxx, self.maxy)}, "
            f"w: {self.maxx - self.minx}, h: {self.maxy - self.miny}>"
        )

Attributes¶

area property ¶

Return bounding box area.

maxx property ¶

Return maximum x-coordinate.

maxy property ¶

Return maximum y-coordinate.

minx property ¶

Return minimum x-coordinate.

miny property ¶

Return minimum y-coordinate.

shift_x property ¶

Return x-coordinate shift.

shift_y property ¶

Return y-coordinate shift.

Functions¶

__post_init__() ¶

Validate bounding box coordinates and shift amount.

Source code in sahi/annotation.py

def __post_init__(self) -> None:
    """Validate bounding box coordinates and shift amount."""
    if len(self.box) != 4 or any(coord < 0 for coord in self.box):
        raise ValueError("box must be 4 non-negative floats: [minx, miny, maxx, maxy]")
    if len(self.shift_amount) != 2:
        raise ValueError("shift_amount must be 2 integers: [shift_x, shift_y]")

__repr__() ¶

Return string representation of bounding box.

Source code in sahi/annotation.py

def __repr__(self) -> str:
    """Return string representation of bounding box."""
    return (
        f"BoundingBox: <{(self.minx, self.miny, self.maxx, self.maxy)}, "
        f"w: {self.maxx - self.minx}, h: {self.maxy - self.miny}>"
    )

get_expanded_box(ratio=0.1, max_x=None, max_y=None) ¶

Get an expanded bounding box by increasing its size by a given ratio.

The expansion is applied equally in all directions. Optionally, the expanded box can be clipped to maximum x and y boundaries.

Parameters:

Name	Type	Description	Default
`ratio` ¶	`float`	The proportion by which to expand the box size. Default is 0.1 (10%).	`0.1`
`max_x` ¶	`int`	The maximum allowed x-coordinate for the expanded box. If None, no maximum is applied.	`None`
`max_y` ¶	`int`	The maximum allowed y-coordinate for the expanded box. If None, no maximum is applied.	`None`

Returns:

Name	Type	Description
`BoundingBox`	`BoundingBox`	A new BoundingBox instance representing the expanded box.

Source code in sahi/annotation.py

def get_expanded_box(self, ratio: float = 0.1, max_x: int | None = None, max_y: int | None = None) -> BoundingBox:
    """Get an expanded bounding box by increasing its size by a given ratio.

    The expansion is applied equally in all directions. Optionally, the expanded box can be
    clipped to maximum x and y boundaries.

    Args:
        ratio (float, optional): The proportion by which to expand the box size.
            Default is 0.1 (10%).
        max_x (int, optional): The maximum allowed x-coordinate for the expanded box.
            If None, no maximum is applied.
        max_y (int, optional): The maximum allowed y-coordinate for the expanded box.
            If None, no maximum is applied.

    Returns:
        BoundingBox: A new BoundingBox instance representing the expanded box.
    """
    w = self.maxx - self.minx
    h = self.maxy - self.miny
    y_mar = int(h * ratio)
    x_mar = int(w * ratio)
    maxx = min(max_x, self.maxx + x_mar) if max_x else self.maxx + x_mar
    minx = max(0, self.minx - x_mar)
    maxy = min(max_y, self.maxy + y_mar) if max_y else self.maxy + y_mar
    miny = max(0, self.miny - y_mar)
    box: list[float] = [minx, miny, maxx, maxy]
    return BoundingBox(box)

get_shifted_box() ¶

Get shifted BoundingBox.

Returns:

Name	Type	Description
`BoundingBox`	`BoundingBox`	A new BoundingBox instance representing the shifted box.

Source code in sahi/annotation.py

def get_shifted_box(self) -> BoundingBox:
    """Get shifted BoundingBox.

    Returns:
        BoundingBox: A new BoundingBox instance representing the shifted box.
    """
    box = [
        self.minx + self.shift_x,
        self.miny + self.shift_y,
        self.maxx + self.shift_x,
        self.maxy + self.shift_y,
    ]
    return BoundingBox(box)

to_coco_bbox() ¶

Convert to COCO format: [xmin, ymin, width, height].

Returns:

Type	Description
`list[float]`	list[float]: A list containing the bounding box in COCO format.

Source code in sahi/annotation.py

def to_coco_bbox(self) -> list[float]:
    """Convert to COCO format: [xmin, ymin, width, height].

    Returns:
        list[float]: A list containing the bounding box in COCO format.
    """
    return self.to_xywh()

to_voc_bbox() ¶

Convert to VOC format: [xmin, ymin, xmax, ymax].

Returns:

Type	Description
`list[float]`	list[float]: A list containing the bounding box in VOC format.

Source code in sahi/annotation.py

def to_voc_bbox(self) -> list[float]:
    """Convert to VOC format: [xmin, ymin, xmax, ymax].

    Returns:
        list[float]: A list containing the bounding box in VOC format.
    """
    return self.to_xyxy()

to_xywh() ¶

Convert to [xmin, ymin, width, height] format.

Returns:

Type	Description
`list[float]`	list[float]: A list containing the bounding box in the format [xmin, ymin, width, height].

Source code in sahi/annotation.py

def to_xywh(self) -> list[float]:
    """Convert to [xmin, ymin, width, height] format.

    Returns:
        list[float]: A list containing the bounding box in the format [xmin, ymin, width, height].
    """
    return [self.minx, self.miny, self.maxx - self.minx, self.maxy - self.miny]

to_xyxy() ¶

Convert to [xmin, ymin, xmax, ymax] format.

Returns:

Type	Description
`list[float]`	list[float]: A list containing the bounding box in the format [xmin, ymin, xmax, ymax].

Source code in sahi/annotation.py

def to_xyxy(self) -> list[float]:
    """Convert to [xmin, ymin, xmax, ymax] format.

    Returns:
        list[float]: A list containing the bounding box in the format [xmin, ymin, xmax, ymax].
    """
    return [self.minx, self.miny, self.maxx, self.maxy]

`Category` `dataclass` ¶

Category of the annotation.

Attributes:

Name	Type	Description
`id`	`int`	Unique identifier for the category.
`name`	`str`	Name of the category.

Source code in sahi/annotation.py

@dataclass(frozen=True)
class Category:
    """Category of the annotation.

    Attributes:
        id (int): Unique identifier for the category.
        name (str): Name of the category.
    """

    id: int
    name: str

    def __post_init__(self) -> None:
        """Validate category id and name types."""
        if not isinstance(self.id, int):
            raise TypeError("id should be integer")
        if not isinstance(self.name, str):
            raise TypeError("name should be string")

    def __repr__(self) -> str:
        """Return string representation of category."""
        return f"Category: <id: {self.id}, name: {self.name}>"

Functions¶

__post_init__() ¶

Validate category id and name types.

Source code in sahi/annotation.py

def __post_init__(self) -> None:
    """Validate category id and name types."""
    if not isinstance(self.id, int):
        raise TypeError("id should be integer")
    if not isinstance(self.name, str):
        raise TypeError("name should be string")

__repr__() ¶

Return string representation of category.

Source code in sahi/annotation.py

def __repr__(self) -> str:
    """Return string representation of category."""
    return f"Category: <id: {self.id}, name: {self.name}>"

`Mask` ¶

Init Mask from coco segmentation representation.

Parameters:

Name	Type	Description	Default
`segmentation` ¶	`list[list[float]] \| ndarray`	List[List] [ [x1, y1, x2, y2, x3, y3, ...], [x1, y1, x2, y2, x3, y3, ...], ... ]	required
`full_shape` ¶	`list[int] \| list[int \| float] \| None`	List[int] Size of the full image, should be in the form of [height, width]	required
`shift_amount` ¶	`list[int] \| list[int \| float]`	List[int] To shift the box and mask predictions from sliced image to full sized image, should be in the form of [shift_x, shift_y]	`[0, 0]`

Source code in sahi/annotation.py

class Mask:
    """Init Mask from coco segmentation representation.

    Args:
        segmentation: List[List]
            [
                [x1, y1, x2, y2, x3, y3, ...],
                [x1, y1, x2, y2, x3, y3, ...],
                ...
            ]
        full_shape: List[int]
            Size of the full image, should be in the form of [height, width]
        shift_amount: List[int]
            To shift the box and mask predictions from sliced image to full
            sized image, should be in the form of [shift_x, shift_y]
    """

    def __init__(
        self,
        segmentation: list[list[float]] | np.ndarray,
        full_shape: list[int] | list[int | float] | None,
        shift_amount: list[int] | list[int | float] = [0, 0],
    ) -> None:
        """Initialize Mask object."""
        if full_shape is None:
            raise ValueError("full_shape must be provided")  # pyright: ignore[reportUnreachable]

        self.shift_x = int(shift_amount[0])
        self.shift_y = int(shift_amount[1])
        self.full_shape_height = int(full_shape[0])
        self.full_shape_width = int(full_shape[1])
        # Ensure segmentation is a list
        if isinstance(segmentation, np.ndarray):
            self.segmentation: list[list[float]] = segmentation.tolist()
        else:
            self.segmentation = segmentation

    @classmethod
    def from_float_mask(
        cls,
        mask: np.ndarray,
        full_shape: list[int],
        mask_threshold: float = 0.5,
        shift_amount: list[int] | None = None,
    ) -> Mask:
        """Create Mask from float mask array.

        Args:
            mask: np.ndarray of np.float elements
                Mask values between 0 and 1 (should have a shape of height*width)
            mask_threshold: float
                Value to threshold mask pixels between 0 and 1
            shift_amount: List
                To shift the box and mask predictions from sliced image
                to full sized image, should be in the form of [shift_x, shift_y]
            full_shape: List[int]
                Size of the full image after shifting, should be in the form of [height, width].
        """
        bool_mask = mask > mask_threshold
        if shift_amount is None:
            shift_amount = [0, 0]
        return cls(
            segmentation=get_coco_segmentation_from_bool_mask(bool_mask),
            shift_amount=shift_amount,
            full_shape=full_shape,
        )

    @classmethod
    def from_bool_mask(
        cls,
        bool_mask: np.ndarray,
        full_shape: list[int],
        shift_amount: list[int] | None = None,
    ) -> Mask:
        """Create Mask from boolean mask array.

        Args:
            bool_mask: np.ndarray with bool elements
                2D mask of object, should have a shape of height*width
            full_shape: List[int]
                Size of the full image, should be in the form of [height, width]
            shift_amount: List[int]
                To shift the box and mask predictions from sliced image to full
                sized image, should be in the form of [shift_x, shift_y].
        """
        if shift_amount is None:
            shift_amount = [0, 0]
        return cls(
            segmentation=get_coco_segmentation_from_bool_mask(bool_mask),
            shift_amount=shift_amount,
            full_shape=full_shape,
        )

    @property
    def bool_mask(self) -> np.ndarray:
        """Return boolean mask representation."""
        # Ensure segmentation is list[list[float]] for the utility function
        seg = self.segmentation
        if isinstance(seg, np.ndarray):
            seg = seg.tolist()
        return get_bool_mask_from_coco_segmentation(seg, width=self.full_shape_width, height=self.full_shape_height)

    @property
    def shape(self) -> list[int]:
        """Returns mask shape as [height, width]."""
        return [self.bool_mask.shape[0], self.bool_mask.shape[1]]

    @property
    def full_shape(self) -> list[int]:
        """Returns full mask shape after shifting as [height, width]."""
        return [self.full_shape_height, self.full_shape_width]

    @property
    def shift_amount(self) -> list[int]:
        """Returns the shift amount of the mask slice as [shift_x, shift_y]."""
        return [self.shift_x, self.shift_y]

    def get_shifted_mask(self) -> Mask:
        """Return shifted mask."""
        # Confirm full_shape is specified
        if (self.full_shape_height is None) or (self.full_shape_width is None):
            raise ValueError("full_shape is None")
        shifted_segmentation = []
        for s in self.segmentation:
            xs = [min(self.shift_x + s[i], self.full_shape_width) for i in range(0, len(s) - 1, 2)]
            ys = [min(self.shift_y + s[i], self.full_shape_height) for i in range(1, len(s), 2)]
            shifted_segmentation.append([j for i in zip(xs, ys) for j in i])
        return Mask(
            segmentation=shifted_segmentation,
            shift_amount=[0, 0],
            full_shape=self.full_shape,
        )

Attributes¶

bool_mask property ¶

Return boolean mask representation.

full_shape property ¶

Returns full mask shape after shifting as [height, width].

shape property ¶

Returns mask shape as [height, width].

shift_amount property ¶

Returns the shift amount of the mask slice as [shift_x, shift_y].

Functions¶

__init__(segmentation, full_shape, shift_amount=[0, 0]) ¶

Initialize Mask object.

Source code in sahi/annotation.py

def __init__(
    self,
    segmentation: list[list[float]] | np.ndarray,
    full_shape: list[int] | list[int | float] | None,
    shift_amount: list[int] | list[int | float] = [0, 0],
) -> None:
    """Initialize Mask object."""
    if full_shape is None:
        raise ValueError("full_shape must be provided")  # pyright: ignore[reportUnreachable]

    self.shift_x = int(shift_amount[0])
    self.shift_y = int(shift_amount[1])
    self.full_shape_height = int(full_shape[0])
    self.full_shape_width = int(full_shape[1])
    # Ensure segmentation is a list
    if isinstance(segmentation, np.ndarray):
        self.segmentation: list[list[float]] = segmentation.tolist()
    else:
        self.segmentation = segmentation

from_bool_mask(bool_mask, full_shape, shift_amount=None) classmethod ¶

Create Mask from boolean mask array.

Parameters:

Name	Type	Description	Default
`bool_mask` ¶	`ndarray`	np.ndarray with bool elements 2D mask of object, should have a shape of height*width	required
`full_shape` ¶	`list[int]`	List[int] Size of the full image, should be in the form of [height, width]	required
`shift_amount` ¶	`list[int] \| None`	List[int] To shift the box and mask predictions from sliced image to full sized image, should be in the form of [shift_x, shift_y].	`None`

Source code in sahi/annotation.py

@classmethod
def from_bool_mask(
    cls,
    bool_mask: np.ndarray,
    full_shape: list[int],
    shift_amount: list[int] | None = None,
) -> Mask:
    """Create Mask from boolean mask array.

    Args:
        bool_mask: np.ndarray with bool elements
            2D mask of object, should have a shape of height*width
        full_shape: List[int]
            Size of the full image, should be in the form of [height, width]
        shift_amount: List[int]
            To shift the box and mask predictions from sliced image to full
            sized image, should be in the form of [shift_x, shift_y].
    """
    if shift_amount is None:
        shift_amount = [0, 0]
    return cls(
        segmentation=get_coco_segmentation_from_bool_mask(bool_mask),
        shift_amount=shift_amount,
        full_shape=full_shape,
    )

from_float_mask(mask, full_shape, mask_threshold=0.5, shift_amount=None) classmethod ¶

Create Mask from float mask array.

Parameters:

Name	Type	Description	Default
`mask` ¶	`ndarray`	np.ndarray of np.float elements Mask values between 0 and 1 (should have a shape of height*width)	required
`mask_threshold` ¶	`float`	float Value to threshold mask pixels between 0 and 1	`0.5`
`shift_amount` ¶	`list[int] \| None`	List To shift the box and mask predictions from sliced image to full sized image, should be in the form of [shift_x, shift_y]	`None`
`full_shape` ¶	`list[int]`	List[int] Size of the full image after shifting, should be in the form of [height, width].	required

Source code in sahi/annotation.py

@classmethod
def from_float_mask(
    cls,
    mask: np.ndarray,
    full_shape: list[int],
    mask_threshold: float = 0.5,
    shift_amount: list[int] | None = None,
) -> Mask:
    """Create Mask from float mask array.

    Args:
        mask: np.ndarray of np.float elements
            Mask values between 0 and 1 (should have a shape of height*width)
        mask_threshold: float
            Value to threshold mask pixels between 0 and 1
        shift_amount: List
            To shift the box and mask predictions from sliced image
            to full sized image, should be in the form of [shift_x, shift_y]
        full_shape: List[int]
            Size of the full image after shifting, should be in the form of [height, width].
    """
    bool_mask = mask > mask_threshold
    if shift_amount is None:
        shift_amount = [0, 0]
    return cls(
        segmentation=get_coco_segmentation_from_bool_mask(bool_mask),
        shift_amount=shift_amount,
        full_shape=full_shape,
    )

get_shifted_mask() ¶

Return shifted mask.

Source code in sahi/annotation.py

def get_shifted_mask(self) -> Mask:
    """Return shifted mask."""
    # Confirm full_shape is specified
    if (self.full_shape_height is None) or (self.full_shape_width is None):
        raise ValueError("full_shape is None")
    shifted_segmentation = []
    for s in self.segmentation:
        xs = [min(self.shift_x + s[i], self.full_shape_width) for i in range(0, len(s) - 1, 2)]
        ys = [min(self.shift_y + s[i], self.full_shape_height) for i in range(1, len(s), 2)]
        shifted_segmentation.append([j for i in zip(xs, ys) for j in i])
    return Mask(
        segmentation=shifted_segmentation,
        shift_amount=[0, 0],
        full_shape=self.full_shape,
    )

`ObjectAnnotation` ¶

All about an annotation such as Mask, Category, BoundingBox.

Source code in sahi/annotation.py

class ObjectAnnotation:
    """All about an annotation such as Mask, Category, BoundingBox."""

    def __init__(
        self,
        bbox: list[float] | None = None,
        segmentation: np.ndarray | list[list[float]] | None = None,
        category_id: int | None = None,
        category_name: str | None = None,
        shift_amount: list[int] | list[int | float] | None = None,
        full_shape: list[int] | list[int | float] | None = None,
    ) -> None:
        """Initialize ObjectAnnotation.

        Args:
            bbox: List
                [minx, miny, maxx, maxy]
            segmentation: List[List]
                [
                    [x1, y1, x2, y2, x3, y3, ...],
                    [x1, y1, x2, y2, x3, y3, ...],
                    ...
                ]
            category_id: int
                ID of the object category
            category_name: str
                Name of the object category
            shift_amount: List
                To shift the box and mask predictions from sliced image
                to full sized image, should be in the form of [shift_x, shift_y]
            full_shape: List
                Size of the full image after shifting, should be in
                the form of [height, width].
        """
        if not isinstance(category_id, int):
            raise ValueError("category_id must be an integer")
        if (bbox is None) and (segmentation is None):
            raise ValueError("you must provide a bbox or segmentation")

        if shift_amount is None:
            shift_amount = [0, 0]

        self.mask: Mask | None = None
        if segmentation is not None:
            self.mask = Mask(
                segmentation=segmentation,
                shift_amount=shift_amount,
                full_shape=full_shape,
            )
            # Convert to list if ndarray for get_bbox_from_coco_segmentation
            seg_for_bbox = segmentation if not isinstance(segmentation, np.ndarray) else segmentation.tolist()
            bbox_from_segmentation = get_bbox_from_coco_segmentation(seg_for_bbox)  # type: ignore[arg-type]
            # https://github.com/obss/sahi/issues/235
            if bbox_from_segmentation is not None:
                bbox = bbox_from_segmentation
            else:
                raise ValueError("Invalid segmentation mask.")

        # if bbox is a numpy object, convert it to python List[float]
        if isinstance(bbox, np.ndarray):
            bbox = copy.deepcopy(bbox).tolist()

        # bbox must not be None at this point
        assert bbox is not None

        # make sure bbox coords lie inside [0, image_size]
        xmin = max(bbox[0], 0)
        ymin = max(bbox[1], 0)
        if full_shape:
            xmax = min(bbox[2], full_shape[1])
            ymax = min(bbox[3], full_shape[0])
        else:
            xmax = bbox[2]
            ymax = bbox[3]
        bbox = [xmin, ymin, xmax, ymax]
        # set bbox - convert shift_amount to tuple for BoundingBox
        shift_amount_tuple: tuple[int, int] = (int(shift_amount[0]), int(shift_amount[1]))
        self.bbox = BoundingBox(bbox, shift_amount_tuple)

        category_name = category_name if category_name else str(category_id)
        self.category = Category(
            id=category_id,
            name=category_name,
        )

        self.merged = None

    @classmethod
    def from_bool_mask(
        cls,
        bool_mask: np.ndarray,
        category_id: int | None = None,
        category_name: str | None = None,
        shift_amount: list[int] | None = None,
        full_shape: list[int] | None = None,
    ) -> ObjectAnnotation:
        """Create ObjectAnnotation from bool_mask (2D np.ndarray).

        Args:
            bool_mask: np.ndarray with bool elements
                2D mask of object, should have a shape of height*width
            category_id: int
                ID of the object category
            category_name: str
                Name of the object category
            full_shape: List
                Size of the full image, should be in the form of [height, width]
            shift_amount: List
                To shift the box and mask predictions from sliced image to full
                sized image, should be in the form of [shift_x, shift_y]
        """
        segmentation = get_coco_segmentation_from_bool_mask(bool_mask)
        return cls(
            category_id=category_id,
            segmentation=segmentation,  # type: ignore[arg-type]
            category_name=category_name,
            shift_amount=shift_amount,
            full_shape=full_shape,
        )

    @classmethod
    def from_coco_segmentation(
        cls,
        segmentation: list[list[float]] | list[list[int]],
        full_shape: list[int],
        category_id: int | None = None,
        category_name: str | None = None,
        shift_amount: list[int] | None = None,
    ) -> ObjectAnnotation:
        """Create ObjectAnnotation from coco segmentation format.

        The segmentation format is:
        [
            [x1, y1, x2, y2, x3, y3, ...],
            [x1, y1, x2, y2, x3, y3, ...],
            ...
        ]

        Args:
            segmentation: List[List]
                [
                    [x1, y1, x2, y2, x3, y3, ...],
                    [x1, y1, x2, y2, x3, y3, ...],
                    ...
                ]
            category_id: int
                ID of the object category
            category_name: str
                Name of the object category
            full_shape: List
                Size of the full image, should be in the form of [height, width]
            shift_amount: List
                To shift the box and mask predictions from sliced image to full
                sized image, should be in the form of [shift_x, shift_y]
        """
        return cls(
            category_id=category_id,
            segmentation=segmentation,  # type: ignore[arg-type]
            category_name=category_name,
            shift_amount=shift_amount,
            full_shape=full_shape,
        )

    @classmethod
    def from_coco_bbox(
        cls,
        bbox: list[float] | list[int],
        category_id: int | None = None,
        category_name: str | None = None,
        shift_amount: list[int] | None = None,
        full_shape: list[int] | None = None,
    ) -> ObjectAnnotation:
        """Create ObjectAnnotation from coco bbox [minx, miny, width, height].

        Args:
            bbox: List
                [minx, miny, width, height]
            category_id: int
                ID of the object category
            category_name: str
                Name of the object category
            full_shape: List
                Size of the full image, should be in the form of [height, width]
            shift_amount: List
                To shift the box and mask predictions from sliced image to full
                sized image, should be in the form of [shift_x, shift_y]
        """
        xmin = bbox[0]
        ymin = bbox[1]
        xmax = bbox[0] + bbox[2]
        ymax = bbox[1] + bbox[3]
        bbox = [xmin, ymin, xmax, ymax]
        return cls(
            category_id=category_id,
            bbox=bbox,
            category_name=category_name,
            shift_amount=shift_amount,
            full_shape=full_shape,
        )

    @classmethod
    def from_coco_annotation_dict(
        cls,
        annotation_dict: dict,
        full_shape: list[int],
        category_name: str | None = None,
        shift_amount: list[int] | None = None,
    ) -> ObjectAnnotation:
        """Create ObjectAnnotation from COCO annotation dict.

        Converts a COCO formatted annotation dict (with fields "bbox",
        "segmentation", "category_id") to ObjectAnnotation.

        Args:
            annotation_dict: dict
                COCO formatted annotation dict (with fields "bbox", "segmentation", "category_id")
            category_name: str
                Category name of the annotation
            full_shape: List
                Size of the full image, should be in the form of [height, width]
            shift_amount: List
                To shift the box and mask predictions from sliced image to full
                sized image, should be in the form of [shift_x, shift_y]
        """
        if annotation_dict["segmentation"]:
            return cls.from_coco_segmentation(
                segmentation=annotation_dict["segmentation"],
                category_id=annotation_dict["category_id"],
                category_name=category_name,
                shift_amount=shift_amount,
                full_shape=full_shape,
            )
        else:
            return cls.from_coco_bbox(
                bbox=annotation_dict["bbox"],
                category_id=annotation_dict["category_id"],
                category_name=category_name,
                shift_amount=shift_amount,
                full_shape=full_shape,
            )

    @classmethod
    def from_shapely_annotation(
        cls,
        annotation: ShapelyAnnotation,
        full_shape: list[int],
        category_id: int | None = None,
        category_name: str | None = None,
        shift_amount: list[int] | None = None,
    ) -> ObjectAnnotation:
        """Create ObjectAnnotation from shapely_utils.ShapelyAnnotation.

        Args:
            annotation: shapely_utils.ShapelyAnnotation
            category_id: int
                ID of the object category
            category_name: str
                Name of the object category
            full_shape: List
                Size of the full image, should be in the form of [height, width]
            shift_amount: List
                To shift the box and mask predictions from sliced image to full
                sized image, should be in the form of [shift_x, shift_y]
        """
        return cls(
            category_id=category_id,
            segmentation=annotation.to_coco_segmentation(),  # type: ignore[arg-type]
            category_name=category_name,
            shift_amount=shift_amount,
            full_shape=full_shape,
        )

    @classmethod
    def from_imantics_annotation(
        cls,
        annotation: Any,
        shift_amount: list[int] | None = None,
        full_shape: list[int] | None = None,
    ) -> ObjectAnnotation:
        """Create ObjectAnnotation from imantics.annotation.Annotation.

        Args:
            annotation: imantics.annotation.Annotation
            shift_amount: List
                To shift the box and mask predictions from sliced image to full
                sized image, should be in the form of [shift_x, shift_y]
            full_shape: List
                Size of the full image, should be in the form of [height, width]
        """
        if shift_amount is None:
            shift_amount = [0, 0]
        return cls(
            category_id=annotation.category.id,
            segmentation=get_coco_segmentation_from_bool_mask(annotation.mask.array),
            category_name=annotation.category.name,
            shift_amount=shift_amount,
            full_shape=full_shape,
        )

    def to_coco_annotation(self) -> CocoAnnotation:
        """Convert to sahi.utils.coco.CocoAnnotation representation."""
        if self.mask:
            coco_annotation = CocoAnnotation.from_coco_segmentation(
                segmentation=self.mask.segmentation,  # type: ignore[arg-type]
                category_id=self.category.id,
                category_name=self.category.name,
            )
        else:
            coco_annotation = CocoAnnotation.from_coco_bbox(
                bbox=self.bbox.to_xywh(),  # type: ignore[arg-type]
                category_id=self.category.id,
                category_name=self.category.name,
            )
        return coco_annotation

    def to_coco_prediction(self) -> CocoPrediction:
        """Convert to sahi.utils.coco.CocoPrediction representation."""
        if self.mask:
            coco_prediction = CocoPrediction.from_coco_segmentation(
                segmentation=self.mask.segmentation,  # type: ignore[arg-type]
                category_id=self.category.id,
                category_name=self.category.name,
                score=1,
            )
        else:
            coco_prediction = CocoPrediction.from_coco_bbox(
                bbox=self.bbox.to_xywh(),  # type: ignore[arg-type]
                category_id=self.category.id,
                category_name=self.category.name,
                score=1,
            )
        return coco_prediction

    def to_shapely_annotation(self) -> ShapelyAnnotation:
        """Convert to sahi.utils.shapely.ShapelyAnnotation representation."""
        if self.mask:
            shapely_annotation = ShapelyAnnotation.from_coco_segmentation(
                segmentation=self.mask.segmentation,  # type: ignore[arg-type]
            )
        else:
            shapely_annotation = ShapelyAnnotation.from_coco_bbox(
                bbox=self.bbox.to_xywh(),  # type: ignore[arg-type]
            )
        return shapely_annotation

    def to_imantics_annotation(self) -> Any:
        """Convert to imantics.annotation.Annotation representation."""
        try:
            import imantics
        except ImportError:
            raise ImportError('Please run "pip install -U imantics" to install imantics first for imantics conversion.')

        imantics_category = imantics.Category(id=self.category.id, name=self.category.name)
        if self.mask is not None:
            imantics_mask = imantics.Mask.create(self.mask.bool_mask)
            imantics_annotation = imantics.annotation.Annotation.from_mask(
                mask=imantics_mask, category=imantics_category
            )
        else:
            imantics_bbox = imantics.BBox.create(self.bbox.to_xyxy())
            imantics_annotation = imantics.annotation.Annotation.from_bbox(
                bbox=imantics_bbox, category=imantics_category
            )
        return imantics_annotation

    def deepcopy(self) -> ObjectAnnotation:
        """Get deepcopy of current ObjectAnnotation instance.

        Returns:
            ObjectAnnotation: A deep copy of this ObjectAnnotation.
        """
        return copy.deepcopy(self)

    @classmethod
    def get_empty_mask(cls) -> Mask:
        """Return an empty mask."""
        return Mask(segmentation=[], full_shape=[0, 0])

    def get_shifted_object_annotation(self) -> ObjectAnnotation:
        """Return shifted object annotation."""
        if self.mask:
            shifted_mask = self.mask.get_shifted_mask()
            return ObjectAnnotation(
                bbox=self.bbox.get_shifted_box().to_xyxy(),
                category_id=self.category.id,
                segmentation=shifted_mask.segmentation,
                category_name=self.category.name,
                shift_amount=[0, 0],
                full_shape=shifted_mask.full_shape,
            )
        else:
            return ObjectAnnotation(
                bbox=self.bbox.get_shifted_box().to_xyxy(),
                category_id=self.category.id,
                segmentation=None,
                category_name=self.category.name,
                shift_amount=[0, 0],
                full_shape=None,
            )

    def __repr__(self) -> str:
        """Return string representation of object annotation."""
        return f"""ObjectAnnotation<
    bbox: {self.bbox},
    mask: {self.mask},
    category: {self.category}>"""

Functions¶

__init__(bbox=None, segmentation=None, category_id=None, category_name=None, shift_amount=None, full_shape=None) ¶

Initialize ObjectAnnotation.

Parameters:

Name	Type	Description	Default
`bbox` ¶	`list[float] \| None`	List [minx, miny, maxx, maxy]	`None`
`segmentation` ¶	`ndarray \| list[list[float]] \| None`	List[List] [ [x1, y1, x2, y2, x3, y3, ...], [x1, y1, x2, y2, x3, y3, ...], ... ]	`None`
`category_id` ¶	`int \| None`	int ID of the object category	`None`
`category_name` ¶	`str \| None`	str Name of the object category	`None`
`shift_amount` ¶	`list[int] \| list[int \| float] \| None`	List To shift the box and mask predictions from sliced image to full sized image, should be in the form of [shift_x, shift_y]	`None`
`full_shape` ¶	`list[int] \| list[int \| float] \| None`	List Size of the full image after shifting, should be in the form of [height, width].	`None`

Source code in sahi/annotation.py

def __init__(
    self,
    bbox: list[float] | None = None,
    segmentation: np.ndarray | list[list[float]] | None = None,
    category_id: int | None = None,
    category_name: str | None = None,
    shift_amount: list[int] | list[int | float] | None = None,
    full_shape: list[int] | list[int | float] | None = None,
) -> None:
    """Initialize ObjectAnnotation.

    Args:
        bbox: List
            [minx, miny, maxx, maxy]
        segmentation: List[List]
            [
                [x1, y1, x2, y2, x3, y3, ...],
                [x1, y1, x2, y2, x3, y3, ...],
                ...
            ]
        category_id: int
            ID of the object category
        category_name: str
            Name of the object category
        shift_amount: List
            To shift the box and mask predictions from sliced image
            to full sized image, should be in the form of [shift_x, shift_y]
        full_shape: List
            Size of the full image after shifting, should be in
            the form of [height, width].
    """
    if not isinstance(category_id, int):
        raise ValueError("category_id must be an integer")
    if (bbox is None) and (segmentation is None):
        raise ValueError("you must provide a bbox or segmentation")

    if shift_amount is None:
        shift_amount = [0, 0]

    self.mask: Mask | None = None
    if segmentation is not None:
        self.mask = Mask(
            segmentation=segmentation,
            shift_amount=shift_amount,
            full_shape=full_shape,
        )
        # Convert to list if ndarray for get_bbox_from_coco_segmentation
        seg_for_bbox = segmentation if not isinstance(segmentation, np.ndarray) else segmentation.tolist()
        bbox_from_segmentation = get_bbox_from_coco_segmentation(seg_for_bbox)  # type: ignore[arg-type]
        # https://github.com/obss/sahi/issues/235
        if bbox_from_segmentation is not None:
            bbox = bbox_from_segmentation
        else:
            raise ValueError("Invalid segmentation mask.")

    # if bbox is a numpy object, convert it to python List[float]
    if isinstance(bbox, np.ndarray):
        bbox = copy.deepcopy(bbox).tolist()

    # bbox must not be None at this point
    assert bbox is not None

    # make sure bbox coords lie inside [0, image_size]
    xmin = max(bbox[0], 0)
    ymin = max(bbox[1], 0)
    if full_shape:
        xmax = min(bbox[2], full_shape[1])
        ymax = min(bbox[3], full_shape[0])
    else:
        xmax = bbox[2]
        ymax = bbox[3]
    bbox = [xmin, ymin, xmax, ymax]
    # set bbox - convert shift_amount to tuple for BoundingBox
    shift_amount_tuple: tuple[int, int] = (int(shift_amount[0]), int(shift_amount[1]))
    self.bbox = BoundingBox(bbox, shift_amount_tuple)

    category_name = category_name if category_name else str(category_id)
    self.category = Category(
        id=category_id,
        name=category_name,
    )

    self.merged = None

__repr__() ¶

Return string representation of object annotation.

Source code in sahi/annotation.py

def __repr__(self) -> str:
    """Return string representation of object annotation."""
    return f"""ObjectAnnotation<
bbox: {self.bbox},
mask: {self.mask},
category: {self.category}>"""

deepcopy() ¶

Get deepcopy of current ObjectAnnotation instance.

Returns:

Name	Type	Description
`ObjectAnnotation`	`ObjectAnnotation`	A deep copy of this ObjectAnnotation.

Source code in sahi/annotation.py

def deepcopy(self) -> ObjectAnnotation:
    """Get deepcopy of current ObjectAnnotation instance.

    Returns:
        ObjectAnnotation: A deep copy of this ObjectAnnotation.
    """
    return copy.deepcopy(self)

from_bool_mask(bool_mask, category_id=None, category_name=None, shift_amount=None, full_shape=None) classmethod ¶

Create ObjectAnnotation from bool_mask (2D np.ndarray).

Parameters:

Name	Type	Description	Default
`bool_mask` ¶	`ndarray`	np.ndarray with bool elements 2D mask of object, should have a shape of height*width	required
`category_id` ¶	`int \| None`	int ID of the object category	`None`
`category_name` ¶	`str \| None`	str Name of the object category	`None`
`full_shape` ¶	`list[int] \| None`	List Size of the full image, should be in the form of [height, width]	`None`
`shift_amount` ¶	`list[int] \| None`	List To shift the box and mask predictions from sliced image to full sized image, should be in the form of [shift_x, shift_y]	`None`

Source code in sahi/annotation.py

@classmethod
def from_bool_mask(
    cls,
    bool_mask: np.ndarray,
    category_id: int | None = None,
    category_name: str | None = None,
    shift_amount: list[int] | None = None,
    full_shape: list[int] | None = None,
) -> ObjectAnnotation:
    """Create ObjectAnnotation from bool_mask (2D np.ndarray).

    Args:
        bool_mask: np.ndarray with bool elements
            2D mask of object, should have a shape of height*width
        category_id: int
            ID of the object category
        category_name: str
            Name of the object category
        full_shape: List
            Size of the full image, should be in the form of [height, width]
        shift_amount: List
            To shift the box and mask predictions from sliced image to full
            sized image, should be in the form of [shift_x, shift_y]
    """
    segmentation = get_coco_segmentation_from_bool_mask(bool_mask)
    return cls(
        category_id=category_id,
        segmentation=segmentation,  # type: ignore[arg-type]
        category_name=category_name,
        shift_amount=shift_amount,
        full_shape=full_shape,
    )

from_coco_annotation_dict(annotation_dict, full_shape, category_name=None, shift_amount=None) classmethod ¶

Create ObjectAnnotation from COCO annotation dict.

Converts a COCO formatted annotation dict (with fields "bbox", "segmentation", "category_id") to ObjectAnnotation.

Parameters:

Name	Type	Description	Default
`annotation_dict` ¶	`dict`	dict COCO formatted annotation dict (with fields "bbox", "segmentation", "category_id")	required
`category_name` ¶	`str \| None`	str Category name of the annotation	`None`
`full_shape` ¶	`list[int]`	List Size of the full image, should be in the form of [height, width]	required
`shift_amount` ¶	`list[int] \| None`	List To shift the box and mask predictions from sliced image to full sized image, should be in the form of [shift_x, shift_y]	`None`

Source code in sahi/annotation.py

@classmethod
def from_coco_annotation_dict(
    cls,
    annotation_dict: dict,
    full_shape: list[int],
    category_name: str | None = None,
    shift_amount: list[int] | None = None,
) -> ObjectAnnotation:
    """Create ObjectAnnotation from COCO annotation dict.

    Converts a COCO formatted annotation dict (with fields "bbox",
    "segmentation", "category_id") to ObjectAnnotation.

    Args:
        annotation_dict: dict
            COCO formatted annotation dict (with fields "bbox", "segmentation", "category_id")
        category_name: str
            Category name of the annotation
        full_shape: List
            Size of the full image, should be in the form of [height, width]
        shift_amount: List
            To shift the box and mask predictions from sliced image to full
            sized image, should be in the form of [shift_x, shift_y]
    """
    if annotation_dict["segmentation"]:
        return cls.from_coco_segmentation(
            segmentation=annotation_dict["segmentation"],
            category_id=annotation_dict["category_id"],
            category_name=category_name,
            shift_amount=shift_amount,
            full_shape=full_shape,
        )
    else:
        return cls.from_coco_bbox(
            bbox=annotation_dict["bbox"],
            category_id=annotation_dict["category_id"],
            category_name=category_name,
            shift_amount=shift_amount,
            full_shape=full_shape,
        )

from_coco_bbox(bbox, category_id=None, category_name=None, shift_amount=None, full_shape=None) classmethod ¶

Create ObjectAnnotation from coco bbox [minx, miny, width, height].

Parameters:

Name	Type	Description	Default
`bbox` ¶	`list[float] \| list[int]`	List [minx, miny, width, height]	required
`category_id` ¶	`int \| None`	int ID of the object category	`None`
`category_name` ¶	`str \| None`	str Name of the object category	`None`
`full_shape` ¶	`list[int] \| None`	List Size of the full image, should be in the form of [height, width]	`None`
`shift_amount` ¶	`list[int] \| None`	List To shift the box and mask predictions from sliced image to full sized image, should be in the form of [shift_x, shift_y]	`None`

Source code in sahi/annotation.py

@classmethod
def from_coco_bbox(
    cls,
    bbox: list[float] | list[int],
    category_id: int | None = None,
    category_name: str | None = None,
    shift_amount: list[int] | None = None,
    full_shape: list[int] | None = None,
) -> ObjectAnnotation:
    """Create ObjectAnnotation from coco bbox [minx, miny, width, height].

    Args:
        bbox: List
            [minx, miny, width, height]
        category_id: int
            ID of the object category
        category_name: str
            Name of the object category
        full_shape: List
            Size of the full image, should be in the form of [height, width]
        shift_amount: List
            To shift the box and mask predictions from sliced image to full
            sized image, should be in the form of [shift_x, shift_y]
    """
    xmin = bbox[0]
    ymin = bbox[1]
    xmax = bbox[0] + bbox[2]
    ymax = bbox[1] + bbox[3]
    bbox = [xmin, ymin, xmax, ymax]
    return cls(
        category_id=category_id,
        bbox=bbox,
        category_name=category_name,
        shift_amount=shift_amount,
        full_shape=full_shape,
    )

from_coco_segmentation(segmentation, full_shape, category_id=None, category_name=None, shift_amount=None) classmethod ¶

Create ObjectAnnotation from coco segmentation format.

The segmentation format is: [ [x1, y1, x2, y2, x3, y3, ...], [x1, y1, x2, y2, x3, y3, ...], ... ]

Parameters:

Name	Type	Description	Default
`segmentation` ¶	`list[list[float]] \| list[list[int]]`	List[List] [ [x1, y1, x2, y2, x3, y3, ...], [x1, y1, x2, y2, x3, y3, ...], ... ]	required
`category_id` ¶	`int \| None`	int ID of the object category	`None`
`category_name` ¶	`str \| None`	str Name of the object category	`None`
`full_shape` ¶	`list[int]`	List Size of the full image, should be in the form of [height, width]	required
`shift_amount` ¶	`list[int] \| None`	List To shift the box and mask predictions from sliced image to full sized image, should be in the form of [shift_x, shift_y]	`None`

Source code in sahi/annotation.py

@classmethod
def from_coco_segmentation(
    cls,
    segmentation: list[list[float]] | list[list[int]],
    full_shape: list[int],
    category_id: int | None = None,
    category_name: str | None = None,
    shift_amount: list[int] | None = None,
) -> ObjectAnnotation:
    """Create ObjectAnnotation from coco segmentation format.

    The segmentation format is:
    [
        [x1, y1, x2, y2, x3, y3, ...],
        [x1, y1, x2, y2, x3, y3, ...],
        ...
    ]

    Args:
        segmentation: List[List]
            [
                [x1, y1, x2, y2, x3, y3, ...],
                [x1, y1, x2, y2, x3, y3, ...],
                ...
            ]
        category_id: int
            ID of the object category
        category_name: str
            Name of the object category
        full_shape: List
            Size of the full image, should be in the form of [height, width]
        shift_amount: List
            To shift the box and mask predictions from sliced image to full
            sized image, should be in the form of [shift_x, shift_y]
    """
    return cls(
        category_id=category_id,
        segmentation=segmentation,  # type: ignore[arg-type]
        category_name=category_name,
        shift_amount=shift_amount,
        full_shape=full_shape,
    )

from_imantics_annotation(annotation, shift_amount=None, full_shape=None) classmethod ¶

Create ObjectAnnotation from imantics.annotation.Annotation.

Parameters:

Name	Type	Description	Default
`annotation` ¶	`Any`	imantics.annotation.Annotation	required
`shift_amount` ¶	`list[int] \| None`	List To shift the box and mask predictions from sliced image to full sized image, should be in the form of [shift_x, shift_y]	`None`
`full_shape` ¶	`list[int] \| None`	List Size of the full image, should be in the form of [height, width]	`None`

Source code in sahi/annotation.py

@classmethod
def from_imantics_annotation(
    cls,
    annotation: Any,
    shift_amount: list[int] | None = None,
    full_shape: list[int] | None = None,
) -> ObjectAnnotation:
    """Create ObjectAnnotation from imantics.annotation.Annotation.

    Args:
        annotation: imantics.annotation.Annotation
        shift_amount: List
            To shift the box and mask predictions from sliced image to full
            sized image, should be in the form of [shift_x, shift_y]
        full_shape: List
            Size of the full image, should be in the form of [height, width]
    """
    if shift_amount is None:
        shift_amount = [0, 0]
    return cls(
        category_id=annotation.category.id,
        segmentation=get_coco_segmentation_from_bool_mask(annotation.mask.array),
        category_name=annotation.category.name,
        shift_amount=shift_amount,
        full_shape=full_shape,
    )

from_shapely_annotation(annotation, full_shape, category_id=None, category_name=None, shift_amount=None) classmethod ¶

Create ObjectAnnotation from shapely_utils.ShapelyAnnotation.

Parameters:

Name	Type	Description	Default
`annotation` ¶	`ShapelyAnnotation`	shapely_utils.ShapelyAnnotation	required
`category_id` ¶	`int \| None`	int ID of the object category	`None`
`category_name` ¶	`str \| None`	str Name of the object category	`None`
`full_shape` ¶	`list[int]`	List Size of the full image, should be in the form of [height, width]	required
`shift_amount` ¶	`list[int] \| None`	List To shift the box and mask predictions from sliced image to full sized image, should be in the form of [shift_x, shift_y]	`None`

Source code in sahi/annotation.py

@classmethod
def from_shapely_annotation(
    cls,
    annotation: ShapelyAnnotation,
    full_shape: list[int],
    category_id: int | None = None,
    category_name: str | None = None,
    shift_amount: list[int] | None = None,
) -> ObjectAnnotation:
    """Create ObjectAnnotation from shapely_utils.ShapelyAnnotation.

    Args:
        annotation: shapely_utils.ShapelyAnnotation
        category_id: int
            ID of the object category
        category_name: str
            Name of the object category
        full_shape: List
            Size of the full image, should be in the form of [height, width]
        shift_amount: List
            To shift the box and mask predictions from sliced image to full
            sized image, should be in the form of [shift_x, shift_y]
    """
    return cls(
        category_id=category_id,
        segmentation=annotation.to_coco_segmentation(),  # type: ignore[arg-type]
        category_name=category_name,
        shift_amount=shift_amount,
        full_shape=full_shape,
    )

get_empty_mask() classmethod ¶

Return an empty mask.

Source code in sahi/annotation.py

@classmethod
def get_empty_mask(cls) -> Mask:
    """Return an empty mask."""
    return Mask(segmentation=[], full_shape=[0, 0])

get_shifted_object_annotation() ¶

Return shifted object annotation.

Source code in sahi/annotation.py

def get_shifted_object_annotation(self) -> ObjectAnnotation:
    """Return shifted object annotation."""
    if self.mask:
        shifted_mask = self.mask.get_shifted_mask()
        return ObjectAnnotation(
            bbox=self.bbox.get_shifted_box().to_xyxy(),
            category_id=self.category.id,
            segmentation=shifted_mask.segmentation,
            category_name=self.category.name,
            shift_amount=[0, 0],
            full_shape=shifted_mask.full_shape,
        )
    else:
        return ObjectAnnotation(
            bbox=self.bbox.get_shifted_box().to_xyxy(),
            category_id=self.category.id,
            segmentation=None,
            category_name=self.category.name,
            shift_amount=[0, 0],
            full_shape=None,
        )

to_coco_annotation() ¶

Convert to sahi.utils.coco.CocoAnnotation representation.

Source code in sahi/annotation.py

def to_coco_annotation(self) -> CocoAnnotation:
    """Convert to sahi.utils.coco.CocoAnnotation representation."""
    if self.mask:
        coco_annotation = CocoAnnotation.from_coco_segmentation(
            segmentation=self.mask.segmentation,  # type: ignore[arg-type]
            category_id=self.category.id,
            category_name=self.category.name,
        )
    else:
        coco_annotation = CocoAnnotation.from_coco_bbox(
            bbox=self.bbox.to_xywh(),  # type: ignore[arg-type]
            category_id=self.category.id,
            category_name=self.category.name,
        )
    return coco_annotation

to_coco_prediction() ¶

Convert to sahi.utils.coco.CocoPrediction representation.

Source code in sahi/annotation.py

def to_coco_prediction(self) -> CocoPrediction:
    """Convert to sahi.utils.coco.CocoPrediction representation."""
    if self.mask:
        coco_prediction = CocoPrediction.from_coco_segmentation(
            segmentation=self.mask.segmentation,  # type: ignore[arg-type]
            category_id=self.category.id,
            category_name=self.category.name,
            score=1,
        )
    else:
        coco_prediction = CocoPrediction.from_coco_bbox(
            bbox=self.bbox.to_xywh(),  # type: ignore[arg-type]
            category_id=self.category.id,
            category_name=self.category.name,
            score=1,
        )
    return coco_prediction

to_imantics_annotation() ¶

Convert to imantics.annotation.Annotation representation.

Source code in sahi/annotation.py

def to_imantics_annotation(self) -> Any:
    """Convert to imantics.annotation.Annotation representation."""
    try:
        import imantics
    except ImportError:
        raise ImportError('Please run "pip install -U imantics" to install imantics first for imantics conversion.')

    imantics_category = imantics.Category(id=self.category.id, name=self.category.name)
    if self.mask is not None:
        imantics_mask = imantics.Mask.create(self.mask.bool_mask)
        imantics_annotation = imantics.annotation.Annotation.from_mask(
            mask=imantics_mask, category=imantics_category
        )
    else:
        imantics_bbox = imantics.BBox.create(self.bbox.to_xyxy())
        imantics_annotation = imantics.annotation.Annotation.from_bbox(
            bbox=imantics_bbox, category=imantics_category
        )
    return imantics_annotation

to_shapely_annotation() ¶

Convert to sahi.utils.shapely.ShapelyAnnotation representation.

Source code in sahi/annotation.py

def to_shapely_annotation(self) -> ShapelyAnnotation:
    """Convert to sahi.utils.shapely.ShapelyAnnotation representation."""
    if self.mask:
        shapely_annotation = ShapelyAnnotation.from_coco_segmentation(
            segmentation=self.mask.segmentation,  # type: ignore[arg-type]
        )
    else:
        shapely_annotation = ShapelyAnnotation.from_coco_bbox(
            bbox=self.bbox.to_xywh(),  # type: ignore[arg-type]
        )
    return shapely_annotation

Functions¶

`auto_model` ¶

Automatic model loading utilities.

Classes¶

`AutoDetectionModel` ¶

Automatic detection model loader.

Source code in sahi/auto_model.py

class AutoDetectionModel:
    """Automatic detection model loader."""

    @staticmethod
    def from_pretrained(
        model_type: str,
        model_path: str | None = None,
        model: object | None = None,
        config_path: str | None = None,
        device: str | None = None,
        mask_threshold: float = 0.5,
        confidence_threshold: float = 0.3,
        category_mapping: dict | None = None,
        category_remapping: dict | None = None,
        load_at_init: bool = True,
        image_size: int | None = None,
        **kwargs: object,
    ) -> DetectionModel:
        """Load a DetectionModel from given path.

        Args:
            model_type: str
                Name of the detection framework (example: "ultralytics", "huggingface", "torchvision")
            model_path: str
                Path of the detection model (ex. 'model.pt')
            model: Any
                A pre-initialized model instance, if available
            config_path: str
                Path of the config file (ex. 'mmdet/configs/cascade_rcnn_r50_fpn_1x.py')
            device: str
                Device, "cpu" or "cuda:0"
            mask_threshold: float
                Value to threshold mask pixels, should be between 0 and 1
            confidence_threshold: float
                All predictions with score < confidence_threshold will be discarded
            category_mapping: dict: str to str
                Mapping from category id (str) to category name (str) e.g. {"1": "pedestrian"}
            category_remapping: dict: str to int
                Remap category ids based on category names, after performing inference e.g. {"car": 3}
            load_at_init: bool
                If True, automatically loads the model at initialization
            image_size: int
                Inference input size.
            **kwargs: object
                Additional keyword arguments to pass to the model.

        Returns:
            Returns an instance of a DetectionModel

        Raises:
            ImportError: If given {model_type} framework is not installed
        """
        if model_type in ULTRALYTICS_MODEL_NAMES:
            model_type = "ultralytics"
        model_class_name = MODEL_TYPE_TO_MODEL_CLASS_NAME[model_type]
        DetectionModel = import_model_class(model_type, model_class_name)

        return DetectionModel(
            model_path=model_path,
            model=model,
            config_path=config_path,
            device=device,
            mask_threshold=mask_threshold,
            confidence_threshold=confidence_threshold,
            category_mapping=category_mapping,
            category_remapping=category_remapping,
            load_at_init=load_at_init,
            image_size=image_size,
            **kwargs,
        )

Functions¶

from_pretrained(model_type, model_path=None, model=None, config_path=None, device=None, mask_threshold=0.5, confidence_threshold=0.3, category_mapping=None, category_remapping=None, load_at_init=True, image_size=None, **kwargs)

staticmethod ¶

Load a DetectionModel from given path.

Parameters:

Name	Type	Description	Default
`model_type` ¶	`str`	str Name of the detection framework (example: "ultralytics", "huggingface", "torchvision")	required
`model_path` ¶	`str \| None`	str Path of the detection model (ex. 'model.pt')	`None`
`model` ¶	`object \| None`	Any A pre-initialized model instance, if available	`None`
`config_path` ¶	`str \| None`	str Path of the config file (ex. 'mmdet/configs/cascade_rcnn_r50_fpn_1x.py')	`None`
`device` ¶	`str \| None`	str Device, "cpu" or "cuda:0"	`None`
`mask_threshold` ¶	`float`	float Value to threshold mask pixels, should be between 0 and 1	`0.5`
`confidence_threshold` ¶	`float`	float All predictions with score < confidence_threshold will be discarded	`0.3`
`category_mapping` ¶	`dict \| None`	dict: str to str Mapping from category id (str) to category name (str) e.g. {"1": "pedestrian"}	`None`
`category_remapping` ¶	`dict \| None`	dict: str to int Remap category ids based on category names, after performing inference e.g. {"car": 3}	`None`
`load_at_init` ¶	`bool`	bool If True, automatically loads the model at initialization	`True`
`image_size` ¶	`int \| None`	int Inference input size.	`None`
`**kwargs` ¶	`object`	object Additional keyword arguments to pass to the model.	`{}`

Returns:

Type	Description
`DetectionModel`	Returns an instance of a DetectionModel

Raises:

Type	Description
`ImportError`	If given {model_type} framework is not installed

Source code in sahi/auto_model.py

@staticmethod
def from_pretrained(
    model_type: str,
    model_path: str | None = None,
    model: object | None = None,
    config_path: str | None = None,
    device: str | None = None,
    mask_threshold: float = 0.5,
    confidence_threshold: float = 0.3,
    category_mapping: dict | None = None,
    category_remapping: dict | None = None,
    load_at_init: bool = True,
    image_size: int | None = None,
    **kwargs: object,
) -> DetectionModel:
    """Load a DetectionModel from given path.

    Args:
        model_type: str
            Name of the detection framework (example: "ultralytics", "huggingface", "torchvision")
        model_path: str
            Path of the detection model (ex. 'model.pt')
        model: Any
            A pre-initialized model instance, if available
        config_path: str
            Path of the config file (ex. 'mmdet/configs/cascade_rcnn_r50_fpn_1x.py')
        device: str
            Device, "cpu" or "cuda:0"
        mask_threshold: float
            Value to threshold mask pixels, should be between 0 and 1
        confidence_threshold: float
            All predictions with score < confidence_threshold will be discarded
        category_mapping: dict: str to str
            Mapping from category id (str) to category name (str) e.g. {"1": "pedestrian"}
        category_remapping: dict: str to int
            Remap category ids based on category names, after performing inference e.g. {"car": 3}
        load_at_init: bool
            If True, automatically loads the model at initialization
        image_size: int
            Inference input size.
        **kwargs: object
            Additional keyword arguments to pass to the model.

    Returns:
        Returns an instance of a DetectionModel

    Raises:
        ImportError: If given {model_type} framework is not installed
    """
    if model_type in ULTRALYTICS_MODEL_NAMES:
        model_type = "ultralytics"
    model_class_name = MODEL_TYPE_TO_MODEL_CLASS_NAME[model_type]
    DetectionModel = import_model_class(model_type, model_class_name)

    return DetectionModel(
        model_path=model_path,
        model=model,
        config_path=config_path,
        device=device,
        mask_threshold=mask_threshold,
        confidence_threshold=confidence_threshold,
        category_mapping=category_mapping,
        category_remapping=category_remapping,
        load_at_init=load_at_init,
        image_size=image_size,
        **kwargs,
    )

Functions¶

`cli` ¶

Command-line interface for SAHI.

Functions¶

`app()` ¶

Cli app.

Source code in sahi/cli.py

def app() -> None:
    """Cli app."""
    fire.Fire(sahi_app)

`constants` ¶

Constants for COCO dataset.

`logger` ¶

Logger configuration for SAHI.

Classes¶

`BaseSahiLogger` ¶

Bases: Logger, ABC

Base logger class for SAHI.

Source code in sahi/logger.py

class BaseSahiLogger(logging.Logger, ABC):
    """Base logger class for SAHI."""

    @abstractmethod
    def pkg_info(self, message: str, *args: object, **kws: object) -> None:
        """Log a package info message at PKG_INFO level."""
        raise NotImplementedError

Functions¶

pkg_info(message, *args, **kws) abstractmethod ¶

Log a package info message at PKG_INFO level.

Source code in sahi/logger.py

@abstractmethod
def pkg_info(self, message: str, *args: object, **kws: object) -> None:
    """Log a package info message at PKG_INFO level."""
    raise NotImplementedError

`SahiLogger` ¶

Bases: BaseSahiLogger

SAHI logger implementation.

Source code in sahi/logger.py

class SahiLogger(BaseSahiLogger):
    """SAHI logger implementation."""

    def pkg_info(self, message: str, *args: object, **kws: object) -> None:
        """Log a package info message at PKG_INFO level."""
        if self.isEnabledFor(PKG_INFO_LEVEL):
            self._log(PKG_INFO_LEVEL, message, args, **kws)  # type: ignore[arg-type]

Functions¶

pkg_info(message, *args, **kws) ¶

Log a package info message at PKG_INFO level.

Source code in sahi/logger.py

def pkg_info(self, message: str, *args: object, **kws: object) -> None:
    """Log a package info message at PKG_INFO level."""
    if self.isEnabledFor(PKG_INFO_LEVEL):
        self._log(PKG_INFO_LEVEL, message, args, **kws)  # type: ignore[arg-type]

`SahiLoggerFormatter` ¶

Bases: Formatter

Custom formatter for SAHI logs.

Source code in sahi/logger.py

class SahiLoggerFormatter(logging.Formatter):
    """Custom formatter for SAHI logs."""

    grey = "\x1b[38;20m"
    yellow = "\x1b[33;20m"
    red = "\x1b[31;20m"
    bold_red = "\x1b[31;1m"
    cyan = "\x1b[36;20m"  # package name color
    green = "\x1b[32;20m"  # version color
    reset = "\x1b[0m"
    base_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s (%(filename)s:%(lineno)d)"

    FORMATS: ClassVar[dict[int, str]] = {
        logging.DEBUG: grey + base_format + reset,
        logging.INFO: grey + base_format + reset,
        logging.WARNING: yellow + base_format + reset,
        logging.ERROR: red + base_format + reset,
        logging.CRITICAL: bold_red + base_format + reset,
        PKG_INFO_LEVEL: base_format,
    }

    pkg_info_pattern = re.compile(r"^(?P<name>\S+)\s+version\s+(?P<version>\S+)\s+(?P<rest>.*)$")

    def format(self, record: logging.LogRecord) -> str:  # type: ignore[override]
        if record.levelno == PKG_INFO_LEVEL:
            # Custom minimal line without timestamp/file info
            msg = record.getMessage()
            m = self.pkg_info_pattern.match(msg)
            if m:
                name = f"{self.cyan}{m.group('name')}{self.reset}"
                version = f"{self.green}{m.group('version')}{self.reset}"
                rest = m.group("rest")
                return f"{name} version {version} {rest}".rstrip()
            else:
                return msg  # fallback
        # default handling
        log_fmt = self.FORMATS.get(record.levelno, self.base_format)
        formatter = logging.Formatter(log_fmt)
        return formatter.format(record)

`SupportsPkgInfo` ¶

Bases: Protocol

Protocol for loggers supporting pkg_info method.

Source code in sahi/logger.py

class SupportsPkgInfo(Protocol):
    """Protocol for loggers supporting pkg_info method."""

    def pkg_info(self, message: str, *args: object, **kws: object) -> None:
        """Log a package info message."""
        ...  # pragma: no cover

Functions¶

pkg_info(message, *args, **kws) ¶

Log a package info message.

Source code in sahi/logger.py

def pkg_info(self, message: str, *args: object, **kws: object) -> None:
    """Log a package info message."""
    ...  # pragma: no cover

`models` ¶

SAHI model classes for various detection frameworks.

This package provides a unified interface for object detection and instance segmentation models from multiple frameworks including Detectron2, MMDetection, HuggingFace Transformers, Roboflow, Ultralytics YOLO variants, and TorchVision.

Modules¶

`base` ¶

Base class for all detection models in SAHI.

Provides a unified interface for loading, inference, and prediction conversion across different detection frameworks.

Classes¶

DetectionModel ¶

Base class for all detection models in SAHI.

Subclasses must implement load_model, perform_inference, and _create_object_prediction_list_from_original_predictions to integrate a new detection framework. The base class handles device management, dependency checking, category remapping, and the public prediction API.

Source code in sahi/models/base.py

class DetectionModel:
    """Base class for all detection models in SAHI.

    Subclasses must implement ``load_model``, ``perform_inference``, and
    ``_create_object_prediction_list_from_original_predictions`` to integrate
    a new detection framework. The base class handles device management,
    dependency checking, category remapping, and the public prediction API.
    """

    required_packages: list[str] | None = None

    def __init__(
        self,
        model_path: str | None = None,
        model: Any | None = None,
        config_path: str | None = None,
        device: str | None = None,
        mask_threshold: float = 0.5,
        confidence_threshold: float = 0.3,
        category_mapping: dict | None = None,
        category_remapping: dict | None = None,
        load_at_init: bool = True,
        image_size: int | None = None,
    ) -> None:
        """Init object detection/instance segmentation model.

        Args:
            model_path: str
                Path for the instance segmentation model weight
            model: Any
                A pre-loaded detection model instance.
            config_path: str
                Path for the mmdetection instance segmentation model config file
            device: Torch device, "cpu", "mps", "cuda", "cuda:0", "cuda:1", etc.
            mask_threshold: float
                Value to threshold mask pixels, should be between 0 and 1
            confidence_threshold: float
                All predictions with score < confidence_threshold will be discarded
            category_mapping: dict: str to str
                Mapping from category id (str) to category name (str) e.g. {"1": "pedestrian"}
            category_remapping: dict: str to int
                Remap category ids based on category names, after performing inference e.g. {"car": 3}
            load_at_init: bool
                If True, automatically loads the model at initialization
            image_size: int
                Inference input size.
        """
        self.model_path = model_path
        self.config_path = config_path
        self.model: Any = None
        self.mask_threshold = mask_threshold
        self.confidence_threshold = confidence_threshold
        self.category_mapping = category_mapping
        self.category_remapping = category_remapping
        self.image_size = image_size
        self._original_predictions: Any = None
        self._object_prediction_list_per_image: list[list[ObjectPrediction]] | None = None
        self._batch_images: list[np.ndarray] | None = None
        self._original_shapes: list[tuple[int, ...]] | None = None
        self.set_device(device)

        # automatically ensure dependencies
        self.check_dependencies()

        # automatically load model if load_at_init is True
        if load_at_init:
            if model:
                self.set_model(model)
            else:
                self.load_model()

    def check_dependencies(self, packages: list[str] | None = None) -> None:
        """Ensures required dependencies are installed.

        If 'packages' is None, uses self.required_packages. Subclasses may still call with a custom list for dynamic
        needs.
        """
        pkgs = packages if packages is not None else getattr(self, "required_packages", [])
        if pkgs:
            check_requirements(pkgs)

    def load_model(self) -> None:
        """Load the detection model from disk and assign it to ``self.model``.

        Subclasses must override this method. The implementation should use
        ``self.model_path``, ``self.config_path``, and ``self.device`` to
        construct the underlying model object and store it in ``self.model``.
        """
        raise NotImplementedError()

    def set_model(self, model: Any, **kwargs: Any) -> None:
        """Set an already-instantiated model as the underlying detection model.

        Subclasses must override this method to assign ``model`` to
        ``self.model`` and perform any additional setup (e.g. category mapping).

        Args:
            model: Any
                A pre-loaded detection model instance.
            **kwargs: Any
                Additional keyword arguments for subclass-specific setup.
        """
        raise NotImplementedError()

    def set_device(self, device: str | None = None) -> None:
        """Sets the device pytorch should use for the model.

        Args:
            device: Torch device, "cpu", "mps", "cuda", "cuda:0", "cuda:1", etc.
        """
        self.device = select_device(device)

    def unload_model(self) -> None:
        """Unloads the model from CPU/GPU."""
        self.model = None
        empty_cuda_cache()

    def perform_inference(self, image: np.ndarray) -> None:
        """Run inference on a single image and store raw predictions.

        Subclasses must override this method. The implementation should run
        the model on ``image`` and assign the raw results to
        ``self._original_predictions``.

        Args:
            image: np.ndarray
                A numpy array (H, W, C) containing the image to run inference on.
        """
        raise NotImplementedError()

    def perform_batch_inference(self, images: list[np.ndarray]) -> None:
        """Performs inference on a batch of images.

        Subclasses can override this for native batch support (e.g.
        ``UltralyticsDetectionModel`` passes the full list to YOLO for
        true GPU batching, ``HuggingfaceDetectionModel`` feeds all images
        to the processor in one call).

        The default does **not** run inference here.  It stores images so
        that ``convert_original_predictions`` can call ``perform_inference``
        per image, preserving each model's ``_original_predictions`` format.
        Subclasses with native batch support override this to run inference
        immediately.

        Args:
            images: list[np.ndarray]
                List of numpy arrays (H, W, C) to run inference on.
        """
        self._batch_images = images
        self._original_shapes = [img.shape for img in images]

    def _create_object_prediction_list_from_original_predictions(
        self,
        shift_amount_list: list[list[int | float]] | None = [[0, 0]],
        full_shape_list: list[list[int | float]] | None = None,
    ) -> None:
        """Convert raw predictions to a list of ObjectPrediction instances.

        Subclasses must override this method. The implementation should read
        ``self._original_predictions``, convert each raw prediction into an
        ``ObjectPrediction``, and store the result in
        ``self._object_prediction_list_per_image``. ``self.mask_threshold``
        may be used to threshold segmentation masks.

        Args:
            shift_amount_list: list of list
                Per-image pixel shifts for mapping sliced predictions back to
                the full image, in the form ``[[shift_x, shift_y], ...]``.
            full_shape_list: list of list
                Per-image full image dimensions after shifting, in the form
                ``[[height, width], ...]``.
        """
        raise NotImplementedError()

    def _apply_category_remapping(self) -> None:
        """Applies category remapping based on mapping given in self.category_remapping."""
        # confirm self.category_remapping is not None
        if self.category_remapping is None:
            raise ValueError("self.category_remapping cannot be None")
        # remap categories
        if not isinstance(self._object_prediction_list_per_image, list):
            logger.error(
                f"Unknown type for self._object_prediction_list_per_image: "
                f"{type(self._object_prediction_list_per_image)}"
            )
            return
        for object_prediction_list in self._object_prediction_list_per_image:  # type: ignore
            for object_prediction in object_prediction_list:
                old_category_id_str = str(object_prediction.category.id)
                new_category_id_int = self.category_remapping[old_category_id_str]
                object_prediction.category = Category(id=new_category_id_int, name=object_prediction.category.name)

    def convert_original_predictions(
        self,
        shift_amount: list[list[int | float]] | None = [[0, 0]],
        full_shape: list[list[int | float]] | None = None,
    ) -> None:
        """Convert raw predictions to ObjectPrediction lists.

        Should be called after ``perform_inference`` or ``perform_batch_inference``.

        When the default (sequential) ``perform_batch_inference`` was used,
        this method runs inference + conversion one image at a time so that
        each model's internal ``_original_predictions`` format is preserved.

        Args:
            shift_amount: Per-image shift amounts ``[[shift_x, shift_y], ...]``
                or a single ``[shift_x, shift_y]`` for one image.
            full_shape: Per-image full image sizes ``[[height, width], ...]``
                or a single ``[height, width]`` for one image.
        """
        batch_images = getattr(self, "_batch_images", None)
        if batch_images is not None:
            from sahi.utils.compatibility import fix_full_shape_list, fix_shift_amount_list

            shift_amount_list = fix_shift_amount_list(shift_amount)
            full_shape_list = fix_full_shape_list(full_shape)

            all_preds: list[list[ObjectPrediction]] = []
            for i, image in enumerate(batch_images):
                self.perform_inference(np.ascontiguousarray(image))
                sa: list[list[int | float]] = [shift_amount_list[i]] if shift_amount_list else [[0, 0]]
                fs: list[list[int | float]] | None = [full_shape_list[i]] if full_shape_list else None
                self._create_object_prediction_list_from_original_predictions(
                    shift_amount_list=sa,
                    full_shape_list=fs,
                )
                if self.category_remapping:
                    self._apply_category_remapping()
                all_preds.extend(self._object_prediction_list_per_image or [])
            self._object_prediction_list_per_image = all_preds
            self._batch_images = None  # clear deferred state
            return

        # Standard single-image path
        self._create_object_prediction_list_from_original_predictions(
            shift_amount_list=shift_amount,
            full_shape_list=full_shape,
        )
        if self.category_remapping:
            self._apply_category_remapping()

    @property
    def object_prediction_list(self) -> list[ObjectPrediction]:
        """Returns the object predictions for the first image.

        This is a convenience accessor for single-image inference. For batch
        inference results, use ``object_prediction_list_per_image`` instead.
        """
        if self._object_prediction_list_per_image is None:
            return []
        if len(self._object_prediction_list_per_image) == 0:
            return []
        return self._object_prediction_list_per_image[0]

    @property
    def object_prediction_list_per_image(self) -> list[list[ObjectPrediction]]:
        """Returns object predictions grouped by image.

        Each element is a list of ``ObjectPrediction`` instances for the
        corresponding image in the batch.
        """
        return self._object_prediction_list_per_image or []

    @property
    def original_predictions(self) -> object:
        """Returns the raw predictions from the underlying model.

        The format is model-specific and is set by ``perform_inference`` or
        ``perform_batch_inference``.
        """
        return self._original_predictions

Attributes¶

object_prediction_list property ¶

Returns the object predictions for the first image.

This is a convenience accessor for single-image inference. For batch inference results, use object_prediction_list_per_image instead.

object_prediction_list_per_image property ¶

Returns object predictions grouped by image.

Each element is a list of ObjectPrediction instances for the corresponding image in the batch.

original_predictions property ¶

Returns the raw predictions from the underlying model.

The format is model-specific and is set by perform_inference or perform_batch_inference.

Functions¶

__init__(model_path=None, model=None, config_path=None, device=None, mask_threshold=0.5, confidence_threshold=0.3, category_mapping=None, category_remapping=None, load_at_init=True, image_size=None)

¶

Init object detection/instance segmentation model.

Parameters:

Name	Type	Description	Default
`model_path` ¶	`str \| None`	str Path for the instance segmentation model weight	`None`
`model` ¶	`Any \| None`	Any A pre-loaded detection model instance.	`None`
`config_path` ¶	`str \| None`	str Path for the mmdetection instance segmentation model config file	`None`
`device` ¶	`str \| None`	Torch device, "cpu", "mps", "cuda", "cuda:0", "cuda:1", etc.	`None`
`mask_threshold` ¶	`float`	float Value to threshold mask pixels, should be between 0 and 1	`0.5`
`confidence_threshold` ¶	`float`	float All predictions with score < confidence_threshold will be discarded	`0.3`
`category_mapping` ¶	`dict \| None`	dict: str to str Mapping from category id (str) to category name (str) e.g. {"1": "pedestrian"}	`None`
`category_remapping` ¶	`dict \| None`	dict: str to int Remap category ids based on category names, after performing inference e.g. {"car": 3}	`None`
`load_at_init` ¶	`bool`	bool If True, automatically loads the model at initialization	`True`
`image_size` ¶	`int \| None`	int Inference input size.	`None`

Source code in sahi/models/base.py

def __init__(
    self,
    model_path: str | None = None,
    model: Any | None = None,
    config_path: str | None = None,
    device: str | None = None,
    mask_threshold: float = 0.5,
    confidence_threshold: float = 0.3,
    category_mapping: dict | None = None,
    category_remapping: dict | None = None,
    load_at_init: bool = True,
    image_size: int | None = None,
) -> None:
    """Init object detection/instance segmentation model.

    Args:
        model_path: str
            Path for the instance segmentation model weight
        model: Any
            A pre-loaded detection model instance.
        config_path: str
            Path for the mmdetection instance segmentation model config file
        device: Torch device, "cpu", "mps", "cuda", "cuda:0", "cuda:1", etc.
        mask_threshold: float
            Value to threshold mask pixels, should be between 0 and 1
        confidence_threshold: float
            All predictions with score < confidence_threshold will be discarded
        category_mapping: dict: str to str
            Mapping from category id (str) to category name (str) e.g. {"1": "pedestrian"}
        category_remapping: dict: str to int
            Remap category ids based on category names, after performing inference e.g. {"car": 3}
        load_at_init: bool
            If True, automatically loads the model at initialization
        image_size: int
            Inference input size.
    """
    self.model_path = model_path
    self.config_path = config_path
    self.model: Any = None
    self.mask_threshold = mask_threshold
    self.confidence_threshold = confidence_threshold
    self.category_mapping = category_mapping
    self.category_remapping = category_remapping
    self.image_size = image_size
    self._original_predictions: Any = None
    self._object_prediction_list_per_image: list[list[ObjectPrediction]] | None = None
    self._batch_images: list[np.ndarray] | None = None
    self._original_shapes: list[tuple[int, ...]] | None = None
    self.set_device(device)

    # automatically ensure dependencies
    self.check_dependencies()

    # automatically load model if load_at_init is True
    if load_at_init:
        if model:
            self.set_model(model)
        else:
            self.load_model()

check_dependencies(packages=None) ¶

Ensures required dependencies are installed.

If 'packages' is None, uses self.required_packages. Subclasses may still call with a custom list for dynamic needs.

Source code in sahi/models/base.py

def check_dependencies(self, packages: list[str] | None = None) -> None:
    """Ensures required dependencies are installed.

    If 'packages' is None, uses self.required_packages. Subclasses may still call with a custom list for dynamic
    needs.
    """
    pkgs = packages if packages is not None else getattr(self, "required_packages", [])
    if pkgs:
        check_requirements(pkgs)

convert_original_predictions(shift_amount=[[0, 0]], full_shape=None) ¶

Convert raw predictions to ObjectPrediction lists.

Should be called after perform_inference or perform_batch_inference.

When the default (sequential) perform_batch_inference was used, this method runs inference + conversion one image at a time so that each model's internal _original_predictions format is preserved.

Parameters:

Name	Type	Description	Default
`shift_amount` ¶	`list[list[int \| float]] \| None`	Per-image shift amounts `[[shift_x, shift_y], ...]` or a single `[shift_x, shift_y]` for one image.	`[[0, 0]]`
`full_shape` ¶	`list[list[int \| float]] \| None`	Per-image full image sizes `[[height, width], ...]` or a single `[height, width]` for one image.	`None`

Source code in sahi/models/base.py

def convert_original_predictions(
    self,
    shift_amount: list[list[int | float]] | None = [[0, 0]],
    full_shape: list[list[int | float]] | None = None,
) -> None:
    """Convert raw predictions to ObjectPrediction lists.

    Should be called after ``perform_inference`` or ``perform_batch_inference``.

    When the default (sequential) ``perform_batch_inference`` was used,
    this method runs inference + conversion one image at a time so that
    each model's internal ``_original_predictions`` format is preserved.

    Args:
        shift_amount: Per-image shift amounts ``[[shift_x, shift_y], ...]``
            or a single ``[shift_x, shift_y]`` for one image.
        full_shape: Per-image full image sizes ``[[height, width], ...]``
            or a single ``[height, width]`` for one image.
    """
    batch_images = getattr(self, "_batch_images", None)
    if batch_images is not None:
        from sahi.utils.compatibility import fix_full_shape_list, fix_shift_amount_list

        shift_amount_list = fix_shift_amount_list(shift_amount)
        full_shape_list = fix_full_shape_list(full_shape)

        all_preds: list[list[ObjectPrediction]] = []
        for i, image in enumerate(batch_images):
            self.perform_inference(np.ascontiguousarray(image))
            sa: list[list[int | float]] = [shift_amount_list[i]] if shift_amount_list else [[0, 0]]
            fs: list[list[int | float]] | None = [full_shape_list[i]] if full_shape_list else None
            self._create_object_prediction_list_from_original_predictions(
                shift_amount_list=sa,
                full_shape_list=fs,
            )
            if self.category_remapping:
                self._apply_category_remapping()
            all_preds.extend(self._object_prediction_list_per_image or [])
        self._object_prediction_list_per_image = all_preds
        self._batch_images = None  # clear deferred state
        return

    # Standard single-image path
    self._create_object_prediction_list_from_original_predictions(
        shift_amount_list=shift_amount,
        full_shape_list=full_shape,
    )
    if self.category_remapping:
        self._apply_category_remapping()

load_model() ¶

Load the detection model from disk and assign it to self.model.

Subclasses must override this method. The implementation should use self.model_path, self.config_path, and self.device to construct the underlying model object and store it in self.model.

Source code in sahi/models/base.py

def load_model(self) -> None:
    """Load the detection model from disk and assign it to ``self.model``.

    Subclasses must override this method. The implementation should use
    ``self.model_path``, ``self.config_path``, and ``self.device`` to
    construct the underlying model object and store it in ``self.model``.
    """
    raise NotImplementedError()

perform_batch_inference(images) ¶

Performs inference on a batch of images.

Subclasses can override this for native batch support (e.g. UltralyticsDetectionModel passes the full list to YOLO for true GPU batching, HuggingfaceDetectionModel feeds all images to the processor in one call).

The default does not run inference here. It stores images so that convert_original_predictions can call perform_inference per image, preserving each model's _original_predictions format. Subclasses with native batch support override this to run inference immediately.

Parameters:

Name	Type	Description	Default
`images` ¶	`list[ndarray]`	list[np.ndarray] List of numpy arrays (H, W, C) to run inference on.	required

Source code in sahi/models/base.py

def perform_batch_inference(self, images: list[np.ndarray]) -> None:
    """Performs inference on a batch of images.

    Subclasses can override this for native batch support (e.g.
    ``UltralyticsDetectionModel`` passes the full list to YOLO for
    true GPU batching, ``HuggingfaceDetectionModel`` feeds all images
    to the processor in one call).

    The default does **not** run inference here.  It stores images so
    that ``convert_original_predictions`` can call ``perform_inference``
    per image, preserving each model's ``_original_predictions`` format.
    Subclasses with native batch support override this to run inference
    immediately.

    Args:
        images: list[np.ndarray]
            List of numpy arrays (H, W, C) to run inference on.
    """
    self._batch_images = images
    self._original_shapes = [img.shape for img in images]

perform_inference(image) ¶

Run inference on a single image and store raw predictions.

Subclasses must override this method. The implementation should run the model on image and assign the raw results to self._original_predictions.

Parameters:

Name	Type	Description	Default
`image` ¶	`ndarray`	np.ndarray A numpy array (H, W, C) containing the image to run inference on.	required

Source code in sahi/models/base.py

def perform_inference(self, image: np.ndarray) -> None:
    """Run inference on a single image and store raw predictions.

    Subclasses must override this method. The implementation should run
    the model on ``image`` and assign the raw results to
    ``self._original_predictions``.

    Args:
        image: np.ndarray
            A numpy array (H, W, C) containing the image to run inference on.
    """
    raise NotImplementedError()

set_device(device=None) ¶

Sets the device pytorch should use for the model.

Parameters:

Name	Type	Description	Default
`device` ¶	`str \| None`	Torch device, "cpu", "mps", "cuda", "cuda:0", "cuda:1", etc.	`None`

Source code in sahi/models/base.py

def set_device(self, device: str | None = None) -> None:
    """Sets the device pytorch should use for the model.

    Args:
        device: Torch device, "cpu", "mps", "cuda", "cuda:0", "cuda:1", etc.
    """
    self.device = select_device(device)

set_model(model, **kwargs) ¶

Set an already-instantiated model as the underlying detection model.

Subclasses must override this method to assign model to self.model and perform any additional setup (e.g. category mapping).

Parameters:

Name	Type	Description	Default
`model` ¶	`Any`	Any A pre-loaded detection model instance.	required
`**kwargs` ¶	`Any`	Any Additional keyword arguments for subclass-specific setup.	`{}`

Source code in sahi/models/base.py

def set_model(self, model: Any, **kwargs: Any) -> None:
    """Set an already-instantiated model as the underlying detection model.

    Subclasses must override this method to assign ``model`` to
    ``self.model`` and perform any additional setup (e.g. category mapping).

    Args:
        model: Any
            A pre-loaded detection model instance.
        **kwargs: Any
            Additional keyword arguments for subclass-specific setup.
    """
    raise NotImplementedError()

unload_model() ¶

Unloads the model from CPU/GPU.

Source code in sahi/models/base.py

def unload_model(self) -> None:
    """Unloads the model from CPU/GPU."""
    self.model = None
    empty_cuda_cache()

Functions¶

`detectron2` ¶

Detectron2 detection model wrapper for SAHI.

Provides integration with Facebook's Detectron2 framework for object detection and instance segmentation.

Classes¶

Detectron2DetectionModel ¶

Bases: DetectionModel

Detectron2 object detection model.

Wraps Detectron2's DefaultPredictor for detection and instance segmentation.

Source code in sahi/models/detectron2.py

class Detectron2DetectionModel(DetectionModel):
    """Detectron2 object detection model.

    Wraps Detectron2's DefaultPredictor for detection and instance segmentation.
    """

    def __init__(self, *args: object, **kwargs: object) -> None:
        """Initialize Detectron2 detection model.

        Args:
            *args: Variable length argument list passed to DetectionModel.
            **kwargs: Arbitrary keyword arguments passed to DetectionModel.
        """
        existing_packages = getattr(self, "required_packages", None) or []
        self.required_packages = [*list(existing_packages), "torch", "detectron2"]
        super().__init__(*args, **kwargs)  # type: ignore[misc, arg-type]

    def load_model(self) -> None:
        """Load Detectron2 model from configuration."""
        from detectron2.config import get_cfg
        from detectron2.data import MetadataCatalog
        from detectron2.engine import DefaultPredictor
        from detectron2.model_zoo import model_zoo

        cfg = get_cfg()

        try:  # try to load from model zoo
            config_file = model_zoo.get_config_file(self.config_path)
            cfg.set_new_allowed(True)
            cfg.merge_from_file(config_file)
            cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url(self.config_path)
        except Exception as e:  # try to load from local
            print(e)
            if self.config_path is not None:
                cfg.set_new_allowed(True)
                cfg.merge_from_file(self.config_path)
            cfg.MODEL.WEIGHTS = self.model_path

        # set model device
        cfg.MODEL.DEVICE = self.device.type if hasattr(self.device, "type") else str(self.device)  # type: ignore[union-attr]
        # set input image size
        if self.image_size is not None:
            cfg.INPUT.MIN_SIZE_TEST = self.image_size
            cfg.INPUT.MAX_SIZE_TEST = self.image_size
        # init predictor
        model = DefaultPredictor(cfg)

        self.model = model

        # detectron2 category mapping
        if self.category_mapping is None:
            try:  # try to parse category names from metadata
                metadata = MetadataCatalog.get(cfg.DATASETS.TRAIN[0])
                category_names = metadata.thing_classes
                self.category_names = category_names
                self.category_mapping = {
                    str(ind): category_name for ind, category_name in enumerate(self.category_names)
                }
            except Exception as e:
                logger.warning(e)
                # https://detectron2.readthedocs.io/en/latest/tutorials/datasets.html#update-the-config-for-new-datasets
                if cfg.MODEL.META_ARCHITECTURE == "RetinaNet":
                    num_categories = cfg.MODEL.RETINANET.NUM_CLASSES
                else:  # fasterrcnn/maskrcnn etc
                    num_categories = cfg.MODEL.ROI_HEADS.NUM_CLASSES
                self.category_names = [str(category_id) for category_id in range(num_categories)]
                self.category_mapping = {
                    str(ind): category_name for ind, category_name in enumerate(self.category_names)
                }
        else:
            self.category_names = list(self.category_mapping.values())

    def perform_inference(self, image: np.ndarray) -> None:
        """Prediction is performed using self.model and the prediction result is set to self._original_predictions.

        Args:
            image: np.ndarray
                A numpy array that contains the image to be predicted. 3 channel image should be in RGB order.
        """
        # Confirm model is loaded
        if self.model is None:
            raise RuntimeError("Model is not loaded, load it by calling .load_model()")

        if isinstance(image, np.ndarray) and self.model.input_format == "BGR":
            # convert RGB image to BGR format
            image = image[:, :, ::-1]

        prediction_result = self.model(image)

        self._original_predictions = prediction_result

    @property
    def num_categories(self) -> int:
        """Returns number of categories."""
        assert self.category_mapping is not None
        num_categories = len(self.category_mapping)
        return num_categories

    def _create_object_prediction_list_from_original_predictions(
        self,
        shift_amount_list: list[list[int | float]] | None = [[0, 0]],
        full_shape_list: list[list[int | float]] | None = None,
    ) -> None:
        """Convert predictions to ObjectPrediction list.

        self._original_predictions is converted to a list of prediction.ObjectPrediction and set to
        self._object_prediction_list_per_image.

        Args:
            shift_amount_list: list of list
                To shift the box and mask predictions from sliced image to full sized image, should
                be in the form of List[[shift_x, shift_y],[shift_x, shift_y],...]
            full_shape_list: list of list
                Size of the full image after shifting, should be in the form of
                List[[height, width],[height, width],...]
        """
        original_predictions = self._original_predictions

        # compatilibty for sahi v0.8.15
        if shift_amount_list is not None and isinstance(shift_amount_list[0], int):
            shift_amount_list = [shift_amount_list]  # type: ignore[list-item]
        if full_shape_list is not None and isinstance(full_shape_list[0], int):
            full_shape_list = [full_shape_list]  # type: ignore[list-item]

        # detectron2 DefaultPredictor supports single image
        shift_amount = shift_amount_list[0] if shift_amount_list else [0, 0]
        full_shape = None if full_shape_list is None else full_shape_list[0]

        # parse boxes, masks, scores, category_ids from predictions
        boxes = original_predictions["instances"].pred_boxes.tensor
        scores = original_predictions["instances"].scores
        category_ids = original_predictions["instances"].pred_classes

        # check if predictions contain mask
        try:
            masks = original_predictions["instances"].pred_masks
        except AttributeError:
            masks = None

        # filter predictions with low confidence
        high_confidence_mask = scores >= self.confidence_threshold
        boxes = boxes[high_confidence_mask]
        scores = scores[high_confidence_mask]
        category_ids = category_ids[high_confidence_mask]
        if masks is not None:
            masks = masks[high_confidence_mask]
        if masks is not None:
            object_prediction_list = [
                ObjectPrediction(
                    bbox=box.tolist() if mask is None else None,
                    segmentation=(
                        get_coco_segmentation_from_bool_mask(mask.detach().cpu().numpy()) if mask is not None else None
                    ),
                    category_id=category_id.item(),
                    category_name=self.category_mapping[str(category_id.item())] if self.category_mapping else "",  # type: ignore[index]
                    shift_amount=shift_amount,
                    score=score.item(),
                    full_shape=full_shape,
                )
                for box, score, category_id, mask in zip(boxes, scores, category_ids, masks)
                if mask is None
                or (
                    (
                        (segmentation := get_coco_segmentation_from_bool_mask(mask.detach().cpu().numpy()))
                        and len(segmentation) > 0
                    )
                    and get_bbox_from_bool_mask(mask.detach().cpu().numpy()) is not None
                )
            ]
        else:
            object_prediction_list = [
                ObjectPrediction(
                    bbox=box.tolist(),
                    segmentation=None,
                    category_id=category_id.item(),
                    category_name=self.category_mapping[str(category_id.item())] if self.category_mapping else "",  # type: ignore[index]
                    shift_amount=shift_amount,
                    score=score.item(),
                    full_shape=full_shape,
                )
                for box, score, category_id in zip(boxes, scores, category_ids)
            ]

        # detectron2 DefaultPredictor supports single image
        object_prediction_list_per_image = [object_prediction_list]

        self._object_prediction_list_per_image = object_prediction_list_per_image

Attributes¶

num_categories property ¶

Returns number of categories.

Functions¶

__init__(*args, **kwargs) ¶

Initialize Detectron2 detection model.

Parameters:

Name	Type	Description	Default
`*args` ¶	`object`	Variable length argument list passed to DetectionModel.	`()`
`**kwargs` ¶	`object`	Arbitrary keyword arguments passed to DetectionModel.	`{}`

Source code in sahi/models/detectron2.py

def __init__(self, *args: object, **kwargs: object) -> None:
    """Initialize Detectron2 detection model.

    Args:
        *args: Variable length argument list passed to DetectionModel.
        **kwargs: Arbitrary keyword arguments passed to DetectionModel.
    """
    existing_packages = getattr(self, "required_packages", None) or []
    self.required_packages = [*list(existing_packages), "torch", "detectron2"]
    super().__init__(*args, **kwargs)  # type: ignore[misc, arg-type]

load_model() ¶

Load Detectron2 model from configuration.

Source code in sahi/models/detectron2.py

def load_model(self) -> None:
    """Load Detectron2 model from configuration."""
    from detectron2.config import get_cfg
    from detectron2.data import MetadataCatalog
    from detectron2.engine import DefaultPredictor
    from detectron2.model_zoo import model_zoo

    cfg = get_cfg()

    try:  # try to load from model zoo
        config_file = model_zoo.get_config_file(self.config_path)
        cfg.set_new_allowed(True)
        cfg.merge_from_file(config_file)
        cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url(self.config_path)
    except Exception as e:  # try to load from local
        print(e)
        if self.config_path is not None:
            cfg.set_new_allowed(True)
            cfg.merge_from_file(self.config_path)
        cfg.MODEL.WEIGHTS = self.model_path

    # set model device
    cfg.MODEL.DEVICE = self.device.type if hasattr(self.device, "type") else str(self.device)  # type: ignore[union-attr]
    # set input image size
    if self.image_size is not None:
        cfg.INPUT.MIN_SIZE_TEST = self.image_size
        cfg.INPUT.MAX_SIZE_TEST = self.image_size
    # init predictor
    model = DefaultPredictor(cfg)

    self.model = model

    # detectron2 category mapping
    if self.category_mapping is None:
        try:  # try to parse category names from metadata
            metadata = MetadataCatalog.get(cfg.DATASETS.TRAIN[0])
            category_names = metadata.thing_classes
            self.category_names = category_names
            self.category_mapping = {
                str(ind): category_name for ind, category_name in enumerate(self.category_names)
            }
        except Exception as e:
            logger.warning(e)
            # https://detectron2.readthedocs.io/en/latest/tutorials/datasets.html#update-the-config-for-new-datasets
            if cfg.MODEL.META_ARCHITECTURE == "RetinaNet":
                num_categories = cfg.MODEL.RETINANET.NUM_CLASSES
            else:  # fasterrcnn/maskrcnn etc
                num_categories = cfg.MODEL.ROI_HEADS.NUM_CLASSES
            self.category_names = [str(category_id) for category_id in range(num_categories)]
            self.category_mapping = {
                str(ind): category_name for ind, category_name in enumerate(self.category_names)
            }
    else:
        self.category_names = list(self.category_mapping.values())

perform_inference(image) ¶

Prediction is performed using self.model and the prediction result is set to self._original_predictions.

Parameters:

Name	Type	Description	Default
`image` ¶	`ndarray`	np.ndarray A numpy array that contains the image to be predicted. 3 channel image should be in RGB order.	required

Source code in sahi/models/detectron2.py

def perform_inference(self, image: np.ndarray) -> None:
    """Prediction is performed using self.model and the prediction result is set to self._original_predictions.

    Args:
        image: np.ndarray
            A numpy array that contains the image to be predicted. 3 channel image should be in RGB order.
    """
    # Confirm model is loaded
    if self.model is None:
        raise RuntimeError("Model is not loaded, load it by calling .load_model()")

    if isinstance(image, np.ndarray) and self.model.input_format == "BGR":
        # convert RGB image to BGR format
        image = image[:, :, ::-1]

    prediction_result = self.model(image)

    self._original_predictions = prediction_result

Functions¶

`huggingface` ¶

HuggingFace Transformers detection model wrapper for SAHI.

Provides integration with Hugging Face Transformers library for object detection and instance segmentation models like DETR variants.

Classes¶

HuggingfaceDetectionModel ¶

Bases: DetectionModel

HuggingFace Transformers object detection model.

Supports DETR-style object detection models and GroundingDINO-style zero-shot detection models.

Source code in sahi/models/huggingface.py

class HuggingfaceDetectionModel(DetectionModel):
    """HuggingFace Transformers object detection model.

    Supports DETR-style object detection models and GroundingDINO-style zero-shot detection models.
    """

    def __init__(
        self,
        model_path: str | None = None,
        model: object | None = None,
        processor: object | None = None,
        config_path: str | None = None,
        device: str | None = None,
        mask_threshold: float = 0.5,
        confidence_threshold: float = 0.3,
        category_mapping: dict | None = None,
        category_remapping: dict | None = None,
        load_at_init: bool = True,
        image_size: int | None = None,
        token: str | None = None,
        text_prompt: str | None = None,
        text_labels: list[str] | None = None,
        text_threshold: float = 0.25,
    ) -> None:
        """Initialize HuggingFace detection model."""
        self._processor = processor
        self._original_shapes: list[tuple[int, ...]] = []
        self._token = token
        self.text_prompt = text_prompt
        self.text_labels = text_labels
        self.text_threshold = text_threshold
        self._original_input_ids: Any | None = None
        self._is_zero_shot_model = False
        self._category_name_to_id: dict[str, int] = {}
        existing_packages = getattr(self, "required_packages", None) or []
        self.required_packages = [*list(existing_packages), "torch", "transformers"]
        ensure_package_minimum_version("transformers", "4.42.0")
        super().__init__(
            model_path,
            model,
            config_path,
            device,
            mask_threshold,
            confidence_threshold,
            category_mapping,
            category_remapping,
            load_at_init,
            image_size,
        )

    @property
    def processor(self) -> Any:
        """Return the image processor."""
        return self._processor

    @property
    def image_shapes(self) -> list:
        """Return original image shapes."""
        # TODO: remove this property in a future release; use _original_shapes directly
        return self._original_shapes

    @property
    def num_categories(self) -> int:
        """Returns number of categories."""
        if self._is_zero_shot_model:
            return len(self.category_mapping)
        return self.model.config.num_labels  # type: ignore[attr-defined]

    def load_model(self) -> None:
        """Load model from HuggingFace."""
        from transformers import AutoConfig, AutoModelForObjectDetection, AutoProcessor

        hf_token = os.getenv("HF_TOKEN", self._token)
        assert self.model_path is not None, "model_path must be provided for HuggingFace models"
        config = AutoConfig.from_pretrained(self.model_path, token=hf_token)
        if self._is_zero_shot(config):
            ensure_package_minimum_version("transformers", "4.49.0")
            from transformers import AutoModelForZeroShotObjectDetection

            model_class: Any = AutoModelForZeroShotObjectDetection
        else:
            model_class = AutoModelForObjectDetection
        model = model_class.from_pretrained(self.model_path, token=hf_token)
        if self.image_size is not None:
            # RT-DETR family expects explicit height/width; other models use shortest_edge
            if model.__class__.__name__.startswith("RTDetr"):
                size: dict[str, int | None] = {"height": self.image_size, "width": self.image_size}
            else:
                size = {"shortest_edge": self.image_size, "longest_edge": None}
            # use_fast=True raises error: AttributeError: 'SizeDict' object has no attribute 'keys'
            processor = AutoProcessor.from_pretrained(
                self.model_path, size=size, do_resize=True, use_fast=False, token=hf_token
            )
        else:
            processor = AutoProcessor.from_pretrained(self.model_path, use_fast=False, token=hf_token)
        self.set_model(model, processor)

    def set_model(self, model: Any, processor: Any | None = None, **kwargs: Any) -> None:
        """Set the detection model and processor."""
        processor = processor or self.processor
        if processor is None:
            raise ValueError(f"'processor' is required to be set, got {processor}.")
        self._is_zero_shot_model = self._is_zero_shot(model)
        valid_processor = "ImageProcessor" in processor.__class__.__name__ or self._is_zero_shot(processor)
        if "ObjectDetection" not in model.__class__.__name__ or not valid_processor:
            raise ValueError(
                "Given 'model' is not an ObjectDetectionModel or 'processor' is not a valid ImageProcessor."
            )
        self.model = model
        self.model.to(self.device)  # type: ignore[attr-defined]
        self._processor = processor
        if self._is_zero_shot_model:
            self.category_mapping = {i: name for i, name in enumerate(self.text_labels or [])}
            self._category_name_to_id = {name: i for i, name in self.category_mapping.items()}
        else:
            self.category_mapping = self.model.config.id2label  # type: ignore[attr-defined]

    def perform_inference(self, image: list | np.ndarray) -> None:
        """Prediction is performed using self.model and the prediction result is set to self._original_predictions.

        Args:
            image: np.ndarray
                A numpy array that contains the image to be predicted. 3 channel image should be in RGB order.
        """
        import torch

        # Confirm model is loaded
        if self.model is None or self.processor is None:
            raise RuntimeError("Model is not loaded, load it by calling .load_model()")

        with torch.no_grad():
            if self._is_zero_shot_model:
                text = self._get_zero_shot_text_input(len(image) if isinstance(image, list) else 1)
                inputs = self.processor(images=image, text=text, return_tensors="pt")
            else:
                inputs = self.processor(images=image, return_tensors="pt")
            inputs = {k: v.to(self.device) if hasattr(v, "to") else v for k, v in inputs.items()}
            outputs = self.model(**inputs)
        self._original_input_ids = inputs.get("input_ids")

        images = image if isinstance(image, list) else [image]
        self._original_shapes = [img.shape for img in images]
        self._original_predictions = outputs

    def perform_batch_inference(self, images: list[np.ndarray]) -> None:
        """Native batch inference: process all images in a single processor + model call.

        Unlike the base-class default (which runs images sequentially), this
        feeds the entire list to the HuggingFace processor at once and executes
        one batched forward pass.  The processor pads images to a uniform size
        internally, so images of different resolutions are handled correctly.

        This avoids setting ``_batch_images`` so
        ``convert_original_predictions`` uses the standard multi-image path
        rather than the sequential fallback.

        Args:
            images: List of numpy arrays (H, W, C) in RGB order.
        """
        self.perform_inference(images)

    # Models using per-class sigmoid (no background class in logits)
    _SIGMOID_CLS_PREFIXES = ("RTDetr", "ConditionalDetr", "DeformableDetr", "Deta", "GroundingDino")

    @property
    def _uses_sigmoid_cls(self) -> bool:
        """True for models that use per-class sigmoid instead of softmax+background."""
        cls_name = self.model.__class__.__name__
        return any(cls_name.startswith(p) for p in self._SIGMOID_CLS_PREFIXES)

    @staticmethod
    def _is_zero_shot(obj: Any) -> bool:
        """Return whether a HuggingFace config/model/processor is a GroundingDINO-style zero-shot detector."""
        if hasattr(obj, "post_process_grounded_object_detection"):
            return True
        return obj.__class__.__name__.startswith("GroundingDino") or getattr(obj, "model_type", "") == "grounding-dino"

    def _get_zero_shot_text_input(self, num_images: int) -> list:
        """Return per-image text input for the HuggingFace zero-shot processor."""
        prompt = self.text_labels or self.text_prompt
        if not prompt:
            raise ValueError("'text_labels' or 'text_prompt' is required for zero-shot HuggingFace detection models.")
        return [prompt] * num_images

    @staticmethod
    def _clamp_bbox(bbox: list, image_width: int, image_height: int) -> list:
        """Clamp a [x1, y1, x2, y2] box to image bounds."""
        x1, y1, x2, y2 = bbox
        return [max(0, x1), max(0, y1), min(x2, image_width), min(y2, image_height)]

    @staticmethod
    def _shift_and_full_shape(
        shift_amount_list: list[list[int | float]],
        full_shape_list: list[list[int | float]] | None,
        image_ind: int,
    ) -> tuple[list[int], list[int] | None]:
        """Return the int-cast shift amount and full shape for a single image."""
        shift_amount = [int(x) for x in shift_amount_list[image_ind]]
        full_shape = None if full_shape_list is None else [int(x) for x in full_shape_list[image_ind]]
        return shift_amount, full_shape

    def _get_zero_shot_category_id(self, category_name: str) -> int:
        """Return a stable category id for a zero-shot label, assigning a new one for unseen phrases."""
        if category_name not in self._category_name_to_id:
            new_id = len(self.category_mapping)
            self._category_name_to_id[category_name] = new_id
            self.category_mapping[new_id] = category_name
        return self._category_name_to_id[category_name]

    def get_valid_predictions(self, logits: Any, pred_boxes: Any) -> tuple:
        """Get predictions above confidence threshold.

        Args:
            logits: torch.Tensor
            pred_boxes: torch.Tensor

        Returns:
            scores: torch.Tensor
            cat_ids: torch.Tensor
            boxes: torch.Tensor
        """
        import torch

        if self._uses_sigmoid_cls:
            # RT-DETR family: per-class sigmoid, logits shape (Q, num_classes) — no background class
            probs = logits.sigmoid()
            scores, cat_ids = probs.max(-1)
            valid_mask = scores >= self.confidence_threshold
        else:
            # DETR family: softmax over (num_classes + 1), last index is no-object/background
            probs = logits.softmax(-1)
            scores = probs.max(-1).values
            cat_ids = probs.argmax(-1)
            valid_detections = torch.where(cat_ids < self.num_categories, 1, 0)
            valid_confidences = torch.where(scores >= self.confidence_threshold, 1, 0)
            valid_mask = valid_detections.logical_and(valid_confidences).bool()

        scores = scores[valid_mask]
        cat_ids = cat_ids[valid_mask]
        boxes = pred_boxes[valid_mask]
        return scores, cat_ids, boxes

    def _create_object_prediction_list_from_original_predictions(
        self,
        shift_amount_list: list[list[int | float]] | None = [[0, 0]],
        full_shape_list: list[list[int | float]] | None = None,
    ) -> None:
        """Convert predictions to ObjectPrediction list.

        self._original_predictions is converted to a list of prediction.ObjectPrediction and set to
        self._object_prediction_list_per_image.

        Args:
            shift_amount_list: list of list
                To shift the box and mask predictions from sliced image to full sized image, should
                be in the form of List[[shift_x, shift_y],[shift_x, shift_y],...]
            full_shape_list: list of list
                Size of the full image after shifting, should be in the form of
                List[[height, width],[height, width],...]
        """
        assert self._original_predictions is not None
        original_predictions: Any = self._original_predictions

        # compatibility for sahi v0.8.15
        shift_amount_list_typed: list[list[int | float]] = fix_shift_amount_list(shift_amount_list)
        full_shape_list_typed: list[list[int | float]] | None = fix_full_shape_list(full_shape_list)

        if self._is_zero_shot_model:
            self._create_object_prediction_list_from_zero_shot_predictions(
                original_predictions=original_predictions,
                shift_amount_list=shift_amount_list_typed,
                full_shape_list=full_shape_list_typed,
            )
            return

        from sahi.utils.cv import yolo_bbox_to_voc_bbox

        n_image = original_predictions.logits.shape[0]
        object_prediction_list_per_image = []
        for image_ind in range(n_image):
            image_height, image_width, _ = self.image_shapes[image_ind]
            scores, cat_ids, boxes = self.get_valid_predictions(
                logits=original_predictions.logits[image_ind], pred_boxes=original_predictions.pred_boxes[image_ind]
            )

            # create object_prediction_list
            object_prediction_list = []

            shift_amount, full_shape = self._shift_and_full_shape(
                shift_amount_list_typed, full_shape_list_typed, image_ind
            )

            for ind in range(len(boxes)):
                category_id = cat_ids[ind].item()
                bbox = yolo_bbox_to_voc_bbox(boxes[ind].tolist(), image_width=image_width, image_height=image_height)
                bbox = self._clamp_bbox(bbox, image_width, image_height)

                object_prediction = ObjectPrediction(
                    bbox=bbox,
                    segmentation=None,
                    category_id=category_id,
                    category_name=self.category_mapping[category_id] if self.category_mapping else "",  # type: ignore[index]
                    shift_amount=shift_amount,
                    score=scores[ind].item(),
                    full_shape=full_shape,
                )
                object_prediction_list.append(object_prediction)
            object_prediction_list_per_image.append(object_prediction_list)

        self._object_prediction_list_per_image = object_prediction_list_per_image

    def _create_object_prediction_list_from_zero_shot_predictions(
        self,
        original_predictions: Any,
        shift_amount_list: list[list[int | float]],
        full_shape_list: list[list[int | float]] | None = None,
    ) -> None:
        """Convert HuggingFace zero-shot detection output to ObjectPrediction objects."""
        if self._original_input_ids is None:
            raise RuntimeError("Zero-shot text input ids are missing. Run .perform_inference() before conversion.")

        target_sizes = [(image_shape[0], image_shape[1]) for image_shape in self.image_shapes]
        results = self.processor.post_process_grounded_object_detection(
            original_predictions,
            input_ids=self._original_input_ids,
            threshold=self.confidence_threshold,
            text_threshold=self.text_threshold,
            target_sizes=target_sizes,
        )

        object_prediction_list_per_image = []
        for image_ind, image_predictions in enumerate(results):
            image_height, image_width, _ = self.image_shapes[image_ind]
            shift_amount, full_shape = self._shift_and_full_shape(shift_amount_list, full_shape_list, image_ind)
            labels = image_predictions.get("text_labels") or image_predictions.get("labels", [])

            object_prediction_list = [
                ObjectPrediction(
                    bbox=self._clamp_bbox(bbox.tolist(), image_width, image_height),
                    segmentation=None,
                    category_id=self._get_zero_shot_category_id(str(name)),
                    category_name=str(name),
                    shift_amount=shift_amount,
                    score=float(score),
                    full_shape=full_shape,
                )
                for score, bbox, name in zip(image_predictions["scores"], image_predictions["boxes"], labels)
                # when fixed text_labels are given, drop combined phrases (e.g. "car truck")
                if not self.text_labels or str(name) in self.text_labels
            ]
            object_prediction_list_per_image.append(object_prediction_list)

        self._object_prediction_list_per_image = object_prediction_list_per_image

Attributes¶

image_shapes property ¶

Return original image shapes.

num_categories property ¶

Returns number of categories.

processor property ¶

Return the image processor.

Functions¶

__init__(model_path=None, model=None, processor=None, config_path=None, device=None, mask_threshold=0.5, confidence_threshold=0.3, category_mapping=None, category_remapping=None, load_at_init=True, image_size=None, token=None, text_prompt=None, text_labels=None, text_threshold=0.25)

¶

Initialize HuggingFace detection model.

Source code in sahi/models/huggingface.py

def __init__(
    self,
    model_path: str | None = None,
    model: object | None = None,
    processor: object | None = None,
    config_path: str | None = None,
    device: str | None = None,
    mask_threshold: float = 0.5,
    confidence_threshold: float = 0.3,
    category_mapping: dict | None = None,
    category_remapping: dict | None = None,
    load_at_init: bool = True,
    image_size: int | None = None,
    token: str | None = None,
    text_prompt: str | None = None,
    text_labels: list[str] | None = None,
    text_threshold: float = 0.25,
) -> None:
    """Initialize HuggingFace detection model."""
    self._processor = processor
    self._original_shapes: list[tuple[int, ...]] = []
    self._token = token
    self.text_prompt = text_prompt
    self.text_labels = text_labels
    self.text_threshold = text_threshold
    self._original_input_ids: Any | None = None
    self._is_zero_shot_model = False
    self._category_name_to_id: dict[str, int] = {}
    existing_packages = getattr(self, "required_packages", None) or []
    self.required_packages = [*list(existing_packages), "torch", "transformers"]
    ensure_package_minimum_version("transformers", "4.42.0")
    super().__init__(
        model_path,
        model,
        config_path,
        device,
        mask_threshold,
        confidence_threshold,
        category_mapping,
        category_remapping,
        load_at_init,
        image_size,
    )

get_valid_predictions(logits, pred_boxes) ¶

Get predictions above confidence threshold.

Parameters:

Name	Type	Description	Default
`logits` ¶	`Any`	torch.Tensor	required
`pred_boxes` ¶	`Any`	torch.Tensor	required

Returns:

Name	Type	Description
`scores`	`tuple`	torch.Tensor
`cat_ids`	`tuple`	torch.Tensor
`boxes`	`tuple`	torch.Tensor

Source code in sahi/models/huggingface.py

def get_valid_predictions(self, logits: Any, pred_boxes: Any) -> tuple:
    """Get predictions above confidence threshold.

    Args:
        logits: torch.Tensor
        pred_boxes: torch.Tensor

    Returns:
        scores: torch.Tensor
        cat_ids: torch.Tensor
        boxes: torch.Tensor
    """
    import torch

    if self._uses_sigmoid_cls:
        # RT-DETR family: per-class sigmoid, logits shape (Q, num_classes) — no background class
        probs = logits.sigmoid()
        scores, cat_ids = probs.max(-1)
        valid_mask = scores >= self.confidence_threshold
    else:
        # DETR family: softmax over (num_classes + 1), last index is no-object/background
        probs = logits.softmax(-1)
        scores = probs.max(-1).values
        cat_ids = probs.argmax(-1)
        valid_detections = torch.where(cat_ids < self.num_categories, 1, 0)
        valid_confidences = torch.where(scores >= self.confidence_threshold, 1, 0)
        valid_mask = valid_detections.logical_and(valid_confidences).bool()

    scores = scores[valid_mask]
    cat_ids = cat_ids[valid_mask]
    boxes = pred_boxes[valid_mask]
    return scores, cat_ids, boxes

load_model() ¶

Load model from HuggingFace.

Source code in sahi/models/huggingface.py

def load_model(self) -> None:
    """Load model from HuggingFace."""
    from transformers import AutoConfig, AutoModelForObjectDetection, AutoProcessor

    hf_token = os.getenv("HF_TOKEN", self._token)
    assert self.model_path is not None, "model_path must be provided for HuggingFace models"
    config = AutoConfig.from_pretrained(self.model_path, token=hf_token)
    if self._is_zero_shot(config):
        ensure_package_minimum_version("transformers", "4.49.0")
        from transformers import AutoModelForZeroShotObjectDetection

        model_class: Any = AutoModelForZeroShotObjectDetection
    else:
        model_class = AutoModelForObjectDetection
    model = model_class.from_pretrained(self.model_path, token=hf_token)
    if self.image_size is not None:
        # RT-DETR family expects explicit height/width; other models use shortest_edge
        if model.__class__.__name__.startswith("RTDetr"):
            size: dict[str, int | None] = {"height": self.image_size, "width": self.image_size}
        else:
            size = {"shortest_edge": self.image_size, "longest_edge": None}
        # use_fast=True raises error: AttributeError: 'SizeDict' object has no attribute 'keys'
        processor = AutoProcessor.from_pretrained(
            self.model_path, size=size, do_resize=True, use_fast=False, token=hf_token
        )
    else:
        processor = AutoProcessor.from_pretrained(self.model_path, use_fast=False, token=hf_token)
    self.set_model(model, processor)

perform_batch_inference(images) ¶

Native batch inference: process all images in a single processor + model call.

Unlike the base-class default (which runs images sequentially), this feeds the entire list to the HuggingFace processor at once and executes one batched forward pass. The processor pads images to a uniform size internally, so images of different resolutions are handled correctly.

This avoids setting _batch_images so convert_original_predictions uses the standard multi-image path rather than the sequential fallback.

Parameters:

Name	Type	Description	Default
`images` ¶	`list[ndarray]`	List of numpy arrays (H, W, C) in RGB order.	required

Source code in sahi/models/huggingface.py

def perform_batch_inference(self, images: list[np.ndarray]) -> None:
    """Native batch inference: process all images in a single processor + model call.

    Unlike the base-class default (which runs images sequentially), this
    feeds the entire list to the HuggingFace processor at once and executes
    one batched forward pass.  The processor pads images to a uniform size
    internally, so images of different resolutions are handled correctly.

    This avoids setting ``_batch_images`` so
    ``convert_original_predictions`` uses the standard multi-image path
    rather than the sequential fallback.

    Args:
        images: List of numpy arrays (H, W, C) in RGB order.
    """
    self.perform_inference(images)

perform_inference(image) ¶

Prediction is performed using self.model and the prediction result is set to self._original_predictions.

Parameters:

Name	Type	Description	Default
`image` ¶	`list \| ndarray`	np.ndarray A numpy array that contains the image to be predicted. 3 channel image should be in RGB order.	required

Source code in sahi/models/huggingface.py

def perform_inference(self, image: list | np.ndarray) -> None:
    """Prediction is performed using self.model and the prediction result is set to self._original_predictions.

    Args:
        image: np.ndarray
            A numpy array that contains the image to be predicted. 3 channel image should be in RGB order.
    """
    import torch

    # Confirm model is loaded
    if self.model is None or self.processor is None:
        raise RuntimeError("Model is not loaded, load it by calling .load_model()")

    with torch.no_grad():
        if self._is_zero_shot_model:
            text = self._get_zero_shot_text_input(len(image) if isinstance(image, list) else 1)
            inputs = self.processor(images=image, text=text, return_tensors="pt")
        else:
            inputs = self.processor(images=image, return_tensors="pt")
        inputs = {k: v.to(self.device) if hasattr(v, "to") else v for k, v in inputs.items()}
        outputs = self.model(**inputs)
    self._original_input_ids = inputs.get("input_ids")

    images = image if isinstance(image, list) else [image]
    self._original_shapes = [img.shape for img in images]
    self._original_predictions = outputs

set_model(model, processor=None, **kwargs) ¶

Set the detection model and processor.

Source code in sahi/models/huggingface.py

def set_model(self, model: Any, processor: Any | None = None, **kwargs: Any) -> None:
    """Set the detection model and processor."""
    processor = processor or self.processor
    if processor is None:
        raise ValueError(f"'processor' is required to be set, got {processor}.")
    self._is_zero_shot_model = self._is_zero_shot(model)
    valid_processor = "ImageProcessor" in processor.__class__.__name__ or self._is_zero_shot(processor)
    if "ObjectDetection" not in model.__class__.__name__ or not valid_processor:
        raise ValueError(
            "Given 'model' is not an ObjectDetectionModel or 'processor' is not a valid ImageProcessor."
        )
    self.model = model
    self.model.to(self.device)  # type: ignore[attr-defined]
    self._processor = processor
    if self._is_zero_shot_model:
        self.category_mapping = {i: name for i, name in enumerate(self.text_labels or [])}
        self._category_name_to_id = {name: i for i, name in self.category_mapping.items()}
    else:
        self.category_mapping = self.model.config.id2label  # type: ignore[attr-defined]

Functions¶

`huggingface_segmentation` ¶

HuggingFace segmentation model wrapper for SAHI.

Supports MaskFormer, Mask2Former, and OneFormer for instance, semantic, and panoptic segmentation via Hugging Face Transformers.

Classes¶

HuggingfaceSegmentationModel ¶

Bases: HuggingfaceDetectionModel

HuggingFace segmentation model.

Subclasses :class:HuggingfaceDetectionModel, reusing its processor, num_categories, token handling, and dependency checks. Supports MaskFormer, Mask2Former, and OneFormer for instance, semantic, and panoptic segmentation.

Parameters:

Name	Type	Description	Default
`overlap_mask_area_threshold` ¶	`float`	Overlap mask area threshold to merge or discard small disconnected parts within each binary instance mask.	`0.8`
`label_ids_to_fuse` ¶	`list[int] \| None`	Label ids whose instances will be fused together (panoptic only). E.g. sky can be a single segment per image.	`None`
`min_segment_area` ¶	`int`	Segments below this contour area are dropped.	`100`
`segmentation_type` ¶	`SegmentationType`	Which segmentation head to use. Params that do not apply to the chosen type are ignored.	`INSTANCE_SEGMENTATION`

Source code in sahi/models/huggingface_segmentation.py

class HuggingfaceSegmentationModel(HuggingfaceDetectionModel):
    """HuggingFace segmentation model.

    Subclasses :class:`HuggingfaceDetectionModel`, reusing its processor,
    ``num_categories``, token handling, and dependency checks. Supports
    MaskFormer, Mask2Former, and OneFormer for instance, semantic, and
    panoptic segmentation.

    Args:
        overlap_mask_area_threshold: Overlap mask area threshold to merge or
            discard small disconnected parts within each binary instance mask.
        label_ids_to_fuse: Label ids whose instances will be fused together
            (panoptic only). E.g. sky can be a single segment per image.
        min_segment_area: Segments below this contour area are dropped.
        segmentation_type: Which segmentation head to use. Params that do not
            apply to the chosen type are ignored.
    """

    def __init__(
        self,
        *args: Any,
        overlap_mask_area_threshold: float = 0.8,
        label_ids_to_fuse: list[int] | None = None,
        min_segment_area: int = 100,
        segmentation_type: SegmentationType = SegmentationType.INSTANCE_SEGMENTATION,
        **kwargs: Any,
    ) -> None:
        self.segmentation_type = segmentation_type
        self.overlap_mask_area_threshold = overlap_mask_area_threshold
        self.label_ids_to_fuse = label_ids_to_fuse
        self.min_segment_area = min_segment_area
        # Segmentation models default to a stricter threshold than detection (0.3).
        kwargs.setdefault("confidence_threshold", 0.5)
        super().__init__(*args, **kwargs)

    def load_model(self) -> None:
        """Load model and processor from HuggingFace."""
        from transformers import AutoModelForUniversalSegmentation, AutoProcessor

        if self.model_path is None:
            raise ValueError("model_path must be provided for HuggingFace models")

        hf_token = os.getenv("HF_TOKEN", self._token)
        model = AutoModelForUniversalSegmentation.from_pretrained(self.model_path, token=hf_token)

        processor_kwargs: dict[str, Any] = {"use_fast": False, "token": hf_token}
        if self.image_size is not None:
            processor_kwargs |= {
                "size": {"height": self.image_size, "width": self.image_size},
                "do_resize": True,
            }
        # use_fast=True raises: AttributeError: 'SizeDict' object has no attribute 'keys'
        processor = AutoProcessor.from_pretrained(self.model_path, **processor_kwargs)

        self.set_model(model, processor)

    def set_model(self, model: Any, processor: Any = None, **kwargs: Any) -> None:
        processor = processor or self.processor
        if processor is None:
            raise ValueError("'processor' is required to be set.")

        model_name = type(model).__name__
        processor_name = type(processor).__name__
        expected_processor_prefix = _SUPPORTED_MODELS.get(model_name)
        # Newer transformers append backend suffixes (e.g. Mask2FormerImageProcessorPil), so match by prefix.
        if expected_processor_prefix is None or not processor_name.startswith(expected_processor_prefix):
            raise ValueError(
                f"Invalid model/processor pair: {model_name} + {processor_name}. Supported: {_SUPPORTED_MODELS}"
            )

        self.model = model
        self.model.to(self.device)
        self._processor = processor
        self.category_mapping = self.model.config.id2label

    def perform_inference(self, image: list[np.ndarray] | np.ndarray) -> None:
        import torch

        if self.model is None or self.processor is None:
            raise RuntimeError("Model is not loaded; call .load_model() first.")

        inputs = self._pre_process(image)
        with torch.no_grad():
            outputs = self.model(**inputs)

        images = image if isinstance(image, list) else [image]
        self._original_shapes = [(int(img.shape[0]), int(img.shape[1])) for img in images]
        self._original_predictions = outputs

    def perform_batch_inference(self, images: list[np.ndarray]) -> None:
        self.perform_inference(images)

    def _is_oneformer(self) -> bool:
        return type(self.model).__name__ == "OneFormerForUniversalSegmentation"

    def _pre_process(self, image: list[np.ndarray] | np.ndarray) -> Any:
        kwargs: dict[str, Any] = {"images": image, "return_tensors": "pt"}
        if self._is_oneformer():
            seg_value = self.segmentation_type.value
            n = len(image) if isinstance(image, list) else 1
            kwargs["task_inputs"] = [seg_value] * n

        inputs = self.processor(**kwargs)
        for key in ("pixel_values", "pixel_mask", "task_inputs"):
            if key in inputs:
                inputs[key] = inputs[key].to(self.device)
        return inputs

    def _post_process(self, predictions: Any, target_sizes: list) -> list[dict]:
        processor = self.processor
        seg_type = self.segmentation_type
        common: dict[str, Any] = {
            "threshold": self.confidence_threshold,
            "mask_threshold": self.mask_threshold,
            "overlap_mask_area_threshold": self.overlap_mask_area_threshold,
            "target_sizes": target_sizes,
        }

        if seg_type is SegmentationType.SEMANTIC_SEGMENTATION:
            outputs = processor.post_process_semantic_segmentation(predictions, target_sizes)
            return _convert_semantic_to_binary_masks(outputs)

        if seg_type is SegmentationType.PANOPTIC_SEGMENTATION:
            outputs = processor.post_process_panoptic_segmentation(
                predictions, label_ids_to_fuse=self.label_ids_to_fuse, **common
            )
            return _convert_panoptic_to_binary_masks(outputs)

        # OneFormer's instance output matches panoptic format and needs conversion;
        # MaskFormer/Mask2Former can emit binary maps directly.
        if self._is_oneformer():
            outputs = processor.post_process_instance_segmentation(predictions, **common)
            return _convert_panoptic_to_binary_masks(outputs)
        return processor.post_process_instance_segmentation(predictions, return_binary_maps=True, **common)

    def _extract_segments(self, post_processed_output: dict) -> tuple[list, list, list]:
        """Convert per-segment binary masks to (scores, category_ids, coco_segmentations).

        Each mask yields at most one COCO multi-polygon entry; masks smaller
        than ``min_segment_area`` pixels (or yielding no valid polygons) are dropped.
        """
        scores: list = []
        category_ids: list = []
        coco_segmentations: list[list[list[float]]] = []

        segments = post_processed_output["segmentation"]
        segments_info = post_processed_output["segments_info"]
        if segments is None or not segments_info:
            return scores, category_ids, coco_segmentations

        for segment, segment_info in zip(segments, segments_info):
            bool_mask = segment.cpu().numpy().astype(bool)
            if bool_mask.sum() < self.min_segment_area:
                continue
            coco_segmentation = get_coco_segmentation_from_bool_mask(bool_mask)
            if not coco_segmentation:
                continue
            coco_segmentations.append(coco_segmentation)
            scores.append(segment_info["score"])
            category_ids.append(segment_info["label_id"])

        return scores, category_ids, coco_segmentations

    def _create_object_prediction_list_from_original_predictions(
        self,
        shift_amount_list: list[list[int | float]] | None = [[0, 0]],
        full_shape_list: list[list[int | float]] | None = None,
    ) -> None:
        target_sizes = self._original_shapes or []
        post_processed_outputs = self._post_process(self._original_predictions, target_sizes)

        shift_amount_list = fix_shift_amount_list(shift_amount_list)
        full_shape_list = fix_full_shape_list(full_shape_list)

        object_prediction_list_per_image: list[list[ObjectPrediction]] = []
        for image_ind, output in enumerate(post_processed_outputs):
            scores, category_ids, segments = self._extract_segments(output)
            shift_amount = shift_amount_list[image_ind]
            full_shape = list(target_sizes[image_ind]) if full_shape_list is None else full_shape_list[image_ind]

            object_prediction_list = [
                ObjectPrediction(
                    bbox=None,
                    segmentation=segment,
                    category_id=category_id,
                    category_name=self.category_mapping[category_id],
                    shift_amount=shift_amount,
                    score=score,
                    full_shape=full_shape,
                )
                for category_id, segment, score in zip(category_ids, segments, scores)
            ]
            object_prediction_list_per_image.append(object_prediction_list)

        self._object_prediction_list_per_image = object_prediction_list_per_image

Functions¶

load_model() ¶

Load model and processor from HuggingFace.

Source code in sahi/models/huggingface_segmentation.py

def load_model(self) -> None:
    """Load model and processor from HuggingFace."""
    from transformers import AutoModelForUniversalSegmentation, AutoProcessor

    if self.model_path is None:
        raise ValueError("model_path must be provided for HuggingFace models")

    hf_token = os.getenv("HF_TOKEN", self._token)
    model = AutoModelForUniversalSegmentation.from_pretrained(self.model_path, token=hf_token)

    processor_kwargs: dict[str, Any] = {"use_fast": False, "token": hf_token}
    if self.image_size is not None:
        processor_kwargs |= {
            "size": {"height": self.image_size, "width": self.image_size},
            "do_resize": True,
        }
    # use_fast=True raises: AttributeError: 'SizeDict' object has no attribute 'keys'
    processor = AutoProcessor.from_pretrained(self.model_path, **processor_kwargs)

    self.set_model(model, processor)

Functions¶

`mmdet` ¶

MMDetection detection model wrapper for SAHI.

Provides integration with OpenMMLab's MMDetection framework for object detection and instance segmentation.

Classes¶

DetInferencerWrapper ¶

Bases: DetInferencer

Wrapper around MMDetection DetInferencer for custom inference pipeline.

Source code in sahi/models/mmdet.py

class DetInferencerWrapper(DetInferencer):
    """Wrapper around MMDetection DetInferencer for custom inference pipeline."""

    def __init__(
        self,
        model: ModelType | str | None = None,
        weights: str | None = None,
        device: str | None = None,
        scope: str | None = "mmdet",
        palette: str = "none",
        image_size: int | None = None,
    ) -> None:
        """Initialize the DetInferencer wrapper."""
        self.image_size = image_size
        super().__init__(model, weights, device, scope, palette)

    def __call__(self, images: list[np.ndarray], batch_size: int = 1) -> dict:
        """Emulate DetInferencer(images) without progressbar.

        Args:
            images: list of np.ndarray
                A list of numpy array that contains the image to be predicted. 3 channel image should be in RGB order.
            batch_size: int
                Inference batch size. Defaults to 1.
        """
        inputs = self.preprocess(images, batch_size=batch_size)
        results_dict: dict[str, list] = {"predictions": [], "visualization": []}
        for _, data in inputs:
            preds = self.forward(data)
            results = self.postprocess(
                preds,
                visualization=None,
                return_datasample=False,
                print_result=False,
                no_save_pred=True,
                pred_out_dir=None,
            )
            results_dict["predictions"].extend(results["predictions"])
        return results_dict

    def _init_pipeline(self, cfg: ConfigType) -> Compose:
        """Initialize the test pipeline."""
        pipeline_cfg = cfg.test_dataloader.dataset.pipeline

        # For inference, the key of ``img_id`` is not used.
        if "meta_keys" in pipeline_cfg[-1]:
            pipeline_cfg[-1]["meta_keys"] = tuple(
                meta_key for meta_key in pipeline_cfg[-1]["meta_keys"] if meta_key != "img_id"
            )

        load_img_idx = self._get_transform_idx(pipeline_cfg, "LoadImageFromFile")
        if load_img_idx == -1:
            raise ValueError("LoadImageFromFile is not found in the test pipeline")
        pipeline_cfg[load_img_idx]["type"] = "mmdet.InferencerLoader"

        resize_idx = self._get_transform_idx(pipeline_cfg, "Resize")
        if resize_idx == -1:
            raise ValueError("Resize is not found in the test pipeline")
        if self.image_size is not None:
            pipeline_cfg[resize_idx]["scale"] = (self.image_size, self.image_size)
        return Compose(pipeline_cfg)

Functions¶

__call__(images, batch_size=1) ¶

Emulate DetInferencer(images) without progressbar.

Parameters:

Name	Type	Description	Default
`images` ¶	`list[ndarray]`	list of np.ndarray A list of numpy array that contains the image to be predicted. 3 channel image should be in RGB order.	required
`batch_size` ¶	`int`	int Inference batch size. Defaults to 1.	`1`

Source code in sahi/models/mmdet.py

def __call__(self, images: list[np.ndarray], batch_size: int = 1) -> dict:
    """Emulate DetInferencer(images) without progressbar.

    Args:
        images: list of np.ndarray
            A list of numpy array that contains the image to be predicted. 3 channel image should be in RGB order.
        batch_size: int
            Inference batch size. Defaults to 1.
    """
    inputs = self.preprocess(images, batch_size=batch_size)
    results_dict: dict[str, list] = {"predictions": [], "visualization": []}
    for _, data in inputs:
        preds = self.forward(data)
        results = self.postprocess(
            preds,
            visualization=None,
            return_datasample=False,
            print_result=False,
            no_save_pred=True,
            pred_out_dir=None,
        )
        results_dict["predictions"].extend(results["predictions"])
    return results_dict

__init__(model=None, weights=None, device=None, scope='mmdet', palette='none', image_size=None) ¶

Initialize the DetInferencer wrapper.

Source code in sahi/models/mmdet.py

def __init__(
    self,
    model: ModelType | str | None = None,
    weights: str | None = None,
    device: str | None = None,
    scope: str | None = "mmdet",
    palette: str = "none",
    image_size: int | None = None,
) -> None:
    """Initialize the DetInferencer wrapper."""
    self.image_size = image_size
    super().__init__(model, weights, device, scope, palette)

MmdetDetectionModel ¶

Bases: DetectionModel

MMDetection object detection model.

Wraps MMDetection's DetInferencer for detection and instance segmentation.

Source code in sahi/models/mmdet.py

class MmdetDetectionModel(DetectionModel):
    """MMDetection object detection model.

    Wraps MMDetection's DetInferencer for detection and instance segmentation.
    """

    def __init__(
        self,
        model_path: str | None = None,
        model: object | None = None,
        config_path: str | None = None,
        device: str | None = None,
        mask_threshold: float = 0.5,
        confidence_threshold: float = 0.3,
        category_mapping: dict | None = None,
        category_remapping: dict | None = None,
        load_at_init: bool = True,
        image_size: int | None = None,
        scope: str = "mmdet",
    ) -> None:
        """Initialize MMDetection detection model."""
        self.scope = scope
        self.image_size = image_size
        existing_packages = getattr(self, "required_packages", None) or []
        self.required_packages = [*list(existing_packages), "mmdet", "mmcv", "torch"]
        super().__init__(
            model_path,
            model,
            config_path,
            device,
            mask_threshold,
            confidence_threshold,
            category_mapping,
            category_remapping,
            load_at_init,
            image_size,
        )

    def load_model(self) -> None:
        """Detection model is initialized and set to self.model."""
        # create model
        model = DetInferencerWrapper(
            self.config_path, self.model_path, device=str(self.device), scope=self.scope, image_size=self.image_size
        )

        self.set_model(model)

    def set_model(self, model: Any, **kwargs: Any) -> None:
        """Sets the underlying MMDetection model.

        Args:
            model: Any
                A MMDetection model
            **kwargs: Any
                Additional keyword arguments for model setup.
        """
        # set self.model
        self.model = model

        # set category_mapping
        if not self.category_mapping:
            category_mapping = {str(ind): category_name for ind, category_name in enumerate(self.category_names)}
            self.category_mapping = category_mapping

    def perform_inference(self, image: np.ndarray) -> None:
        """Prediction is performed using self.model and the prediction result is set to self._original_predictions.

        Args:
            image: np.ndarray
                A numpy array that contains the image to be predicted. 3 channel image should be in RGB order.
        """
        # Confirm model is loaded
        if self.model is None:
            raise ValueError("Model is not loaded, load it by calling .load_model()")

        # Supports only batch of 1

        # perform inference
        if isinstance(image, np.ndarray):
            # https://github.com/obss/sahi/issues/265
            image = image[:, :, ::-1]
        # compatibility with sahi v0.8.15
        if not isinstance(image, list):
            image_list = [image]
        prediction_result = self.model(image_list)

        self._original_predictions = prediction_result["predictions"]

    @property
    def num_categories(self) -> int:
        """Returns number of categories."""
        return len(self.category_names)

    @property
    def has_mask(self) -> bool:
        """Returns if model output contains segmentation mask.

        Considers both single dataset and ConcatDataset scenarios.
        """

        def check_pipeline_for_mask(pipeline: list) -> bool:
            return any(
                isinstance(item, dict) and any("mask" in key and value is True for key, value in item.items())
                for item in pipeline
            )

        # Access the dataset from the configuration
        dataset_config = self.model.cfg["train_dataloader"]["dataset"]  # type: ignore[attr-defined]

        if dataset_config["type"] == "ConcatDataset":
            # If using ConcatDataset, check each dataset individually
            datasets = dataset_config["datasets"]
            for dataset in datasets:
                if check_pipeline_for_mask(dataset["pipeline"]):
                    return True
        else:
            # Otherwise, assume a single dataset with its own pipeline
            if check_pipeline_for_mask(dataset_config["pipeline"]):
                return True

        return False

    @property
    def category_names(self) -> tuple | list:
        """Return category names from model metadata."""
        classes = self.model.model.dataset_meta["classes"]  # type: ignore[attr-defined]
        if isinstance(classes, str):
            # https://github.com/open-mmlab/mmdetection/pull/4973
            return (classes,)
        else:
            return classes

    def _create_object_prediction_list_from_original_predictions(
        self,
        shift_amount_list: list[list[int | float]] | None = [[0, 0]],
        full_shape_list: list[list[int | float]] | None = None,
    ) -> None:
        """Convert predictions to ObjectPrediction list.

        self._original_predictions is converted to a list of prediction.ObjectPrediction and set to
        self._object_prediction_list_per_image.

        Args:
            shift_amount_list: list of list
                To shift the box and mask predictions from sliced image to full sized image, should
                be in the form of List[[shift_x, shift_y],[shift_x, shift_y],...]
            full_shape_list: list of list
                Size of the full image after shifting, should be in the form of
                List[[height, width],[height, width],...]
        """
        try:
            from pycocotools import mask as mask_utils

            can_decode_rle = True
        except ImportError:
            can_decode_rle = False
        assert self._original_predictions is not None
        original_predictions: list = self._original_predictions
        assert self.category_mapping is not None
        category_mapping = self.category_mapping

        # compatilibty for sahi v0.8.15
        shift_amount_list_typed: list[list[int | float]] = fix_shift_amount_list(shift_amount_list)
        full_shape_list_typed: list[list[int | float]] | None = fix_full_shape_list(full_shape_list)

        # parse boxes and masks from predictions
        object_prediction_list_per_image = []
        for image_ind, original_prediction in enumerate(original_predictions):
            shift_amount = [int(x) for x in shift_amount_list_typed[image_ind]]
            full_shape = None if full_shape_list_typed is None else [int(x) for x in full_shape_list_typed[image_ind]]

            boxes = original_prediction["bboxes"]
            scores = original_prediction["scores"]
            labels = original_prediction["labels"]
            if self.has_mask:
                masks = original_prediction["masks"]

            object_prediction_list = []

            n_detects = len(labels)
            # process predictions
            for i in range(n_detects):
                if self.has_mask:
                    mask = masks[i]

                bbox = boxes[i]
                score = scores[i]
                category_id = labels[i]
                category_name = category_mapping[str(category_id)]

                # ignore low scored predictions
                if score < self.confidence_threshold:
                    continue

                # parse prediction mask
                if self.has_mask:
                    if "counts" in mask:
                        if can_decode_rle:
                            bool_mask = mask_utils.decode(mask)
                        else:
                            raise ValueError(
                                "Can not decode rle mask. Please install pycocotools. ex: 'pip install pycocotools'"
                            )
                    else:
                        bool_mask = mask
                    # check if mask is valid
                    # https://github.com/obss/sahi/discussions/696
                    if get_bbox_from_bool_mask(bool_mask) is None:
                        continue
                    segmentation = get_coco_segmentation_from_bool_mask(bool_mask)
                else:
                    segmentation = None

                # fix negative box coords
                bbox[0] = max(0, bbox[0])
                bbox[1] = max(0, bbox[1])
                bbox[2] = max(0, bbox[2])
                bbox[3] = max(0, bbox[3])

                # fix out of image box coords
                if full_shape is not None:
                    bbox[0] = min(full_shape[1], bbox[0])
                    bbox[1] = min(full_shape[0], bbox[1])
                    bbox[2] = min(full_shape[1], bbox[2])
                    bbox[3] = min(full_shape[0], bbox[3])

                # ignore invalid predictions
                if not (bbox[0] < bbox[2]) or not (bbox[1] < bbox[3]):
                    logger.warning(f"ignoring invalid prediction with bbox: {bbox}")
                    continue

                object_prediction = ObjectPrediction(
                    bbox=bbox,
                    category_id=category_id,
                    score=score,
                    segmentation=segmentation,
                    category_name=category_name,
                    shift_amount=shift_amount,
                    full_shape=full_shape,
                )
                object_prediction_list.append(object_prediction)
            object_prediction_list_per_image.append(object_prediction_list)
        self._object_prediction_list_per_image = object_prediction_list_per_image

Attributes¶

category_names property ¶

Return category names from model metadata.

has_mask property ¶

Returns if model output contains segmentation mask.

Considers both single dataset and ConcatDataset scenarios.

num_categories property ¶

Returns number of categories.

Functions¶

__init__(model_path=None, model=None, config_path=None, device=None, mask_threshold=0.5, confidence_threshold=0.3, category_mapping=None, category_remapping=None, load_at_init=True, image_size=None, scope='mmdet')

¶

Initialize MMDetection detection model.

Source code in sahi/models/mmdet.py

def __init__(
    self,
    model_path: str | None = None,
    model: object | None = None,
    config_path: str | None = None,
    device: str | None = None,
    mask_threshold: float = 0.5,
    confidence_threshold: float = 0.3,
    category_mapping: dict | None = None,
    category_remapping: dict | None = None,
    load_at_init: bool = True,
    image_size: int | None = None,
    scope: str = "mmdet",
) -> None:
    """Initialize MMDetection detection model."""
    self.scope = scope
    self.image_size = image_size
    existing_packages = getattr(self, "required_packages", None) or []
    self.required_packages = [*list(existing_packages), "mmdet", "mmcv", "torch"]
    super().__init__(
        model_path,
        model,
        config_path,
        device,
        mask_threshold,
        confidence_threshold,
        category_mapping,
        category_remapping,
        load_at_init,
        image_size,
    )

load_model() ¶

Detection model is initialized and set to self.model.

Source code in sahi/models/mmdet.py

def load_model(self) -> None:
    """Detection model is initialized and set to self.model."""
    # create model
    model = DetInferencerWrapper(
        self.config_path, self.model_path, device=str(self.device), scope=self.scope, image_size=self.image_size
    )

    self.set_model(model)

perform_inference(image) ¶

Prediction is performed using self.model and the prediction result is set to self._original_predictions.

Parameters:

Name	Type	Description	Default
`image` ¶	`ndarray`	np.ndarray A numpy array that contains the image to be predicted. 3 channel image should be in RGB order.	required

Source code in sahi/models/mmdet.py

def perform_inference(self, image: np.ndarray) -> None:
    """Prediction is performed using self.model and the prediction result is set to self._original_predictions.

    Args:
        image: np.ndarray
            A numpy array that contains the image to be predicted. 3 channel image should be in RGB order.
    """
    # Confirm model is loaded
    if self.model is None:
        raise ValueError("Model is not loaded, load it by calling .load_model()")

    # Supports only batch of 1

    # perform inference
    if isinstance(image, np.ndarray):
        # https://github.com/obss/sahi/issues/265
        image = image[:, :, ::-1]
    # compatibility with sahi v0.8.15
    if not isinstance(image, list):
        image_list = [image]
    prediction_result = self.model(image_list)

    self._original_predictions = prediction_result["predictions"]

set_model(model, **kwargs) ¶

Sets the underlying MMDetection model.

Parameters:

Name	Type	Description	Default
`model` ¶	`Any`	Any A MMDetection model	required
`**kwargs` ¶	`Any`	Any Additional keyword arguments for model setup.	`{}`

Source code in sahi/models/mmdet.py

def set_model(self, model: Any, **kwargs: Any) -> None:
    """Sets the underlying MMDetection model.

    Args:
        model: Any
            A MMDetection model
        **kwargs: Any
            Additional keyword arguments for model setup.
    """
    # set self.model
    self.model = model

    # set category_mapping
    if not self.category_mapping:
        category_mapping = {str(ind): category_name for ind, category_name in enumerate(self.category_names)}
        self.category_mapping = category_mapping

Functions¶

`roboflow` ¶

Roboflow detection model wrapper for SAHI.

Provides integration with Roboflow's inference SDK for object detection and instance segmentation models.

Classes¶

RoboflowDetectionModel ¶

Bases: DetectionModel

Roboflow object detection model.

Supports both Roboflow Universe models (API-based) and local RF-DETR models.

Source code in sahi/models/roboflow.py

class RoboflowDetectionModel(DetectionModel):
    """Roboflow object detection model.

    Supports both Roboflow Universe models (API-based) and local RF-DETR models.
    """

    def __init__(
        self,
        model: object | None = None,
        model_path: str | None = None,
        config_path: str | None = None,
        device: str | None = None,
        mask_threshold: float = 0.5,
        confidence_threshold: float = 0.3,
        category_mapping: dict | None = None,
        category_remapping: dict | None = None,
        load_at_init: bool = True,
        image_size: int | None = None,
        api_key: str | None = None,
    ) -> None:
        """Initialize the RoboflowDetectionModel with the given parameters.

        Args:
            model: object
                Either a Roboflow model string identifier or an RF-DETR model class.
            api_key: str
                Roboflow API key for authentication.
            model_path: str
                Path for the instance segmentation model weight
            config_path: str
                Path for the mmdetection instance segmentation model config file
            device: Torch device, "cpu", "mps", "cuda", "cuda:0", "cuda:1", etc.
            mask_threshold: float
                Value to threshold mask pixels, should be between 0 and 1
            confidence_threshold: float
                All predictions with score < confidence_threshold will be discarded
            category_mapping: dict: str to str
                Mapping from category id (str) to category name (str) e.g. {"1": "pedestrian"}
            category_remapping: dict: str to int
                Remap category ids based on category names, after performing inference e.g. {"car": 3}
            load_at_init: bool
                If True, automatically loads the model at initialization
            image_size: int
                Inference input size.
        """
        self._use_universe = model and isinstance(model, str)
        self._model = model
        self._device = device
        self._api_key = api_key

        if self._use_universe:
            existing_packages = getattr(self, "required_packages", None) or []
            self.required_packages = [*list(existing_packages), "inference"]
        else:
            existing_packages = getattr(self, "required_packages", None) or []
            self.required_packages = [*list(existing_packages), "rfdetr"]

        super().__init__(
            model=model,
            model_path=model_path,
            config_path=config_path,
            device=device,
            mask_threshold=mask_threshold,
            confidence_threshold=confidence_threshold,
            category_mapping=category_mapping,
            category_remapping=category_remapping,
            load_at_init=False,
            image_size=image_size,
        )

        if load_at_init:
            self.load_model()

    def set_model(self, model: Any, **kwargs: Any) -> None:
        """Set the detection model.

        Args:
            model: Any
                Loaded model.
            **kwargs: Additional keyword arguments.
        """
        self.model = model

    def load_model(self) -> None:
        """Load detection model from Roboflow.

        This function initializes detection model and sets to self.model.
        Uses self.model_path, self.config_path, and self.device.
        """
        if self._use_universe:
            from inference import get_model
            from inference.core.env import API_KEY
            from inference.core.exceptions import RoboflowAPINotAuthorizedError

            api_key = self._api_key or API_KEY

            try:
                model = get_model(self._model, api_key=api_key)
            except RoboflowAPINotAuthorizedError as e:
                raise ValueError(
                    "Authorization failed. Please pass a valid API key with "
                    "the `api_key` parameter or set the `ROBOFLOW_API_KEY` environment variable."
                ) from e

            assert model.task_type in ["object-detection", "instance-segmentation"], (
                "Roboflow model must be an object detection model or an instance segmentation model."
            )

        else:
            from rfdetr.detr import (
                RFDETRBase,
                RFDETRLarge,
                RFDETRMedium,
                RFDETRNano,
                RFDETRSeg2XLarge,
                RFDETRSegLarge,
                RFDETRSegMedium,
                RFDETRSegNano,
                RFDETRSegSmall,
                RFDETRSegXLarge,
                RFDETRSmall,
            )

            model, model_path = self._model, self.model_path
            model_names = (
                "RFDETRBase",
                "RFDETRNano",
                "RFDETRSmall",
                "RFDETRMedium",
                "RFDETRLarge",
                "RFDETRSegNano",
                "RFDETRSegSmall",
                "RFDETRSegMedium",
                "RFDETRSegLarge",
                "RFDETRSegXLarge",
                "RFDETRSeg2XLarge",
            )
            model_types = (
                RFDETRBase,
                RFDETRNano,
                RFDETRSmall,
                RFDETRMedium,
                RFDETRLarge,
                RFDETRSegNano,
                RFDETRSegSmall,
                RFDETRSegMedium,
                RFDETRSegLarge,
                RFDETRSegXLarge,
                RFDETRSeg2XLarge,
            )
            if hasattr(model, "__name__") and model.__name__ in model_names:
                model_params = dict(
                    device=self._device,
                    num_classes=len(self.category_mapping.keys()) if self.category_mapping else None,
                )
                if model_path:
                    model_params["pretrain_weights"] = model_path
                    if self.image_size:
                        model_params["resolution"] = int(self.image_size)

                model = model(**model_params)  # type: ignore[operator]
            elif isinstance(model, model_types):
                model = model
            else:
                raise ValueError(
                    f"Model must be a Roboflow model string or one of {model_names} models, got {self.model}."
                )

        self.set_model(model)

    def perform_inference(
        self,
        image: np.ndarray,
    ) -> None:
        """Run inference on image and store predictions.

        Args:
            image: np.ndarray
                A numpy array that contains the image to be predicted.
        """
        if self._use_universe:
            self._original_predictions = self.model.infer(image, confidence=self.confidence_threshold)
        else:
            self._original_predictions = [self.model.predict(image, threshold=self.confidence_threshold)]

    @property
    def has_mask(self) -> bool:
        """Returns if model output contains segmentation mask."""
        if self._use_universe:
            return self.model.task_type == "instance-segmentation"
        else:
            return "seg" in self.model.size

    def _create_object_prediction_list_from_original_predictions(
        self,
        shift_amount_list: list[list[int | float]] | None = [[0, 0]],
        full_shape_list: list[list[int | float]] | None = None,
    ) -> None:
        """Convert predictions to ObjectPrediction list.

        self._original_predictions is converted to a list of prediction.ObjectPrediction and set to
        self._object_prediction_list_per_image. self.mask_threshold can also be utilized.

        Args:
            shift_amount_list: list of list
                To shift the box and mask predictions from sliced image to full sized image, should
                be in the form of List[[shift_x, shift_y],[shift_x, shift_y],...]
            full_shape_list: list of list
                Size of the full image after shifting, should be in the form of
                List[[height, width],[height, width],...]
        """
        # compatibility for sahi v0.8.15
        shift_amount_list_typed: list[list[int | float]] = fix_shift_amount_list(shift_amount_list)
        full_shape_list_typed: list[list[int | float]] | None = fix_full_shape_list(full_shape_list)

        object_prediction_list: list[ObjectPrediction] = []

        if self._use_universe:
            try:
                from pycocotools import mask as mask_utils

                can_decode_rle = True
            except ImportError:
                can_decode_rle = False

            original_reponses = self._original_predictions

            assert len(original_reponses) == len(shift_amount_list_typed) == len(full_shape_list_typed or []), (
                "Length mismatch between original responses, shift amounts, and full shapes."
            )

            for original_reponse, shift_amount, full_shape in zip(
                original_reponses,
                shift_amount_list_typed,
                full_shape_list_typed or [],
            ):
                for prediction in original_reponse.predictions:
                    bbox = [
                        prediction.x - prediction.width / 2,
                        prediction.y - prediction.height / 2,
                        prediction.x + prediction.width / 2,
                        prediction.y + prediction.height / 2,
                    ]

                    segmentation: list[list[float]] | None = None
                    if self.has_mask:
                        if hasattr(prediction, "points"):
                            segmentation = [list(chain(*[[pt.x, pt.y] for pt in prediction.points]))]
                        elif hasattr(prediction, "rle"):
                            if can_decode_rle:
                                bool_mask = mask_utils.decode(prediction.rle)
                            else:
                                raise ValueError(
                                    "Can not decode rle mask. Please install pycocotools. ex: 'pip install pycocotools'"
                                )
                            # check if mask is valid
                            if get_bbox_from_bool_mask(bool_mask) is None:
                                continue
                            segmentation = get_coco_segmentation_from_bool_mask(bool_mask)

                    object_prediction = ObjectPrediction(
                        bbox=bbox,
                        segmentation=segmentation,
                        category_id=prediction.class_id,
                        category_name=prediction.class_name,
                        score=prediction.confidence,
                        shift_amount=shift_amount,
                        full_shape=full_shape,
                    )
                    object_prediction_list.append(object_prediction)

        else:
            from supervision.detection.core import Detections

            original_detections: list[Detections] = self._original_predictions

            assert len(original_detections) == len(shift_amount_list_typed) == len(full_shape_list_typed or []), (
                "Length mismatch between original responses, shift amounts, and full shapes."
            )

            for original_detection, shift_amount, full_shape in zip(
                original_detections,
                shift_amount_list_typed,
                full_shape_list_typed or [],
            ):
                for xyxy, mask, confidence, class_id in zip_longest(
                    original_detection.xyxy,
                    original_detection.mask if original_detection.mask is not None else [],
                    original_detection.confidence,
                    original_detection.class_id,
                ):
                    segmentation = get_coco_segmentation_from_bool_mask(mask) if mask is not None else None

                    object_prediction = ObjectPrediction(
                        bbox=xyxy,
                        segmentation=segmentation,
                        category_id=int(class_id),
                        category_name=self.category_mapping.get(int(class_id), None) if self.category_mapping else None,
                        score=float(confidence),
                        shift_amount=shift_amount,
                        full_shape=full_shape,
                    )
                    object_prediction_list.append(object_prediction)

        object_prediction_list_per_image = [object_prediction_list]
        self._object_prediction_list_per_image = object_prediction_list_per_image

Attributes¶

has_mask property ¶

Returns if model output contains segmentation mask.

Functions¶

__init__(model=None, model_path=None, config_path=None, device=None, mask_threshold=0.5, confidence_threshold=0.3, category_mapping=None, category_remapping=None, load_at_init=True, image_size=None, api_key=None)

¶

Initialize the RoboflowDetectionModel with the given parameters.

Parameters:

Name	Type	Description	Default
`model` ¶	`object \| None`	object Either a Roboflow model string identifier or an RF-DETR model class.	`None`
`api_key` ¶	`str \| None`	str Roboflow API key for authentication.	`None`
`model_path` ¶	`str \| None`	str Path for the instance segmentation model weight	`None`
`config_path` ¶	`str \| None`	str Path for the mmdetection instance segmentation model config file	`None`
`device` ¶	`str \| None`	Torch device, "cpu", "mps", "cuda", "cuda:0", "cuda:1", etc.	`None`
`mask_threshold` ¶	`float`	float Value to threshold mask pixels, should be between 0 and 1	`0.5`
`confidence_threshold` ¶	`float`	float All predictions with score < confidence_threshold will be discarded	`0.3`
`category_mapping` ¶	`dict \| None`	dict: str to str Mapping from category id (str) to category name (str) e.g. {"1": "pedestrian"}	`None`
`category_remapping` ¶	`dict \| None`	dict: str to int Remap category ids based on category names, after performing inference e.g. {"car": 3}	`None`
`load_at_init` ¶	`bool`	bool If True, automatically loads the model at initialization	`True`
`image_size` ¶	`int \| None`	int Inference input size.	`None`

Source code in sahi/models/roboflow.py

def __init__(
    self,
    model: object | None = None,
    model_path: str | None = None,
    config_path: str | None = None,
    device: str | None = None,
    mask_threshold: float = 0.5,
    confidence_threshold: float = 0.3,
    category_mapping: dict | None = None,
    category_remapping: dict | None = None,
    load_at_init: bool = True,
    image_size: int | None = None,
    api_key: str | None = None,
) -> None:
    """Initialize the RoboflowDetectionModel with the given parameters.

    Args:
        model: object
            Either a Roboflow model string identifier or an RF-DETR model class.
        api_key: str
            Roboflow API key for authentication.
        model_path: str
            Path for the instance segmentation model weight
        config_path: str
            Path for the mmdetection instance segmentation model config file
        device: Torch device, "cpu", "mps", "cuda", "cuda:0", "cuda:1", etc.
        mask_threshold: float
            Value to threshold mask pixels, should be between 0 and 1
        confidence_threshold: float
            All predictions with score < confidence_threshold will be discarded
        category_mapping: dict: str to str
            Mapping from category id (str) to category name (str) e.g. {"1": "pedestrian"}
        category_remapping: dict: str to int
            Remap category ids based on category names, after performing inference e.g. {"car": 3}
        load_at_init: bool
            If True, automatically loads the model at initialization
        image_size: int
            Inference input size.
    """
    self._use_universe = model and isinstance(model, str)
    self._model = model
    self._device = device
    self._api_key = api_key

    if self._use_universe:
        existing_packages = getattr(self, "required_packages", None) or []
        self.required_packages = [*list(existing_packages), "inference"]
    else:
        existing_packages = getattr(self, "required_packages", None) or []
        self.required_packages = [*list(existing_packages), "rfdetr"]

    super().__init__(
        model=model,
        model_path=model_path,
        config_path=config_path,
        device=device,
        mask_threshold=mask_threshold,
        confidence_threshold=confidence_threshold,
        category_mapping=category_mapping,
        category_remapping=category_remapping,
        load_at_init=False,
        image_size=image_size,
    )

    if load_at_init:
        self.load_model()

load_model() ¶

Load detection model from Roboflow.

This function initializes detection model and sets to self.model. Uses self.model_path, self.config_path, and self.device.

Source code in sahi/models/roboflow.py

def load_model(self) -> None:
    """Load detection model from Roboflow.

    This function initializes detection model and sets to self.model.
    Uses self.model_path, self.config_path, and self.device.
    """
    if self._use_universe:
        from inference import get_model
        from inference.core.env import API_KEY
        from inference.core.exceptions import RoboflowAPINotAuthorizedError

        api_key = self._api_key or API_KEY

        try:
            model = get_model(self._model, api_key=api_key)
        except RoboflowAPINotAuthorizedError as e:
            raise ValueError(
                "Authorization failed. Please pass a valid API key with "
                "the `api_key` parameter or set the `ROBOFLOW_API_KEY` environment variable."
            ) from e

        assert model.task_type in ["object-detection", "instance-segmentation"], (
            "Roboflow model must be an object detection model or an instance segmentation model."
        )

    else:
        from rfdetr.detr import (
            RFDETRBase,
            RFDETRLarge,
            RFDETRMedium,
            RFDETRNano,
            RFDETRSeg2XLarge,
            RFDETRSegLarge,
            RFDETRSegMedium,
            RFDETRSegNano,
            RFDETRSegSmall,
            RFDETRSegXLarge,
            RFDETRSmall,
        )

        model, model_path = self._model, self.model_path
        model_names = (
            "RFDETRBase",
            "RFDETRNano",
            "RFDETRSmall",
            "RFDETRMedium",
            "RFDETRLarge",
            "RFDETRSegNano",
            "RFDETRSegSmall",
            "RFDETRSegMedium",
            "RFDETRSegLarge",
            "RFDETRSegXLarge",
            "RFDETRSeg2XLarge",
        )
        model_types = (
            RFDETRBase,
            RFDETRNano,
            RFDETRSmall,
            RFDETRMedium,
            RFDETRLarge,
            RFDETRSegNano,
            RFDETRSegSmall,
            RFDETRSegMedium,
            RFDETRSegLarge,
            RFDETRSegXLarge,
            RFDETRSeg2XLarge,
        )
        if hasattr(model, "__name__") and model.__name__ in model_names:
            model_params = dict(
                device=self._device,
                num_classes=len(self.category_mapping.keys()) if self.category_mapping else None,
            )
            if model_path:
                model_params["pretrain_weights"] = model_path
                if self.image_size:
                    model_params["resolution"] = int(self.image_size)

            model = model(**model_params)  # type: ignore[operator]
        elif isinstance(model, model_types):
            model = model
        else:
            raise ValueError(
                f"Model must be a Roboflow model string or one of {model_names} models, got {self.model}."
            )

    self.set_model(model)

perform_inference(image) ¶

Run inference on image and store predictions.

Parameters:

Name	Type	Description	Default
`image` ¶	`ndarray`	np.ndarray A numpy array that contains the image to be predicted.	required

Source code in sahi/models/roboflow.py

def perform_inference(
    self,
    image: np.ndarray,
) -> None:
    """Run inference on image and store predictions.

    Args:
        image: np.ndarray
            A numpy array that contains the image to be predicted.
    """
    if self._use_universe:
        self._original_predictions = self.model.infer(image, confidence=self.confidence_threshold)
    else:
        self._original_predictions = [self.model.predict(image, threshold=self.confidence_threshold)]

set_model(model, **kwargs) ¶

Set the detection model.

Parameters:

Name	Type	Description	Default
`model` ¶	`Any`	Any Loaded model.	required
`**kwargs` ¶	`Any`	Additional keyword arguments.	`{}`

Source code in sahi/models/roboflow.py

def set_model(self, model: Any, **kwargs: Any) -> None:
    """Set the detection model.

    Args:
        model: Any
            Loaded model.
        **kwargs: Additional keyword arguments.
    """
    self.model = model

Functions¶

`rtdetr` ¶

RT-DETR detection model wrapper for SAHI.

Provides integration with Ultralytics RT-DETR real-time detection transformer models.

Classes¶

RTDetrDetectionModel ¶

Bases: UltralyticsDetectionModel

RT-DETR object detection model.

Wraps Ultralytics RT-DETR for real-time detection inference.

Source code in sahi/models/rtdetr.py

class RTDetrDetectionModel(UltralyticsDetectionModel):
    """RT-DETR object detection model.

    Wraps Ultralytics RT-DETR for real-time detection inference.
    """

    def load_model(self) -> None:
        """Detection model is initialized and set to self.model."""
        from ultralytics import RTDETR

        try:
            model_source = self.model_path or "rtdetr-l.pt"
            model = RTDETR(model_source)
            model.to(self.device)
            self.set_model(model)
        except Exception as e:
            raise TypeError("model_path is not a valid rtdetr model path: ", e)

Functions¶

load_model() ¶

Detection model is initialized and set to self.model.

Source code in sahi/models/rtdetr.py

def load_model(self) -> None:
    """Detection model is initialized and set to self.model."""
    from ultralytics import RTDETR

    try:
        model_source = self.model_path or "rtdetr-l.pt"
        model = RTDETR(model_source)
        model.to(self.device)
        self.set_model(model)
    except Exception as e:
        raise TypeError("model_path is not a valid rtdetr model path: ", e)

`torchvision` ¶

TorchVision detection model wrapper for SAHI.

Provides integration with PyTorch's TorchVision library for object detection and instance segmentation models.

Classes¶

TorchVisionDetectionModel ¶

Bases: DetectionModel

TorchVision object detection model.

Supports various TorchVision detection models like Faster R-CNN, Mask R-CNN, etc.

Source code in sahi/models/torchvision.py

class TorchVisionDetectionModel(DetectionModel):
    """TorchVision object detection model.

    Supports various TorchVision detection models like Faster R-CNN, Mask R-CNN, etc.
    """

    def __init__(self, *args: object, **kwargs: object) -> None:
        """Initialize TorchVision detection model."""
        existing_packages = getattr(self, "required_packages", None) or []
        self.required_packages = [*list(existing_packages), "torch", "torchvision"]
        super().__init__(*args, **kwargs)  # type: ignore[misc, arg-type]

    def load_model(self) -> None:
        """Load TorchVision model from config and weights."""
        import torch

        # read config params
        model_name = None
        num_classes = None
        if self.config_path is not None:
            with open(self.config_path) as stream:
                try:
                    config = yaml.safe_load(stream)
                except yaml.YAMLError as exc:
                    raise RuntimeError(exc)

            model_name = config.get("model_name", None)
            num_classes = config.get("num_classes", None)

        # complete params if not provided in config
        if not model_name:
            model_name = "fasterrcnn_resnet50_fpn"
            logger.warning(f"model_name not provided in config, using default model_type: {model_name}'")
        if num_classes is None:
            logger.warning("num_classes not provided in config, using default num_classes: 91")
            num_classes = 91
        if self.model_path is None:
            logger.warning("model_path not provided in config, using pretrained weights and default num_classes: 91.")
            weights = "DEFAULT"
            num_classes = 91
        else:
            weights = None

        # load model
        # Note: torchvision >= 0.13 is required for the 'weights' parameter
        model = MODEL_NAME_TO_CONSTRUCTOR[model_name](num_classes=num_classes, weights=weights)
        if self.model_path:
            try:
                model.load_state_dict(torch.load(self.model_path))
            except Exception as e:
                logger.error(f"Invalid {self.model_path=}")
                raise TypeError("model_path is not a valid torchvision model path: ", e)

        self.set_model(model)

    def set_model(self, model: Any, **kwargs: Any) -> None:
        """Sets the underlying TorchVision model.

        Args:
            model: Any
                A TorchVision model
            **kwargs: Any
                Additional keyword arguments for model setup.
        """
        model.eval()  # type: ignore[attr-defined]
        self.model = model.to(self.device)  # type: ignore[attr-defined]

        # set category_mapping

        if self.category_mapping is None:
            category_names = {str(i): COCO_CLASSES[i] for i in range(len(COCO_CLASSES))}
            self.category_mapping = category_names

    def perform_inference(self, image: np.ndarray, image_size: int | None = None) -> None:
        """Prediction is performed using self.model and the prediction result is set to self._original_predictions.

        Args:
            image: np.ndarray
                A numpy array that contains the image to be predicted. 3 channel image should be in RGB order.
            image_size: int
                Inference input size.
        """
        from sahi.utils.torch_utils import to_float_tensor

        # arrange model input size
        assert self.model is not None
        if self.image_size is not None:
            # get min and max of image height and width
            min_shape, max_shape = min(image.shape[:2]), max(image.shape[:2])
            # torchvision resize transform scales the shorter dimension to the target size
            # we want to scale the longer dimension to the target size
            image_size = self.image_size * min_shape / max_shape
            self.model.transform.min_size = (image_size,)  # default is (800,)
            self.model.transform.max_size = image_size  # default is 1333

        image_tensor = to_float_tensor(image)
        image_tensor = image_tensor.to(self.device)
        prediction_result = self.model([image_tensor])

        self._original_predictions = prediction_result

    @property
    def num_categories(self) -> int:
        """Returns number of categories."""
        assert self.category_mapping is not None
        return len(self.category_mapping)

    @property
    def has_mask(self) -> bool:
        """Returns if model output contains segmentation mask."""
        return hasattr(self.model, "roi_heads") and hasattr(self.model.roi_heads, "mask_predictor")  # type: ignore[attr-defined]

    @property
    def category_names(self) -> list:
        """Return category names from mapping."""
        assert self.category_mapping is not None
        return list(self.category_mapping.values())

    def _create_object_prediction_list_from_original_predictions(
        self,
        shift_amount_list: list[list[int | float]] | None = [[0, 0]],
        full_shape_list: list[list[int | float]] | None = None,
    ) -> None:
        """Convert predictions to ObjectPrediction list.

        self._original_predictions is converted to a list of prediction.ObjectPrediction and set to
        self._object_prediction_list_per_image.

        Args:
            shift_amount_list: list of list
                To shift the box and mask predictions from sliced image to full sized image, should
                be in the form of List[[shift_x, shift_y],[shift_x, shift_y],...]
            full_shape_list: list of list
                Size of the full image after shifting, should be in the form of
                List[[height, width],[height, width],...]
        """
        original_predictions = self._original_predictions

        # compatilibty for sahi v0.8.20
        if shift_amount_list is not None and isinstance(shift_amount_list[0], int):
            shift_amount_list = [shift_amount_list]  # type: ignore[list-item]
        if full_shape_list is not None and isinstance(full_shape_list[0], int):
            full_shape_list = [full_shape_list]  # type: ignore[list-item]

        for image_predictions in original_predictions:
            object_prediction_list_per_image = []

            # get indices of boxes with score > confidence_threshold
            scores = image_predictions["scores"].cpu().detach().numpy()
            selected_indices = np.where(scores > self.confidence_threshold)[0]

            # parse boxes, masks, scores, category_ids from predictions
            category_ids = list(image_predictions["labels"][selected_indices].cpu().detach().numpy())
            boxes = list(image_predictions["boxes"][selected_indices].cpu().detach().numpy())
            scores = scores[selected_indices]

            # check if predictions contain mask
            masks = image_predictions.get("masks", None)
            if masks is not None:
                masks = list(
                    (image_predictions["masks"][selected_indices] > self.mask_threshold).cpu().detach().numpy()
                )
            else:
                masks = None

            # create object_prediction_list
            object_prediction_list = []

            shift_amount = shift_amount_list[0] if shift_amount_list else [0, 0]
            full_shape = None if full_shape_list is None else full_shape_list[0]

            for ind in range(len(boxes)):
                if masks is not None:
                    segmentation = get_coco_segmentation_from_bool_mask(np.array(masks[ind]))
                else:
                    segmentation = None

                object_prediction = ObjectPrediction(
                    bbox=boxes[ind],
                    segmentation=segmentation,
                    category_id=int(category_ids[ind]),
                    category_name=self.category_mapping[str(int(category_ids[ind]))] if self.category_mapping else "",  # type: ignore[index]
                    shift_amount=shift_amount,
                    score=scores[ind],
                    full_shape=full_shape,
                )
                object_prediction_list.append(object_prediction)
            object_prediction_list_per_image.append(object_prediction_list)

        self._object_prediction_list_per_image = object_prediction_list_per_image

Attributes¶

category_names property ¶

Return category names from mapping.

has_mask property ¶

Returns if model output contains segmentation mask.

num_categories property ¶

Returns number of categories.

Functions¶

__init__(*args, **kwargs) ¶

Initialize TorchVision detection model.

Source code in sahi/models/torchvision.py

def __init__(self, *args: object, **kwargs: object) -> None:
    """Initialize TorchVision detection model."""
    existing_packages = getattr(self, "required_packages", None) or []
    self.required_packages = [*list(existing_packages), "torch", "torchvision"]
    super().__init__(*args, **kwargs)  # type: ignore[misc, arg-type]

load_model() ¶

Load TorchVision model from config and weights.

Source code in sahi/models/torchvision.py

def load_model(self) -> None:
    """Load TorchVision model from config and weights."""
    import torch

    # read config params
    model_name = None
    num_classes = None
    if self.config_path is not None:
        with open(self.config_path) as stream:
            try:
                config = yaml.safe_load(stream)
            except yaml.YAMLError as exc:
                raise RuntimeError(exc)

        model_name = config.get("model_name", None)
        num_classes = config.get("num_classes", None)

    # complete params if not provided in config
    if not model_name:
        model_name = "fasterrcnn_resnet50_fpn"
        logger.warning(f"model_name not provided in config, using default model_type: {model_name}'")
    if num_classes is None:
        logger.warning("num_classes not provided in config, using default num_classes: 91")
        num_classes = 91
    if self.model_path is None:
        logger.warning("model_path not provided in config, using pretrained weights and default num_classes: 91.")
        weights = "DEFAULT"
        num_classes = 91
    else:
        weights = None

    # load model
    # Note: torchvision >= 0.13 is required for the 'weights' parameter
    model = MODEL_NAME_TO_CONSTRUCTOR[model_name](num_classes=num_classes, weights=weights)
    if self.model_path:
        try:
            model.load_state_dict(torch.load(self.model_path))
        except Exception as e:
            logger.error(f"Invalid {self.model_path=}")
            raise TypeError("model_path is not a valid torchvision model path: ", e)

    self.set_model(model)

perform_inference(image, image_size=None) ¶

Prediction is performed using self.model and the prediction result is set to self._original_predictions.

Parameters:

Name	Type	Description	Default
`image` ¶	`ndarray`	np.ndarray A numpy array that contains the image to be predicted. 3 channel image should be in RGB order.	required
`image_size` ¶	`int \| None`	int Inference input size.	`None`

Source code in sahi/models/torchvision.py

def perform_inference(self, image: np.ndarray, image_size: int | None = None) -> None:
    """Prediction is performed using self.model and the prediction result is set to self._original_predictions.

    Args:
        image: np.ndarray
            A numpy array that contains the image to be predicted. 3 channel image should be in RGB order.
        image_size: int
            Inference input size.
    """
    from sahi.utils.torch_utils import to_float_tensor

    # arrange model input size
    assert self.model is not None
    if self.image_size is not None:
        # get min and max of image height and width
        min_shape, max_shape = min(image.shape[:2]), max(image.shape[:2])
        # torchvision resize transform scales the shorter dimension to the target size
        # we want to scale the longer dimension to the target size
        image_size = self.image_size * min_shape / max_shape
        self.model.transform.min_size = (image_size,)  # default is (800,)
        self.model.transform.max_size = image_size  # default is 1333

    image_tensor = to_float_tensor(image)
    image_tensor = image_tensor.to(self.device)
    prediction_result = self.model([image_tensor])

    self._original_predictions = prediction_result

set_model(model, **kwargs) ¶

Sets the underlying TorchVision model.

Parameters:

Name	Type	Description	Default
`model` ¶	`Any`	Any A TorchVision model	required
`**kwargs` ¶	`Any`	Any Additional keyword arguments for model setup.	`{}`

Source code in sahi/models/torchvision.py

def set_model(self, model: Any, **kwargs: Any) -> None:
    """Sets the underlying TorchVision model.

    Args:
        model: Any
            A TorchVision model
        **kwargs: Any
            Additional keyword arguments for model setup.
    """
    model.eval()  # type: ignore[attr-defined]
    self.model = model.to(self.device)  # type: ignore[attr-defined]

    # set category_mapping

    if self.category_mapping is None:
        category_names = {str(i): COCO_CLASSES[i] for i in range(len(COCO_CLASSES))}
        self.category_mapping = category_names

Functions¶

`ultralytics` ¶

Ultralytics detection model wrapper for SAHI.

Provides integration with Ultralytics YOLO models for object detection, instance segmentation, and oriented bounding box detection.

Classes¶

UltralyticsDetectionModel ¶

Bases: DetectionModel

Detection model for Ultralytics YOLO models.

Supports PyTorch (.pt), ONNX (.onnx), OpenVINO (.xml or _openvino_model/), NCNN (.param or _ncnn_model/), and TorchScript (.torchscript) models.

Source code in sahi/models/ultralytics.py

class UltralyticsDetectionModel(DetectionModel):
    """Detection model for Ultralytics YOLO models.

    Supports PyTorch (.pt), ONNX (.onnx), OpenVINO (.xml or _openvino_model/),
    NCNN (.param or _ncnn_model/), and TorchScript (.torchscript) models.
    """

    def __init__(self, *args: object, fuse: bool = False, task: str | None = None, **kwargs: object) -> None:
        """Initialize the Ultralytics detection model.

        Accepts all arguments from ``DetectionModel.__init__`` plus the
        following keyword arguments.

        Args:
            *args: Variable length argument list passed to DetectionModel.
            fuse: If True, fuse Conv2d and BatchNorm2d layers for faster
                inference. Default: False.
            task: Ultralytics task type (e.g. ``"detect"``, ``"segment"``,
                ``"obb"``). When None, the task is inferred from the model.
                Default: None.
            **kwargs: Arbitrary keyword arguments passed to DetectionModel.
        """
        self.fuse: bool = fuse
        self.task: str | None = task
        existing_packages = getattr(self, "required_packages", None) or []
        self.required_packages = [*list(existing_packages), "ultralytics"]
        super().__init__(*args, **kwargs)  # type: ignore[misc, arg-type]

    def load_model(self) -> None:
        """Detection model is initialized and set to self.model."""
        from ultralytics import YOLO

        try:
            assert self.model_path is not None, "model_path must be provided for Ultralytics models"
            if self.task:
                model = YOLO(self.model_path, task=self.task)
            else:
                model = YOLO(self.model_path)

            # Only call .to(device) for PyTorch models, not ONNX,OpenVINO,Ncnn.
            if self.model_path and isinstance(self.model_path, str) and self.model_path.endswith(".pt"):
                model.to(self.device)
            self.set_model(model)
            if self.fuse and hasattr(model, "fuse"):
                model.fuse()

        except Exception as e:
            raise TypeError("model_path is not a valid Ultralytics model path: ", e)

    def set_model(self, model: Any, **kwargs: Any) -> None:
        """Sets the underlying Ultralytics model.

        Args:
            model: Any
                A Ultralytics model
            **kwargs: Any
                Additional keyword arguments for model setup.
        """
        self.model = model
        # set category_mapping
        if not self.category_mapping:
            category_mapping = {str(ind): category_name for ind, category_name in enumerate(self.category_names)}
            self.category_mapping = category_mapping

    def perform_inference(self, image: np.ndarray) -> None:
        """Prediction is performed using self.model and the prediction result is set to self._original_predictions.

        Args:
            image: np.ndarray
                A numpy array that contains the image to be predicted. 3 channel image should be in RGB order.
        """
        self.perform_batch_inference([image])

    def _extract_predictions(self, prediction_result: Any) -> list:
        """Extracts predictions from YOLO result objects into the internal format.

        Args:
            prediction_result: list of YOLO Result objects from self.model()

        Returns:
            list of extracted predictions (tensors or tuple of tensors per image)
        """
        import torch

        if self.has_mask:
            from ultralytics.engine.results import Masks

            for result in prediction_result:
                if not result.masks:
                    device = getattr(self.model, "device", "cpu")
                    result.masks = Masks(torch.tensor([], device=device), result.boxes.orig_shape)

            return [
                (
                    result.boxes.data,
                    result.masks.data,
                )
                for result in prediction_result
            ]
        elif self.is_obb:
            device = getattr(self.model, "device", "cpu")
            return [
                (
                    torch.cat(
                        [
                            result.obb.xyxy,
                            result.obb.conf.unsqueeze(-1),
                            result.obb.cls.unsqueeze(-1),
                        ],
                        dim=1,
                    )
                    if result.obb is not None
                    else torch.empty((0, 6), device=device),
                    result.obb.xyxyxyxy if result.obb is not None else torch.empty((0, 4, 2), device=device),
                )
                for result in prediction_result
            ]
        else:
            return [result.boxes.data for result in prediction_result]

    def perform_batch_inference(self, images: list[np.ndarray]) -> None:
        """Performs inference on a batch of images using native YOLO batch support.

        Args:
            images: list[np.ndarray]
                List of numpy arrays (H, W, C) in RGB order.
        """
        if self.model is None:
            raise ValueError("Model is not loaded, load it by calling .load_model()")

        kwargs = {"cfg": self.config_path, "verbose": False, "conf": self.confidence_threshold, "device": self.device}

        if self.image_size is not None:
            kwargs = {"imgsz": self.image_size, **kwargs}

        # YOLO expects BGR — convert each image and pass the list for native batch inference
        images_bgr = [img[:, :, ::-1] for img in images]
        prediction_result = self.model(images_bgr, **kwargs)

        self._original_predictions = self._extract_predictions(prediction_result)
        self._original_shapes = [img.shape for img in images]

    @property
    def category_names(self) -> list:
        """Returns the list of category names from the model.

        Falls back to ``category_mapping`` values when model metadata is
        unavailable (e.g. ONNX models without embedded names).

        Raises:
            ValueError: If neither model names nor category_mapping are available.
        """
        # For ONNX models, names might not be available, use category_mapping
        assert self.model is not None
        if hasattr(self.model, "names") and self.model.names:
            return list(self.model.names.values())
        elif self.category_mapping:
            return list(self.category_mapping.values())
        else:
            raise ValueError("Category names not available. Please provide category_mapping for ONNX models.")

    @property
    def num_categories(self) -> int:
        """Returns number of categories."""
        assert self.model is not None
        if hasattr(self.model, "names") and self.model.names:
            return len(self.model.names)
        elif self.category_mapping:
            return len(self.category_mapping)
        else:
            raise ValueError("Cannot determine number of categories. Please provide category_mapping for ONNX models.")

    @property
    def has_mask(self) -> bool:
        """Returns if model output contains segmentation mask."""
        # Check if model has 'task' attribute (for both .pt and .onnx models)
        assert self.model is not None
        if hasattr(self.model, "overrides") and "task" in self.model.overrides:
            return self.model.overrides["task"] == "segment"
        # For ONNX models, task might be stored differently
        elif hasattr(self.model, "task"):
            return self.model.task == "segment"
        # For ONNX models without task info, check model path
        elif self.model_path and isinstance(self.model_path, str):
            return "seg" in self.model_path.lower()
        return False

    @property
    def is_obb(self) -> bool:
        """Returns if model output contains oriented bounding boxes."""
        # Check if model has 'task' attribute (for both .pt and .onnx models)
        assert self.model is not None
        if hasattr(self.model, "overrides") and "task" in self.model.overrides:
            return self.model.overrides["task"] == "obb"
        # For ONNX models, task might be stored differently
        elif hasattr(self.model, "task"):
            return self.model.task == "obb"
        # For ONNX models without task info, check model path
        elif self.model_path and isinstance(self.model_path, str):
            return "obb" in self.model_path.lower()
        return False

    def _create_object_prediction_list_from_original_predictions(
        self,
        shift_amount_list: list[list[int | float]] | None = [[0, 0]],
        full_shape_list: list[list[int | float]] | None = None,
    ) -> None:
        """Convert predictions to ObjectPrediction list.

        self._original_predictions is converted to a list of prediction.ObjectPrediction and set to
        self._object_prediction_list_per_image.

        Args:
            shift_amount_list: list of list
                To shift the box and mask predictions from sliced image to full sized image, should
                be in the form of List[[shift_x, shift_y],[shift_x, shift_y],...]
            full_shape_list: list of list
                Size of the full image after shifting, should be in the form of
                List[[height, width],[height, width],...]
        """
        assert self._original_predictions is not None
        assert self._original_shapes is not None
        original_predictions = self._original_predictions

        # compatibility for sahi v0.8.15
        shift_amount_list_typed: list[list[int | float]] = fix_shift_amount_list(shift_amount_list)
        full_shape_list_typed: list[list[int | float]] | None = fix_full_shape_list(full_shape_list)

        # handle all predictions
        object_prediction_list_per_image = []

        for image_ind, image_predictions in enumerate(original_predictions):
            shift_amount = [int(x) for x in shift_amount_list_typed[image_ind]]
            full_shape = None if full_shape_list_typed is None else [int(x) for x in full_shape_list_typed[image_ind]]
            image_shape = self._original_shapes[image_ind]
            object_prediction_list = []

            # Extract boxes and optional masks/obb
            if self.has_mask or self.is_obb:
                boxes = image_predictions[0].cpu().detach().numpy()
                masks_or_points = image_predictions[1].cpu().detach().numpy()
            else:
                boxes = image_predictions.data.cpu().detach().numpy()
                masks_or_points = None

            # Process each prediction
            for pred_ind, prediction in enumerate(boxes):
                # Get bbox coordinates
                bbox = prediction[:4].tolist()
                score = prediction[4]
                category_id = int(prediction[5])
                assert self.category_mapping is not None
                category_name = self.category_mapping[str(category_id)]

                # Fix box coordinates
                bbox = [max(0, coord) for coord in bbox]
                if full_shape is not None:
                    bbox[0] = min(full_shape[1], bbox[0])
                    bbox[1] = min(full_shape[0], bbox[1])
                    bbox[2] = min(full_shape[1], bbox[2])
                    bbox[3] = min(full_shape[0], bbox[3])

                # Ignore invalid predictions
                if not (bbox[0] < bbox[2]) or not (bbox[1] < bbox[3]):
                    logger.warning(f"ignoring invalid prediction with bbox: {bbox}")
                    continue

                # Get segmentation or OBB points
                segmentation = None
                if masks_or_points is not None:
                    if self.has_mask:
                        bool_mask = masks_or_points[pred_ind]
                        # Resize mask to original image size
                        bool_mask = cv2.resize(bool_mask.astype(np.uint8), (image_shape[1], image_shape[0]))
                        segmentation = get_coco_segmentation_from_bool_mask(bool_mask)
                    else:  # is_obb
                        obb_points = masks_or_points[pred_ind]  # Get OBB points for this prediction
                        segmentation = [obb_points.reshape(-1).tolist()]

                    if len(segmentation) == 0:
                        continue

                # Create and append object prediction
                object_prediction = ObjectPrediction(
                    bbox=bbox,
                    category_id=category_id,
                    score=score,
                    segmentation=segmentation,
                    category_name=category_name,
                    shift_amount=shift_amount,
                    full_shape=list(image_shape[:2]) if full_shape is None else full_shape,  # (height, width)
                )
                object_prediction_list.append(object_prediction)

            object_prediction_list_per_image.append(object_prediction_list)

        self._object_prediction_list_per_image = object_prediction_list_per_image

Attributes¶

category_names property ¶

Returns the list of category names from the model.

Falls back to category_mapping values when model metadata is unavailable (e.g. ONNX models without embedded names).

Raises:

Type	Description
`ValueError`	If neither model names nor category_mapping are available.

has_mask property ¶

Returns if model output contains segmentation mask.

is_obb property ¶

Returns if model output contains oriented bounding boxes.

num_categories property ¶

Returns number of categories.

Functions¶

__init__(*args, fuse=False, task=None, **kwargs) ¶

Initialize the Ultralytics detection model.

Accepts all arguments from DetectionModel.__init__ plus the following keyword arguments.

Parameters:

Name	Type	Description	Default
`*args` ¶	`object`	Variable length argument list passed to DetectionModel.	`()`
`fuse` ¶	`bool`	If True, fuse Conv2d and BatchNorm2d layers for faster inference. Default: False.	`False`
`task` ¶	`str \| None`	Ultralytics task type (e.g. `"detect"`, `"segment"`, `"obb"`). When None, the task is inferred from the model. Default: None.	`None`
`**kwargs` ¶	`object`	Arbitrary keyword arguments passed to DetectionModel.	`{}`

Source code in sahi/models/ultralytics.py

def __init__(self, *args: object, fuse: bool = False, task: str | None = None, **kwargs: object) -> None:
    """Initialize the Ultralytics detection model.

    Accepts all arguments from ``DetectionModel.__init__`` plus the
    following keyword arguments.

    Args:
        *args: Variable length argument list passed to DetectionModel.
        fuse: If True, fuse Conv2d and BatchNorm2d layers for faster
            inference. Default: False.
        task: Ultralytics task type (e.g. ``"detect"``, ``"segment"``,
            ``"obb"``). When None, the task is inferred from the model.
            Default: None.
        **kwargs: Arbitrary keyword arguments passed to DetectionModel.
    """
    self.fuse: bool = fuse
    self.task: str | None = task
    existing_packages = getattr(self, "required_packages", None) or []
    self.required_packages = [*list(existing_packages), "ultralytics"]
    super().__init__(*args, **kwargs)  # type: ignore[misc, arg-type]

load_model() ¶

Detection model is initialized and set to self.model.

Source code in sahi/models/ultralytics.py

def load_model(self) -> None:
    """Detection model is initialized and set to self.model."""
    from ultralytics import YOLO

    try:
        assert self.model_path is not None, "model_path must be provided for Ultralytics models"
        if self.task:
            model = YOLO(self.model_path, task=self.task)
        else:
            model = YOLO(self.model_path)

        # Only call .to(device) for PyTorch models, not ONNX,OpenVINO,Ncnn.
        if self.model_path and isinstance(self.model_path, str) and self.model_path.endswith(".pt"):
            model.to(self.device)
        self.set_model(model)
        if self.fuse and hasattr(model, "fuse"):
            model.fuse()

    except Exception as e:
        raise TypeError("model_path is not a valid Ultralytics model path: ", e)

perform_batch_inference(images) ¶

Performs inference on a batch of images using native YOLO batch support.

Parameters:

Name	Type	Description	Default
`images` ¶	`list[ndarray]`	list[np.ndarray] List of numpy arrays (H, W, C) in RGB order.	required

Source code in sahi/models/ultralytics.py

def perform_batch_inference(self, images: list[np.ndarray]) -> None:
    """Performs inference on a batch of images using native YOLO batch support.

    Args:
        images: list[np.ndarray]
            List of numpy arrays (H, W, C) in RGB order.
    """
    if self.model is None:
        raise ValueError("Model is not loaded, load it by calling .load_model()")

    kwargs = {"cfg": self.config_path, "verbose": False, "conf": self.confidence_threshold, "device": self.device}

    if self.image_size is not None:
        kwargs = {"imgsz": self.image_size, **kwargs}

    # YOLO expects BGR — convert each image and pass the list for native batch inference
    images_bgr = [img[:, :, ::-1] for img in images]
    prediction_result = self.model(images_bgr, **kwargs)

    self._original_predictions = self._extract_predictions(prediction_result)
    self._original_shapes = [img.shape for img in images]

perform_inference(image) ¶

Prediction is performed using self.model and the prediction result is set to self._original_predictions.

Parameters:

Name	Type	Description	Default
`image` ¶	`ndarray`	np.ndarray A numpy array that contains the image to be predicted. 3 channel image should be in RGB order.	required

Source code in sahi/models/ultralytics.py

def perform_inference(self, image: np.ndarray) -> None:
    """Prediction is performed using self.model and the prediction result is set to self._original_predictions.

    Args:
        image: np.ndarray
            A numpy array that contains the image to be predicted. 3 channel image should be in RGB order.
    """
    self.perform_batch_inference([image])

set_model(model, **kwargs) ¶

Sets the underlying Ultralytics model.

Parameters:

Name	Type	Description	Default
`model` ¶	`Any`	Any A Ultralytics model	required
`**kwargs` ¶	`Any`	Any Additional keyword arguments for model setup.	`{}`

Source code in sahi/models/ultralytics.py

def set_model(self, model: Any, **kwargs: Any) -> None:
    """Sets the underlying Ultralytics model.

    Args:
        model: Any
            A Ultralytics model
        **kwargs: Any
            Additional keyword arguments for model setup.
    """
    self.model = model
    # set category_mapping
    if not self.category_mapping:
        category_mapping = {str(ind): category_name for ind, category_name in enumerate(self.category_names)}
        self.category_mapping = category_mapping

Functions¶

`yolo-world` ¶

YOLO-World detection model wrapper for SAHI.

Provides integration with Ultralytics YOLO-World open-vocabulary detection models.

Classes¶

YOLOWorldDetectionModel ¶

Bases: UltralyticsDetectionModel

YOLO-World object detection model.

An open-vocabulary object detector that can detect custom classes at test-time.

Source code in sahi/models/yolo-world.py

class YOLOWorldDetectionModel(UltralyticsDetectionModel):
    """YOLO-World object detection model.

    An open-vocabulary object detector that can detect custom classes at test-time.
    """

    def load_model(self) -> None:
        """Detection model is initialized and set to self.model."""
        from ultralytics import YOLOWorld

        try:
            model_source = self.model_path or "yolov8s-worldv2.pt"
            model = YOLOWorld(model_source)
            model.to(self.device)
            self.set_model(model)
        except Exception as e:
            raise TypeError("model_path is not a valid yolo world model path: ", e)

Functions¶

load_model() ¶

Detection model is initialized and set to self.model.

Source code in sahi/models/yolo-world.py

def load_model(self) -> None:
    """Detection model is initialized and set to self.model."""
    from ultralytics import YOLOWorld

    try:
        model_source = self.model_path or "yolov8s-worldv2.pt"
        model = YOLOWorld(model_source)
        model.to(self.device)
        self.set_model(model)
    except Exception as e:
        raise TypeError("model_path is not a valid yolo world model path: ", e)

`yoloe` ¶

YOLOE detection model wrapper for SAHI.

Provides integration with YOLOE (Real-Time Seeing Anything) open-vocabulary detection and segmentation models.

Classes¶

YOLOEDetectionModel ¶

Bases: UltralyticsDetectionModel

YOLOE Detection Model for open-vocabulary detection and segmentation.

YOLOE (Real-Time Seeing Anything) is a zero-shot, promptable YOLO model designed for open-vocabulary detection and segmentation. It supports text prompts, visual prompts, and prompt-free detection with internal vocabulary (1200+ categories).

Key Features

Open-vocabulary detection: Detect any object class via text prompts
Visual prompting: One-shot detection using reference images
Instance segmentation: Built-in segmentation for detected objects
Real-time performance: Maintains YOLO speed with no inference overhead
Prompt-free mode: Uses internal vocabulary for open-set recognition

Available Models

Text/Visual Prompt models: - yoloe-11s-seg.pt, yoloe-11m-seg.pt, yoloe-11l-seg.pt - yoloe-v8s-seg.pt, yoloe-v8m-seg.pt, yoloe-v8l-seg.pt

Prompt-free models: - yoloe-11s-seg-pf.pt, yoloe-11m-seg-pf.pt, yoloe-11l-seg-pf.pt - yoloe-v8s-seg-pf.pt, yoloe-v8m-seg-pf.pt, yoloe-v8l-seg-pf.pt

Usage Text Prompts

from sahi import AutoDetectionModel

# Load YOLOE model
detection_model = AutoDetectionModel.from_pretrained(
    model_type="yoloe",
    model_path="yoloe-11l-seg.pt",
    confidence_threshold=0.3,
    device="cuda:0"
)

# Set text prompts for specific classes
detection_model.model.set_classes(
    ["person", "car", "traffic light"],
    detection_model.model.get_text_pe(["person", "car", "traffic light"])
)

# Perform prediction
from sahi.predict import get_prediction
result = get_prediction("image.jpg", detection_model)

Usage for standard detection (no prompts)

from sahi import AutoDetectionModel

# Load YOLOE model (works like standard YOLO)
detection_model = AutoDetectionModel.from_pretrained(
    model_type="yoloe",
    model_path="yoloe-11l-seg.pt",
    confidence_threshold=0.3,
    device="cuda:0"
)

# Perform prediction without prompts (uses internal vocabulary)
from sahi.predict import get_sliced_prediction
result = get_sliced_prediction(
    "image.jpg",
    detection_model,
    slice_height=512,
    slice_width=512,
    overlap_height_ratio=0.2,
    overlap_width_ratio=0.2
)

Note

YOLOE models perform instance segmentation by default
When used without prompts, YOLOE performs like standard YOLO11 with identical speed
For visual prompting, see Ultralytics YOLOE documentation
YOLOE achieves +3.5 AP over YOLO-Worldv2 on LVIS with 1.4x faster inference

References

Paper: https://arxiv.org/abs/2503.07465
Docs: https://docs.ultralytics.com/models/yoloe/
GitHub: https://github.com/THU-MIG/yoloe

Source code in sahi/models/yoloe.py

class YOLOEDetectionModel(UltralyticsDetectionModel):
    """YOLOE Detection Model for open-vocabulary detection and segmentation.

    YOLOE (Real-Time Seeing Anything) is a zero-shot, promptable YOLO model designed for
    open-vocabulary detection and segmentation. It supports text prompts, visual prompts,
    and prompt-free detection with internal vocabulary (1200+ categories).

    Key Features:
        - Open-vocabulary detection: Detect any object class via text prompts
        - Visual prompting: One-shot detection using reference images
        - Instance segmentation: Built-in segmentation for detected objects
        - Real-time performance: Maintains YOLO speed with no inference overhead
        - Prompt-free mode: Uses internal vocabulary for open-set recognition

    Available Models:
        Text/Visual Prompt models:
            - yoloe-11s-seg.pt, yoloe-11m-seg.pt, yoloe-11l-seg.pt
            - yoloe-v8s-seg.pt, yoloe-v8m-seg.pt, yoloe-v8l-seg.pt

        Prompt-free models:
            - yoloe-11s-seg-pf.pt, yoloe-11m-seg-pf.pt, yoloe-11l-seg-pf.pt
            - yoloe-v8s-seg-pf.pt, yoloe-v8m-seg-pf.pt, yoloe-v8l-seg-pf.pt

    !!! example "Usage Text Prompts"
        ```python
        from sahi import AutoDetectionModel

        # Load YOLOE model
        detection_model = AutoDetectionModel.from_pretrained(
            model_type="yoloe",
            model_path="yoloe-11l-seg.pt",
            confidence_threshold=0.3,
            device="cuda:0"
        )

        # Set text prompts for specific classes
        detection_model.model.set_classes(
            ["person", "car", "traffic light"],
            detection_model.model.get_text_pe(["person", "car", "traffic light"])
        )

        # Perform prediction
        from sahi.predict import get_prediction
        result = get_prediction("image.jpg", detection_model)
        ```

    !!! example "Usage for standard detection (no prompts)"
        ```python
        from sahi import AutoDetectionModel

        # Load YOLOE model (works like standard YOLO)
        detection_model = AutoDetectionModel.from_pretrained(
            model_type="yoloe",
            model_path="yoloe-11l-seg.pt",
            confidence_threshold=0.3,
            device="cuda:0"
        )

        # Perform prediction without prompts (uses internal vocabulary)
        from sahi.predict import get_sliced_prediction
        result = get_sliced_prediction(
            "image.jpg",
            detection_model,
            slice_height=512,
            slice_width=512,
            overlap_height_ratio=0.2,
            overlap_width_ratio=0.2
        )
        ```

    Note:
        - YOLOE models perform instance segmentation by default
        - When used without prompts, YOLOE performs like standard YOLO11 with identical speed
        - For visual prompting, see Ultralytics YOLOE documentation
        - YOLOE achieves +3.5 AP over YOLO-Worldv2 on LVIS with 1.4x faster inference

    References:
        - Paper: https://arxiv.org/abs/2503.07465
        - Docs: https://docs.ultralytics.com/models/yoloe/
        - GitHub: https://github.com/THU-MIG/yoloe
    """

    def load_model(self) -> None:
        """Loads the YOLOE detection model from the specified path.

        Initializes the YOLOE model with the given model path or uses the default
        'yoloe-11s-seg.pt' if no path is provided. The model is then moved to the
        specified device (CPU/GPU).

        By default, YOLOE works in prompt-free mode using its internal vocabulary
        of 1200+ categories. To use text prompts for specific classes, call
        model.set_classes() after loading:

            model.set_classes(["person", "car"], model.get_text_pe(["person", "car"]))

        Raises:
            TypeError: If the model_path is not a valid YOLOE model path or if
                      the ultralytics package with YOLOE support is not installed.
        """
        from ultralytics import YOLOE

        try:
            model_source = self.model_path or "yoloe-11s-seg.pt"
            model = YOLOE(model_source)
            model.to(self.device)
            self.set_model(model)
        except Exception as e:
            raise TypeError(f"model_path is not a valid YOLOE model path: {e}") from e

Functions¶

load_model() ¶

Loads the YOLOE detection model from the specified path.

Initializes the YOLOE model with the given model path or uses the default 'yoloe-11s-seg.pt' if no path is provided. The model is then moved to the specified device (CPU/GPU).

By default, YOLOE works in prompt-free mode using its internal vocabulary of 1200+ categories. To use text prompts for specific classes, call model.set_classes() after loading:

model.set_classes(["person", "car"], model.get_text_pe(["person", "car"]))

Raises:

Type	Description
`TypeError`	If the model_path is not a valid YOLOE model path or if the ultralytics package with YOLOE support is not installed.

Source code in sahi/models/yoloe.py

def load_model(self) -> None:
    """Loads the YOLOE detection model from the specified path.

    Initializes the YOLOE model with the given model path or uses the default
    'yoloe-11s-seg.pt' if no path is provided. The model is then moved to the
    specified device (CPU/GPU).

    By default, YOLOE works in prompt-free mode using its internal vocabulary
    of 1200+ categories. To use text prompts for specific classes, call
    model.set_classes() after loading:

        model.set_classes(["person", "car"], model.get_text_pe(["person", "car"]))

    Raises:
        TypeError: If the model_path is not a valid YOLOE model path or if
                  the ultralytics package with YOLOE support is not installed.
    """
    from ultralytics import YOLOE

    try:
        model_source = self.model_path or "yoloe-11s-seg.pt"
        model = YOLOE(model_source)
        model.to(self.device)
        self.set_model(model)
    except Exception as e:
        raise TypeError(f"model_path is not a valid YOLOE model path: {e}") from e

`yolov5` ¶

YOLOv5 detection model wrapper for SAHI.

Provides integration with Ultralytics YOLOv5 for object detection.

Classes¶

Yolov5DetectionModel ¶

Bases: DetectionModel

YOLOv5 object detection model.

Wraps Ultralytics YOLOv5 for fast object detection.

Source code in sahi/models/yolov5.py

class Yolov5DetectionModel(DetectionModel):
    """YOLOv5 object detection model.

    Wraps Ultralytics YOLOv5 for fast object detection.
    """

    def __init__(self, *args: object, **kwargs: object) -> None:
        """Initialize YOLOv5 detection model."""
        existing_packages = getattr(self, "required_packages", None) or []
        self.required_packages = [*list(existing_packages), "yolov5", "torch"]
        super().__init__(*args, **kwargs)  # type: ignore[misc, arg-type]

    def load_model(self) -> None:
        """Detection model is initialized and set to self.model."""
        import yolov5

        try:
            model = yolov5.load(self.model_path, device=self.device)
            self.set_model(model)
        except Exception as e:
            raise TypeError("model_path is not a valid yolov5 model path: ", e)

    def set_model(self, model: Any, **kwargs: Any) -> None:
        """Sets the underlying YOLOv5 model.

        Args:
            model: Any
                A YOLOv5 model
            **kwargs: Any
                Additional keyword arguments for model setup.
        """
        if model.__class__.__module__ not in ["yolov5.models.common", "models.common"]:
            raise Exception(f"Not a yolov5 model: {type(model)}")

        model.conf = self.confidence_threshold  # type: ignore[attr-defined]
        self.model = model

        # set category_mapping
        if not self.category_mapping:
            category_mapping = {str(ind): category_name for ind, category_name in enumerate(self.category_names)}
            self.category_mapping = category_mapping

    def perform_inference(self, image: np.ndarray) -> None:
        """Prediction is performed using self.model and the prediction result is set to self._original_predictions.

        Args:
            image: np.ndarray
                A numpy array that contains the image to be predicted. 3 channel image should be in RGB order.
        """
        # Confirm model is loaded
        if self.model is None:
            raise ValueError("Model is not loaded, load it by calling .load_model()")
        if self.image_size is not None:
            prediction_result = self.model(image, size=self.image_size)
        else:
            prediction_result = self.model(image)

        self._original_predictions = prediction_result

    @property
    def num_categories(self) -> int:
        """Returns number of categories."""
        assert self.model is not None
        return len(self.model.names)

    @property
    def has_mask(self) -> bool:
        """Returns if model output contains segmentation mask."""
        return False  # fix when yolov5 supports segmentation models

    @property
    def category_names(self) -> list:
        """Return category names from model."""
        assert self.model is not None
        if check_package_minimum_version("yolov5", "6.2.0"):
            return list(self.model.names.values())
        else:
            return self.model.names

    def _create_object_prediction_list_from_original_predictions(
        self,
        shift_amount_list: list[list[int | float]] | None = [[0, 0]],
        full_shape_list: list[list[int | float]] | None = None,
    ) -> None:
        """Convert predictions to ObjectPrediction list.

        self._original_predictions is converted to a list of prediction.ObjectPrediction and set to
        self._object_prediction_list_per_image.

        Args:
            shift_amount_list: list of list
                To shift the box and mask predictions from sliced image to full sized image, should
                be in the form of List[[shift_x, shift_y],[shift_x, shift_y],...]
            full_shape_list: list of list
                Size of the full image after shifting, should be in the form of
                List[[height, width],[height, width],...]
        """
        assert self._original_predictions is not None
        original_predictions = self._original_predictions

        # compatilibty for sahi v0.8.15
        shift_amount_list_typed: list[list[int | float]] = fix_shift_amount_list(shift_amount_list)
        full_shape_list_typed: list[list[int | float]] | None = fix_full_shape_list(full_shape_list)

        # handle all predictions
        object_prediction_list_per_image = []
        for image_ind, image_predictions_in_xyxy_format in enumerate(original_predictions.xyxy):  # type: ignore[attr-defined]
            shift_amount = [int(x) for x in shift_amount_list_typed[image_ind]]
            full_shape = None if full_shape_list_typed is None else [int(x) for x in full_shape_list_typed[image_ind]]
            object_prediction_list = []

            # process predictions
            for prediction in image_predictions_in_xyxy_format.cpu().detach().numpy():
                x1 = prediction[0]
                y1 = prediction[1]
                x2 = prediction[2]
                y2 = prediction[3]
                bbox = [x1, y1, x2, y2]
                score = prediction[4]
                category_id = int(prediction[5])
                assert self.category_mapping is not None
                category_name = self.category_mapping[str(category_id)]

                # fix negative box coords
                bbox[0] = max(0, bbox[0])
                bbox[1] = max(0, bbox[1])
                bbox[2] = max(0, bbox[2])
                bbox[3] = max(0, bbox[3])

                # fix out of image box coords
                if full_shape is not None:
                    bbox[0] = min(full_shape[1], bbox[0])
                    bbox[1] = min(full_shape[0], bbox[1])
                    bbox[2] = min(full_shape[1], bbox[2])
                    bbox[3] = min(full_shape[0], bbox[3])

                # ignore invalid predictions
                if not (bbox[0] < bbox[2]) or not (bbox[1] < bbox[3]):
                    logger.warning(f"ignoring invalid prediction with bbox: {bbox}")
                    continue

                object_prediction = ObjectPrediction(
                    bbox=bbox,
                    category_id=category_id,
                    score=score,
                    segmentation=None,
                    category_name=category_name,
                    shift_amount=shift_amount,
                    full_shape=full_shape,
                )
                object_prediction_list.append(object_prediction)
            object_prediction_list_per_image.append(object_prediction_list)

        self._object_prediction_list_per_image = object_prediction_list_per_image

Attributes¶

category_names property ¶

Return category names from model.

has_mask property ¶

Returns if model output contains segmentation mask.

num_categories property ¶

Returns number of categories.

Functions¶

__init__(*args, **kwargs) ¶

Initialize YOLOv5 detection model.

Source code in sahi/models/yolov5.py

def __init__(self, *args: object, **kwargs: object) -> None:
    """Initialize YOLOv5 detection model."""
    existing_packages = getattr(self, "required_packages", None) or []
    self.required_packages = [*list(existing_packages), "yolov5", "torch"]
    super().__init__(*args, **kwargs)  # type: ignore[misc, arg-type]

load_model() ¶

Detection model is initialized and set to self.model.

Source code in sahi/models/yolov5.py

def load_model(self) -> None:
    """Detection model is initialized and set to self.model."""
    import yolov5

    try:
        model = yolov5.load(self.model_path, device=self.device)
        self.set_model(model)
    except Exception as e:
        raise TypeError("model_path is not a valid yolov5 model path: ", e)

perform_inference(image) ¶

Prediction is performed using self.model and the prediction result is set to self._original_predictions.

Parameters:

Name	Type	Description	Default
`image` ¶	`ndarray`	np.ndarray A numpy array that contains the image to be predicted. 3 channel image should be in RGB order.	required

Source code in sahi/models/yolov5.py

def perform_inference(self, image: np.ndarray) -> None:
    """Prediction is performed using self.model and the prediction result is set to self._original_predictions.

    Args:
        image: np.ndarray
            A numpy array that contains the image to be predicted. 3 channel image should be in RGB order.
    """
    # Confirm model is loaded
    if self.model is None:
        raise ValueError("Model is not loaded, load it by calling .load_model()")
    if self.image_size is not None:
        prediction_result = self.model(image, size=self.image_size)
    else:
        prediction_result = self.model(image)

    self._original_predictions = prediction_result

set_model(model, **kwargs) ¶

Sets the underlying YOLOv5 model.

Parameters:

Name	Type	Description	Default
`model` ¶	`Any`	Any A YOLOv5 model	required
`**kwargs` ¶	`Any`	Any Additional keyword arguments for model setup.	`{}`

Source code in sahi/models/yolov5.py

def set_model(self, model: Any, **kwargs: Any) -> None:
    """Sets the underlying YOLOv5 model.

    Args:
        model: Any
            A YOLOv5 model
        **kwargs: Any
            Additional keyword arguments for model setup.
    """
    if model.__class__.__module__ not in ["yolov5.models.common", "models.common"]:
        raise Exception(f"Not a yolov5 model: {type(model)}")

    model.conf = self.confidence_threshold  # type: ignore[attr-defined]
    self.model = model

    # set category_mapping
    if not self.category_mapping:
        category_mapping = {str(ind): category_name for ind, category_name in enumerate(self.category_names)}
        self.category_mapping = category_mapping

Functions¶

`postprocess` ¶

Postprocessing backends and utilities for object prediction refinement.

Functions¶

`get_postprocess_backend()` ¶

Return the currently configured backend name (may be "auto").

Source code in sahi/postprocess/backends.py

def get_postprocess_backend() -> str:
    """Return the currently configured backend name (may be "auto")."""
    return _backend

`set_postprocess_backend(name)` ¶

Set the postprocessing backend.

Call once at startup before running any inference. This function is not thread-safe.

Parameters:

Name	Type	Description	Default
`name` ¶	`str`	One of "auto", "numpy", "numba", "torchvision".	required

Source code in sahi/postprocess/backends.py

def set_postprocess_backend(name: str) -> None:
    """Set the postprocessing backend.

    Call once at startup before running any inference.  This function is
    **not** thread-safe.

    Args:
        name: One of "auto", "numpy", "numba", "torchvision".
    """
    global _backend, _resolved_cache
    if name not in VALID_BACKENDS:
        raise ValueError(f"Unknown backend {name!r}. Choose from {VALID_BACKENDS}")
    _backend = name
    _resolved_cache = None  # force re-resolve
    # Invalidate the dispatch function cache in combine module
    try:
        from sahi.postprocess.combine import _dispatch_cache

        _dispatch_cache.clear()
    except ImportError:
        pass

Modules¶

`backends` ¶

Postprocessing backend selection and auto-detection.

Usage

from sahi.postprocess.backends import set_postprocess_backend, get_postprocess_backend

set_postprocess_backend("numba") # force numba set_postprocess_backend("auto") # auto-detect best available

Functions¶

get_postprocess_backend() ¶

Return the currently configured backend name (may be "auto").

Source code in sahi/postprocess/backends.py

def get_postprocess_backend() -> str:
    """Return the currently configured backend name (may be "auto")."""
    return _backend

resolve_backend() ¶

Resolve "auto" to a concrete backend, caching the result.

When the backend is set to "auto", detection follows this priority:

torchvision -- selected if both torchvision and a CUDA GPU are available (GPU-accelerated NMS).
numba -- selected if the numba package is installed (JIT-compiled loops, faster than pure numpy for large prediction counts).
numpy -- always available as the fallback (pure numpy, no extra dependencies).

If the backend was explicitly set via set_postprocess_backend, that value is returned directly without auto-detection.

Returns:

Type	Description
`str`	One of "numpy", "numba", or "torchvision".

Source code in sahi/postprocess/backends.py

def resolve_backend() -> str:
    """Resolve "auto" to a concrete backend, caching the result.

    When the backend is set to "auto", detection follows this priority:

    1. **torchvision** -- selected if both torchvision and a CUDA GPU are
       available (GPU-accelerated NMS).
    2. **numba** -- selected if the numba package is installed (JIT-compiled
       loops, faster than pure numpy for large prediction counts).
    3. **numpy** -- always available as the fallback (pure numpy,
       no extra dependencies).

    If the backend was explicitly set via ``set_postprocess_backend``, that
    value is returned directly without auto-detection.

    Returns:
        One of "numpy", "numba", or "torchvision".
    """
    global _resolved_cache
    if _resolved_cache is not None:
        return _resolved_cache

    if _backend != "auto":
        _resolved_cache = _backend
        return _backend

    # Auto-detect: prefer torchvision on GPU, then numba, then numpy
    if is_available("torchvision"):
        try:
            import torch

            if torch.cuda.is_available():
                _resolved_cache = "torchvision"
                return _resolved_cache
        except ImportError:
            pass

    if is_available("numba"):
        _resolved_cache = "numba"
        return _resolved_cache

    _resolved_cache = "numpy"
    return _resolved_cache

set_postprocess_backend(name) ¶

Set the postprocessing backend.

Call once at startup before running any inference. This function is not thread-safe.

Parameters:

Name	Type	Description	Default
`name` ¶	`str`	One of "auto", "numpy", "numba", "torchvision".	required

Source code in sahi/postprocess/backends.py

def set_postprocess_backend(name: str) -> None:
    """Set the postprocessing backend.

    Call once at startup before running any inference.  This function is
    **not** thread-safe.

    Args:
        name: One of "auto", "numpy", "numba", "torchvision".
    """
    global _backend, _resolved_cache
    if name not in VALID_BACKENDS:
        raise ValueError(f"Unknown backend {name!r}. Choose from {VALID_BACKENDS}")
    _backend = name
    _resolved_cache = None  # force re-resolve
    # Invalidate the dispatch function cache in combine module
    try:
        from sahi.postprocess.combine import _dispatch_cache

        _dispatch_cache.clear()
    except ImportError:
        pass

`combine` ¶

Postprocessing strategies for combining predictions from sliced inference.

Classes¶

GreedyNMMPostprocess ¶

Bases: NMMPostprocess

Postprocessor using Greedy Non-Maximum Merging (NMM).

Similar to NMM but uses a greedy strategy: each kept prediction only merges boxes that directly overlap with it (no transitive merging). This is faster than full NMM and produces tighter merged boxes.

Source code in sahi/postprocess/combine.py

class GreedyNMMPostprocess(NMMPostprocess):
    """Postprocessor using Greedy Non-Maximum Merging (NMM).

    Similar to NMM but uses a greedy strategy: each kept prediction only
    merges boxes that directly overlap with it (no transitive merging).
    This is faster than full NMM and produces tighter merged boxes.
    """

    _agnostic_func = staticmethod(greedy_nmm)
    _batched_func = staticmethod(batched_greedy_nmm)

LSNMSPostprocess ¶

Bases: PostprocessPredictions

Postprocessor using Locality-Sensitive NMS from the lsnms package.

Uses a spatial index for fast neighbor lookup, making it efficient for large numbers of predictions. Only supports IoU metric (not IoS). Requires the lsnms package (pip install lsnms>0.3.1).

Note

This postprocessor is experimental and not recommended for production use.

Source code in sahi/postprocess/combine.py

class LSNMSPostprocess(PostprocessPredictions):
    """Postprocessor using Locality-Sensitive NMS from the ``lsnms`` package.

    Uses a spatial index for fast neighbor lookup, making it efficient for
    large numbers of predictions. Only supports IoU metric (not IoS).
    Requires the ``lsnms`` package (``pip install lsnms>0.3.1``).

    Note:
        This postprocessor is experimental and not recommended for
        production use.
    """

    def __call__(self, object_predictions: list[ObjectPrediction]) -> list[ObjectPrediction]:
        """Apply Locality-Sensitive NMS to suppress overlapping predictions.

        Args:
            object_predictions: List of ObjectPrediction instances to suppress.

        Returns:
            List of suppressed ObjectPrediction instances.

        Raises:
            ModuleNotFoundError: If the lsnms package is not installed.
            NotImplementedError: If match_metric is not "IOU".
        """
        try:
            from lsnms import nms
        except ModuleNotFoundError:
            raise ModuleNotFoundError(
                'Please run "pip install lsnms>0.3.1" to install lsnms first for lsnms utilities.'
            )

        if self.match_metric == "IOS":
            raise NotImplementedError(f"match_metric={self.match_metric} is not supported for LSNMSPostprocess")

        logger.warning("LSNMSPostprocess is experimental and not recommended to use.")

        object_prediction_list = ObjectPredictionList(object_predictions)
        preds_np = object_prediction_list.tonumpy()

        boxes = preds_np[:, :4]
        scores = preds_np[:, 4]
        class_ids = preds_np[:, 5].astype("uint8")

        keep = nms(
            boxes, scores, iou_threshold=self.match_threshold, class_ids=None if self.class_agnostic else class_ids
        )

        selected = object_prediction_list[keep].tolist()
        if not isinstance(selected, list):
            selected = [selected]
        return selected

Functions¶

__call__(object_predictions) ¶

Apply Locality-Sensitive NMS to suppress overlapping predictions.

Parameters:

Name	Type	Description	Default
`object_predictions` ¶	`list[ObjectPrediction]`	List of ObjectPrediction instances to suppress.	required

Returns:

Type	Description
`list[ObjectPrediction]`	List of suppressed ObjectPrediction instances.

Raises:

Type	Description
`ModuleNotFoundError`	If the lsnms package is not installed.
`NotImplementedError`	If match_metric is not "IOU".

Source code in sahi/postprocess/combine.py

def __call__(self, object_predictions: list[ObjectPrediction]) -> list[ObjectPrediction]:
    """Apply Locality-Sensitive NMS to suppress overlapping predictions.

    Args:
        object_predictions: List of ObjectPrediction instances to suppress.

    Returns:
        List of suppressed ObjectPrediction instances.

    Raises:
        ModuleNotFoundError: If the lsnms package is not installed.
        NotImplementedError: If match_metric is not "IOU".
    """
    try:
        from lsnms import nms
    except ModuleNotFoundError:
        raise ModuleNotFoundError(
            'Please run "pip install lsnms>0.3.1" to install lsnms first for lsnms utilities.'
        )

    if self.match_metric == "IOS":
        raise NotImplementedError(f"match_metric={self.match_metric} is not supported for LSNMSPostprocess")

    logger.warning("LSNMSPostprocess is experimental and not recommended to use.")

    object_prediction_list = ObjectPredictionList(object_predictions)
    preds_np = object_prediction_list.tonumpy()

    boxes = preds_np[:, :4]
    scores = preds_np[:, 4]
    class_ids = preds_np[:, 5].astype("uint8")

    keep = nms(
        boxes, scores, iou_threshold=self.match_threshold, class_ids=None if self.class_agnostic else class_ids
    )

    selected = object_prediction_list[keep].tolist()
    if not isinstance(selected, list):
        selected = [selected]
    return selected

NMMPostprocess ¶

Bases: PostprocessPredictions

Postprocessor using Non-Maximum Merging (NMM) with transitive merging.

Instead of discarding overlapping detections, merges their bounding boxes, masks, and scores. Uses non-greedy transitive merging: if A overlaps B and B overlaps C, all three are merged even if A does not directly overlap C.

Source code in sahi/postprocess/combine.py

class NMMPostprocess(PostprocessPredictions):
    """Postprocessor using Non-Maximum Merging (NMM) with transitive merging.

    Instead of discarding overlapping detections, merges their bounding
    boxes, masks, and scores. Uses non-greedy transitive merging: if A
    overlaps B and B overlaps C, all three are merged even if A does not
    directly overlap C.
    """

    _agnostic_func = staticmethod(nmm)
    _batched_func = staticmethod(batched_nmm)

    def __call__(self, object_predictions: list[ObjectPrediction]) -> list[ObjectPrediction]:
        """Apply Non-Maximum Merging to merge overlapping predictions.

        Args:
            object_predictions: List of ObjectPrediction instances to merge.

        Returns:
            List of merged ObjectPrediction instances.
        """
        object_prediction_list = ObjectPredictionList(object_predictions)
        preds_np = object_prediction_list.tonumpy()
        func = self._agnostic_func if self.class_agnostic else self._batched_func
        keep_to_merge = func(preds_np, match_threshold=self.match_threshold, match_metric=self.match_metric)
        return _apply_merge(object_prediction_list, keep_to_merge, self.match_metric, self.match_threshold)

Functions¶

__call__(object_predictions) ¶

Apply Non-Maximum Merging to merge overlapping predictions.

Parameters:

Name	Type	Description	Default
`object_predictions` ¶	`list[ObjectPrediction]`	List of ObjectPrediction instances to merge.	required

Returns:

Type	Description
`list[ObjectPrediction]`	List of merged ObjectPrediction instances.

Source code in sahi/postprocess/combine.py

def __call__(self, object_predictions: list[ObjectPrediction]) -> list[ObjectPrediction]:
    """Apply Non-Maximum Merging to merge overlapping predictions.

    Args:
        object_predictions: List of ObjectPrediction instances to merge.

    Returns:
        List of merged ObjectPrediction instances.
    """
    object_prediction_list = ObjectPredictionList(object_predictions)
    preds_np = object_prediction_list.tonumpy()
    func = self._agnostic_func if self.class_agnostic else self._batched_func
    keep_to_merge = func(preds_np, match_threshold=self.match_threshold, match_metric=self.match_metric)
    return _apply_merge(object_prediction_list, keep_to_merge, self.match_metric, self.match_threshold)

NMSPostprocess ¶

Bases: PostprocessPredictions

Postprocessor using Non-Maximum Suppression (NMS).

Keeps the highest-scored prediction among overlapping boxes and discards the rest. Does not merge bounding boxes or masks.

Source code in sahi/postprocess/combine.py

class NMSPostprocess(PostprocessPredictions):
    """Postprocessor using Non-Maximum Suppression (NMS).

    Keeps the highest-scored prediction among overlapping boxes and
    discards the rest. Does not merge bounding boxes or masks.
    """

    def __call__(self, object_predictions: list[ObjectPrediction]) -> list[ObjectPrediction]:
        """Apply Non-Maximum Suppression to suppress overlapping predictions.

        Args:
            object_predictions: List of ObjectPrediction instances to suppress.

        Returns:
            List of suppressed ObjectPrediction instances.
        """
        object_prediction_list = ObjectPredictionList(object_predictions)
        preds_np = object_prediction_list.tonumpy()
        func = nms if self.class_agnostic else batched_nms
        keep = func(preds_np, match_threshold=self.match_threshold, match_metric=self.match_metric)

        selected = object_prediction_list[keep].tolist()
        if not isinstance(selected, list):
            selected = [selected]
        return selected

Functions¶

__call__(object_predictions) ¶

Apply Non-Maximum Suppression to suppress overlapping predictions.

Parameters:

Name	Type	Description	Default
`object_predictions` ¶	`list[ObjectPrediction]`	List of ObjectPrediction instances to suppress.	required

Returns:

Type	Description
`list[ObjectPrediction]`	List of suppressed ObjectPrediction instances.

Source code in sahi/postprocess/combine.py

def __call__(self, object_predictions: list[ObjectPrediction]) -> list[ObjectPrediction]:
    """Apply Non-Maximum Suppression to suppress overlapping predictions.

    Args:
        object_predictions: List of ObjectPrediction instances to suppress.

    Returns:
        List of suppressed ObjectPrediction instances.
    """
    object_prediction_list = ObjectPredictionList(object_predictions)
    preds_np = object_prediction_list.tonumpy()
    func = nms if self.class_agnostic else batched_nms
    keep = func(preds_np, match_threshold=self.match_threshold, match_metric=self.match_metric)

    selected = object_prediction_list[keep].tolist()
    if not isinstance(selected, list):
        selected = [selected]
    return selected

PostprocessPredictions ¶

Bases: ABC

Abstract base class for postprocessing object prediction lists.

Subclasses implement a specific strategy (NMS, NMM, greedy NMM, etc.) to reduce overlapping detections produced by sliced inference.

Parameters:

Name	Type	Description	Default
`match_threshold` ¶	`float`	Minimum overlap value (IoU or IoS) to consider two predictions as matching.	`0.5`
`match_metric` ¶	`str`	Overlap metric, "IOU" or "IOS".	`'IOU'`
`class_agnostic` ¶	`bool`	If True, apply postprocessing across all categories. If False, apply per category independently.	`True`

Source code in sahi/postprocess/combine.py

class PostprocessPredictions(ABC):
    """Abstract base class for postprocessing object prediction lists.

    Subclasses implement a specific strategy (NMS, NMM, greedy NMM, etc.)
    to reduce overlapping detections produced by sliced inference.

    Args:
        match_threshold: Minimum overlap value (IoU or IoS) to consider
            two predictions as matching.
        match_metric: Overlap metric, "IOU" or "IOS".
        class_agnostic: If True, apply postprocessing across all
            categories. If False, apply per category independently.
    """

    def __init__(
        self,
        match_threshold: float = 0.5,
        match_metric: str = "IOU",
        class_agnostic: bool = True,
    ) -> None:
        """Initialize the postprocessor with configuration parameters.

        Args:
            match_threshold: Minimum overlap value (IoU or IoS) to consider
                two predictions as matching.
            match_metric: Overlap metric, "IOU" or "IOS".
            class_agnostic: If True, apply postprocessing across all
                categories. If False, apply per category independently.
        """
        self.match_threshold = match_threshold
        self.class_agnostic = class_agnostic
        self.match_metric = match_metric

    @abstractmethod
    def __call__(self, predictions: list[ObjectPrediction]) -> list[ObjectPrediction]:
        """Apply postprocessing to the list of predictions.

        Args:
            predictions: List of ObjectPrediction instances to postprocess.

        Returns:
            List of postprocessed ObjectPrediction instances.
        """
        pass

Functions¶

__call__(predictions) abstractmethod ¶

Apply postprocessing to the list of predictions.

Parameters:

Name	Type	Description	Default
`predictions` ¶	`list[ObjectPrediction]`	List of ObjectPrediction instances to postprocess.	required

Returns:

Type	Description
`list[ObjectPrediction]`	List of postprocessed ObjectPrediction instances.

Source code in sahi/postprocess/combine.py

@abstractmethod
def __call__(self, predictions: list[ObjectPrediction]) -> list[ObjectPrediction]:
    """Apply postprocessing to the list of predictions.

    Args:
        predictions: List of ObjectPrediction instances to postprocess.

    Returns:
        List of postprocessed ObjectPrediction instances.
    """
    pass

__init__(match_threshold=0.5, match_metric='IOU', class_agnostic=True) ¶

Initialize the postprocessor with configuration parameters.

Parameters:

Name	Type	Description	Default
`match_threshold` ¶	`float`	Minimum overlap value (IoU or IoS) to consider two predictions as matching.	`0.5`
`match_metric` ¶	`str`	Overlap metric, "IOU" or "IOS".	`'IOU'`
`class_agnostic` ¶	`bool`	If True, apply postprocessing across all categories. If False, apply per category independently.	`True`

Source code in sahi/postprocess/combine.py

def __init__(
    self,
    match_threshold: float = 0.5,
    match_metric: str = "IOU",
    class_agnostic: bool = True,
) -> None:
    """Initialize the postprocessor with configuration parameters.

    Args:
        match_threshold: Minimum overlap value (IoU or IoS) to consider
            two predictions as matching.
        match_metric: Overlap metric, "IOU" or "IOS".
        class_agnostic: If True, apply postprocessing across all
            categories. If False, apply per category independently.
    """
    self.match_threshold = match_threshold
    self.class_agnostic = class_agnostic
    self.match_metric = match_metric

Functions¶

batched_greedy_nmm(predictions, match_metric='IOU', match_threshold=0.5) ¶

Apply greedy non-maximum merging independently per category.

Parameters:

Name	Type	Description	Default
`predictions` ¶	`ndarray`	Array of shape (N, 6) with columns [x1, y1, x2, y2, score, category_id].	required
`match_metric` ¶	`str`	Overlap metric, "IOU" or "IOS".	`'IOU'`
`match_threshold` ¶	`float`	Minimum overlap to merge a lower-scored box.	`0.5`

Returns:

Type	Description
`dict[int, list[int]]`	Dict mapping each kept index to a list of indices merged into it.

Source code in sahi/postprocess/combine.py

def batched_greedy_nmm(
    predictions: np.ndarray,
    match_metric: str = "IOU",
    match_threshold: float = 0.5,
) -> dict[int, list[int]]:
    """Apply greedy non-maximum merging independently per category.

    Args:
        predictions: Array of shape (N, 6) with columns
            [x1, y1, x2, y2, score, category_id].
        match_metric: Overlap metric, "IOU" or "IOS".
        match_threshold: Minimum overlap to merge a lower-scored box.

    Returns:
        Dict mapping each kept index to a list of indices merged into it.
    """
    return _batched_apply(predictions, greedy_nmm, match_metric, match_threshold)  # type: ignore[return-value]

batched_nmm(predictions, match_metric='IOU', match_threshold=0.5) ¶

Apply non-maximum merging (non-greedy, transitive) independently per category.

Parameters:

Name	Type	Description	Default
`predictions` ¶	`ndarray`	Array of shape (N, 6) with columns [x1, y1, x2, y2, score, category_id].	required
`match_metric` ¶	`str`	Overlap metric, "IOU" or "IOS".	`'IOU'`
`match_threshold` ¶	`float`	Minimum overlap to merge a lower-scored box.	`0.5`

Returns:

Type	Description
`dict[int, list[int]]`	Dict mapping each kept index to a list of indices merged into it.

Source code in sahi/postprocess/combine.py

def batched_nmm(
    predictions: np.ndarray,
    match_metric: str = "IOU",
    match_threshold: float = 0.5,
) -> dict[int, list[int]]:
    """Apply non-maximum merging (non-greedy, transitive) independently per category.

    Args:
        predictions: Array of shape (N, 6) with columns
            [x1, y1, x2, y2, score, category_id].
        match_metric: Overlap metric, "IOU" or "IOS".
        match_threshold: Minimum overlap to merge a lower-scored box.

    Returns:
        Dict mapping each kept index to a list of indices merged into it.
    """
    return _batched_apply(predictions, nmm, match_metric, match_threshold)  # type: ignore[return-value]

batched_nms(predictions, match_metric='IOU', match_threshold=0.5) ¶

Apply non-maximum suppression independently per category.

Parameters:

Name	Type	Description	Default
`predictions` ¶	`ndarray`	Array of shape (N, 6) with columns [x1, y1, x2, y2, score, category_id].	required
`match_metric` ¶	`str`	Overlap metric, "IOU" or "IOS".	`'IOU'`
`match_threshold` ¶	`float`	Minimum overlap to suppress a lower-scored box.	`0.5`

Returns:

Type	Description
`list[int]`	List of indices of the kept predictions, sorted by score descending.

Source code in sahi/postprocess/combine.py

def batched_nms(
    predictions: np.ndarray,
    match_metric: str = "IOU",
    match_threshold: float = 0.5,
) -> list[int]:
    """Apply non-maximum suppression independently per category.

    Args:
        predictions: Array of shape (N, 6) with columns
            [x1, y1, x2, y2, score, category_id].
        match_metric: Overlap metric, "IOU" or "IOS".
        match_threshold: Minimum overlap to suppress a lower-scored box.

    Returns:
        List of indices of the kept predictions, sorted by score descending.
    """
    return _batched_apply(predictions, nms, match_metric, match_threshold)  # type: ignore[return-value]

greedy_nmm(predictions, match_metric='IOU', match_threshold=0.5) ¶

Greedy non-maximum merging for axis-aligned bounding boxes.

Instead of discarding overlapping boxes, merges them into the highest-scored box. Dispatches to the resolved backend.

Parameters:

Name	Type	Description	Default
`predictions` ¶	`ndarray`	Array of shape (N, 6) with columns [x1, y1, x2, y2, score, category_id].	required
`match_metric` ¶	`str`	Overlap metric, "IOU" or "IOS".	`'IOU'`
`match_threshold` ¶	`float`	Minimum overlap to merge a lower-scored box.	`0.5`

Returns:

Type	Description
`dict[int, list[int]]`	Dict mapping each kept index to a list of indices merged into it.

Source code in sahi/postprocess/combine.py

def greedy_nmm(
    predictions: np.ndarray,
    match_metric: str = "IOU",
    match_threshold: float = 0.5,
) -> dict[int, list[int]]:
    """Greedy non-maximum merging for axis-aligned bounding boxes.

    Instead of discarding overlapping boxes, merges them into the
    highest-scored box. Dispatches to the resolved backend.

    Args:
        predictions: Array of shape (N, 6) with columns
            [x1, y1, x2, y2, score, category_id].
        match_metric: Overlap metric, "IOU" or "IOS".
        match_threshold: Minimum overlap to merge a lower-scored box.

    Returns:
        Dict mapping each kept index to a list of indices merged into it.
    """
    return _dispatch("greedy_nmm")(predictions, match_metric, match_threshold)

nmm(predictions, match_metric='IOU', match_threshold=0.5) ¶

Non-maximum merging (non-greedy, transitive) for axis-aligned bounding boxes.

Unlike greedy NMM, this variant allows transitive merging: if box A merges with B and B merges with C, all three are merged together. Dispatches to the resolved backend.

Parameters:

Name	Type	Description	Default
`predictions` ¶	`ndarray`	Array of shape (N, 6) with columns [x1, y1, x2, y2, score, category_id].	required
`match_metric` ¶	`str`	Overlap metric, "IOU" or "IOS".	`'IOU'`
`match_threshold` ¶	`float`	Minimum overlap to merge a lower-scored box.	`0.5`

Returns:

Type	Description
`dict[int, list[int]]`	Dict mapping each kept index to a list of indices merged into it.

Source code in sahi/postprocess/combine.py

def nmm(
    predictions: np.ndarray,
    match_metric: str = "IOU",
    match_threshold: float = 0.5,
) -> dict[int, list[int]]:
    """Non-maximum merging (non-greedy, transitive) for axis-aligned bounding boxes.

    Unlike greedy NMM, this variant allows transitive merging: if box A
    merges with B and B merges with C, all three are merged together.
    Dispatches to the resolved backend.

    Args:
        predictions: Array of shape (N, 6) with columns
            [x1, y1, x2, y2, score, category_id].
        match_metric: Overlap metric, "IOU" or "IOS".
        match_threshold: Minimum overlap to merge a lower-scored box.

    Returns:
        Dict mapping each kept index to a list of indices merged into it.
    """
    return _dispatch("nmm")(predictions, match_metric, match_threshold)

nms(predictions, match_metric='IOU', match_threshold=0.5) ¶

Non-maximum suppression for axis-aligned bounding boxes.

Dispatches to the resolved backend (numpy, numba, or torchvision).

Parameters:

Name	Type	Description	Default
`predictions` ¶	`ndarray`	Array of shape (N, 6) with columns [x1, y1, x2, y2, score, category_id].	required
`match_metric` ¶	`str`	Overlap metric, "IOU" or "IOS".	`'IOU'`
`match_threshold` ¶	`float`	Minimum overlap to suppress a lower-scored box.	`0.5`

Returns:

Type	Description
`list[int]`	List of indices of the kept predictions, sorted by score descending.

Source code in sahi/postprocess/combine.py

def nms(
    predictions: np.ndarray,
    match_metric: str = "IOU",
    match_threshold: float = 0.5,
) -> list[int]:
    """Non-maximum suppression for axis-aligned bounding boxes.

    Dispatches to the resolved backend (numpy, numba, or torchvision).

    Args:
        predictions: Array of shape (N, 6) with columns
            [x1, y1, x2, y2, score, category_id].
        match_metric: Overlap metric, "IOU" or "IOS".
        match_threshold: Minimum overlap to suppress a lower-scored box.

    Returns:
        List of indices of the kept predictions, sorted by score descending.
    """
    if len(predictions) == 0:
        return []
    return _dispatch("nms")(predictions, match_metric, match_threshold)

`legacy` ¶

Legacy postprocessing implementations.

Modules¶

combine ¶

Legacy postprocessing implementations for object prediction merging.

Classes¶

NMSPostprocess ¶

Bases: PostprocessPredictions

Non-Maximum Suppression postprocessor for legacy usage.

Source code in sahi/postprocess/legacy/combine.py

class NMSPostprocess(PostprocessPredictions):
    """Non-Maximum Suppression postprocessor for legacy usage."""

    def __call__(
        self,
        object_predictions: list[ObjectPrediction],
    ) -> list[ObjectPrediction]:
        """Apply NMS to object predictions."""
        source_object_predictions: list[ObjectPrediction] = copy.deepcopy(object_predictions)
        selected_object_predictions: list[ObjectPrediction] = []
        while len(source_object_predictions) > 0:
            # select object prediction with highest score
            source_object_predictions.sort(reverse=True, key=self.get_score_func)
            selected_object_prediction = source_object_predictions[0]
            # remove selected prediction from source list
            del source_object_predictions[0]
            # if any element from remaining source prediction list matches, remove it
            new_source_object_predictions: list[ObjectPrediction] = []
            for candidate_object_prediction in source_object_predictions:
                if self._has_match(selected_object_prediction, candidate_object_prediction):
                    pass
                else:
                    new_source_object_predictions.append(candidate_object_prediction)
            source_object_predictions = new_source_object_predictions
            # append selected prediction to selected list
            selected_object_predictions.append(selected_object_prediction)
        return selected_object_predictions

Functions¶

__call__(object_predictions) ¶

Apply NMS to object predictions.

Source code in sahi/postprocess/legacy/combine.py

def __call__(
    self,
    object_predictions: list[ObjectPrediction],
) -> list[ObjectPrediction]:
    """Apply NMS to object predictions."""
    source_object_predictions: list[ObjectPrediction] = copy.deepcopy(object_predictions)
    selected_object_predictions: list[ObjectPrediction] = []
    while len(source_object_predictions) > 0:
        # select object prediction with highest score
        source_object_predictions.sort(reverse=True, key=self.get_score_func)
        selected_object_prediction = source_object_predictions[0]
        # remove selected prediction from source list
        del source_object_predictions[0]
        # if any element from remaining source prediction list matches, remove it
        new_source_object_predictions: list[ObjectPrediction] = []
        for candidate_object_prediction in source_object_predictions:
            if self._has_match(selected_object_prediction, candidate_object_prediction):
                pass
            else:
                new_source_object_predictions.append(candidate_object_prediction)
        source_object_predictions = new_source_object_predictions
        # append selected prediction to selected list
        selected_object_predictions.append(selected_object_prediction)
    return selected_object_predictions

PostprocessPredictions ¶

Utilities for calculating IOU/IOS based match for given ObjectPredictions.

Source code in sahi/postprocess/legacy/combine.py

class PostprocessPredictions:
    """Utilities for calculating IOU/IOS based match for given ObjectPredictions."""

    def __init__(
        self,
        match_threshold: float = 0.5,
        match_metric: str = "IOU",
        class_agnostic: bool = True,
    ) -> None:
        """Initialize the postprocessor with matching configuration.

        Args:
            match_threshold: Minimum overlap value to consider predictions matching.
            match_metric: Metric for overlap computation, "IOU" or "IOS".
            class_agnostic: If True, apply postprocessing across all categories.
        """
        self.match_threshold = match_threshold
        self.class_agnostic = class_agnostic
        if match_metric == "IOU":
            self.calculate_match = self.calculate_bbox_iou
        elif match_metric == "IOS":
            self.calculate_match = self.calculate_bbox_ios
        else:
            raise ValueError(f"'match_metric' should be one of ['IOU', 'IOS'] but given as {match_metric}")

    def _has_match(self, pred1: ObjectPrediction, pred2: ObjectPrediction) -> bool:
        threshold_condition = self.calculate_match(pred1, pred2) > self.match_threshold
        category_condition = self.has_same_category_id(pred1, pred2) or self.class_agnostic
        return threshold_condition and category_condition

    @staticmethod
    def get_score_func(object_prediction: ObjectPrediction) -> float:
        """Used for sorting predictions."""
        return object_prediction.score.value

    @staticmethod
    def has_same_category_id(pred1: ObjectPrediction, pred2: ObjectPrediction) -> bool:
        """Check if two predictions belong to the same category.

        Args:
            pred1: First ObjectPrediction instance.
            pred2: Second ObjectPrediction instance.

        Returns:
            True if both predictions have the same category ID.
        """
        return pred1.category.id == pred2.category.id

    @staticmethod
    def calculate_bbox_iou(pred1: ObjectPrediction, pred2: ObjectPrediction) -> float:
        """Returns the ratio of intersection area to the union."""
        box1 = np.array(pred1.bbox.to_xyxy())
        box2 = np.array(pred2.bbox.to_xyxy())
        area1 = calculate_area(box1)
        area2 = calculate_area(box2)
        intersect = calculate_intersection_area(box1, box2)
        return intersect / (area1 + area2 - intersect)

    @staticmethod
    def calculate_bbox_ios(pred1: ObjectPrediction, pred2: ObjectPrediction) -> float:
        """Returns the ratio of intersection area to the smaller box's area."""
        box1 = np.array(pred1.bbox.to_xyxy())
        box2 = np.array(pred2.bbox.to_xyxy())
        area1 = calculate_area(box1)
        area2 = calculate_area(box2)
        intersect = calculate_intersection_area(box1, box2)
        smaller_area = np.minimum(area1, area2)
        return intersect / smaller_area

    def __call__(
        self,
        object_predictions: list[ObjectPrediction],
    ) -> list[ObjectPrediction]:
        """Apply postprocessing to object predictions.

        Args:
            object_predictions: List of object predictions to postprocess.

        Returns:
            List of postprocessed object predictions.
        """
        raise NotImplementedError()

Functions¶

__call__(object_predictions) ¶

Apply postprocessing to object predictions.

Parameters:

Name	Type	Description	Default
`object_predictions` ¶	`list[ObjectPrediction]`	List of object predictions to postprocess.	required

Returns:

Type	Description
`list[ObjectPrediction]`	List of postprocessed object predictions.

Source code in sahi/postprocess/legacy/combine.py

def __call__(
    self,
    object_predictions: list[ObjectPrediction],
) -> list[ObjectPrediction]:
    """Apply postprocessing to object predictions.

    Args:
        object_predictions: List of object predictions to postprocess.

    Returns:
        List of postprocessed object predictions.
    """
    raise NotImplementedError()

__init__(match_threshold=0.5, match_metric='IOU', class_agnostic=True) ¶

Initialize the postprocessor with matching configuration.

Parameters:

Name	Type	Description	Default
`match_threshold` ¶	`float`	Minimum overlap value to consider predictions matching.	`0.5`
`match_metric` ¶	`str`	Metric for overlap computation, "IOU" or "IOS".	`'IOU'`
`class_agnostic` ¶	`bool`	If True, apply postprocessing across all categories.	`True`

Source code in sahi/postprocess/legacy/combine.py

def __init__(
    self,
    match_threshold: float = 0.5,
    match_metric: str = "IOU",
    class_agnostic: bool = True,
) -> None:
    """Initialize the postprocessor with matching configuration.

    Args:
        match_threshold: Minimum overlap value to consider predictions matching.
        match_metric: Metric for overlap computation, "IOU" or "IOS".
        class_agnostic: If True, apply postprocessing across all categories.
    """
    self.match_threshold = match_threshold
    self.class_agnostic = class_agnostic
    if match_metric == "IOU":
        self.calculate_match = self.calculate_bbox_iou
    elif match_metric == "IOS":
        self.calculate_match = self.calculate_bbox_ios
    else:
        raise ValueError(f"'match_metric' should be one of ['IOU', 'IOS'] but given as {match_metric}")

calculate_bbox_ios(pred1, pred2) staticmethod ¶

Returns the ratio of intersection area to the smaller box's area.

Source code in sahi/postprocess/legacy/combine.py

@staticmethod
def calculate_bbox_ios(pred1: ObjectPrediction, pred2: ObjectPrediction) -> float:
    """Returns the ratio of intersection area to the smaller box's area."""
    box1 = np.array(pred1.bbox.to_xyxy())
    box2 = np.array(pred2.bbox.to_xyxy())
    area1 = calculate_area(box1)
    area2 = calculate_area(box2)
    intersect = calculate_intersection_area(box1, box2)
    smaller_area = np.minimum(area1, area2)
    return intersect / smaller_area

calculate_bbox_iou(pred1, pred2) staticmethod ¶

Returns the ratio of intersection area to the union.

Source code in sahi/postprocess/legacy/combine.py

@staticmethod
def calculate_bbox_iou(pred1: ObjectPrediction, pred2: ObjectPrediction) -> float:
    """Returns the ratio of intersection area to the union."""
    box1 = np.array(pred1.bbox.to_xyxy())
    box2 = np.array(pred2.bbox.to_xyxy())
    area1 = calculate_area(box1)
    area2 = calculate_area(box2)
    intersect = calculate_intersection_area(box1, box2)
    return intersect / (area1 + area2 - intersect)

get_score_func(object_prediction) staticmethod ¶

Used for sorting predictions.

Source code in sahi/postprocess/legacy/combine.py

@staticmethod
def get_score_func(object_prediction: ObjectPrediction) -> float:
    """Used for sorting predictions."""
    return object_prediction.score.value

has_same_category_id(pred1, pred2) staticmethod ¶

Check if two predictions belong to the same category.

Parameters:

Name	Type	Description	Default
`pred1` ¶	`ObjectPrediction`	First ObjectPrediction instance.	required
`pred2` ¶	`ObjectPrediction`	Second ObjectPrediction instance.	required

Returns:

Type	Description
`bool`	True if both predictions have the same category ID.

Source code in sahi/postprocess/legacy/combine.py

@staticmethod
def has_same_category_id(pred1: ObjectPrediction, pred2: ObjectPrediction) -> bool:
    """Check if two predictions belong to the same category.

    Args:
        pred1: First ObjectPrediction instance.
        pred2: Second ObjectPrediction instance.

    Returns:
        True if both predictions have the same category ID.
    """
    return pred1.category.id == pred2.category.id

UnionMergePostprocess ¶

Bases: PostprocessPredictions

Union merging postprocessor for overlapping predictions.

Source code in sahi/postprocess/legacy/combine.py

class UnionMergePostprocess(PostprocessPredictions):
    """Union merging postprocessor for overlapping predictions."""

    def __call__(
        self,
        object_predictions: list[ObjectPrediction],
    ) -> list[ObjectPrediction]:
        """Apply union merging to overlapping object predictions."""
        source_object_predictions: list[ObjectPrediction] = copy.deepcopy(object_predictions)
        selected_object_predictions: list[ObjectPrediction] = []
        while len(source_object_predictions) > 0:
            # select object prediction with highest score
            source_object_predictions.sort(reverse=True, key=self.get_score_func)
            selected_object_prediction = source_object_predictions[0]
            # remove selected prediction from source list
            del source_object_predictions[0]
            # if any element from remaining source prediction list matches, remove it and merge with selected prediction
            new_source_object_predictions: list[ObjectPrediction] = []
            for ind, candidate_object_prediction in enumerate(source_object_predictions):
                if self._has_match(selected_object_prediction, candidate_object_prediction):
                    selected_object_prediction = self._merge_object_prediction_pair(
                        selected_object_prediction, candidate_object_prediction
                    )
                else:
                    new_source_object_predictions.append(candidate_object_prediction)
            source_object_predictions = new_source_object_predictions
            # append selected prediction to selected list
            selected_object_predictions.append(selected_object_prediction)
        return selected_object_predictions

    def _merge_object_prediction_pair(
        self,
        pred1: ObjectPrediction,
        pred2: ObjectPrediction,
    ) -> ObjectPrediction:
        shift_amount = list(pred1.bbox.shift_amount)
        merged_bbox: BoundingBox = self._get_merged_bbox(pred1, pred2)
        merged_score: float = self._get_merged_score(pred1, pred2)
        merged_category: Category = self._get_merged_category(pred1, pred2)
        if pred1.mask and pred2.mask:
            merged_mask: Mask = self._get_merged_mask(pred1, pred2)
            segmentation = merged_mask.segmentation
            full_shape = merged_mask.full_shape
        else:
            segmentation = None
            full_shape = None
        return ObjectPrediction(
            bbox=merged_bbox.to_xyxy(),
            score=merged_score,
            category_id=merged_category.id,
            category_name=merged_category.name,
            segmentation=segmentation,
            shift_amount=shift_amount,
            full_shape=full_shape,
        )

    @staticmethod
    def _get_merged_category(pred1: ObjectPrediction, pred2: ObjectPrediction) -> Category:
        if pred1.score.value > pred2.score.value:
            return pred1.category
        else:
            return pred2.category

    @staticmethod
    def _get_merged_bbox(pred1: ObjectPrediction, pred2: ObjectPrediction) -> BoundingBox:
        box1: list[float] = pred1.bbox.to_xyxy()
        box2: list[float] = pred2.bbox.to_xyxy()
        bbox = BoundingBox(box=calculate_box_union(box1, box2))
        return bbox

    @staticmethod
    def _get_merged_score(
        pred1: ObjectPrediction,
        pred2: ObjectPrediction,
    ) -> float:
        scores: list[float] = [pred.score.value for pred in (pred1, pred2)]
        return max(scores)

    @staticmethod
    def _get_merged_mask(pred1: ObjectPrediction, pred2: ObjectPrediction) -> Mask:
        mask1 = pred1.mask
        mask2 = pred2.mask
        if mask1 is None or mask2 is None:
            raise ValueError("Both predictions must have masks to merge them")
        union_mask = np.logical_or(mask1.bool_mask, mask2.bool_mask)
        from sahi.utils.cv import get_coco_segmentation_from_bool_mask

        return Mask(
            segmentation=get_coco_segmentation_from_bool_mask(union_mask),
            full_shape=mask1.full_shape,
            shift_amount=mask1.shift_amount,
        )

Functions¶

__call__(object_predictions) ¶

Apply union merging to overlapping object predictions.

Source code in sahi/postprocess/legacy/combine.py

def __call__(
    self,
    object_predictions: list[ObjectPrediction],
) -> list[ObjectPrediction]:
    """Apply union merging to overlapping object predictions."""
    source_object_predictions: list[ObjectPrediction] = copy.deepcopy(object_predictions)
    selected_object_predictions: list[ObjectPrediction] = []
    while len(source_object_predictions) > 0:
        # select object prediction with highest score
        source_object_predictions.sort(reverse=True, key=self.get_score_func)
        selected_object_prediction = source_object_predictions[0]
        # remove selected prediction from source list
        del source_object_predictions[0]
        # if any element from remaining source prediction list matches, remove it and merge with selected prediction
        new_source_object_predictions: list[ObjectPrediction] = []
        for ind, candidate_object_prediction in enumerate(source_object_predictions):
            if self._has_match(selected_object_prediction, candidate_object_prediction):
                selected_object_prediction = self._merge_object_prediction_pair(
                    selected_object_prediction, candidate_object_prediction
                )
            else:
                new_source_object_predictions.append(candidate_object_prediction)
        source_object_predictions = new_source_object_predictions
        # append selected prediction to selected list
        selected_object_predictions.append(selected_object_prediction)
    return selected_object_predictions

Functions¶

`utils` ¶

Utilities for postprocessing object predictions.

Classes¶

ObjectPredictionList ¶

Bases: Sequence

Sequence wrapper around a list of ObjectPrediction instances.

Provides indexing by int, list, or tensor-like objects, and conversion to numpy arrays or torch tensors for batch postprocessing operations.

Parameters:

Name	Type	Description	Default
`prediction_list` ¶	`list`	List of ObjectPrediction instances to wrap.	required

Source code in sahi/postprocess/utils.py

class ObjectPredictionList(Sequence):
    """Sequence wrapper around a list of ObjectPrediction instances.

    Provides indexing by int, list, or tensor-like objects, and conversion
    to numpy arrays or torch tensors for batch postprocessing operations.

    Args:
        prediction_list: List of ObjectPrediction instances to wrap.
    """

    def __init__(self, prediction_list: list) -> None:
        """Initialize with a list of object predictions.

        Args:
            prediction_list: List of ObjectPrediction instances.
        """
        self.list: list[ObjectPrediction] = prediction_list
        super().__init__()

    def __getitem__(self, i: int | list[int] | tuple[int, ...] | object) -> ObjectPredictionList:
        """Retrieve predictions by index, list of indices, or tensor-like.

        Args:
            i: An integer index, list/tuple of indices, or tensor-like
                object convertible via ``.tolist()``.

        Returns:
            A new ObjectPredictionList containing the selected predictions.
        """
        if _is_tensor_like(i):
            i = i.tolist()  # type: ignore[union-attr]
        if isinstance(i, int):
            return ObjectPredictionList([self.list[i]])
        elif isinstance(i, (tuple, list)):
            accessed_mapping = map(self.list.__getitem__, i)
            return ObjectPredictionList(list(accessed_mapping))
        else:
            raise NotImplementedError(f"{type(i)}")

    def __setitem__(
        self,
        i: int | list[int] | tuple[int, ...] | object,
        elem: ObjectPrediction | ObjectPredictionList | list[ObjectPrediction],
    ) -> None:
        """Set predictions at the given index or indices.

        Args:
            i: An integer index, list/tuple of indices, or tensor-like.
            elem: An ObjectPrediction, ObjectPredictionList, or list of
                ObjectPrediction instances to assign.
        """
        if _is_tensor_like(i):
            i = i.tolist()  # type: ignore[union-attr]
        if isinstance(i, int):
            if isinstance(elem, ObjectPrediction):
                self.list[i] = elem
            else:
                raise ValueError("Single index requires ObjectPrediction value")
        elif isinstance(i, (tuple, list)):
            if isinstance(elem, ObjectPredictionList):
                elem_len = len(elem.list)
                for ind, el in enumerate(elem.list):
                    self.list[i[ind]] = el
            elif isinstance(elem, ObjectPrediction):
                raise ValueError("Single prediction value cannot be used with multiple indices")
            else:
                elem_len = len(elem)
                for ind, el in enumerate(elem):
                    self.list[i[ind]] = el
            if len(i) != elem_len:
                raise ValueError()
        else:
            raise NotImplementedError(f"{type(i)}")

    def __len__(self) -> int:
        """Return the number of predictions in this list."""
        return len(self.list)

    def __str__(self) -> str:
        """Return string representation of the prediction list."""
        return str(self.list)

    def extend(self, object_prediction_list: ObjectPredictionList) -> None:
        """Extend this list with predictions from another ObjectPredictionList.

        Args:
            object_prediction_list: The list whose predictions to append.
        """
        self.list.extend(object_prediction_list.list)

    def totensor(self) -> object:
        """Convert to torch.Tensor. Requires torch to be installed."""
        return object_prediction_list_to_torch(self)

    def tonumpy(self) -> np.ndarray:
        """Convert to a numpy array of shape (N, 6).

        Returns:
            np.ndarray with columns [x1, y1, x2, y2, score, category_id].
        """
        return object_prediction_list_to_numpy(self)

    def tolist(self) -> ObjectPrediction | list[ObjectPrediction]:
        """Unwrap to a single ObjectPrediction or a list.

        Returns:
            A single ObjectPrediction if the list has one element,
            otherwise the full list of ObjectPrediction instances.
        """
        if len(self.list) == 1:
            return self.list[0]
        else:
            return self.list

Functions¶

__getitem__(i) ¶

Retrieve predictions by index, list of indices, or tensor-like.

Parameters:

Name	Type	Description	Default
`i` ¶	`int \| list[int] \| tuple[int, ...] \| object`	An integer index, list/tuple of indices, or tensor-like object convertible via `.tolist()`.	required

Returns:

Type	Description
`ObjectPredictionList`	A new ObjectPredictionList containing the selected predictions.

Source code in sahi/postprocess/utils.py

def __getitem__(self, i: int | list[int] | tuple[int, ...] | object) -> ObjectPredictionList:
    """Retrieve predictions by index, list of indices, or tensor-like.

    Args:
        i: An integer index, list/tuple of indices, or tensor-like
            object convertible via ``.tolist()``.

    Returns:
        A new ObjectPredictionList containing the selected predictions.
    """
    if _is_tensor_like(i):
        i = i.tolist()  # type: ignore[union-attr]
    if isinstance(i, int):
        return ObjectPredictionList([self.list[i]])
    elif isinstance(i, (tuple, list)):
        accessed_mapping = map(self.list.__getitem__, i)
        return ObjectPredictionList(list(accessed_mapping))
    else:
        raise NotImplementedError(f"{type(i)}")

__init__(prediction_list) ¶

Initialize with a list of object predictions.

Parameters:

Name	Type	Description	Default
`prediction_list` ¶	`list`	List of ObjectPrediction instances.	required

Source code in sahi/postprocess/utils.py

def __init__(self, prediction_list: list) -> None:
    """Initialize with a list of object predictions.

    Args:
        prediction_list: List of ObjectPrediction instances.
    """
    self.list: list[ObjectPrediction] = prediction_list
    super().__init__()

__len__() ¶

Return the number of predictions in this list.

Source code in sahi/postprocess/utils.py

def __len__(self) -> int:
    """Return the number of predictions in this list."""
    return len(self.list)

__setitem__(i, elem) ¶

Set predictions at the given index or indices.

Parameters:

Name	Type	Description	Default
`i` ¶	`int \| list[int] \| tuple[int, ...] \| object`	An integer index, list/tuple of indices, or tensor-like.	required
`elem` ¶	`ObjectPrediction \| ObjectPredictionList \| list[ObjectPrediction]`	An ObjectPrediction, ObjectPredictionList, or list of ObjectPrediction instances to assign.	required

Source code in sahi/postprocess/utils.py

def __setitem__(
    self,
    i: int | list[int] | tuple[int, ...] | object,
    elem: ObjectPrediction | ObjectPredictionList | list[ObjectPrediction],
) -> None:
    """Set predictions at the given index or indices.

    Args:
        i: An integer index, list/tuple of indices, or tensor-like.
        elem: An ObjectPrediction, ObjectPredictionList, or list of
            ObjectPrediction instances to assign.
    """
    if _is_tensor_like(i):
        i = i.tolist()  # type: ignore[union-attr]
    if isinstance(i, int):
        if isinstance(elem, ObjectPrediction):
            self.list[i] = elem
        else:
            raise ValueError("Single index requires ObjectPrediction value")
    elif isinstance(i, (tuple, list)):
        if isinstance(elem, ObjectPredictionList):
            elem_len = len(elem.list)
            for ind, el in enumerate(elem.list):
                self.list[i[ind]] = el
        elif isinstance(elem, ObjectPrediction):
            raise ValueError("Single prediction value cannot be used with multiple indices")
        else:
            elem_len = len(elem)
            for ind, el in enumerate(elem):
                self.list[i[ind]] = el
        if len(i) != elem_len:
            raise ValueError()
    else:
        raise NotImplementedError(f"{type(i)}")

__str__() ¶

Return string representation of the prediction list.

Source code in sahi/postprocess/utils.py

def __str__(self) -> str:
    """Return string representation of the prediction list."""
    return str(self.list)

extend(object_prediction_list) ¶

Extend this list with predictions from another ObjectPredictionList.

Parameters:

Name	Type	Description	Default
`object_prediction_list` ¶	`ObjectPredictionList`	The list whose predictions to append.	required

Source code in sahi/postprocess/utils.py

def extend(self, object_prediction_list: ObjectPredictionList) -> None:
    """Extend this list with predictions from another ObjectPredictionList.

    Args:
        object_prediction_list: The list whose predictions to append.
    """
    self.list.extend(object_prediction_list.list)

tolist() ¶

Unwrap to a single ObjectPrediction or a list.

Returns:

Type	Description
`ObjectPrediction \| list[ObjectPrediction]`	A single ObjectPrediction if the list has one element,
`ObjectPrediction \| list[ObjectPrediction]`	otherwise the full list of ObjectPrediction instances.

Source code in sahi/postprocess/utils.py

def tolist(self) -> ObjectPrediction | list[ObjectPrediction]:
    """Unwrap to a single ObjectPrediction or a list.

    Returns:
        A single ObjectPrediction if the list has one element,
        otherwise the full list of ObjectPrediction instances.
    """
    if len(self.list) == 1:
        return self.list[0]
    else:
        return self.list

tonumpy() ¶

Convert to a numpy array of shape (N, 6).

Returns:

Type	Description
`ndarray`	np.ndarray with columns [x1, y1, x2, y2, score, category_id].

Source code in sahi/postprocess/utils.py

def tonumpy(self) -> np.ndarray:
    """Convert to a numpy array of shape (N, 6).

    Returns:
        np.ndarray with columns [x1, y1, x2, y2, score, category_id].
    """
    return object_prediction_list_to_numpy(self)

totensor() ¶

Convert to torch.Tensor. Requires torch to be installed.

Source code in sahi/postprocess/utils.py

def totensor(self) -> object:
    """Convert to torch.Tensor. Requires torch to be installed."""
    return object_prediction_list_to_torch(self)

Functions¶

calculate_area(box) ¶

Compute the area of an axis-aligned bounding box.

Parameters:

Name	Type	Description	Default
`box` ¶	`list[int] \| list[float] \| ndarray`	Bounding box as [x1, y1, x2, y2].	required

Returns:

Type	Description
`float`	The area of the box (width * height).

Source code in sahi/postprocess/utils.py

def calculate_area(box: list[int] | list[float] | np.ndarray) -> float:
    """Compute the area of an axis-aligned bounding box.

    Args:
        box: Bounding box as [x1, y1, x2, y2].

    Returns:
        The area of the box (width * height).
    """
    return (box[2] - box[0]) * (box[3] - box[1])

calculate_bbox_ios(pred1, pred2) ¶

Compute Intersection over Smaller (IoS) between two predictions.

Parameters:

Name	Type	Description	Default
`pred1` ¶	`ObjectPrediction`	First object prediction.	required
`pred2` ¶	`ObjectPrediction`	Second object prediction.	required

Returns:

Type	Description
`float`	The IoS value in [0, 1], where the denominator is the area of
`float`	the smaller bounding box.

Source code in sahi/postprocess/utils.py

def calculate_bbox_ios(pred1: ObjectPrediction, pred2: ObjectPrediction) -> float:
    """Compute Intersection over Smaller (IoS) between two predictions.

    Args:
        pred1: First object prediction.
        pred2: Second object prediction.

    Returns:
        The IoS value in [0, 1], where the denominator is the area of
        the smaller bounding box.
    """
    box1 = np.array(pred1.bbox.to_xyxy())
    box2 = np.array(pred2.bbox.to_xyxy())
    area1 = calculate_area(box1)
    area2 = calculate_area(box2)
    intersect = calculate_intersection_area(box1, box2)
    smaller_area = np.minimum(area1, area2)
    return intersect / smaller_area

calculate_bbox_iou(pred1, pred2) ¶

Compute Intersection over Union (IoU) between two predictions.

Parameters:

Name	Type	Description	Default
`pred1` ¶	`ObjectPrediction`	First object prediction.	required
`pred2` ¶	`ObjectPrediction`	Second object prediction.	required

Returns:

Type	Description
`float`	The IoU value in [0, 1].

Source code in sahi/postprocess/utils.py

def calculate_bbox_iou(pred1: ObjectPrediction, pred2: ObjectPrediction) -> float:
    """Compute Intersection over Union (IoU) between two predictions.

    Args:
        pred1: First object prediction.
        pred2: Second object prediction.

    Returns:
        The IoU value in [0, 1].
    """
    box1 = np.array(pred1.bbox.to_xyxy())
    box2 = np.array(pred2.bbox.to_xyxy())
    area1 = calculate_area(box1)
    area2 = calculate_area(box2)
    intersect = calculate_intersection_area(box1, box2)
    return intersect / (area1 + area2 - intersect)

calculate_box_union(box1, box2) ¶

Compute the smallest bounding box enclosing both input boxes.

Parameters:

Name	Type	Description	Default
`box1` ¶	`list[int] \| list[float] \| ndarray`	First box as [x1, y1, x2, y2].	required
`box2` ¶	`list[int] \| list[float] \| ndarray`	Second box as [x1, y1, x2, y2].	required

Returns:

Type	Description
`list[int]`	The union bounding box as [x1, y1, x2, y2].

Source code in sahi/postprocess/utils.py

def calculate_box_union(
    box1: list[int] | list[float] | np.ndarray, box2: list[int] | list[float] | np.ndarray
) -> list[int]:
    """Compute the smallest bounding box enclosing both input boxes.

    Args:
        box1: First box as [x1, y1, x2, y2].
        box2: Second box as [x1, y1, x2, y2].

    Returns:
        The union bounding box as [x1, y1, x2, y2].
    """
    box1 = np.array(box1)
    box2 = np.array(box2)
    left_top = np.minimum(box1[:2], box2[:2])
    right_bottom = np.maximum(box1[2:], box2[2:])
    return list(np.concatenate((left_top, right_bottom)))

calculate_intersection_area(box1, box2) ¶

Compute the intersection area of two axis-aligned bounding boxes.

Parameters:

Name	Type	Description	Default
`box1` ¶	`ndarray`	First box as np.array([x1, y1, x2, y2]).	required
`box2` ¶	`ndarray`	Second box as np.array([x1, y1, x2, y2]).	required

Returns:

Type	Description
`float`	The area of the intersection region, or 0 if the boxes do not
`float`	overlap.

Source code in sahi/postprocess/utils.py

def calculate_intersection_area(box1: np.ndarray, box2: np.ndarray) -> float:
    """Compute the intersection area of two axis-aligned bounding boxes.

    Args:
        box1: First box as np.array([x1, y1, x2, y2]).
        box2: Second box as np.array([x1, y1, x2, y2]).

    Returns:
        The area of the intersection region, or 0 if the boxes do not
        overlap.
    """
    left_top = np.maximum(box1[:2], box2[:2])
    right_bottom = np.minimum(box1[2:], box2[2:])
    width_height = (right_bottom - left_top).clip(min=0)
    return width_height[0] * width_height[1]

coco_segmentation_to_shapely(segmentation) ¶

Convert COCO segmentation format to a Shapely MultiPolygon.

Source code in sahi/postprocess/utils.py

def coco_segmentation_to_shapely(segmentation: list | list[list]) -> MultiPolygon:
    """Convert COCO segmentation format to a Shapely MultiPolygon."""
    if isinstance(segmentation, list) and all([not isinstance(seg, list) for seg in segmentation]):
        segmentation = [segmentation]
    elif isinstance(segmentation, list) and all([isinstance(seg, list) for seg in segmentation]):
        pass
    else:
        raise ValueError("segmentation must be List or List[List]")

    polygon_list = []

    for coco_polygon in segmentation:
        point_list = list(zip(coco_polygon[::2], coco_polygon[1::2]))
        shapely_polygon = Polygon(point_list)
        polygon_list.append(repair_polygon(shapely_polygon))

    shapely_multipolygon = repair_multipolygon(MultiPolygon(polygon_list))
    return shapely_multipolygon

get_merged_bbox(pred1, pred2) ¶

Compute the union bounding box of two predictions.

Parameters:

Name	Type	Description	Default
`pred1` ¶	`ObjectPrediction`	First object prediction.	required
`pred2` ¶	`ObjectPrediction`	Second object prediction.	required

Returns:

Type	Description
`BoundingBox`	A BoundingBox enclosing both input bounding boxes.

Source code in sahi/postprocess/utils.py

def get_merged_bbox(pred1: ObjectPrediction, pred2: ObjectPrediction) -> BoundingBox:
    """Compute the union bounding box of two predictions.

    Args:
        pred1: First object prediction.
        pred2: Second object prediction.

    Returns:
        A BoundingBox enclosing both input bounding boxes.
    """
    box1: list[float] = pred1.bbox.to_xyxy()
    box2: list[float] = pred2.bbox.to_xyxy()
    bbox = BoundingBox(box=calculate_box_union(box1, box2))
    return bbox

get_merged_category(pred1, pred2) ¶

Return the category of the higher-scored prediction.

Parameters:

Name	Type	Description	Default
`pred1` ¶	`ObjectPrediction`	First object prediction.	required
`pred2` ¶	`ObjectPrediction`	Second object prediction.	required

Returns:

Type	Description
`Category`	The Category from whichever prediction has the higher score.

Source code in sahi/postprocess/utils.py

def get_merged_category(pred1: ObjectPrediction, pred2: ObjectPrediction) -> Category:
    """Return the category of the higher-scored prediction.

    Args:
        pred1: First object prediction.
        pred2: Second object prediction.

    Returns:
        The Category from whichever prediction has the higher score.
    """
    if pred1.score.value > pred2.score.value:
        return pred1.category
    else:
        return pred2.category

get_merged_mask(pred1, pred2) ¶

Compute the union of two prediction masks.

Parameters:

Name	Type	Description	Default
`pred1` ¶	`ObjectPrediction`	First object prediction with a valid mask.	required
`pred2` ¶	`ObjectPrediction`	Second object prediction with a valid mask.	required

Returns:

Type	Description
`Mask`	A new Mask representing the geometric union of both masks.

Source code in sahi/postprocess/utils.py

def get_merged_mask(pred1: ObjectPrediction, pred2: ObjectPrediction) -> Mask:
    """Compute the union of two prediction masks.

    Args:
        pred1: First object prediction with a valid mask.
        pred2: Second object prediction with a valid mask.

    Returns:
        A new Mask representing the geometric union of both masks.
    """
    mask1 = pred1.mask
    mask2 = pred2.mask

    if mask1 is None or mask2 is None:
        raise ValueError("Both predictions must have masks to merge them")

    # buffer(0) is a quickhack to fix invalid polygons most of the time
    poly1 = get_shapely_multipolygon(mask1.segmentation).buffer(0)
    poly2 = get_shapely_multipolygon(mask2.segmentation).buffer(0)

    if poly1.is_empty:
        poly1 = coco_segmentation_to_shapely(mask1.segmentation)
    if poly2.is_empty:
        poly2 = coco_segmentation_to_shapely(mask2.segmentation)

    union_poly = poly1.union(poly2)
    if not hasattr(union_poly, "geoms"):
        union_poly = MultiPolygon([union_poly])
    else:
        union_poly = MultiPolygon([g.buffer(0) for g in union_poly.geoms if isinstance(g, Polygon)])
    union = ShapelyAnnotation(multipolygon=union_poly).to_coco_segmentation()
    return Mask(
        segmentation=union,  # type: ignore[arg-type]
        full_shape=mask1.full_shape,
        shift_amount=mask1.shift_amount,
    )

get_merged_score(pred1, pred2) ¶

Return the higher confidence score from two predictions.

Parameters:

Name	Type	Description	Default
`pred1` ¶	`ObjectPrediction`	First object prediction.	required
`pred2` ¶	`ObjectPrediction`	Second object prediction.	required

Returns:

Type	Description
`float`	The maximum score value.

Source code in sahi/postprocess/utils.py

def get_merged_score(
    pred1: ObjectPrediction,
    pred2: ObjectPrediction,
) -> float:
    """Return the higher confidence score from two predictions.

    Args:
        pred1: First object prediction.
        pred2: Second object prediction.

    Returns:
        The maximum score value.
    """
    scores: list[float] = [pred.score.value for pred in (pred1, pred2)]
    return max(scores)

has_match(pred1, pred2, match_type='IOU', match_threshold=0.5) ¶

Check whether two predictions overlap above the given threshold.

Parameters:

Name	Type	Description	Default
`pred1` ¶	`ObjectPrediction`	First object prediction.	required
`pred2` ¶	`ObjectPrediction`	Second object prediction.	required
`match_type` ¶	`str`	Overlap metric, "IOU" or "IOS".	`'IOU'`
`match_threshold` ¶	`float`	Minimum overlap to count as a match.	`0.5`

Returns:

Type	Description
`bool`	True if the overlap exceeds match_threshold.

Raises:

Type	Description
`ValueError`	If match_type is not "IOU" or "IOS".

Source code in sahi/postprocess/utils.py

def has_match(
    pred1: ObjectPrediction, pred2: ObjectPrediction, match_type: str = "IOU", match_threshold: float = 0.5
) -> bool:
    """Check whether two predictions overlap above the given threshold.

    Args:
        pred1: First object prediction.
        pred2: Second object prediction.
        match_type: Overlap metric, "IOU" or "IOS".
        match_threshold: Minimum overlap to count as a match.

    Returns:
        True if the overlap exceeds match_threshold.

    Raises:
        ValueError: If match_type is not "IOU" or "IOS".
    """
    if match_type == "IOU":
        threshold_condition = calculate_bbox_iou(pred1, pred2) > match_threshold
    elif match_type == "IOS":
        threshold_condition = calculate_bbox_ios(pred1, pred2) > match_threshold
    else:
        raise ValueError()
    return threshold_condition

merge_object_prediction_pair(pred1, pred2) ¶

Merge two overlapping predictions into a single prediction.

Combines bounding boxes (union), masks (geometric union), scores (maximum), and categories (from the higher-scored prediction).

Parameters:

Name	Type	Description	Default
`pred1` ¶	`ObjectPrediction`	First object prediction.	required
`pred2` ¶	`ObjectPrediction`	Second object prediction.	required

Returns:

Type	Description
`ObjectPrediction`	A new ObjectPrediction with merged attributes.

Source code in sahi/postprocess/utils.py

def merge_object_prediction_pair(
    pred1: ObjectPrediction,
    pred2: ObjectPrediction,
) -> ObjectPrediction:
    """Merge two overlapping predictions into a single prediction.

    Combines bounding boxes (union), masks (geometric union), scores
    (maximum), and categories (from the higher-scored prediction).

    Args:
        pred1: First object prediction.
        pred2: Second object prediction.

    Returns:
        A new ObjectPrediction with merged attributes.
    """
    shift_amount = list(pred1.bbox.shift_amount)
    merged_bbox: BoundingBox = get_merged_bbox(pred1, pred2)
    merged_score: float = get_merged_score(pred1, pred2)
    merged_category: Category = get_merged_category(pred1, pred2)
    if pred1.mask and pred2.mask:
        merged_mask: Mask = get_merged_mask(pred1, pred2)
        segmentation = merged_mask.segmentation
        full_shape = merged_mask.full_shape
    else:
        segmentation = None
        full_shape = None
    return ObjectPrediction(
        bbox=merged_bbox.to_xyxy(),
        score=merged_score,
        category_id=merged_category.id,
        category_name=merged_category.name,
        segmentation=segmentation,
        shift_amount=shift_amount,
        full_shape=full_shape,
    )

object_prediction_list_to_numpy(object_prediction_list) ¶

Convert an ObjectPredictionList to a numpy array.

Parameters:

Name	Type	Description	Default
`object_prediction_list` ¶	`ObjectPredictionList`	The predictions to convert.	required

Returns:

Type	Description
`ndarray`	np.ndarray of shape (N, 6) with columns
`ndarray`	[x1, y1, x2, y2, score, category_id].

Source code in sahi/postprocess/utils.py

def object_prediction_list_to_numpy(object_prediction_list: ObjectPredictionList) -> np.ndarray:
    """Convert an ObjectPredictionList to a numpy array.

    Args:
        object_prediction_list: The predictions to convert.

    Returns:
        np.ndarray of shape (N, 6) with columns
        [x1, y1, x2, y2, score, category_id].
    """
    num_predictions = len(object_prediction_list)
    numpy_predictions = np.zeros([num_predictions, 6], dtype=np.float32)
    for ind, object_prediction in enumerate(object_prediction_list):
        numpy_predictions[ind, :4] = np.array(object_prediction.tolist().bbox.to_xyxy(), dtype=np.float32)
        numpy_predictions[ind, 4] = object_prediction.tolist().score.value
        numpy_predictions[ind, 5] = object_prediction.tolist().category.id
    return numpy_predictions

object_prediction_list_to_torch(object_prediction_list) ¶

Convert to torch.Tensor. Requires torch to be installed.

Returns:

Type	Description
`object`	torch.Tensor of size N x [x1, y1, x2, y2, score, category_id]

Source code in sahi/postprocess/utils.py

def object_prediction_list_to_torch(object_prediction_list: ObjectPredictionList) -> object:
    """Convert to torch.Tensor. Requires torch to be installed.

    Returns:
        torch.Tensor of size N x [x1, y1, x2, y2, score, category_id]
    """
    if not is_available("torch"):
        raise ImportError("torch is required for totensor(). Install it with: pip install sahi[torch]")
    import torch

    num_predictions = len(object_prediction_list)
    torch_predictions = torch.zeros([num_predictions, 6], dtype=torch.float32)
    for ind, object_prediction in enumerate(object_prediction_list):
        torch_predictions[ind, :4] = torch.tensor(object_prediction.tolist().bbox.to_xyxy(), dtype=torch.float32)
        torch_predictions[ind, 4] = object_prediction.tolist().score.value
        torch_predictions[ind, 5] = object_prediction.tolist().category.id
    return torch_predictions

repair_multipolygon(shapely_multipolygon) ¶

Attempt to fix an invalid Shapely MultiPolygon using a zero-width buffer.

If the repaired result is a single Polygon, it is wrapped in a MultiPolygon. GeometryCollection results are filtered to polygons only.

Parameters:

Name	Type	Description	Default
`shapely_multipolygon` ¶	`MultiPolygon`	A Shapely MultiPolygon that may be invalid.	required

Returns:

Type	Description
`MultiPolygon`	A valid MultiPolygon, or the original if it was already valid or
`MultiPolygon`	could not be repaired.

Source code in sahi/postprocess/utils.py

def repair_multipolygon(shapely_multipolygon: MultiPolygon) -> MultiPolygon:
    """Attempt to fix an invalid Shapely MultiPolygon using a zero-width buffer.

    If the repaired result is a single Polygon, it is wrapped in a
    MultiPolygon. GeometryCollection results are filtered to polygons only.

    Args:
        shapely_multipolygon: A Shapely MultiPolygon that may be invalid.

    Returns:
        A valid MultiPolygon, or the original if it was already valid or
        could not be repaired.
    """
    if not shapely_multipolygon.is_valid:
        fixed_geometry = shapely_multipolygon.buffer(0)

        if fixed_geometry.is_valid:
            if isinstance(fixed_geometry, MultiPolygon):
                return fixed_geometry
            elif isinstance(fixed_geometry, Polygon):
                return MultiPolygon([fixed_geometry])
            elif isinstance(fixed_geometry, GeometryCollection):
                polygons = [geom for geom in fixed_geometry.geoms if isinstance(geom, Polygon)]
                return MultiPolygon(polygons) if polygons else shapely_multipolygon

    return shapely_multipolygon

repair_polygon(shapely_polygon) ¶

Attempt to fix an invalid Shapely polygon using a zero-width buffer.

If the repaired result is a MultiPolygon or GeometryCollection, the polygon with the largest area is returned.

Parameters:

Name	Type	Description	Default
`shapely_polygon` ¶	`Polygon`	A Shapely Polygon that may be invalid.	required

Returns:

Type	Description
`Polygon`	A valid Polygon, or the original if it was already valid or
`Polygon`	could not be repaired.

Source code in sahi/postprocess/utils.py

def repair_polygon(shapely_polygon: Polygon) -> Polygon:
    """Attempt to fix an invalid Shapely polygon using a zero-width buffer.

    If the repaired result is a MultiPolygon or GeometryCollection, the
    polygon with the largest area is returned.

    Args:
        shapely_polygon: A Shapely Polygon that may be invalid.

    Returns:
        A valid Polygon, or the original if it was already valid or
        could not be repaired.
    """
    if not shapely_polygon.is_valid:
        fixed_polygon = shapely_polygon.buffer(0)
        if fixed_polygon.is_valid:
            if isinstance(fixed_polygon, Polygon):
                return fixed_polygon
            elif isinstance(fixed_polygon, MultiPolygon):
                return max(fixed_polygon.geoms, key=lambda p: p.area)
            elif isinstance(fixed_polygon, GeometryCollection):
                polygons = [geom for geom in fixed_polygon.geoms if isinstance(geom, Polygon)]
                return max(polygons, key=lambda p: p.area) if polygons else shapely_polygon

    return shapely_polygon

`predict` ¶

High-level prediction API for object detection.

Classes¶

Functions¶

`agg_prediction(result, thresh)` ¶

Aggregate predictions by merging overlapping bounding boxes.

Parameters:

Name	Type	Description	Default
`result` ¶	`PredictionResult`	Prediction result object containing detections.	required
`thresh` ¶	`float`	Threshold for bounding box overlap merging.	required

Returns:

Type	Description
`list`	List of aggregated bounding boxes and associated data.

Source code in sahi/predict.py

def agg_prediction(result: PredictionResult, thresh: float) -> list:
    """Aggregate predictions by merging overlapping bounding boxes.

    Args:
        result: Prediction result object containing detections.
        thresh: Threshold for bounding box overlap merging.

    Returns:
        List of aggregated bounding boxes and associated data.
    """
    coord_list = []
    res = result.to_coco_annotations()
    for ann in res:
        current_bbox = ann["bbox"]
        x = current_bbox[0]
        y = current_bbox[1]
        w = current_bbox[2]
        h = current_bbox[3]

        coord_list.append((x, y, w, h))

    def _cmp(a: tuple[Any, ...], b: tuple[Any, ...]) -> int:
        return bbox_sort(a, b, thresh)

    cnts = sorted(coord_list, key=cmp_to_key(_cmp))
    for pred in range(len(res) - 1):
        res[pred]["image_id"] = cnts.index(tuple(res[pred]["bbox"]))

    return res

`bbox_sort(a, b, thresh)` ¶

Compare two bounding boxes for reading-order sorting.

Boxes whose Y-coordinates differ by no more than thresh are considered to be on the same row and are sorted by X-coordinate. Otherwise they are sorted by Y-coordinate.

Parameters:

Name	Type	Description	Default
`a` ¶	`tuple`	tuple First bounding box as `(x, y, w, h)`.	required
`b` ¶	`tuple`	tuple Second bounding box as `(x, y, w, h)`.	required
`thresh` ¶	`int \| float`	int or float Maximum Y-coordinate difference to treat two boxes as being on the same row.	required

Returns:

Name	Type	Description
`int`	`int`	Negative if `a` should come first, positive if `b`
	`int`	should come first, zero if equal.

Source code in sahi/predict.py

def bbox_sort(a: tuple, b: tuple, thresh: int | float) -> int:
    """Compare two bounding boxes for reading-order sorting.

    Boxes whose Y-coordinates differ by no more than ``thresh`` are
    considered to be on the same row and are sorted by X-coordinate.
    Otherwise they are sorted by Y-coordinate.

    Args:
        a: tuple
            First bounding box as ``(x, y, w, h)``.
        b: tuple
            Second bounding box as ``(x, y, w, h)``.
        thresh: int or float
            Maximum Y-coordinate difference to treat two boxes as being
            on the same row.

    Returns:
        int: Negative if ``a`` should come first, positive if ``b``
        should come first, zero if equal.
    """
    bbox_a = a
    bbox_b = b

    if abs(bbox_a[1] - bbox_b[1]) <= thresh:
        diff = bbox_a[0] - bbox_b[0]
        return -1 if diff < 0 else (1 if diff > 0 else 0)

    diff = bbox_a[1] - bbox_b[1]
    return -1 if diff < 0 else (1 if diff > 0 else 0)

`filter_predictions(object_prediction_list, exclude_classes_by_name, exclude_classes_by_id)` ¶

Filter out predictions whose category matches an exclusion list.

Parameters:

Name	Type	Description	Default
`object_prediction_list` ¶	`list[ObjectPrediction]`	list[ObjectPrediction] Predictions to filter.	required
`exclude_classes_by_name` ¶	`list[str] \| None`	list[str] or None Category names to exclude.	required
`exclude_classes_by_id` ¶	`list[int] \| None`	list[int] or None Category IDs to exclude.	required

Returns:

Type	Description
`list[ObjectPrediction]`	list[ObjectPrediction]: Predictions not matching any exclusion criterion.

Source code in sahi/predict.py

def filter_predictions(
    object_prediction_list: list[ObjectPrediction],
    exclude_classes_by_name: list[str] | None,
    exclude_classes_by_id: list[int] | None,
) -> list[ObjectPrediction]:
    """Filter out predictions whose category matches an exclusion list.

    Args:
        object_prediction_list: list[ObjectPrediction]
            Predictions to filter.
        exclude_classes_by_name: list[str] or None
            Category names to exclude.
        exclude_classes_by_id: list[int] or None
            Category IDs to exclude.

    Returns:
        list[ObjectPrediction]: Predictions not matching any exclusion criterion.
    """
    return [
        obj_pred
        for obj_pred in object_prediction_list
        if obj_pred.category.name not in (exclude_classes_by_name or [])
        and obj_pred.category.id not in (exclude_classes_by_id or [])
    ]

`get_prediction(image, detection_model, shift_amount=None, full_shape=None, postprocess=None, verbose=0, exclude_classes_by_name=None, exclude_classes_by_id=None, confidence_threshold=None)` ¶

Function for performing prediction for given image using given detection_model.

Parameters:

Name	Type	Description	Default
`image` ¶	`str \| ndarray \| Image`	str or np.ndarray Location of image or numpy image matrix to slice	required
`detection_model` ¶	`DetectionModel`	model.DetectionMode	required
`shift_amount` ¶	`list[int] \| None`	List To shift the box and mask predictions from sliced image to full sized image, should be in the form of [shift_x, shift_y]	`None`
`full_shape` ¶	`list[int] \| None`	List Size of the full image, should be in the form of [height, width]	`None`
`postprocess` ¶	`PostprocessPredictions \| None`	sahi.postprocess.combine.PostprocessPredictions	`None`
`verbose` ¶	`int`	int 0: no print (default) 1: print prediction duration	`0`
`exclude_classes_by_name` ¶	`list[str] \| None`	Optional[List[str]] None: if no classes are excluded List[str]: set of classes to exclude using its/their class label name/s	`None`
`exclude_classes_by_id` ¶	`list[int] \| None`	Optional[List[int]] None: if no classes are excluded List[int]: set of classes to exclude using one or more IDs	`None`
`confidence_threshold` ¶	`float \| None`	float, optional Override the model's confidence threshold for this call only. The model's original threshold is restored after the call.	`None`

Example

from sahi import AutoDetectionModel from sahi.predict import get_prediction model = AutoDetectionModel.from_pretrained( ... model_type="ultralytics", ... model_path="yolo11n.pt", ... confidence_threshold=0.3, ... )

run with a different threshold without changing the model¶

result = get_prediction("image.jpg", model, confidence_threshold=0.7) print(model.confidence_threshold) # still 0.3

Returns:

Type	Description
`PredictionResult`	A dict with fields: object_prediction_list: a list of ObjectPrediction durations_in_seconds: a dict containing elapsed times for profiling

Source code in sahi/predict.py

def get_prediction(
    image: str | np.ndarray | Image.Image,
    detection_model: DetectionModel,
    shift_amount: list[int] | None = None,
    full_shape: list[int] | None = None,
    postprocess: PostprocessPredictions | None = None,
    verbose: int = 0,
    exclude_classes_by_name: list[str] | None = None,
    exclude_classes_by_id: list[int] | None = None,
    confidence_threshold: float | None = None,
) -> PredictionResult:
    """Function for performing prediction for given image using given detection_model.

    Args:
        image: str or np.ndarray
            Location of image or numpy image matrix to slice
        detection_model: model.DetectionMode
        shift_amount: List
            To shift the box and mask predictions from sliced image to full
            sized image, should be in the form of [shift_x, shift_y]
        full_shape: List
            Size of the full image, should be in the form of [height, width]
        postprocess: sahi.postprocess.combine.PostprocessPredictions
        verbose: int
            0: no print (default)
            1: print prediction duration
        exclude_classes_by_name: Optional[List[str]]
            None: if no classes are excluded
            List[str]: set of classes to exclude using its/their class label name/s
        exclude_classes_by_id: Optional[List[int]]
            None: if no classes are excluded
            List[int]: set of classes to exclude using one or more IDs
        confidence_threshold: float, optional
            Override the model's confidence threshold for this call only.
            The model's original threshold is restored after the call.

    Example:
        >>> from sahi import AutoDetectionModel
        >>> from sahi.predict import get_prediction
        >>> model = AutoDetectionModel.from_pretrained(
        ...     model_type="ultralytics",
        ...     model_path="yolo11n.pt",
        ...     confidence_threshold=0.3,
        ... )
        >>> # run with a different threshold without changing the model
        >>> result = get_prediction("image.jpg", model, confidence_threshold=0.7)
        >>> print(model.confidence_threshold)  # still 0.3

    Returns:
        A dict with fields:
            object_prediction_list: a list of ObjectPrediction
            durations_in_seconds: a dict containing elapsed times for profiling
    """
    original_confidence_threshold = detection_model.confidence_threshold
    if confidence_threshold is not None:
        detection_model.confidence_threshold = confidence_threshold

    try:
        durations_in_seconds = dict()
        image_as_arr = read_image_as_pil(image, return_arr=True)

        if shift_amount is None:
            shift_amount = [0, 0]
        if full_shape is None:
            if image_as_arr.ndim == 2:  # type: ignore[union-attr]
                h, w = image_as_arr.shape  # type: ignore[misc]
            else:
                h, w = image_as_arr.shape[:2]  # type: ignore[union-attr]
            full_shape = [h, w]

        time_start = time.perf_counter()
        detection_model.perform_inference(np.ascontiguousarray(image_as_arr))
        durations_in_seconds["prediction"] = time.perf_counter() - time_start

        time_start = time.perf_counter()
        detection_model.convert_original_predictions(
            shift_amount=shift_amount,  # type: ignore[arg-type]
            full_shape=full_shape,  # type: ignore[arg-type]
        )
        object_prediction_list: list[ObjectPrediction] = detection_model.object_prediction_list
        object_prediction_list = filter_predictions(
            object_prediction_list, exclude_classes_by_name, exclude_classes_by_id
        )

        if postprocess is not None:
            object_prediction_list = postprocess(object_prediction_list)

        durations_in_seconds["postprocess"] = time.perf_counter() - time_start

        if verbose == 1:
            print("Prediction performed in", durations_in_seconds["prediction"], "seconds.")
    finally:
        detection_model.confidence_threshold = original_confidence_threshold

    return PredictionResult(
        image=image, object_prediction_list=object_prediction_list, durations_in_seconds=durations_in_seconds
    )

get_sliced_prediction(image, detection_model=None, slice_height=None, slice_width=None, overlap_height_ratio=0.2, overlap_width_ratio=0.2, perform_standard_pred=True, postprocess_type='GREEDYNMM', postprocess_match_metric='IOS', postprocess_match_threshold=0.5, postprocess_class_agnostic=False, verbose=1, merge_buffer_length=None, auto_slice_resolution=True, slice_export_prefix=None, slice_dir=None, exclude_classes_by_name=None, exclude_classes_by_id=None, progress_bar=False, progress_callback=None, batch_size=1, force_postprocess_type=False, confidence_threshold=None) ¶

Function for slice image + get predicion for each slice + combine predictions in full image.

Parameters:

Name	Type	Description	Default
`image` ¶	`str \| ndarray \| Image`	str or np.ndarray Location of image or numpy image matrix to slice	required
`detection_model` ¶	`DetectionModel \| None`	model.DetectionModel	`None`
`slice_height` ¶	`int \| None`	int Height of each slice. Defaults to `None`.	`None`
`slice_width` ¶	`int \| None`	int Width of each slice. Defaults to `None`.	`None`
`overlap_height_ratio` ¶	`float`	float Fractional overlap in height of each window (e.g. an overlap of 0.2 for a window of size 512 yields an overlap of 102 pixels). Default to `0.2`.	`0.2`
`overlap_width_ratio` ¶	`float`	float Fractional overlap in width of each window (e.g. an overlap of 0.2 for a window of size 512 yields an overlap of 102 pixels). Default to `0.2`.	`0.2`
`perform_standard_pred` ¶	`bool`	bool Perform a standard prediction on top of sliced predictions to increase large object detection accuracy. Default: True.	`True`
`postprocess_type` ¶	`str`	str Type of the postprocess to be used after sliced inference while merging/eliminating predictions. Options are 'NMM', 'GREEDYNMM' or 'NMS'. Default is 'GREEDYNMM'.	`'GREEDYNMM'`
`postprocess_match_metric` ¶	`str`	str Metric to be used during object prediction matching after sliced prediction. 'IOU' for intersection over union, 'IOS' for intersection over smaller area.	`'IOS'`
`postprocess_match_threshold` ¶	`float`	float Sliced predictions having higher iou than postprocess_match_threshold will be postprocessed after sliced prediction.	`0.5`
`postprocess_class_agnostic` ¶	`bool`	bool If True, postprocess will ignore category ids.	`False`
`verbose` ¶	`int`	int 0: no print 1: print number of slices (default) 2: print number of slices and slice/prediction durations	`1`
`merge_buffer_length` ¶	`int \| None`	int The length of buffer for slices to be used during sliced prediction, which is suitable for low memory. It may affect the AP if it is specified. The higher the amount, the closer results to the non-buffered. scenario. See the discussion.	`None`
`auto_slice_resolution` ¶	`bool`	bool if slice parameters (slice_height, slice_width) are not given, it enables automatically calculate these params from image resolution and orientation.	`True`
`slice_export_prefix` ¶	`str \| None`	str Prefix for the exported slices. Defaults to None.	`None`
`slice_dir` ¶	`str \| None`	str Directory to save the slices. Defaults to None.	`None`
`exclude_classes_by_name` ¶	`list[str] \| None`	Optional[List[str]] None: if no classes are excluded List[str]: set of classes to exclude using its/their class label name/s	`None`
`exclude_classes_by_id` ¶	`list[int] \| None`	Optional[List[int]] None: if no classes are excluded List[int]: set of classes to exclude using one or more IDs	`None`
`progress_bar` ¶	`bool`	bool Whether to show progress bar for slice processing. Default: False.	`False`
`progress_callback` ¶	`Callable \| None`	callable A callback function that will be called after each slice is processed. The function should accept two arguments: (current_slice, total_slices)	`None`
`batch_size` ¶	`int`	int Number of slices to process in a single batch inference call. Increasing this value can improve GPU utilization and throughput. Default: 1 (sequential, same as previous behavior).	`1`
`force_postprocess_type` ¶	`bool`	bool If True, the auto postprocess type switch will be disabled. When False (default) and the detection model's confidence threshold is below LOW_MODEL_CONFIDENCE (0.1), the postprocess type will be automatically switched to NMS/IOU to avoid bounding box enlargement from merge operations. Default: False.	`False`
`confidence_threshold` ¶	`float \| None`	float, optional Override the model's confidence threshold for this call only. The model's original threshold is restored after the call.	`None`

Example

from sahi import AutoDetectionModel from sahi.predict import get_sliced_prediction model = AutoDetectionModel.from_pretrained( ... model_type="ultralytics", ... model_path="yolo11n.pt", ... confidence_threshold=0.3, ... )

sweep thresholds without recreating the model¶

for thresh in [0.3, 0.5, 0.7]: ... result = get_sliced_prediction("image.jpg", model, confidence_threshold=thresh) ... print(thresh, len(result.object_prediction_list)) print(model.confidence_threshold) # still 0.3

Returns:

Type	Description
`PredictionResult`	A Dict with fields: object_prediction_list: a list of sahi.prediction.ObjectPrediction durations_in_seconds: a dict containing elapsed times for profiling

Source code in sahi/predict.py

def get_sliced_prediction(
    image: str | np.ndarray | Image.Image,
    detection_model: DetectionModel | None = None,
    slice_height: int | None = None,
    slice_width: int | None = None,
    overlap_height_ratio: float = 0.2,
    overlap_width_ratio: float = 0.2,
    perform_standard_pred: bool = True,
    postprocess_type: str = "GREEDYNMM",
    postprocess_match_metric: str = "IOS",
    postprocess_match_threshold: float = 0.5,
    postprocess_class_agnostic: bool = False,
    verbose: int = 1,
    merge_buffer_length: int | None = None,
    auto_slice_resolution: bool = True,
    slice_export_prefix: str | None = None,
    slice_dir: str | None = None,
    exclude_classes_by_name: list[str] | None = None,
    exclude_classes_by_id: list[int] | None = None,
    progress_bar: bool = False,
    progress_callback: Callable | None = None,
    batch_size: int = 1,
    force_postprocess_type: bool = False,
    confidence_threshold: float | None = None,
) -> PredictionResult:
    """Function for slice image + get predicion for each slice + combine predictions in full image.

    Args:
        image: str or np.ndarray
            Location of image or numpy image matrix to slice
        detection_model: model.DetectionModel
        slice_height: int
            Height of each slice.  Defaults to ``None``.
        slice_width: int
            Width of each slice.  Defaults to ``None``.
        overlap_height_ratio: float
            Fractional overlap in height of each window (e.g. an overlap of 0.2 for a window
            of size 512 yields an overlap of 102 pixels).
            Default to ``0.2``.
        overlap_width_ratio: float
            Fractional overlap in width of each window (e.g. an overlap of 0.2 for a window
            of size 512 yields an overlap of 102 pixels).
            Default to ``0.2``.
        perform_standard_pred: bool
            Perform a standard prediction on top of sliced predictions to increase large object
            detection accuracy. Default: True.
        postprocess_type: str
            Type of the postprocess to be used after sliced inference while merging/eliminating predictions.
            Options are 'NMM', 'GREEDYNMM' or 'NMS'. Default is 'GREEDYNMM'.
        postprocess_match_metric: str
            Metric to be used during object prediction matching after sliced prediction.
            'IOU' for intersection over union, 'IOS' for intersection over smaller area.
        postprocess_match_threshold: float
            Sliced predictions having higher iou than postprocess_match_threshold will be
            postprocessed after sliced prediction.
        postprocess_class_agnostic: bool
            If True, postprocess will ignore category ids.
        verbose: int
            0: no print
            1: print number of slices (default)
            2: print number of slices and slice/prediction durations
        merge_buffer_length: int
            The length of buffer for slices to be used during sliced prediction, which is suitable for low memory.
            It may affect the AP if it is specified. The higher the amount, the closer results to the non-buffered.
            scenario. See [the discussion](https://github.com/obss/sahi/pull/445).
        auto_slice_resolution: bool
            if slice parameters (slice_height, slice_width) are not given,
            it enables automatically calculate these params from image resolution and orientation.
        slice_export_prefix: str
            Prefix for the exported slices. Defaults to None.
        slice_dir: str
            Directory to save the slices. Defaults to None.
        exclude_classes_by_name: Optional[List[str]]
            None: if no classes are excluded
            List[str]: set of classes to exclude using its/their class label name/s
        exclude_classes_by_id: Optional[List[int]]
            None: if no classes are excluded
            List[int]: set of classes to exclude using one or more IDs
        progress_bar: bool
            Whether to show progress bar for slice processing. Default: False.
        progress_callback: callable
            A callback function that will be called after each slice is processed.
            The function should accept two arguments: (current_slice, total_slices)
        batch_size: int
            Number of slices to process in a single batch inference call.
            Increasing this value can improve GPU utilization and throughput.
            Default: 1 (sequential, same as previous behavior).
        force_postprocess_type: bool
            If True, the auto postprocess type switch will be disabled.
            When False (default) and the detection model's confidence threshold is
            below LOW_MODEL_CONFIDENCE (0.1), the postprocess type will be
            automatically switched to NMS/IOU to avoid bounding box enlargement
            from merge operations. Default: False.
        confidence_threshold: float, optional
            Override the model's confidence threshold for this call only.
            The model's original threshold is restored after the call.

    Example:
        >>> from sahi import AutoDetectionModel
        >>> from sahi.predict import get_sliced_prediction
        >>> model = AutoDetectionModel.from_pretrained(
        ...     model_type="ultralytics",
        ...     model_path="yolo11n.pt",
        ...     confidence_threshold=0.3,
        ... )
        >>> # sweep thresholds without recreating the model
        >>> for thresh in [0.3, 0.5, 0.7]:
        ...     result = get_sliced_prediction("image.jpg", model, confidence_threshold=thresh)
        ...     print(thresh, len(result.object_prediction_list))
        >>> print(model.confidence_threshold)  # still 0.3

    Returns:
        A Dict with fields:
            object_prediction_list: a list of sahi.prediction.ObjectPrediction
            durations_in_seconds: a dict containing elapsed times for profiling
    """
    if batch_size < 1:
        raise ValueError(f"batch_size must be >= 1, got {batch_size}")
    if detection_model is None:
        raise ValueError("detection_model must be provided for sliced prediction")

    original_confidence_threshold = detection_model.confidence_threshold
    if confidence_threshold is not None:
        detection_model.confidence_threshold = confidence_threshold

    try:
        durations_in_seconds = dict()

        # create slices from full image
        time_start = time.perf_counter()
        slice_image_result = slice_image(
            image=image,
            output_file_name=slice_export_prefix,
            output_dir=slice_dir,
            slice_height=slice_height,
            slice_width=slice_width,
            overlap_height_ratio=overlap_height_ratio,
            overlap_width_ratio=overlap_width_ratio,
            auto_slice_resolution=auto_slice_resolution,
        )
        from sahi.models.ultralytics import UltralyticsDetectionModel

        num_slices = len(slice_image_result)
        durations_in_seconds["slice"] = time.perf_counter() - time_start

        # auto postprocess type switch for low confidence thresholds
        if (
            not force_postprocess_type
            and detection_model.confidence_threshold < LOW_MODEL_CONFIDENCE
            and postprocess_type != "NMS"
        ):
            logger.warning(
                f"Switching postprocess type/metric to NMS/IOU since model confidence "
                f"threshold is low ({detection_model.confidence_threshold})."
            )
            postprocess_type = "NMS"
            postprocess_match_metric = "IOU"

        if isinstance(detection_model, UltralyticsDetectionModel) and detection_model.is_obb:
            # Only NMS is supported for OBB model outputs
            postprocess_type = "NMS"

        if postprocess_type not in POSTPROCESS_NAME_TO_CLASS.keys():
            raise ValueError(
                f"postprocess_type should be one of {list(POSTPROCESS_NAME_TO_CLASS.keys())} "
                f"but given as {postprocess_type}"
            )
        postprocess_constructor = POSTPROCESS_NAME_TO_CLASS[postprocess_type]
        postprocess = postprocess_constructor(
            match_threshold=postprocess_match_threshold,
            match_metric=postprocess_match_metric,
            class_agnostic=postprocess_class_agnostic,
        )

        postprocess_time = 0.0
        time_start = time.perf_counter()
        num_batches = (num_slices + batch_size - 1) // batch_size
        if verbose == 1 or verbose == 2:
            tqdm.write(f"Performing prediction on {num_slices} slices.")

        if progress_bar:
            slice_iterator = tqdm(range(num_batches), desc="Processing slices", total=num_batches)
        else:
            slice_iterator = range(num_batches)

        full_shape: list[int | float] = [
            slice_image_result.original_image_height,
            slice_image_result.original_image_width,
        ]
        object_prediction_list = []
        slices_processed = 0
        for batch_ind in slice_iterator:
            batch_start = batch_ind * batch_size
            batch_end = min(batch_start + batch_size, num_slices)
            batch_images = [slice_image_result.images[i] for i in range(batch_start, batch_end)]
            batch_shifts: list[list[int | float]] = [
                list(slice_image_result.starting_pixels[i]) for i in range(batch_start, batch_end)
            ]
            current_batch_size = len(batch_images)

            detection_model.perform_batch_inference([np.ascontiguousarray(img) for img in batch_images])
            detection_model.convert_original_predictions(
                shift_amount=batch_shifts,
                full_shape=[full_shape] * current_batch_size,
            )

            for image_preds in detection_model.object_prediction_list_per_image:
                filtered_preds = filter_predictions(image_preds, exclude_classes_by_name, exclude_classes_by_id)
                for object_prediction in filtered_preds:
                    if object_prediction:
                        object_prediction_list.append(object_prediction.get_shifted_object_prediction())

            slices_processed += current_batch_size

            if merge_buffer_length is not None and len(object_prediction_list) > merge_buffer_length:
                postprocess_time_start = time.time()
                object_prediction_list = postprocess(object_prediction_list)
                postprocess_time += time.time() - postprocess_time_start

            if progress_callback is not None:
                progress_callback(slices_processed, num_slices)

        if num_slices > 1 and perform_standard_pred:
            prediction_result = get_prediction(
                image=image,
                detection_model=detection_model,
                shift_amount=[0, 0],
                full_shape=[
                    slice_image_result.original_image_height,
                    slice_image_result.original_image_width,
                ],
                postprocess=None,
                exclude_classes_by_name=exclude_classes_by_name,
                exclude_classes_by_id=exclude_classes_by_id,
            )
            object_prediction_list.extend(prediction_result.object_prediction_list)

        if len(object_prediction_list) > 1:
            postprocess_time_start = time.time()
            object_prediction_list = postprocess(object_prediction_list)
            postprocess_time += time.time() - postprocess_time_start

        time_end = time.perf_counter() - time_start
        durations_in_seconds["prediction"] = time_end - postprocess_time
        durations_in_seconds["postprocess"] = postprocess_time

        if verbose == 2:
            print("Slicing performed in", durations_in_seconds["slice"], "seconds.")
            print("Prediction performed in", durations_in_seconds["prediction"], "seconds.")
            print("Postprocessing performed in", durations_in_seconds["postprocess"], "seconds.")
    finally:
        detection_model.confidence_threshold = original_confidence_threshold

    return PredictionResult(
        image=image, object_prediction_list=object_prediction_list, durations_in_seconds=durations_in_seconds
    )

predict(detection_model=None, model_type='ultralytics', model_path=None, model_config_path=None, model_confidence_threshold=0.25, model_device=None, model_category_mapping=None, model_category_remapping=None, source=None, no_standard_prediction=False, no_sliced_prediction=False, image_size=None, slice_height=512, slice_width=512, overlap_height_ratio=0.2, overlap_width_ratio=0.2, postprocess_type='GREEDYNMM', postprocess_match_metric='IOS', postprocess_match_threshold=0.5, postprocess_class_agnostic=False, novisual=False, view_video=False, frame_skip_interval=0, export_pickle=False, export_crop=False, dataset_json_path=None, project='runs/predict', name='exp', visual_bbox_thickness=None, visual_text_size=None, visual_text_thickness=None, visual_hide_labels=False, visual_hide_conf=False, visual_export_format='png', verbose=1, return_dict=False, force_postprocess_type=False, exclude_classes_by_name=None, exclude_classes_by_id=None, progress_bar=False, batch_size=1, **kwargs) ¶

Performs prediction for all present images in given folder.

Parameters:

Name	Type	Description	Default
`detection_model` ¶	`DetectionModel \| None`	sahi.model.DetectionModel Optionally provide custom DetectionModel to be used for inference. When provided, model_type, model_path, config_path, model_device, model_category_mapping, image_size params will be ignored	`None`
`model_type` ¶	`str`	str mmdet for 'MmdetDetectionModel', 'yolov5' for 'Yolov5DetectionModel'.	`'ultralytics'`
`model_path` ¶	`str \| None`	str Path for the model weight	`None`
`model_config_path` ¶	`str \| None`	str Path for the detection model config file	`None`
`model_confidence_threshold` ¶	`float`	float All predictions with score < model_confidence_threshold will be discarded.	`0.25`
`model_device` ¶	`str \| None`	str Torch device, "cpu" or "cuda"	`None`
`model_category_mapping` ¶	`dict \| None`	dict Mapping from category id (str) to category name (str) e.g. {"1": "pedestrian"}	`None`
`model_category_remapping` ¶	`dict \| None`	dict: str to int Remap category ids after performing inference	`None`
`source` ¶	`str \| None`	str Folder directory that contains images or path of the image to be predicted. Also video to be predicted.	`None`
`no_standard_prediction` ¶	`bool`	bool Dont perform standard prediction. Default: False.	`False`
`no_sliced_prediction` ¶	`bool`	bool Dont perform sliced prediction. Default: False.	`False`
`image_size` ¶	`int \| None`	int Input image size for each inference (image is scaled by preserving asp. rat.).	`None`
`slice_height` ¶	`int`	int Height of each slice. Defaults to `512`.	`512`
`slice_width` ¶	`int`	int Width of each slice. Defaults to `512`.	`512`
`overlap_height_ratio` ¶	`float`	float Fractional overlap in height of each window (e.g. an overlap of 0.2 for a window of size 512 yields an overlap of 102 pixels). Default to `0.2`.	`0.2`
`overlap_width_ratio` ¶	`float`	float Fractional overlap in width of each window (e.g. an overlap of 0.2 for a window of size 512 yields an overlap of 102 pixels). Default to `0.2`.	`0.2`
`postprocess_type` ¶	`str`	str Type of the postprocess to be used after sliced inference while merging/eliminating predictions. Options are 'NMM', 'GREEDYNMM', 'LSNMS' or 'NMS'. Default is 'GREEDYNMM'.	`'GREEDYNMM'`
`postprocess_match_metric` ¶	`str`	str Metric to be used during object prediction matching after sliced prediction. 'IOU' for intersection over union, 'IOS' for intersection over smaller area.	`'IOS'`
`postprocess_match_threshold` ¶	`float`	float Sliced predictions having higher iou than postprocess_match_threshold will be postprocessed after sliced prediction.	`0.5`
`postprocess_class_agnostic` ¶	`bool`	bool If True, postprocess will ignore category ids.	`False`
`novisual` ¶	`bool`	bool Dont export predicted video/image visuals.	`False`
`view_video` ¶	`bool`	bool View result of prediction during video inference.	`False`
`frame_skip_interval` ¶	`int`	int If view_video or export_visual is slow, you can process one frames of 3(for exp: --frame_skip_interval=3).	`0`
`export_pickle` ¶	`bool`	bool Export predictions as .pickle	`False`
`export_crop` ¶	`bool`	bool Export predictions as cropped images.	`False`
`dataset_json_path` ¶	`str \| None`	str If coco file path is provided, detection results will be exported in coco json format.	`None`
`project` ¶	`str`	str Save results to project/name.	`'runs/predict'`
`name` ¶	`str`	str Save results to project/name.	`'exp'`
`visual_bbox_thickness` ¶	`int \| None`	int, optional Line thickness (in pixels) for bounding boxes in exported visualizations. If None, a default thickness is chosen based on image size.	`None`
`visual_text_size` ¶	`float \| None`	float, optional Font scale/size for label text in exported visualizations. If None, a sensible default is used.	`None`
`visual_text_thickness` ¶	`int \| None`	int, optional Thickness of text labels. If None, a sensible default is used.	`None`
`visual_hide_labels` ¶	`bool`	bool, optional If True, class label names won't be shown on the exported visuals.	`False`
`visual_hide_conf` ¶	`bool`	bool, optional If True, confidence scores won't be shown on the exported visuals.	`False`
`visual_export_format` ¶	`str`	str, optional Output image format to use when exporting visuals. Supported values are 'png' (default) and 'jpg'. Note that 'jpg' uses lossy compression and may produce smaller files. This parameter is ignored when `novisual` is True. Exported visuals are written under the run directory: `project/name/visuals` (and `project/name/visuals_with_gt` when ground-truth overlays are created).	`'png'`
`verbose` ¶	`int`	int 0: no print 1: print slice/prediction durations, number of slices 2: print model loading/file exporting durations	`1`
`return_dict` ¶	`bool`	bool If True, returns a dict with 'export_dir' field.	`False`
`force_postprocess_type` ¶	`bool`	bool If True, auto postprocess check will e disabled	`False`
`exclude_classes_by_name` ¶	`list[str] \| None`	Optional[List[str]] None: if no classes are excluded List[str]: set of classes to exclude using its/their class label name/s	`None`
`exclude_classes_by_id` ¶	`list[int] \| None`	Optional[List[int]] None: if no classes are excluded List[int]: set of classes to exclude using one or more IDs	`None`
`progress_bar` ¶	`bool`	bool Whether to show a progress bar. Default is False.	`False`
`batch_size` ¶	`int`	int Batch size for processing images. Default is 1.	`1`
`**kwargs` ¶	`Any`	Additional keyword arguments passed to the prediction pipeline.	`{}`

Source code in sahi/predict.py

def predict(
    detection_model: DetectionModel | None = None,
    model_type: str = "ultralytics",
    model_path: str | None = None,
    model_config_path: str | None = None,
    model_confidence_threshold: float = 0.25,
    model_device: str | None = None,
    model_category_mapping: dict | None = None,
    model_category_remapping: dict | None = None,
    source: str | None = None,
    no_standard_prediction: bool = False,
    no_sliced_prediction: bool = False,
    image_size: int | None = None,
    slice_height: int = 512,
    slice_width: int = 512,
    overlap_height_ratio: float = 0.2,
    overlap_width_ratio: float = 0.2,
    postprocess_type: str = "GREEDYNMM",
    postprocess_match_metric: str = "IOS",
    postprocess_match_threshold: float = 0.5,
    postprocess_class_agnostic: bool = False,
    novisual: bool = False,
    view_video: bool = False,
    frame_skip_interval: int = 0,
    export_pickle: bool = False,
    export_crop: bool = False,
    dataset_json_path: str | None = None,
    project: str = "runs/predict",
    name: str = "exp",
    visual_bbox_thickness: int | None = None,
    visual_text_size: float | None = None,
    visual_text_thickness: int | None = None,
    visual_hide_labels: bool = False,
    visual_hide_conf: bool = False,
    visual_export_format: str = "png",
    verbose: int = 1,
    return_dict: bool = False,
    force_postprocess_type: bool = False,
    exclude_classes_by_name: list[str] | None = None,
    exclude_classes_by_id: list[int] | None = None,
    progress_bar: bool = False,
    batch_size: int = 1,
    **kwargs: Any,
) -> dict | None:
    """Performs prediction for all present images in given folder.

    Args:
        detection_model: sahi.model.DetectionModel
            Optionally provide custom DetectionModel to be used for inference. When provided,
            model_type, model_path, config_path, model_device, model_category_mapping, image_size
            params will be ignored
        model_type: str
            mmdet for 'MmdetDetectionModel', 'yolov5' for 'Yolov5DetectionModel'.
        model_path: str
            Path for the model weight
        model_config_path: str
            Path for the detection model config file
        model_confidence_threshold: float
            All predictions with score < model_confidence_threshold will be discarded.
        model_device: str
            Torch device, "cpu" or "cuda"
        model_category_mapping: dict
            Mapping from category id (str) to category name (str) e.g. {"1": "pedestrian"}
        model_category_remapping: dict: str to int
            Remap category ids after performing inference
        source: str
            Folder directory that contains images or path of the image to be predicted. Also video to be predicted.
        no_standard_prediction: bool
            Dont perform standard prediction. Default: False.
        no_sliced_prediction: bool
            Dont perform sliced prediction. Default: False.
        image_size: int
            Input image size for each inference (image is scaled by preserving asp. rat.).
        slice_height: int
            Height of each slice.  Defaults to ``512``.
        slice_width: int
            Width of each slice.  Defaults to ``512``.
        overlap_height_ratio: float
            Fractional overlap in height of each window (e.g. an overlap of 0.2 for a window
            of size 512 yields an overlap of 102 pixels).
            Default to ``0.2``.
        overlap_width_ratio: float
            Fractional overlap in width of each window (e.g. an overlap of 0.2 for a window
            of size 512 yields an overlap of 102 pixels).
            Default to ``0.2``.
        postprocess_type: str
            Type of the postprocess to be used after sliced inference while merging/eliminating predictions.
            Options are 'NMM', 'GREEDYNMM', 'LSNMS' or 'NMS'. Default is 'GREEDYNMM'.
        postprocess_match_metric: str
            Metric to be used during object prediction matching after sliced prediction.
            'IOU' for intersection over union, 'IOS' for intersection over smaller area.
        postprocess_match_threshold: float
            Sliced predictions having higher iou than postprocess_match_threshold will be
            postprocessed after sliced prediction.
        postprocess_class_agnostic: bool
            If True, postprocess will ignore category ids.
        novisual: bool
            Dont export predicted video/image visuals.
        view_video: bool
            View result of prediction during video inference.
        frame_skip_interval: int
            If view_video or export_visual is slow, you can process one frames of 3(for exp: --frame_skip_interval=3).
        export_pickle: bool
            Export predictions as .pickle
        export_crop: bool
            Export predictions as cropped images.
        dataset_json_path: str
            If coco file path is provided, detection results will be exported in coco json format.
        project: str
            Save results to project/name.
        name: str
            Save results to project/name.
        visual_bbox_thickness: int, optional
            Line thickness (in pixels) for bounding boxes in exported visualizations.
            If None, a default thickness is chosen based on image size.
        visual_text_size: float, optional
            Font scale/size for label text in exported visualizations. If None, a
            sensible default is used.
        visual_text_thickness: int, optional
            Thickness of text labels. If None, a sensible default is used.
        visual_hide_labels: bool, optional
            If True, class label names won't be shown on the exported visuals.
        visual_hide_conf: bool, optional
            If True, confidence scores won't be shown on the exported visuals.
        visual_export_format: str, optional
            Output image format to use when exporting visuals. Supported values are
            'png' (default) and 'jpg'. Note that 'jpg' uses lossy compression and may
            produce smaller files. This parameter is ignored when `novisual` is True.
            Exported visuals are written under the run directory: `project/name/visuals`
            (and `project/name/visuals_with_gt` when ground-truth overlays are created).
        verbose: int
            0: no print
            1: print slice/prediction durations, number of slices
            2: print model loading/file exporting durations
        return_dict: bool
            If True, returns a dict with 'export_dir' field.
        force_postprocess_type: bool
            If True, auto postprocess check will e disabled
        exclude_classes_by_name: Optional[List[str]]
            None: if no classes are excluded
            List[str]: set of classes to exclude using its/their class label name/s
        exclude_classes_by_id: Optional[List[int]]
            None: if no classes are excluded
            List[int]: set of classes to exclude using one or more IDs
        progress_bar: bool
            Whether to show a progress bar. Default is False.
        batch_size: int
            Batch size for processing images. Default is 1.
        **kwargs: Additional keyword arguments passed to the prediction pipeline.
    """
    # assert prediction type
    if no_standard_prediction and no_sliced_prediction:
        raise ValueError("'no_standard_prediction' and 'no_sliced_prediction' cannot be True at the same time.")

    # for profiling
    durations_in_seconds = dict()

    # Init export directories
    save_dir = Path(increment_path(Path(project) / name, exist_ok=False))  # increment run
    crop_dir = save_dir / "crops"
    visual_dir = save_dir / "visuals"
    visual_with_gt_dir = save_dir / "visuals_with_gt"
    pickle_dir = save_dir / "pickles"
    if not novisual or export_pickle or export_crop or dataset_json_path is not None:
        save_dir.mkdir(parents=True, exist_ok=True)  # make dir

    # Init image iterator
    # TODO: rewrite this as iterator class as in https://github.com/ultralytics/yolov5/blob/d059d1da03aee9a3c0059895aa4c7c14b7f25a9e/utils/datasets.py#L178
    source_is_video = False
    num_frames = None
    image_iterator: list[str] | Generator[Image.Image]
    if dataset_json_path and source:
        coco: Coco = Coco.from_coco_dict_or_path(dataset_json_path)
        image_iterator = [str(Path(source) / Path(coco_image.file_name)) for coco_image in coco.images]
        coco_json = []
    elif source and os.path.isdir(source):
        image_iterator = list_files(directory=source, contains=IMAGE_EXTENSIONS, verbose=verbose)
    elif source and Path(source).suffix in VIDEO_EXTENSIONS:
        source_is_video = True
        read_video_frame, output_video_writer, video_file_name, num_frames = get_video_reader(
            source, str(save_dir), frame_skip_interval, not novisual, view_video
        )
        image_iterator = read_video_frame
    elif source:
        image_iterator = [source]
    else:
        logger.error("No valid input given to predict function")
        return None

    # init model instance
    time_start = time.time()
    if detection_model is None:
        detection_model = AutoDetectionModel.from_pretrained(
            model_type=model_type,
            model_path=model_path,
            config_path=model_config_path,
            confidence_threshold=model_confidence_threshold,
            device=model_device,
            category_mapping=model_category_mapping,
            category_remapping=model_category_remapping,
            load_at_init=False,
            image_size=image_size,
            **kwargs,
        )
        detection_model.load_model()
    time_end = time.time() - time_start
    durations_in_seconds["model_load"] = time_end

    # iterate over source images
    durations_in_seconds["prediction"] = 0
    durations_in_seconds["slice"] = 0

    input_type_str = "video frames" if source_is_video else "images"
    for ind, image_path in enumerate(
        tqdm(image_iterator, f"Performing inference on {input_type_str}", total=num_frames)
    ):
        # Source is an image: Iterating over Image objects
        if source and source_is_video:
            video_name = Path(source).stem
            relative_filepath = video_name + "_frame_" + str(ind)
        elif isinstance(image_path, Image.Image):
            raise RuntimeError("Source is not a video, but image is still an Image object ")
        # preserve source folder structure in export
        elif source and os.path.isdir(source):
            relative_filepath = str(Path(image_path)).split(str(Path(source)))[-1]
            relative_filepath = relative_filepath[1:] if relative_filepath[0] == os.sep else relative_filepath
        else:  # no process if source is single file
            relative_filepath = Path(image_path).name

        filename_without_extension = Path(relative_filepath).stem

        # load image
        image_as_pil = read_image_as_pil(image_path)

        # perform prediction
        if not no_sliced_prediction:
            # get sliced prediction
            prediction_result = get_sliced_prediction(
                image=image_as_pil,
                detection_model=detection_model,
                slice_height=slice_height,
                slice_width=slice_width,
                overlap_height_ratio=overlap_height_ratio,
                overlap_width_ratio=overlap_width_ratio,
                perform_standard_pred=not no_standard_prediction,
                postprocess_type=postprocess_type,
                postprocess_match_metric=postprocess_match_metric,
                postprocess_match_threshold=postprocess_match_threshold,
                postprocess_class_agnostic=postprocess_class_agnostic,
                verbose=1 if verbose else 0,
                exclude_classes_by_name=exclude_classes_by_name,
                exclude_classes_by_id=exclude_classes_by_id,
                progress_bar=progress_bar,
                batch_size=batch_size,
                force_postprocess_type=force_postprocess_type,
            )
            object_prediction_list = prediction_result.object_prediction_list
            if prediction_result.durations_in_seconds:
                durations_in_seconds["slice"] += prediction_result.durations_in_seconds["slice"]
        else:
            # get standard prediction
            prediction_result = get_prediction(
                image=image_as_pil,
                detection_model=detection_model,
                shift_amount=[0, 0],
                full_shape=None,
                postprocess=None,
                verbose=0,
                exclude_classes_by_name=exclude_classes_by_name,
                exclude_classes_by_id=exclude_classes_by_id,
            )
            object_prediction_list = prediction_result.object_prediction_list

        durations_in_seconds["prediction"] += prediction_result.durations_in_seconds["prediction"]
        # Show prediction time
        if verbose:
            tqdm.write(
                "Prediction time is: {:.2f} ms".format(prediction_result.durations_in_seconds["prediction"] * 1000)
            )

        if dataset_json_path:
            if source_is_video is True:
                raise NotImplementedError("Video input type not supported with coco formatted dataset json")

            # append predictions in coco format
            for object_prediction in object_prediction_list:
                coco_prediction = object_prediction.to_coco_prediction()
                image_id = coco.images[ind].id
                if image_id is not None:
                    coco_prediction.image_id = image_id
                coco_prediction_json = coco_prediction.json
                if coco_prediction_json["bbox"]:
                    coco_json.append(coco_prediction_json)
            if not novisual:
                # convert ground truth annotations to object_prediction_list
                coco_image: CocoImage = coco.images[ind]
                object_prediction_gt_list: list[ObjectAnnotation] = []
                for coco_annotation in coco_image.annotations:
                    coco_annotation_dict = coco_annotation.json
                    category_name = coco_annotation.category_name
                    full_shape = [coco_image.height, coco_image.width]
                    object_prediction_gt = ObjectPrediction.from_coco_annotation_dict(
                        annotation_dict=coco_annotation_dict, category_name=category_name, full_shape=full_shape
                    )
                    object_prediction_gt_list.append(object_prediction_gt)
                # export visualizations with ground truths
                output_dir = str(visual_with_gt_dir / Path(relative_filepath).parent)
                color = (0, 255, 0)  # original annotations in green
                result = visualize_object_predictions(
                    np.ascontiguousarray(image_as_pil),
                    object_prediction_list=object_prediction_gt_list,
                    rect_th=visual_bbox_thickness,
                    text_size=visual_text_size,
                    text_th=visual_text_thickness,
                    color=color,
                    hide_labels=visual_hide_labels,
                    hide_conf=visual_hide_conf,
                    output_dir=None,
                    file_name=None,
                    export_format=None,
                )
                color = (255, 0, 0)  # model predictions in red
                _ = visualize_object_predictions(
                    result["image"],
                    object_prediction_list=object_prediction_list,
                    rect_th=visual_bbox_thickness,
                    text_size=visual_text_size,
                    text_th=visual_text_thickness,
                    color=color,
                    hide_labels=visual_hide_labels,
                    hide_conf=visual_hide_conf,
                    output_dir=output_dir,
                    file_name=filename_without_extension,
                    export_format=visual_export_format,
                )

        time_start = time.time()
        # export prediction boxes
        if export_crop:
            output_dir = str(crop_dir / Path(relative_filepath).parent)
            crop_object_predictions(
                image=np.ascontiguousarray(image_as_pil),
                object_prediction_list=object_prediction_list,
                output_dir=output_dir,
                file_name=filename_without_extension,
                export_format=visual_export_format,
            )
        # export prediction list as pickle
        if export_pickle:
            save_path = str(pickle_dir / Path(relative_filepath).parent / (filename_without_extension + ".pickle"))
            save_pickle(data=object_prediction_list, save_path=save_path)

        # export visualization
        if not novisual or view_video:
            output_dir = str(visual_dir / Path(relative_filepath).parent)
            result = visualize_object_predictions(
                np.ascontiguousarray(image_as_pil),
                object_prediction_list=object_prediction_list,
                rect_th=visual_bbox_thickness,
                text_size=visual_text_size,
                text_th=visual_text_thickness,
                hide_labels=visual_hide_labels,
                hide_conf=visual_hide_conf,
                output_dir=output_dir if not source_is_video else None,
                file_name=filename_without_extension,
                export_format=visual_export_format,
            )
            if not novisual and source_is_video:  # export video
                if output_video_writer is None:
                    raise RuntimeError("Output video writer could not be created")
                output_video_writer.write(cv2.cvtColor(result["image"], cv2.COLOR_RGB2BGR))

        # render video inference
        if view_video:
            cv2.imshow(f"Prediction of {video_file_name!s}", result["image"])
            cv2.waitKey(1)

        time_end = time.time() - time_start
        durations_in_seconds["export_files"] = time_end

    # export coco results
    if dataset_json_path:
        save_path = str(save_dir / "result.json")
        save_json(coco_json, save_path)

    if not novisual or export_pickle or export_crop or dataset_json_path is not None:
        print(f"Prediction results are successfully exported to {save_dir}")

    # print prediction duration
    if verbose == 2:
        print(
            "Model loaded in",
            durations_in_seconds["model_load"],
            "seconds.",
        )
        print(
            "Slicing performed in",
            durations_in_seconds["slice"],
            "seconds.",
        )
        print(
            "Prediction performed in",
            durations_in_seconds["prediction"],
            "seconds.",
        )
        if not novisual:
            print(
                "Exporting performed in",
                durations_in_seconds["export_files"],
                "seconds.",
            )

    if return_dict:
        return {"export_dir": save_dir}
    return None

predict_fiftyone(model_type='mmdet', model_path=None, model_config_path=None, model_confidence_threshold=0.25, model_device=None, model_category_mapping=None, model_category_remapping=None, dataset_json_path='', image_dir='', no_standard_prediction=False, no_sliced_prediction=False, image_size=None, slice_height=256, slice_width=256, overlap_height_ratio=0.2, overlap_width_ratio=0.2, postprocess_type='GREEDYNMM', postprocess_match_metric='IOS', postprocess_match_threshold=0.5, postprocess_class_agnostic=False, verbose=1, exclude_classes_by_name=None, exclude_classes_by_id=None, progress_bar=False, batch_size=1) ¶

Performs prediction for all present images in given folder.

Parameters:

Name	Type	Description	Default
`model_type` ¶	`str`	str mmdet for 'MmdetDetectionModel', 'yolov5' for 'Yolov5DetectionModel'.	`'mmdet'`
`model_path` ¶	`str \| None`	str Path for the model weight	`None`
`model_config_path` ¶	`str \| None`	str Path for the detection model config file	`None`
`model_confidence_threshold` ¶	`float`	float All predictions with score < model_confidence_threshold will be discarded.	`0.25`
`model_device` ¶	`str \| None`	str Torch device, "cpu" or "cuda"	`None`
`model_category_mapping` ¶	`dict \| None`	dict Mapping from category id (str) to category name (str) e.g. {"1": "pedestrian"}	`None`
`model_category_remapping` ¶	`dict \| None`	dict: str to int Remap category ids after performing inference	`None`
`dataset_json_path` ¶	`str`	str If coco file path is provided, detection results will be exported in coco json format.	`''`
`image_dir` ¶	`str`	str Folder directory that contains images or path of the image to be predicted.	`''`
`no_standard_prediction` ¶	`bool`	bool Dont perform standard prediction. Default: False.	`False`
`no_sliced_prediction` ¶	`bool`	bool Dont perform sliced prediction. Default: False.	`False`
`image_size` ¶	`int \| None`	int Input image size for each inference (image is scaled by preserving asp. rat.).	`None`
`slice_height` ¶	`int`	int Height of each slice. Defaults to `256`.	`256`
`slice_width` ¶	`int`	int Width of each slice. Defaults to `256`.	`256`
`overlap_height_ratio` ¶	`float`	float Fractional overlap in height of each window (e.g. an overlap of 0.2 for a window of size 256 yields an overlap of 51 pixels). Default to `0.2`.	`0.2`
`overlap_width_ratio` ¶	`float`	float Fractional overlap in width of each window (e.g. an overlap of 0.2 for a window of size 256 yields an overlap of 51 pixels). Default to `0.2`.	`0.2`
`postprocess_type` ¶	`str`	str Type of the postprocess to be used after sliced inference while merging/eliminating predictions. Options are 'NMM', 'GREEDYNMM' or 'NMS'. Default is 'GREEDYNMM'.	`'GREEDYNMM'`
`postprocess_match_metric` ¶	`str`	str Metric to be used during object prediction matching after sliced prediction. 'IOU' for intersection over union, 'IOS' for intersection over smaller area.	`'IOS'`
`postprocess_match_metric` ¶	`str`	str Metric to be used during object prediction matching after sliced prediction. 'IOU' for intersection over union, 'IOS' for intersection over smaller area.	`'IOS'`
`postprocess_match_threshold` ¶	`float`	float Sliced predictions having higher iou than postprocess_match_threshold will be postprocessed after sliced prediction.	`0.5`
`postprocess_class_agnostic` ¶	`bool`	bool If True, postprocess will ignore category ids.	`False`
`verbose` ¶	`int`	int 0: no print 1: print slice/prediction durations, number of slices, model loading/file exporting durations	`1`
`exclude_classes_by_name` ¶	`list[str] \| None`	Optional[List[str]] None: if no classes are excluded List[str]: set of classes to exclude using its/their class label name/s	`None`
`exclude_classes_by_id` ¶	`list[int] \| None`	Optional[List[int]] None: if no classes are excluded List[int]: set of classes to exclude using one or more IDs	`None`
`progress_bar` ¶	`bool`	bool Whether to show progress bar for slice processing. Default: False.	`False`
`batch_size` ¶	`int`	int Batch size for processing images. Default is 1.	`1`

Source code in sahi/predict.py

def predict_fiftyone(
    model_type: str = "mmdet",
    model_path: str | None = None,
    model_config_path: str | None = None,
    model_confidence_threshold: float = 0.25,
    model_device: str | None = None,
    model_category_mapping: dict | None = None,
    model_category_remapping: dict | None = None,
    dataset_json_path: str = "",
    image_dir: str = "",
    no_standard_prediction: bool = False,
    no_sliced_prediction: bool = False,
    image_size: int | None = None,
    slice_height: int = 256,
    slice_width: int = 256,
    overlap_height_ratio: float = 0.2,
    overlap_width_ratio: float = 0.2,
    postprocess_type: str = "GREEDYNMM",
    postprocess_match_metric: str = "IOS",
    postprocess_match_threshold: float = 0.5,
    postprocess_class_agnostic: bool = False,
    verbose: int = 1,
    exclude_classes_by_name: list[str] | None = None,
    exclude_classes_by_id: list[int] | None = None,
    progress_bar: bool = False,
    batch_size: int = 1,
) -> None:
    """Performs prediction for all present images in given folder.

    Args:
        model_type: str
            mmdet for 'MmdetDetectionModel', 'yolov5' for 'Yolov5DetectionModel'.
        model_path: str
            Path for the model weight
        model_config_path: str
            Path for the detection model config file
        model_confidence_threshold: float
            All predictions with score < model_confidence_threshold will be discarded.
        model_device: str
            Torch device, "cpu" or "cuda"
        model_category_mapping: dict
            Mapping from category id (str) to category name (str) e.g. {"1": "pedestrian"}
        model_category_remapping: dict: str to int
            Remap category ids after performing inference
        dataset_json_path: str
            If coco file path is provided, detection results will be exported in coco json format.
        image_dir: str
            Folder directory that contains images or path of the image to be predicted.
        no_standard_prediction: bool
            Dont perform standard prediction. Default: False.
        no_sliced_prediction: bool
            Dont perform sliced prediction. Default: False.
        image_size: int
            Input image size for each inference (image is scaled by preserving asp. rat.).
        slice_height: int
            Height of each slice.  Defaults to ``256``.
        slice_width: int
            Width of each slice.  Defaults to ``256``.
        overlap_height_ratio: float
            Fractional overlap in height of each window (e.g. an overlap of 0.2 for a window
            of size 256 yields an overlap of 51 pixels).
            Default to ``0.2``.
        overlap_width_ratio: float
            Fractional overlap in width of each window (e.g. an overlap of 0.2 for a window
            of size 256 yields an overlap of 51 pixels).
            Default to ``0.2``.
        postprocess_type: str
            Type of the postprocess to be used after sliced inference while merging/eliminating predictions.
            Options are 'NMM', 'GREEDYNMM' or 'NMS'. Default is 'GREEDYNMM'.
        postprocess_match_metric: str
            Metric to be used during object prediction matching after sliced prediction.
            'IOU' for intersection over union, 'IOS' for intersection over smaller area.
        postprocess_match_metric: str
            Metric to be used during object prediction matching after sliced prediction.
            'IOU' for intersection over union, 'IOS' for intersection over smaller area.
        postprocess_match_threshold: float
            Sliced predictions having higher iou than postprocess_match_threshold will be
            postprocessed after sliced prediction.
        postprocess_class_agnostic: bool
            If True, postprocess will ignore category ids.
        verbose: int
            0: no print
            1: print slice/prediction durations, number of slices, model loading/file exporting durations
        exclude_classes_by_name: Optional[List[str]]
            None: if no classes are excluded
            List[str]: set of classes to exclude using its/their class label name/s
        exclude_classes_by_id: Optional[List[int]]
            None: if no classes are excluded
            List[int]: set of classes to exclude using one or more IDs
        progress_bar: bool
            Whether to show progress bar for slice processing. Default: False.
        batch_size: int
            Batch size for processing images. Default is 1.
    """
    check_requirements(["fiftyone"])

    from sahi.utils.fiftyone import create_fiftyone_dataset_from_coco_file, fo

    # assert prediction type
    if no_standard_prediction and no_sliced_prediction:
        raise ValueError("'no_standard_pred' and 'no_sliced_prediction' cannot be True at the same time.")
    # for profiling
    durations_in_seconds = dict()

    dataset = create_fiftyone_dataset_from_coco_file(image_dir, dataset_json_path)

    # init model instance
    time_start = time.time()
    detection_model = AutoDetectionModel.from_pretrained(
        model_type=model_type,
        model_path=model_path,
        config_path=model_config_path,
        confidence_threshold=model_confidence_threshold,
        device=model_device,
        category_mapping=model_category_mapping,
        category_remapping=model_category_remapping,
        load_at_init=False,
        image_size=image_size,
    )
    detection_model.load_model()
    time_end = time.time() - time_start
    durations_in_seconds["model_load"] = time_end

    # iterate over source images
    durations_in_seconds["prediction"] = 0
    durations_in_seconds["slice"] = 0
    # Add predictions to samples
    with fo.ProgressBar() as pb:
        for sample in pb(dataset):
            # perform prediction
            if not no_sliced_prediction:
                # get sliced prediction
                prediction_result = get_sliced_prediction(
                    image=sample.filepath,
                    detection_model=detection_model,
                    slice_height=slice_height,
                    slice_width=slice_width,
                    overlap_height_ratio=overlap_height_ratio,
                    overlap_width_ratio=overlap_width_ratio,
                    perform_standard_pred=not no_standard_prediction,
                    postprocess_type=postprocess_type,
                    postprocess_match_threshold=postprocess_match_threshold,
                    postprocess_match_metric=postprocess_match_metric,
                    postprocess_class_agnostic=postprocess_class_agnostic,
                    verbose=verbose,
                    exclude_classes_by_name=exclude_classes_by_name,
                    exclude_classes_by_id=exclude_classes_by_id,
                    progress_bar=progress_bar,
                    batch_size=batch_size,
                )
                durations_in_seconds["slice"] += prediction_result.durations_in_seconds["slice"]
            else:
                # get standard prediction
                prediction_result = get_prediction(
                    image=sample.filepath,
                    detection_model=detection_model,
                    shift_amount=[0, 0],
                    full_shape=None,
                    postprocess=None,
                    verbose=0,
                    exclude_classes_by_name=exclude_classes_by_name,
                    exclude_classes_by_id=exclude_classes_by_id,
                )
                durations_in_seconds["prediction"] += prediction_result.durations_in_seconds["prediction"]

            # Save predictions to dataset
            sample[model_type] = fo.Detections(detections=prediction_result.to_fiftyone_detections())
            sample.save()

    # print prediction duration
    if verbose == 1:
        print(
            "Model loaded in",
            durations_in_seconds["model_load"],
            "seconds.",
        )
        print(
            "Slicing performed in",
            durations_in_seconds["slice"],
            "seconds.",
        )
        print(
            "Prediction performed in",
            durations_in_seconds["prediction"],
            "seconds.",
        )

    # visualize results
    session = fo.launch_app()  # pyright: ignore[reportArgumentType]
    session.dataset = dataset
    # Evaluate the predictions
    results = dataset.evaluate_detections(
        model_type,
        gt_field="ground_truth",
        eval_key="eval",
        iou=postprocess_match_threshold,
        compute_mAP=True,
    )
    # Get the 10 most common classes in the dataset
    counts = dataset.count_values("ground_truth.detections.label")
    classes_top10 = sorted(counts, key=counts.get, reverse=True)[:10]
    # Print a classification report for the top-10 classes
    results.print_report(classes=classes_top10)
    # Load the view on which we ran the `eval` evaluation
    eval_view = dataset.load_evaluation_view("eval")
    # Show samples with most false positives
    session.view = eval_view.sort_by("eval_fp", reverse=True)
    while 1:
        time.sleep(3)

`prediction` ¶

Prediction classes for object detection results.

Classes¶

`ObjectPrediction` ¶

Bases: ObjectAnnotation

Class for handling detection model predictions.

Source code in sahi/prediction.py

class ObjectPrediction(ObjectAnnotation):
    """Class for handling detection model predictions."""

    def __init__(
        self,
        bbox: list[float] | None = None,
        category_id: int | None = None,
        category_name: str | None = None,
        segmentation: list[list[float]] | None = None,
        score: float = 0.0,
        shift_amount: list[int] | list[int | float] | None = None,
        full_shape: list[int] | list[int | float] | None = None,
    ) -> None:
        """Initialize ObjectPrediction from bbox, score, category_id, category_name, segmentation.

        Args:
            bbox: list
                [minx, miny, maxx, maxy]
            score: float
                Prediction score between 0 and 1
            category_id: int
                ID of the object category
            category_name: str
                Name of the object category
            segmentation: List[List]
                [
                    [x1, y1, x2, y2, x3, y3, ...],
                    [x1, y1, x2, y2, x3, y3, ...],
                    ...
                ]
            shift_amount: list
                To shift the box and mask predictions from sliced image
                to full sized image, should be in the form of [shift_x, shift_y]
            full_shape: list
                Size of the full image after shifting, should be in
                the form of [height, width]
        """
        self.score = PredictionScore(score)
        super().__init__(
            bbox=bbox,
            category_id=category_id,
            segmentation=segmentation,
            category_name=category_name,
            shift_amount=shift_amount,
            full_shape=full_shape,
        )

    def get_shifted_object_prediction(self) -> ObjectPrediction:
        """Get shifted version of ObjectPrediction.

        Shifts bbox and mask coords. Used for mapping sliced predictions over full image.
        """
        if self.mask:
            shifted_mask = self.mask.get_shifted_mask()
            return ObjectPrediction(
                bbox=self.bbox.get_shifted_box().to_xyxy(),
                category_id=self.category.id,
                score=self.score.value,
                segmentation=shifted_mask.segmentation,
                category_name=self.category.name,
                shift_amount=[0, 0],
                full_shape=shifted_mask.full_shape,
            )
        else:
            return ObjectPrediction(
                bbox=self.bbox.get_shifted_box().to_xyxy(),
                category_id=self.category.id,
                score=self.score.value,
                segmentation=None,
                category_name=self.category.name,
                shift_amount=[0, 0],
                full_shape=None,
            )

    def to_coco_prediction(self, image_id: int | None = None) -> CocoPrediction:
        """Convert to sahi.utils.coco.CocoPrediction representation."""
        bbox_xywh = self.bbox.to_xywh()
        if self.mask:
            coco_prediction = CocoPrediction.from_coco_segmentation(  # type: ignore[arg-type]
                segmentation=self.mask.segmentation,
                category_id=self.category.id,
                category_name=self.category.name,
                score=self.score.value,
                image_id=image_id,
            )
        else:
            coco_prediction = CocoPrediction.from_coco_bbox(
                bbox=bbox_xywh,  # type: ignore[arg-type]
                category_id=self.category.id,
                category_name=self.category.name,
                score=self.score.value,
                image_id=image_id,
            )
        return coco_prediction

    def to_fiftyone_detection(self, image_height: int, image_width: int) -> object:
        """Convert to fiftyone.Detection representation."""
        try:
            import fiftyone as fo
        except ImportError:
            raise ImportError('Please run "pip install -U fiftyone" to install fiftyone first for fiftyone conversion.')

        x1, y1, x2, y2 = self.bbox.to_xyxy()
        rel_box = [x1 / image_width, y1 / image_height, (x2 - x1) / image_width, (y2 - y1) / image_height]
        fiftyone_detection = fo.Detection(label=self.category.name, bounding_box=rel_box, confidence=self.score.value)
        return fiftyone_detection

    def __repr__(self) -> str:
        """Return string representation of ObjectPrediction."""
        return f"""ObjectPrediction<
    bbox: {self.bbox},
    mask: {self.mask},
    score: {self.score},
    category: {self.category}>"""

Functions¶

__init__(bbox=None, category_id=None, category_name=None, segmentation=None, score=0.0, shift_amount=None, full_shape=None)

¶

Initialize ObjectPrediction from bbox, score, category_id, category_name, segmentation.

Parameters:

Name	Type	Description	Default
`bbox` ¶	`list[float] \| None`	list [minx, miny, maxx, maxy]	`None`
`score` ¶	`float`	float Prediction score between 0 and 1	`0.0`
`category_id` ¶	`int \| None`	int ID of the object category	`None`
`category_name` ¶	`str \| None`	str Name of the object category	`None`
`segmentation` ¶	`list[list[float]] \| None`	List[List] [ [x1, y1, x2, y2, x3, y3, ...], [x1, y1, x2, y2, x3, y3, ...], ... ]	`None`
`shift_amount` ¶	`list[int] \| list[int \| float] \| None`	list To shift the box and mask predictions from sliced image to full sized image, should be in the form of [shift_x, shift_y]	`None`
`full_shape` ¶	`list[int] \| list[int \| float] \| None`	list Size of the full image after shifting, should be in the form of [height, width]	`None`

Source code in sahi/prediction.py

def __init__(
    self,
    bbox: list[float] | None = None,
    category_id: int | None = None,
    category_name: str | None = None,
    segmentation: list[list[float]] | None = None,
    score: float = 0.0,
    shift_amount: list[int] | list[int | float] | None = None,
    full_shape: list[int] | list[int | float] | None = None,
) -> None:
    """Initialize ObjectPrediction from bbox, score, category_id, category_name, segmentation.

    Args:
        bbox: list
            [minx, miny, maxx, maxy]
        score: float
            Prediction score between 0 and 1
        category_id: int
            ID of the object category
        category_name: str
            Name of the object category
        segmentation: List[List]
            [
                [x1, y1, x2, y2, x3, y3, ...],
                [x1, y1, x2, y2, x3, y3, ...],
                ...
            ]
        shift_amount: list
            To shift the box and mask predictions from sliced image
            to full sized image, should be in the form of [shift_x, shift_y]
        full_shape: list
            Size of the full image after shifting, should be in
            the form of [height, width]
    """
    self.score = PredictionScore(score)
    super().__init__(
        bbox=bbox,
        category_id=category_id,
        segmentation=segmentation,
        category_name=category_name,
        shift_amount=shift_amount,
        full_shape=full_shape,
    )

__repr__() ¶

Return string representation of ObjectPrediction.

Source code in sahi/prediction.py

def __repr__(self) -> str:
    """Return string representation of ObjectPrediction."""
    return f"""ObjectPrediction<
bbox: {self.bbox},
mask: {self.mask},
score: {self.score},
category: {self.category}>"""

get_shifted_object_prediction() ¶

Get shifted version of ObjectPrediction.

Shifts bbox and mask coords. Used for mapping sliced predictions over full image.

Source code in sahi/prediction.py

def get_shifted_object_prediction(self) -> ObjectPrediction:
    """Get shifted version of ObjectPrediction.

    Shifts bbox and mask coords. Used for mapping sliced predictions over full image.
    """
    if self.mask:
        shifted_mask = self.mask.get_shifted_mask()
        return ObjectPrediction(
            bbox=self.bbox.get_shifted_box().to_xyxy(),
            category_id=self.category.id,
            score=self.score.value,
            segmentation=shifted_mask.segmentation,
            category_name=self.category.name,
            shift_amount=[0, 0],
            full_shape=shifted_mask.full_shape,
        )
    else:
        return ObjectPrediction(
            bbox=self.bbox.get_shifted_box().to_xyxy(),
            category_id=self.category.id,
            score=self.score.value,
            segmentation=None,
            category_name=self.category.name,
            shift_amount=[0, 0],
            full_shape=None,
        )

to_coco_prediction(image_id=None) ¶

Convert to sahi.utils.coco.CocoPrediction representation.

Source code in sahi/prediction.py

def to_coco_prediction(self, image_id: int | None = None) -> CocoPrediction:
    """Convert to sahi.utils.coco.CocoPrediction representation."""
    bbox_xywh = self.bbox.to_xywh()
    if self.mask:
        coco_prediction = CocoPrediction.from_coco_segmentation(  # type: ignore[arg-type]
            segmentation=self.mask.segmentation,
            category_id=self.category.id,
            category_name=self.category.name,
            score=self.score.value,
            image_id=image_id,
        )
    else:
        coco_prediction = CocoPrediction.from_coco_bbox(
            bbox=bbox_xywh,  # type: ignore[arg-type]
            category_id=self.category.id,
            category_name=self.category.name,
            score=self.score.value,
            image_id=image_id,
        )
    return coco_prediction

to_fiftyone_detection(image_height, image_width) ¶

Convert to fiftyone.Detection representation.

Source code in sahi/prediction.py

def to_fiftyone_detection(self, image_height: int, image_width: int) -> object:
    """Convert to fiftyone.Detection representation."""
    try:
        import fiftyone as fo
    except ImportError:
        raise ImportError('Please run "pip install -U fiftyone" to install fiftyone first for fiftyone conversion.')

    x1, y1, x2, y2 = self.bbox.to_xyxy()
    rel_box = [x1 / image_width, y1 / image_height, (x2 - x1) / image_width, (y2 - y1) / image_height]
    fiftyone_detection = fo.Detection(label=self.category.name, bounding_box=rel_box, confidence=self.score.value)
    return fiftyone_detection

`PredictionResult` ¶

Container for detection results on a single image.

Holds the list of ObjectPrediction instances together with the source image and optional profiling durations. Provides helpers for exporting results to COCO, FiftyOne, and visual formats.

Source code in sahi/prediction.py

class PredictionResult:
    """Container for detection results on a single image.

    Holds the list of ``ObjectPrediction`` instances together with the
    source image and optional profiling durations. Provides helpers for
    exporting results to COCO, FiftyOne, and visual formats.
    """

    def __init__(
        self,
        object_prediction_list: list[ObjectPrediction],
        image: Image.Image | str | np.ndarray,
        durations_in_seconds: dict[str, Any] = dict(),
    ) -> None:
        """Initialize a PredictionResult.

        Args:
            object_prediction_list: list[ObjectPrediction]
                Detected objects for this image.
            image: Image.Image or str or np.ndarray
                The source image as a PIL Image, file path, or numpy array.
            durations_in_seconds: dict[str, Any]
                Elapsed times for profiling (e.g. inference, postprocess).
        """
        self.image: Image.Image = read_image_as_pil(image)
        self.image_width, self.image_height = self.image.size
        self.object_prediction_list: list[ObjectPrediction] = object_prediction_list
        self.durations_in_seconds = durations_in_seconds

    def export_visuals(
        self,
        export_dir: str,
        text_size: float | None = None,
        rect_th: int | None = None,
        hide_labels: bool = False,
        hide_conf: bool = False,
        file_name: str = "prediction_visual",
    ) -> None:
        """Export prediction visualizations to directory.

        Args:
            export_dir: directory for resulting visualization to be exported.
            text_size: size of the category name over box.
            rect_th: rectangle thickness.
            hide_labels: hide labels.
            hide_conf: hide confidence.
            file_name: saving name.
        """
        Path(export_dir).mkdir(parents=True, exist_ok=True)
        visualize_object_predictions(
            image=np.ascontiguousarray(self.image),
            object_prediction_list=self.object_prediction_list,
            rect_th=rect_th,
            text_size=text_size,
            text_th=None,
            color=None,
            hide_labels=hide_labels,
            hide_conf=hide_conf,
            output_dir=export_dir,
            file_name=file_name,
            export_format="png",
        )

    def to_coco_annotations(self) -> list:
        """Convert predictions to COCO annotation format."""
        coco_annotation_list = []
        for object_prediction in self.object_prediction_list:
            coco_annotation_list.append(object_prediction.to_coco_prediction().json)
        return coco_annotation_list

    def to_coco_predictions(self, image_id: int | None = None) -> list:
        """Convert predictions to COCO prediction format."""
        coco_prediction_list = []
        for object_prediction in self.object_prediction_list:
            coco_prediction_list.append(object_prediction.to_coco_prediction(image_id=image_id).json)
        return coco_prediction_list

    def to_imantics_annotations(self) -> list:
        """Convert predictions to imantics annotation format."""
        imantics_annotation_list = []
        for object_prediction in self.object_prediction_list:
            imantics_annotation_list.append(object_prediction.to_imantics_annotation())
        return imantics_annotation_list

    def to_fiftyone_detections(self) -> list:
        """Convert predictions to FiftyOne detection format."""
        try:
            import fiftyone as fo
        except ImportError:
            raise ImportError('Please run "uv pip install -U fiftyone" to install fiftyone for conversion.')

        fiftyone_detection_list: list[fo.Detection] = []
        for object_prediction in self.object_prediction_list:
            fiftyone_detection_list.append(
                object_prediction.to_fiftyone_detection(image_height=self.image_height, image_width=self.image_width)
            )
        return fiftyone_detection_list

Functions¶

__init__(object_prediction_list, image, durations_in_seconds=dict()) ¶

Initialize a PredictionResult.

Parameters:

Name	Type	Description	Default
`object_prediction_list` ¶	`list[ObjectPrediction]`	list[ObjectPrediction] Detected objects for this image.	required
`image` ¶	`Image \| str \| ndarray`	Image.Image or str or np.ndarray The source image as a PIL Image, file path, or numpy array.	required
`durations_in_seconds` ¶	`dict[str, Any]`	dict[str, Any] Elapsed times for profiling (e.g. inference, postprocess).	`dict()`

Source code in sahi/prediction.py

def __init__(
    self,
    object_prediction_list: list[ObjectPrediction],
    image: Image.Image | str | np.ndarray,
    durations_in_seconds: dict[str, Any] = dict(),
) -> None:
    """Initialize a PredictionResult.

    Args:
        object_prediction_list: list[ObjectPrediction]
            Detected objects for this image.
        image: Image.Image or str or np.ndarray
            The source image as a PIL Image, file path, or numpy array.
        durations_in_seconds: dict[str, Any]
            Elapsed times for profiling (e.g. inference, postprocess).
    """
    self.image: Image.Image = read_image_as_pil(image)
    self.image_width, self.image_height = self.image.size
    self.object_prediction_list: list[ObjectPrediction] = object_prediction_list
    self.durations_in_seconds = durations_in_seconds

export_visuals(export_dir, text_size=None, rect_th=None, hide_labels=False, hide_conf=False, file_name='prediction_visual')

¶

Export prediction visualizations to directory.

Parameters:

Name	Type	Description	Default
`export_dir` ¶	`str`	directory for resulting visualization to be exported.	required
`text_size` ¶	`float \| None`	size of the category name over box.	`None`
`rect_th` ¶	`int \| None`	rectangle thickness.	`None`
`hide_labels` ¶	`bool`	hide labels.	`False`
`hide_conf` ¶	`bool`	hide confidence.	`False`
`file_name` ¶	`str`	saving name.	`'prediction_visual'`

Source code in sahi/prediction.py

def export_visuals(
    self,
    export_dir: str,
    text_size: float | None = None,
    rect_th: int | None = None,
    hide_labels: bool = False,
    hide_conf: bool = False,
    file_name: str = "prediction_visual",
) -> None:
    """Export prediction visualizations to directory.

    Args:
        export_dir: directory for resulting visualization to be exported.
        text_size: size of the category name over box.
        rect_th: rectangle thickness.
        hide_labels: hide labels.
        hide_conf: hide confidence.
        file_name: saving name.
    """
    Path(export_dir).mkdir(parents=True, exist_ok=True)
    visualize_object_predictions(
        image=np.ascontiguousarray(self.image),
        object_prediction_list=self.object_prediction_list,
        rect_th=rect_th,
        text_size=text_size,
        text_th=None,
        color=None,
        hide_labels=hide_labels,
        hide_conf=hide_conf,
        output_dir=export_dir,
        file_name=file_name,
        export_format="png",
    )

to_coco_annotations() ¶

Convert predictions to COCO annotation format.

Source code in sahi/prediction.py

def to_coco_annotations(self) -> list:
    """Convert predictions to COCO annotation format."""
    coco_annotation_list = []
    for object_prediction in self.object_prediction_list:
        coco_annotation_list.append(object_prediction.to_coco_prediction().json)
    return coco_annotation_list

to_coco_predictions(image_id=None) ¶

Convert predictions to COCO prediction format.

Source code in sahi/prediction.py

def to_coco_predictions(self, image_id: int | None = None) -> list:
    """Convert predictions to COCO prediction format."""
    coco_prediction_list = []
    for object_prediction in self.object_prediction_list:
        coco_prediction_list.append(object_prediction.to_coco_prediction(image_id=image_id).json)
    return coco_prediction_list

to_fiftyone_detections() ¶

Convert predictions to FiftyOne detection format.

Source code in sahi/prediction.py

def to_fiftyone_detections(self) -> list:
    """Convert predictions to FiftyOne detection format."""
    try:
        import fiftyone as fo
    except ImportError:
        raise ImportError('Please run "uv pip install -U fiftyone" to install fiftyone for conversion.')

    fiftyone_detection_list: list[fo.Detection] = []
    for object_prediction in self.object_prediction_list:
        fiftyone_detection_list.append(
            object_prediction.to_fiftyone_detection(image_height=self.image_height, image_width=self.image_width)
        )
    return fiftyone_detection_list

to_imantics_annotations() ¶

Convert predictions to imantics annotation format.

Source code in sahi/prediction.py

def to_imantics_annotations(self) -> list:
    """Convert predictions to imantics annotation format."""
    imantics_annotation_list = []
    for object_prediction in self.object_prediction_list:
        imantics_annotation_list.append(object_prediction.to_imantics_annotation())
    return imantics_annotation_list

`PredictionScore` ¶

Wrapper around a numeric prediction confidence score.

Provides comparison operators and conversion from numpy scalars to native Python floats for serialization safety.

Source code in sahi/prediction.py

class PredictionScore:
    """Wrapper around a numeric prediction confidence score.

    Provides comparison operators and conversion from numpy scalars to
    native Python floats for serialization safety.
    """

    value: float

    def __init__(self, value: float | np.ndarray) -> None:
        """Initialize PredictionScore.

        Args:
            value: prediction score between 0 and 1.
        """
        # if score is a numpy object, convert it to python variable
        if isinstance(value, np.ndarray):
            value = copy.deepcopy(value).tolist()
        # set score
        self.value: float = value  # type: ignore[assignment]

    def is_greater_than_threshold(self, threshold: float) -> bool:
        """Check if score is greater than threshold."""
        return self.value > threshold

    def __eq__(self, other: object) -> bool:  # type: ignore[override]
        """Check equality with another value."""
        if isinstance(other, (float, int)):
            return self.value == other
        return NotImplemented

    def __gt__(self, other: object) -> bool:  # type: ignore[override]
        """Check if greater than another value."""
        if isinstance(other, (float, int)):
            return self.value > other
        return NotImplemented

    def __lt__(self, other: object) -> bool:  # type: ignore[override]
        """Check if less than another value."""
        if isinstance(other, (float, int)):
            return self.value < other
        return NotImplemented

    def __repr__(self) -> str:
        """Return string representation of prediction score."""
        return f"PredictionScore: <value: {self.value}>"

Functions¶

__eq__(other) ¶

Check equality with another value.

Source code in sahi/prediction.py

def __eq__(self, other: object) -> bool:  # type: ignore[override]
    """Check equality with another value."""
    if isinstance(other, (float, int)):
        return self.value == other
    return NotImplemented

__gt__(other) ¶

Check if greater than another value.

Source code in sahi/prediction.py

def __gt__(self, other: object) -> bool:  # type: ignore[override]
    """Check if greater than another value."""
    if isinstance(other, (float, int)):
        return self.value > other
    return NotImplemented

__init__(value) ¶

Initialize PredictionScore.

Parameters:

Name	Type	Description	Default
`value` ¶	`float \| ndarray`	prediction score between 0 and 1.	required

Source code in sahi/prediction.py

def __init__(self, value: float | np.ndarray) -> None:
    """Initialize PredictionScore.

    Args:
        value: prediction score between 0 and 1.
    """
    # if score is a numpy object, convert it to python variable
    if isinstance(value, np.ndarray):
        value = copy.deepcopy(value).tolist()
    # set score
    self.value: float = value  # type: ignore[assignment]

__lt__(other) ¶

Check if less than another value.

Source code in sahi/prediction.py

def __lt__(self, other: object) -> bool:  # type: ignore[override]
    """Check if less than another value."""
    if isinstance(other, (float, int)):
        return self.value < other
    return NotImplemented

__repr__() ¶

Return string representation of prediction score.

Source code in sahi/prediction.py

def __repr__(self) -> str:
    """Return string representation of prediction score."""
    return f"PredictionScore: <value: {self.value}>"

is_greater_than_threshold(threshold) ¶

Check if score is greater than threshold.

Source code in sahi/prediction.py

def is_greater_than_threshold(self, threshold: float) -> bool:
    """Check if score is greater than threshold."""
    return self.value > threshold

Functions¶

`scripts` ¶

Command-line scripts for SAHI utilities.

Modules¶

`coco2fiftyone` ¶

Convert COCO dataset annotations to FiftyOne format.

Functions¶

main(image_dir, dataset_json_path, *result_json_paths, iou_thresh=0.5) ¶

Convert COCO dataset to FiftyOne and optionally evaluate detection results.

Parameters:

Name	Type	Description	Default
`image_dir` ¶	`str`	Directory containing COCO images.	required
`dataset_json_path` ¶	`str`	Path to the COCO dataset JSON file.	required
`result_json_paths` ¶	`str`	Paths to COCO result JSON files.	`()`
`iou_thresh` ¶	`float`	IoU threshold for COCO evaluation.	`0.5`

Source code in sahi/scripts/coco2fiftyone.py

def main(
    image_dir: str,
    dataset_json_path: str,
    *result_json_paths: str,
    iou_thresh: float = 0.5,
) -> None:
    """Convert COCO dataset to FiftyOne and optionally evaluate detection results.

    Args:
        image_dir: Directory containing COCO images.
        dataset_json_path: Path to the COCO dataset JSON file.
        result_json_paths: Paths to COCO result JSON files.
        iou_thresh: IoU threshold for COCO evaluation.
    """
    from fiftyone.utils.coco import add_coco_labels

    from sahi.utils.fiftyone import create_fiftyone_dataset_from_coco_file, fo

    coco_result_list = []
    result_name_list = []
    if result_json_paths:
        for result_json_path in result_json_paths:
            coco_result = load_json(result_json_path)
            coco_result_list.append(coco_result)

            # use file names as fiftyone name, create unique names if duplicate
            result_name_temp = Path(result_json_path).stem
            result_name = result_name_temp
            name_increment = 2
            while result_name in result_name_list:
                result_name = result_name_temp + "_" + str(name_increment)
                name_increment += 1
            result_name_list.append(result_name)

    dataset = create_fiftyone_dataset_from_coco_file(image_dir, dataset_json_path)

    # submit detections if coco result is given
    if result_json_paths:
        for result_name, coco_result in zip(result_name_list, coco_result_list):
            add_coco_labels(dataset, result_name, coco_result, coco_id_field="gt_coco_id")

    # visualize results
    session = fo.launch_app()  # pyright: ignore[reportArgumentType]
    session.dataset = dataset

    # order by false positives if any coco result is given
    if result_json_paths:
        # Evaluate the predictions
        first_coco_result_name = result_name_list[0]
        _ = dataset.evaluate_detections(
            first_coco_result_name,
            gt_field="gt_detections",
            eval_key=f"{first_coco_result_name}_eval",
            iou=iou_thresh,
            compute_mAP=False,
        )
        # Get the 10 most common classes in the dataset
        # counts = dataset.count_values("gt_detections.detections.label")
        # classes_top10 = sorted(counts, key=counts.get, reverse=True)[:10]
        # Print a classification report for the top-10 classes
        # results.print_report(classes=classes_top10)
        # Load the view on which we ran the `eval` evaluation
        eval_view = dataset.load_evaluation_view(f"{first_coco_result_name}_eval")
        # Show samples with most false positives
        session.view = eval_view.sort_by(f"{first_coco_result_name}_eval_fp", reverse=True)

        print(f"SAHI has successfully launched a Fiftyone app at http://localhost:{fo.config.default_app_port}")
    while 1:
        time.sleep(3)

`coco2yolo` ¶

Convert COCO dataset annotations to YOLO format.

Classes¶ Functions¶

main(image_dir, dataset_json_path, train_split=0.9, project='runs/coco2yolo', name='exp', seed=1, disable_symlink=False) ¶

Convert COCO dataset annotations to YOLO format.

Parameters:

Name	Type	Description	Default
`image_dir` ¶	`str`	Directory containing COCO images.	required
`dataset_json_path` ¶	`str`	Path to the COCO JSON file to be converted.	required
`train_split` ¶	`int \| float`	Training/validation split ratio.	`0.9`
`project` ¶	`str`	Project directory for results.	`'runs/coco2yolo'`
`name` ¶	`str`	Experiment name within project.	`'exp'`
`seed` ¶	`int`	Random seed for reproducibility.	`1`
`disable_symlink` ¶	`bool`	Disable symlinks (needed for Google Colab).	`False`

Source code in sahi/scripts/coco2yolo.py

def main(
    image_dir: str,
    dataset_json_path: str,
    train_split: int | float = 0.9,
    project: str = "runs/coco2yolo",
    name: str = "exp",
    seed: int = 1,
    disable_symlink: bool = False,
) -> None:
    """Convert COCO dataset annotations to YOLO format.

    Args:
        image_dir: Directory containing COCO images.
        dataset_json_path: Path to the COCO JSON file to be converted.
        train_split: Training/validation split ratio.
        project: Project directory for results.
        name: Experiment name within project.
        seed: Random seed for reproducibility.
        disable_symlink: Disable symlinks (needed for Google Colab).
    """
    # increment run
    save_dir = Path(increment_path(Path(project) / name, exist_ok=False))
    # load coco dict
    coco = Coco.from_coco_dict_or_path(
        coco_dict_or_path=dataset_json_path,
        image_dir=image_dir,
    )
    # export as YOLO
    coco.export_as_yolo(
        output_dir=str(save_dir),
        train_split_rate=train_split,
        numpy_seed=seed,
        disable_symlink=disable_symlink,
    )

    print(f"COCO to YOLO conversion results are successfully exported to {save_dir}")

`coco_error_analysis` ¶

Error analysis utilities for COCO detection results.

Functions¶

analyse(dataset_json_path, result_json_path, out_dir=None, type='bbox', no_extraplots=False, areas=[1024, 9216, 10000000000], max_detections=500, return_dict=False)

¶

Analyze COCO detection results and generate error analysis plots.

Parameters:

Name	Type	Description	Default
`dataset_json_path` ¶	`str`	File path for the COCO dataset JSON file.	required
`result_json_path` ¶	`str`	File path for the COCO result JSON file.	required
`out_dir` ¶	`str \| None`	Directory to save analysis result images.	`None`
`no_extraplots` ¶	`bool`	If True, do not export extra bar/stat plots.	`False`
`type` ¶	`str`	Detection type, either 'bbox' or 'mask'.	`'bbox'`
`areas` ¶	`list[int]`	Area regions for COCO evaluation calculations.	`[1024, 9216, 10000000000]`
`max_detections` ¶	`int`	Maximum number of detections to consider for AP calculation. Default is 500.	`500`
`return_dict` ¶	`bool`	If True, returns a dict of export paths.	`False`

Returns:

Type	Description
`dict \| None`	Dict of export paths if return_dict is True, otherwise None.

Source code in sahi/scripts/coco_error_analysis.py

def analyse(
    dataset_json_path: str,
    result_json_path: str,
    out_dir: str | None = None,
    type: str = "bbox",
    no_extraplots: bool = False,
    areas: list[int] = [1024, 9216, 10000000000],
    max_detections: int = 500,
    return_dict: bool = False,
) -> dict | None:
    """Analyze COCO detection results and generate error analysis plots.

    Args:
        dataset_json_path: File path for the COCO dataset JSON file.
        result_json_path: File path for the COCO result JSON file.
        out_dir: Directory to save analysis result images.
        no_extraplots: If True, do not export extra bar/stat plots.
        type: Detection type, either 'bbox' or 'mask'.
        areas: Area regions for COCO evaluation calculations.
        max_detections: Maximum number of detections to consider for AP calculation.
            Default is 500.
        return_dict: If True, returns a dict of export paths.

    Returns:
        Dict of export paths if return_dict is True, otherwise None.
    """
    if not has_matplotlib:
        logger.error("Please run 'uv pip install -U matplotlib' first for visualization.")
        raise ModuleNotFoundError("matplotlib not installed")
    if not has_pycocotools:
        logger.error("Please run 'uv pip install -U pycocotools' first for Coco analysis.")
        raise ModuleNotFoundError("pycocotools not installed")

    result = _analyse_results(
        result_json_path,
        dataset_json_path,
        res_types=[type],
        out_dir=out_dir,
        extraplots=not no_extraplots,
        areas=areas,
        max_detections=max_detections,
    )
    if return_dict:
        return result

`coco_evaluation` ¶

COCO dataset evaluation and analysis utilities.

Functions¶

evaluate(dataset_json_path, result_json_path, out_dir=None, type='bbox', classwise=False, max_detections=500, iou_thrs=None, areas=[1024, 9216, 10000000000], return_dict=False)

¶

Evaluate COCO object detection results and compute metrics.

Parameters:

Name	Type	Description	Default
`dataset_json_path` ¶	`str`	File path for the COCO dataset JSON file.	required
`result_json_path` ¶	`str`	File path for the COCO result JSON file.	required
`out_dir` ¶	`str \| None`	Directory to save evaluation results.	`None`
`type` ¶	`Literal['bbox', 'segm']`	Detection type, either 'bbox' or 'segm'.	`'bbox'`
`classwise` ¶	`bool`	If True, evaluate AP for each class separately.	`False`
`max_detections` ¶	`int`	Maximum number of detections to consider for AP calculation. Default is 500.	`500`
`iou_thrs` ¶	`list[float] \| float \| None`	IoU threshold(s) used for evaluating recalls and mAPs.	`None`
`areas` ¶	`list[int]`	Area regions for COCO evaluation calculations.	`[1024, 9216, 10000000000]`
`return_dict` ¶	`bool`	If True, returns a dict with 'eval_results' and 'export_path' fields.	`False`

Returns:

Type	Description
`dict`	Dict containing evaluation results and export path if return_dict is True,
`dict`	otherwise None.

Source code in sahi/scripts/coco_evaluation.py

def evaluate(
    dataset_json_path: str,
    result_json_path: str,
    out_dir: str | None = None,
    type: Literal["bbox", "segm"] = "bbox",
    classwise: bool = False,
    max_detections: int = 500,
    iou_thrs: list[float] | float | None = None,
    areas: list[int] = [1024, 9216, 10000000000],
    return_dict: bool = False,
) -> dict:
    """Evaluate COCO object detection results and compute metrics.

    Args:
        dataset_json_path: File path for the COCO dataset JSON file.
        result_json_path: File path for the COCO result JSON file.
        out_dir: Directory to save evaluation results.
        type: Detection type, either 'bbox' or 'segm'.
        classwise: If True, evaluate AP for each class separately.
        max_detections: Maximum number of detections to consider for AP calculation.
            Default is 500.
        iou_thrs: IoU threshold(s) used for evaluating recalls and mAPs.
        areas: Area regions for COCO evaluation calculations.
        return_dict: If True, returns a dict with 'eval_results' and 'export_path' fields.

    Returns:
        Dict containing evaluation results and export path if return_dict is True,
        otherwise None.
    """
    try:
        from pycocotools.coco import COCO
        from pycocotools.cocoeval import COCOeval
    except ModuleNotFoundError:
        raise ModuleNotFoundError(
            'Please run "pip install -U pycocotools" to install pycocotools first for coco evaluation.'
        )

    # perform coco eval
    result = evaluate_core(
        dataset_path=dataset_json_path,
        result_path=result_json_path,
        metric=type,
        classwise=classwise,
        max_detections=max_detections,
        iou_thrs=iou_thrs,
        out_dir=out_dir,
        areas=areas,
        COCO=COCO,
        COCOeval=COCOeval,
    )
    if return_dict:
        return result

evaluate_core(dataset_path, result_path, COCO, COCOeval, metric='bbox', classwise=False, max_detections=500, iou_thrs=None, metric_items=None, out_dir=None, areas=[1024, 9216, 10000000000])

¶

Evaluate detection results using COCO protocol.

Parameters:

Name	Type	Description	Default
`dataset_path` ¶	`str`	COCO dataset JSON path.	required
`result_path` ¶	`str`	COCO result JSON path.	required
`COCO` ¶	`type`	COCO class after safely imported.	required
`COCOeval` ¶	`type`	COCOeval class after safely imported.	required
`metric` ¶	`str \| list[str]`	Metrics to be evaluated. Options are 'bbox', 'segm', 'proposal'.	`'bbox'`
`classwise` ¶	`bool`	Whether to evaluating the AP for each class.	`False`
`max_detections` ¶	`int`	Maximum number of detections to consider for AP calculation. Default: 500	`500`
`iou_thrs` ¶	`List[float]`	IoU threshold used for evaluating recalls/mAPs. If set to a list, the average of all IoUs will also be computed. If not specified, [0.50, 0.55, 0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95] will be used. Default: None.	`None`
`metric_items` ¶	`list[str] \| str`	Metric items that will be returned. If not specified, `['AR@10', 'AR@100', 'AR@500', 'AR_s@500', 'AR_m@500', 'AR_l@500' ]` will be used when `metric=='proposal'`, `['mAP', 'mAP50', 'mAP75', 'mAP_s', 'mAP_m', 'mAP_l', 'mAP50_s', 'mAP50_m', 'mAP50_l']` will be used when `metric=='bbox' or metric=='segm'`.	`None`
`out_dir` ¶	`str`	Directory to save evaluation result json.	`None`
`areas` ¶	`List[int]`	area regions for coco evaluation calculations	`[1024, 9216, 10000000000]`

Returns: dict: eval_results (dict[str, float]): COCO style evaluation metric. export_path (str): Path for the exported eval result json.

Source code in sahi/scripts/coco_evaluation.py

def evaluate_core(
    dataset_path: str,
    result_path: str,
    COCO: type,
    COCOeval: type,
    metric: str = "bbox",
    classwise: bool = False,
    max_detections: int = 500,
    iou_thrs: list[float] | float | None = None,
    metric_items: list[str] | None = None,
    out_dir: str | Path | None = None,
    areas: list[int] = [1024, 9216, 10000000000],
) -> dict:
    """Evaluate detection results using COCO protocol.

    Args:
        dataset_path: COCO dataset JSON path.
        result_path: COCO result JSON path.
        COCO: COCO class after safely imported.
        COCOeval: COCOeval class after safely imported.
        metric (str | list[str]): Metrics to be evaluated. Options are
            'bbox', 'segm', 'proposal'.
        classwise (bool): Whether to evaluating the AP for each class.
        max_detections (int): Maximum number of detections to consider for AP
            calculation.
            Default: 500
        iou_thrs (List[float], optional): IoU threshold used for
            evaluating recalls/mAPs. If set to a list, the average of all
            IoUs will also be computed. If not specified, [0.50, 0.55,
            0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95] will be used.
            Default: None.
        metric_items (list[str] | str, optional): Metric items that will
            be returned. If not specified, ``['AR@10', 'AR@100',
            'AR@500', 'AR_s@500', 'AR_m@500', 'AR_l@500' ]`` will be
            used when ``metric=='proposal'``, ``['mAP', 'mAP50', 'mAP75',
            'mAP_s', 'mAP_m', 'mAP_l', 'mAP50_s', 'mAP50_m', 'mAP50_l']``
            will be used when ``metric=='bbox' or metric=='segm'``.
        out_dir (str): Directory to save evaluation result json.
        areas (List[int]): area regions for coco evaluation calculations
    Returns:
        dict:
            eval_results (dict[str, float]): COCO style evaluation metric.
            export_path (str): Path for the exported eval result json.
    """
    metrics = metric if isinstance(metric, list) else [metric]
    allowed_metrics = ["bbox", "segm"]
    for metric in metrics:
        if metric not in allowed_metrics:
            raise KeyError(f"metric {metric} is not supported")
    if iou_thrs is None:
        iou_thrs = np.linspace(0.5, 0.95, int(np.round((0.95 - 0.5) / 0.05)) + 1, endpoint=True)
    if metric_items is not None:
        if not isinstance(metric_items, list):
            metric_items = [metric_items]
    if areas is not None:
        if len(areas) != 3:
            raise ValueError("3 integers should be specified as areas, representing 3 area regions")
    eval_results = OrderedDict()

    # Load dataset json and add empty 'info' field if missing
    with open(dataset_path) as f:
        dataset_dict = json.load(f)
    if "info" not in dataset_dict:
        dataset_dict["info"] = {}

    # Create temporary file with updated dataset
    import tempfile

    with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as tmp_file:
        json.dump(dataset_dict, tmp_file)
        temp_dataset_path = tmp_file.name

    try:
        cocoGt = COCO(temp_dataset_path)
        cat_ids = list(cocoGt.cats.keys())
        for metric in metrics:
            msg = f"Evaluating {metric}..."
            msg = "\n" + msg
            print(msg)

            iou_type = metric
            with open(result_path) as json_file:
                results = json.load(json_file)
            try:
                cocoDt = cocoGt.loadRes(results)
            except IndexError:
                print("The testing results of the whole dataset is empty.")
                break

            cocoEval = COCOeval(cocoGt, cocoDt, iou_type)
            if areas is not None:
                cocoEval.params.areaRng = [
                    [0**2, areas[2]],
                    [0**2, areas[0]],
                    [areas[0], areas[1]],
                    [areas[1], areas[2]],
                ]
            cocoEval.params.catIds = cat_ids
            cocoEval.params.maxDets = [max_detections]
            cocoEval.params.iouThrs = (
                [iou_thrs] if not isinstance(iou_thrs, list) and not isinstance(iou_thrs, np.ndarray) else iou_thrs
            )
            # mapping of cocoEval.stats
            coco_metric_names = {
                "mAP": 0,
                "mAP75": 1,
                "mAP50": 2,
                "mAP_s": 3,
                "mAP_m": 4,
                "mAP_l": 5,
                "mAP50_s": 6,
                "mAP50_m": 7,
                "mAP50_l": 8,
                "AR_s": 9,
                "AR_m": 10,
                "AR_l": 11,
            }
            if metric_items is not None:
                for metric_item in metric_items:
                    if metric_item not in coco_metric_names:
                        raise KeyError(f"metric item {metric_item} is not supported")

            cocoEval.evaluate()
            cocoEval.accumulate()
            # calculate mAP50_s/m/l
            mAP = _cocoeval_summarize(cocoEval, ap=1, iouThr=None, areaRng="all", maxDets=max_detections)
            mAP50 = _cocoeval_summarize(cocoEval, ap=1, iouThr=0.5, areaRng="all", maxDets=max_detections)
            mAP75 = _cocoeval_summarize(cocoEval, ap=1, iouThr=0.75, areaRng="all", maxDets=max_detections)
            mAP50_s = _cocoeval_summarize(cocoEval, ap=1, iouThr=0.5, areaRng="small", maxDets=max_detections)
            mAP50_m = _cocoeval_summarize(cocoEval, ap=1, iouThr=0.5, areaRng="medium", maxDets=max_detections)
            mAP50_l = _cocoeval_summarize(cocoEval, ap=1, iouThr=0.5, areaRng="large", maxDets=max_detections)
            mAP_s = _cocoeval_summarize(cocoEval, ap=1, iouThr=None, areaRng="small", maxDets=max_detections)
            mAP_m = _cocoeval_summarize(cocoEval, ap=1, iouThr=None, areaRng="medium", maxDets=max_detections)
            mAP_l = _cocoeval_summarize(cocoEval, ap=1, iouThr=None, areaRng="large", maxDets=max_detections)
            AR_s = _cocoeval_summarize(cocoEval, ap=0, iouThr=None, areaRng="small", maxDets=max_detections)
            AR_m = _cocoeval_summarize(cocoEval, ap=0, iouThr=None, areaRng="medium", maxDets=max_detections)
            AR_l = _cocoeval_summarize(cocoEval, ap=0, iouThr=None, areaRng="large", maxDets=max_detections)
            cocoEval.stats = np.append(
                [mAP, mAP75, mAP50, mAP_s, mAP_m, mAP_l, mAP50_s, mAP50_m, mAP50_l, AR_s, AR_m, AR_l], 0
            )

            if classwise:  # Compute per-category AP
                # Compute per-category AP
                # from https://github.com/facebookresearch/detectron2/
                precisions = cocoEval.eval["precision"]
                # precision: (iou, recall, cls, area range, max dets)
                if len(cat_ids) != precisions.shape[2]:
                    raise ValueError(
                        f"The number of categories {len(cat_ids)} is not equal "
                        f"to the number of precisions {precisions.shape[2]}"
                    )
                max_cat_name_len = 0
                for idx, catId in enumerate(cat_ids):
                    nm = cocoGt.loadCats(catId)[0]
                    cat_name_len = len(nm["name"])
                    max_cat_name_len = cat_name_len if cat_name_len > max_cat_name_len else max_cat_name_len

                results_per_category = []
                for idx, catId in enumerate(cat_ids):
                    # skip if no image with this category
                    image_ids = cocoGt.getImgIds(catIds=[catId])
                    if len(image_ids) == 0:
                        continue
                    # area range index 0: all area ranges
                    # max dets index -1: typically 100 per image
                    nm = cocoGt.loadCats(catId)[0]
                    ap = _cocoeval_summarize(
                        cocoEval,
                        ap=1,
                        catIdx=idx,
                        areaRng="all",
                        maxDets=max_detections,
                        catName=nm["name"],
                        nameStrLen=max_cat_name_len,
                    )
                    ap_s = _cocoeval_summarize(
                        cocoEval,
                        ap=1,
                        catIdx=idx,
                        areaRng="small",
                        maxDets=max_detections,
                        catName=nm["name"],
                        nameStrLen=max_cat_name_len,
                    )
                    ap_m = _cocoeval_summarize(
                        cocoEval,
                        ap=1,
                        catIdx=idx,
                        areaRng="medium",
                        maxDets=max_detections,
                        catName=nm["name"],
                        nameStrLen=max_cat_name_len,
                    )
                    ap_l = _cocoeval_summarize(
                        cocoEval,
                        ap=1,
                        catIdx=idx,
                        areaRng="large",
                        maxDets=max_detections,
                        catName=nm["name"],
                        nameStrLen=max_cat_name_len,
                    )
                    ap50 = _cocoeval_summarize(
                        cocoEval,
                        ap=1,
                        iouThr=0.5,
                        catIdx=idx,
                        areaRng="all",
                        maxDets=max_detections,
                        catName=nm["name"],
                        nameStrLen=max_cat_name_len,
                    )
                    ap50_s = _cocoeval_summarize(
                        cocoEval,
                        ap=1,
                        iouThr=0.5,
                        catIdx=idx,
                        areaRng="small",
                        maxDets=max_detections,
                        catName=nm["name"],
                        nameStrLen=max_cat_name_len,
                    )
                    ap50_m = _cocoeval_summarize(
                        cocoEval,
                        ap=1,
                        iouThr=0.5,
                        catIdx=idx,
                        areaRng="medium",
                        maxDets=max_detections,
                        catName=nm["name"],
                        nameStrLen=max_cat_name_len,
                    )
                    ap50_l = _cocoeval_summarize(
                        cocoEval,
                        ap=1,
                        iouThr=0.5,
                        catIdx=idx,
                        areaRng="large",
                        maxDets=max_detections,
                        catName=nm["name"],
                        nameStrLen=max_cat_name_len,
                    )
                    results_per_category.append((f"{metric}_{nm['name']}_mAP", f"{float(ap):0.3f}"))
                    results_per_category.append((f"{metric}_{nm['name']}_mAP_s", f"{float(ap_s):0.3f}"))
                    results_per_category.append((f"{metric}_{nm['name']}_mAP_m", f"{float(ap_m):0.3f}"))
                    results_per_category.append((f"{metric}_{nm['name']}_mAP_l", f"{float(ap_l):0.3f}"))
                    results_per_category.append((f"{metric}_{nm['name']}_mAP50", f"{float(ap50):0.3f}"))
                    results_per_category.append((f"{metric}_{nm['name']}_mAP50_s", f"{float(ap50_s):0.3f}"))
                    results_per_category.append((f"{metric}_{nm['name']}_mAP50_m", f"{float(ap50_m):0.3f}"))
                    results_per_category.append((f"{metric}_{nm['name']}_mAP50_l", f"{float(ap50_l):0.3f}"))

                num_columns = min(6, len(results_per_category) * 2)
                results_flatten = list(itertools.chain(*results_per_category))
                headers = ["category", "AP"] * (num_columns // 2)
                results_2d = itertools.zip_longest(*[results_flatten[i::num_columns] for i in range(num_columns)])
                table_data = [headers]
                table_data += [result for result in results_2d]
                print("\n" + create_ascii_table(table_data))

            if metric_items is None:
                metric_items = ["mAP", "mAP50", "mAP75", "mAP_s", "mAP_m", "mAP_l", "mAP50_s", "mAP50_m", "mAP50_l"]

            for metric_item in metric_items:
                key = f"{metric}_{metric_item}"
                val = float(f"{cocoEval.stats[coco_metric_names[metric_item]]:.3f}")
                eval_results[key] = val
            ap = cocoEval.stats
            eval_results[f"{metric}_mAP_copypaste"] = (
                f"{ap[0]:.3f} {ap[1]:.3f} {ap[2]:.3f} {ap[3]:.3f} "
                f"{ap[4]:.3f} {ap[5]:.3f} {ap[6]:.3f} {ap[7]:.3f} "
                f"{ap[8]:.3f}"
            )
            if classwise:
                eval_results["results_per_category"] = {key: value for key, value in results_per_category}
    finally:
        # Clean up temporary file
        os.unlink(temp_dataset_path)

    # set save path
    if not out_dir:
        out_dir = Path(result_path).parent
    Path(out_dir).mkdir(parents=True, exist_ok=True)
    export_path = str(Path(out_dir) / "eval.json")
    # export as json
    with open(export_path, "w", encoding="utf-8") as outfile:
        json.dump(eval_results, outfile, indent=4, separators=(",", ":"))
    print(f"COCO evaluation results are successfully exported to {export_path}")
    return {"eval_results": eval_results, "export_path": export_path}

`predict` ¶

Command-line interface for SAHI object detection predictions.

Functions¶

main() ¶

Run SAHI prediction via command-line interface.

Source code in sahi/scripts/predict.py

def main() -> None:
    """Run SAHI prediction via command-line interface."""
    fire.Fire(predict)

`predict_fiftyone` ¶

Command-line interface for SAHI predictions with FiftyOne integration.

Functions¶

main() ¶

Run SAHI prediction with FiftyOne visualization via command-line interface.

Source code in sahi/scripts/predict_fiftyone.py

def main() -> None:
    """Run SAHI prediction with FiftyOne visualization via command-line interface."""
    fire.Fire(predict_fiftyone)

`slice_coco` ¶

Slice COCO dataset images and annotations.

Functions¶

slicer(image_dir, dataset_json_path, slice_size=512, overlap_ratio=0.2, ignore_negative_samples=False, output_dir='runs/slice_coco', min_area_ratio=0.1)

¶

Slice COCO dataset into smaller images.

Parameters:

Name	Type	Description	Default
`image_dir` ¶	`str`	Directory containing COCO images.	required
`dataset_json_path` ¶	`str`	Path to COCO dataset JSON file.	required
`slice_size` ¶	`int`	Size of each slice in pixels.	`512`
`overlap_ratio` ¶	`float`	Overlap ratio between slices.	`0.2`
`ignore_negative_samples` ¶	`bool`	Skip images without annotations.	`False`
`output_dir` ¶	`str`	Output directory for sliced results.	`'runs/slice_coco'`
`min_area_ratio` ¶	`float`	Minimum area ratio for cropped annotations. If the annotation ratio is smaller than this value, the annotation is filtered out. Default 0.1.	`0.1`

Source code in sahi/scripts/slice_coco.py

def slicer(
    image_dir: str,
    dataset_json_path: str,
    slice_size: int = 512,
    overlap_ratio: float = 0.2,
    ignore_negative_samples: bool = False,
    output_dir: str = "runs/slice_coco",
    min_area_ratio: float = 0.1,
) -> None:
    """Slice COCO dataset into smaller images.

    Args:
        image_dir: Directory containing COCO images.
        dataset_json_path: Path to COCO dataset JSON file.
        slice_size: Size of each slice in pixels.
        overlap_ratio: Overlap ratio between slices.
        ignore_negative_samples: Skip images without annotations.
        output_dir: Output directory for sliced results.
        min_area_ratio: Minimum area ratio for cropped annotations. If the
            annotation ratio is smaller than this value, the annotation
            is filtered out. Default 0.1.
    """
    # assure slice_size is list
    slice_size_list = slice_size
    if isinstance(slice_size_list, (int, float)):
        slice_size_list = [slice_size_list]

    # slice coco dataset images and annotations
    print("Slicing step is starting...")
    for slice_size in slice_size_list:
        # in format: train_images_512_01
        output_images_folder_name = (
            Path(dataset_json_path).stem + f"_images_{slice_size!s}_{str(overlap_ratio).replace('.', '')}"
        )
        output_images_dir = str(Path(output_dir) / output_images_folder_name)
        sliced_coco_name = Path(dataset_json_path).name.replace(
            ".json", f"_{slice_size!s}_{str(overlap_ratio).replace('.', '')}"
        )
        coco_dict, _ = slice_coco(
            coco_annotation_file_path=dataset_json_path,
            image_dir=image_dir,
            output_coco_annotation_file_name="",
            output_dir=output_images_dir,
            ignore_negative_samples=ignore_negative_samples,
            slice_height=slice_size,
            slice_width=slice_size,
            min_area_ratio=min_area_ratio,
            overlap_height_ratio=overlap_ratio,
            overlap_width_ratio=overlap_ratio,
            out_ext=".jpg",
            verbose=False,
        )
        output_coco_annotation_file_path = os.path.join(output_dir, sliced_coco_name + ".json")
        save_json(coco_dict, output_coco_annotation_file_path)
        print(f"Sliced dataset for 'slice_size: {slice_size}' is exported to {output_dir}")

`slicing` ¶

Image slicing utilities for splitting large images into tiles.

Classes¶

`SliceImageResult` ¶

Container for sliced image results.

Source code in sahi/slicing.py

class SliceImageResult:
    """Container for sliced image results."""

    def __init__(self, original_image_size: list[int], image_dir: str | None = None) -> None:
        """Initialize SliceImageResult.

        Args:
            image_dir: str
                Directory of the sliced image exports.
            original_image_size: list of int
                Size of the unsliced original image in [height, width].
        """
        self.original_image_height = original_image_size[0]
        self.original_image_width = original_image_size[1]
        self.image_dir = image_dir

        self._sliced_image_list: list[SlicedImage] = []

    def add_sliced_image(self, sliced_image: SlicedImage) -> None:
        """Add a sliced image to the result."""
        if not isinstance(sliced_image, SlicedImage):
            raise TypeError("sliced_image must be a SlicedImage instance")

        self._sliced_image_list.append(sliced_image)

    @property
    def sliced_image_list(self) -> list[SlicedImage]:
        """Return list of sliced images."""
        return self._sliced_image_list

    @property
    def images(self) -> list[np.ndarray]:
        """Returns sliced images.

        Returns:
            images: a list of np.array
        """
        images = []
        for sliced_image in self._sliced_image_list:
            images.append(sliced_image.image)
        return images

    @property
    def coco_images(self) -> list[CocoImage]:
        """Returns CocoImage representation of SliceImageResult.

        Returns:
            coco_images: a list of CocoImage
        """
        coco_images: list = []
        for sliced_image in self._sliced_image_list:
            coco_images.append(sliced_image.coco_image)
        return coco_images

    @property
    def starting_pixels(self) -> list[list[int]]:
        """Returns a list of starting pixels for each slice.

        Returns:
            starting_pixels: a list of starting pixel coords [x,y]
        """
        starting_pixels = []
        for sliced_image in self._sliced_image_list:
            starting_pixels.append(sliced_image.starting_pixel)
        return starting_pixels

    @property
    def filenames(self) -> list[str]:
        """Returns a list of filenames for each slice.

        Returns:
            filenames: a list of filenames as str
        """
        filenames = []
        for sliced_image in self._sliced_image_list:
            filenames.append(sliced_image.coco_image.file_name)
        return filenames

    def __getitem__(self, i: int | slice | list | tuple) -> dict | list:
        """Get sliced image(s) by index or slice."""

        def _prepare_ith_dict(i: int) -> dict:
            return {
                "image": self.images[i],
                "coco_image": self.coco_images[i],
                "starting_pixel": self.starting_pixels[i],
                "filename": self.filenames[i],
            }

        if isinstance(i, np.ndarray):
            i = i.tolist()

        if isinstance(i, int):
            return _prepare_ith_dict(i)
        elif isinstance(i, slice):
            start, stop, step = i.indices(len(self))
            return [_prepare_ith_dict(i) for i in range(start, stop, step)]
        elif isinstance(i, (tuple, list)):
            accessed_mapping = map(_prepare_ith_dict, i)
            return list(accessed_mapping)
        else:
            raise NotImplementedError(f"{type(i)}")

    def __len__(self) -> int:
        """Return number of sliced images."""
        return len(self._sliced_image_list)

Attributes¶

coco_images property ¶

Returns CocoImage representation of SliceImageResult.

Returns:

Name	Type	Description
`coco_images`	`list[CocoImage]`	a list of CocoImage

filenames property ¶

Returns a list of filenames for each slice.

Returns:

Name	Type	Description
`filenames`	`list[str]`	a list of filenames as str

images property ¶

Returns sliced images.

Returns:

Name	Type	Description
`images`	`list[ndarray]`	a list of np.array

sliced_image_list property ¶

Return list of sliced images.

starting_pixels property ¶

Returns a list of starting pixels for each slice.

Returns:

Name	Type	Description
`starting_pixels`	`list[list[int]]`	a list of starting pixel coords [x,y]

Functions¶

__getitem__(i) ¶

Get sliced image(s) by index or slice.

Source code in sahi/slicing.py

def __getitem__(self, i: int | slice | list | tuple) -> dict | list:
    """Get sliced image(s) by index or slice."""

    def _prepare_ith_dict(i: int) -> dict:
        return {
            "image": self.images[i],
            "coco_image": self.coco_images[i],
            "starting_pixel": self.starting_pixels[i],
            "filename": self.filenames[i],
        }

    if isinstance(i, np.ndarray):
        i = i.tolist()

    if isinstance(i, int):
        return _prepare_ith_dict(i)
    elif isinstance(i, slice):
        start, stop, step = i.indices(len(self))
        return [_prepare_ith_dict(i) for i in range(start, stop, step)]
    elif isinstance(i, (tuple, list)):
        accessed_mapping = map(_prepare_ith_dict, i)
        return list(accessed_mapping)
    else:
        raise NotImplementedError(f"{type(i)}")

__init__(original_image_size, image_dir=None) ¶

Initialize SliceImageResult.

Parameters:

Name	Type	Description	Default
`image_dir` ¶	`str \| None`	str Directory of the sliced image exports.	`None`
`original_image_size` ¶	`list[int]`	list of int Size of the unsliced original image in [height, width].	required

Source code in sahi/slicing.py

def __init__(self, original_image_size: list[int], image_dir: str | None = None) -> None:
    """Initialize SliceImageResult.

    Args:
        image_dir: str
            Directory of the sliced image exports.
        original_image_size: list of int
            Size of the unsliced original image in [height, width].
    """
    self.original_image_height = original_image_size[0]
    self.original_image_width = original_image_size[1]
    self.image_dir = image_dir

    self._sliced_image_list: list[SlicedImage] = []

__len__() ¶

Return number of sliced images.

Source code in sahi/slicing.py

def __len__(self) -> int:
    """Return number of sliced images."""
    return len(self._sliced_image_list)

add_sliced_image(sliced_image) ¶

Add a sliced image to the result.

Source code in sahi/slicing.py

def add_sliced_image(self, sliced_image: SlicedImage) -> None:
    """Add a sliced image to the result."""
    if not isinstance(sliced_image, SlicedImage):
        raise TypeError("sliced_image must be a SlicedImage instance")

    self._sliced_image_list.append(sliced_image)

`SlicedImage` ¶

Container for a sliced image and its metadata.

Source code in sahi/slicing.py

class SlicedImage:
    """Container for a sliced image and its metadata."""

    def __init__(self, image: np.ndarray, coco_image: CocoImage, starting_pixel: list[int]) -> None:
        """Initialize SlicedImage.

        Args:
            image: np.array
                Sliced image.
            coco_image: CocoImage
                Coco styled image object that belong to sliced image.
            starting_pixel: list of list of int
                Starting pixel coordinates of the sliced image.
        """
        self.image = image
        self.coco_image = coco_image
        self.starting_pixel = starting_pixel

Functions¶

__init__(image, coco_image, starting_pixel) ¶

Initialize SlicedImage.

Parameters:

Name	Type	Description	Default
`image` ¶	`ndarray`	np.array Sliced image.	required
`coco_image` ¶	`CocoImage`	CocoImage Coco styled image object that belong to sliced image.	required
`starting_pixel` ¶	`list[int]`	list of list of int Starting pixel coordinates of the sliced image.	required

Source code in sahi/slicing.py

def __init__(self, image: np.ndarray, coco_image: CocoImage, starting_pixel: list[int]) -> None:
    """Initialize SlicedImage.

    Args:
        image: np.array
            Sliced image.
        coco_image: CocoImage
            Coco styled image object that belong to sliced image.
        starting_pixel: list of list of int
            Starting pixel coordinates of the sliced image.
    """
    self.image = image
    self.coco_image = coco_image
    self.starting_pixel = starting_pixel

Functions¶

`annotation_inside_slice(annotation, slice_bbox)` ¶

Check whether annotation coordinates lie inside slice coordinates.

Parameters:

Name	Type	Description	Default
`annotation` ¶	`dict`	Single annotation entry in COCO format.	required
`slice_bbox` ¶	`List[int]`	Generated from `get_slice_bboxes`. Format for each slice bbox: [x_min, y_min, x_max, y_max].	required

Returns:

Type	Description
`bool`	True if any annotation coordinate lies inside slice.

Source code in sahi/slicing.py

def annotation_inside_slice(annotation: dict, slice_bbox: list[int]) -> bool:
    """Check whether annotation coordinates lie inside slice coordinates.

    Args:
        annotation (dict): Single annotation entry in COCO format.
        slice_bbox (List[int]): Generated from `get_slice_bboxes`.
            Format for each slice bbox: [x_min, y_min, x_max, y_max].

    Returns:
        (bool): True if any annotation coordinate lies inside slice.
    """
    left, top, width, height = annotation["bbox"]

    right = left + width
    bottom = top + height

    if left >= slice_bbox[2]:
        return False
    if top >= slice_bbox[3]:
        return False
    if right <= slice_bbox[0]:
        return False
    if bottom <= slice_bbox[1]:
        return False

    return True

`calc_aspect_ratio_orientation(width, height)` ¶

Calculate image capture orientation from aspect ratio.

Parameters:

Name	Type	Description	Default
`width` ¶	`int`	image width.	required
`height` ¶	`int`	image height.	required

Returns:

Type	Description
`Literal['vertical', 'horizontal', 'square']`	image capture orientation.

Source code in sahi/slicing.py

def calc_aspect_ratio_orientation(width: int, height: int) -> Literal["vertical", "horizontal", "square"]:
    """Calculate image capture orientation from aspect ratio.

    Args:
        width: image width.
        height: image height.

    Returns:
        image capture orientation.
    """
    if width < height:
        return "vertical"
    elif width > height:
        return "horizontal"
    else:
        return "square"

`calc_ratio_and_slice(orientation, slide=1, ratio=0.1)` ¶

Calculate overlap params according to image resolution.

Parameters:

Name	Type	Description	Default
`orientation` ¶	`Literal['vertical', 'horizontal', 'square']`	image capture angle.	required
`slide` ¶	`int`	sliding window.	`1`
`ratio` ¶	`float`	buffer value.	`0.1`

Returns:

Type	Description
`tuple[int, int, float, float]`	overlap params.

Source code in sahi/slicing.py

def calc_ratio_and_slice(
    orientation: Literal["vertical", "horizontal", "square"], slide: int = 1, ratio: float = 0.1
) -> tuple[int, int, float, float]:
    """Calculate overlap params according to image resolution.

    Args:
        orientation: image capture angle.
        slide: sliding window.
        ratio: buffer value.

    Returns:
        overlap params.
    """
    if orientation == "vertical":
        slice_row, slice_col, overlap_height_ratio, overlap_width_ratio = slide, slide * 2, ratio, ratio
    elif orientation == "horizontal":
        slice_row, slice_col, overlap_height_ratio, overlap_width_ratio = slide * 2, slide, ratio, ratio
    elif orientation == "square":
        slice_row, slice_col, overlap_height_ratio, overlap_width_ratio = slide, slide, ratio, ratio
    else:
        raise ValueError(f"Invalid orientation: {orientation}. Must be one of 'vertical', 'horizontal', or 'square'.")

    return slice_row, slice_col, overlap_height_ratio, overlap_width_ratio

`calc_resolution_factor(resolution)` ¶

Calculate power(2,n) and return the closest smaller n for resolution.

Parameters:

Name	Type	Description	Default
`resolution` ¶	`int`	the width and height of the image multiplied. such as 1024x720 = 737280.	required

Returns:

Type	Description
`int`	Power value of 2 closest to the resolution.

Source code in sahi/slicing.py

def calc_resolution_factor(resolution: int) -> int:
    """Calculate power(2,n) and return the closest smaller `n` for resolution.

    Args:
        resolution: the width and height of the image multiplied. such as 1024x720 = 737280.

    Returns:
        Power value of 2 closest to the resolution.
    """
    expo = 0
    while np.power(2, expo) < resolution:
        expo += 1

    return expo - 1

`calc_slice_and_overlap_params(resolution, height, width, orientation)` ¶

Calculate slice and overlap params according to image resolution.

Parameters:

Name	Type	Description	Default
`resolution` ¶	`str`	str	required
`height` ¶	`int`	int	required
`width` ¶	`int`	int	required
`orientation` ¶	`Literal['vertical', 'horizontal', 'square']`	str.	required

Returns:

Type	Description
`tuple[int, int, int, int]`	x_overlap, y_overlap, slice_width, slice_height

Source code in sahi/slicing.py

def calc_slice_and_overlap_params(
    resolution: str, height: int, width: int, orientation: Literal["vertical", "horizontal", "square"]
) -> tuple[int, int, int, int]:
    """Calculate slice and overlap params according to image resolution.

    Args:
        resolution: str
        height: int
        width: int
        orientation: str.

    Returns:
        x_overlap, y_overlap, slice_width, slice_height
    """
    if resolution == "medium":
        split_row, split_col, overlap_height_ratio, overlap_width_ratio = calc_ratio_and_slice(
            orientation, slide=1, ratio=0.8
        )

    elif resolution == "high":
        split_row, split_col, overlap_height_ratio, overlap_width_ratio = calc_ratio_and_slice(
            orientation, slide=2, ratio=0.4
        )

    elif resolution == "ultra-high":
        split_row, split_col, overlap_height_ratio, overlap_width_ratio = calc_ratio_and_slice(
            orientation, slide=4, ratio=0.4
        )
    else:  # low condition
        split_col = 1
        split_row = 1
        overlap_width_ratio = 1
        overlap_height_ratio = 1

    slice_height = height // split_col
    slice_width = width // split_row

    x_overlap = int(slice_width * overlap_width_ratio)
    y_overlap = int(slice_height * overlap_height_ratio)

    return x_overlap, y_overlap, slice_width, slice_height

`get_auto_slice_params(height, width)` ¶

Calculate overlap sliding window and buffer params from image dimensions.

Factor is the power value of 2 closest to the image resolution

factor <= 18: low resolution image such as 300x300, 640x640
18 < factor <= 21: medium resolution image such as 1024x1024, 1336x960
21 < factor <= 24: high resolution image such as 2048x2048, 2048x4096, 4096x4096
factor > 24: ultra-high resolution image such as 6380x6380, 4096x8192.

Parameters:

Name	Type	Description	Default
`height` ¶	`int`	image height.	required
`width` ¶	`int`	image width.	required

Returns:

Type	Description
`tuple[int, int, int, int]`	slicing overlap params x_overlap, y_overlap, slice_width, slice_height.

Source code in sahi/slicing.py

def get_auto_slice_params(height: int, width: int) -> tuple[int, int, int, int]:
    """Calculate overlap sliding window and buffer params from image dimensions.

    Factor is the power value of 2 closest to the image resolution:
        - factor <= 18: low resolution image such as 300x300, 640x640
        - 18 < factor <= 21: medium resolution image such as 1024x1024, 1336x960
        - 21 < factor <= 24: high resolution image such as 2048x2048, 2048x4096, 4096x4096
        - factor > 24: ultra-high resolution image such as 6380x6380, 4096x8192.

    Args:
        height: image height.
        width: image width.

    Returns:
        slicing overlap params x_overlap, y_overlap, slice_width, slice_height.
    """
    resolution = height * width
    factor = calc_resolution_factor(resolution)
    if factor <= 18:
        return get_resolution_selector("low", height=height, width=width)
    elif 18 <= factor < 21:
        return get_resolution_selector("medium", height=height, width=width)
    elif 21 <= factor < 24:
        return get_resolution_selector("high", height=height, width=width)
    else:
        return get_resolution_selector("ultra-high", height=height, width=width)

`get_resolution_selector(res, height, width)` ¶

Get slicing parameters based on resolution.

Parameters:

Name	Type	Description	Default
`res` ¶	`str`	resolution of image such as low, medium.	required
`height` ¶	`int`	image height.	required
`width` ¶	`int`	image width.	required

Returns:

Type	Description
`tuple[int, int, int, int]`	overlap params from slicing params function.

Source code in sahi/slicing.py

def get_resolution_selector(res: str, height: int, width: int) -> tuple[int, int, int, int]:
    """Get slicing parameters based on resolution.

    Args:
        res: resolution of image such as low, medium.
        height: image height.
        width: image width.

    Returns:
        overlap params from slicing params function.
    """
    orientation = calc_aspect_ratio_orientation(width=width, height=height)
    x_overlap, y_overlap, slice_width, slice_height = calc_slice_and_overlap_params(
        resolution=res, height=height, width=width, orientation=orientation
    )

    return x_overlap, y_overlap, slice_width, slice_height

`get_slice_bboxes(image_height, image_width, slice_height=None, slice_width=None, auto_slice_resolution=True, overlap_height_ratio=0.2, overlap_width_ratio=0.2)` ¶

Generate bounding boxes for slicing an image into crops.

The function calculates the coordinates for each slice based on the provided image dimensions, slice size, and overlap ratios. If slice size is not provided and auto_slice_resolution is True, the function will automatically determine appropriate slice parameters.

Parameters:

Name	Type	Description	Default
`image_height` ¶	`int`	Height of the original image.	required
`image_width` ¶	`int`	Width of the original image.	required
`slice_height` ¶	`int`	Height of each slice. Default None.	`None`
`slice_width` ¶	`int`	Width of each slice. Default None.	`None`
`overlap_height_ratio` ¶	`float`	Fractional overlap in height of each slice (e.g. an overlap of 0.2 for a slice of size 100 yields an overlap of 20 pixels). Default 0.2.	`0.2`
`overlap_width_ratio` ¶	`float`	Fractional overlap in width of each slice (e.g. an overlap of 0.2 for a slice of size 100 yields an overlap of 20 pixels). Default 0.2.	`0.2`
`auto_slice_resolution` ¶	`bool`	if not set slice parameters such as slice_height and slice_width, it enables automatically calculate these parameters from image resolution and orientation.	`True`

Returns:

Type	Description
`list[list[int]]`	List[List[int]]: List of 4 corner coordinates for each N slices. [ [slice_0_left, slice_0_top, slice_0_right, slice_0_bottom], ... [slice_N_left, slice_N_top, slice_N_right, slice_N_bottom] ]

Source code in sahi/slicing.py

def get_slice_bboxes(
    image_height: int,
    image_width: int,
    slice_height: int | None = None,
    slice_width: int | None = None,
    auto_slice_resolution: bool | None = True,
    overlap_height_ratio: float | None = 0.2,
    overlap_width_ratio: float | None = 0.2,
) -> list[list[int]]:
    """Generate bounding boxes for slicing an image into crops.

    The function calculates the coordinates for each slice based on the provided
    image dimensions, slice size, and overlap ratios. If slice size is not provided
    and auto_slice_resolution is True, the function will automatically determine
    appropriate slice parameters.

    Args:
        image_height (int): Height of the original image.
        image_width (int): Width of the original image.
        slice_height (int, optional): Height of each slice. Default None.
        slice_width (int, optional): Width of each slice. Default None.
        overlap_height_ratio (float, optional): Fractional overlap in height of each
            slice (e.g. an overlap of 0.2 for a slice of size 100 yields an
            overlap of 20 pixels). Default 0.2.
        overlap_width_ratio(float, optional): Fractional overlap in width of each
            slice (e.g. an overlap of 0.2 for a slice of size 100 yields an
            overlap of 20 pixels). Default 0.2.
        auto_slice_resolution (bool, optional): if not set slice parameters such as slice_height and slice_width,
            it enables automatically calculate these parameters from image resolution and orientation.

    Returns:
        List[List[int]]: List of 4 corner coordinates for each N slices.
            [
                [slice_0_left, slice_0_top, slice_0_right, slice_0_bottom],
                ...
                [slice_N_left, slice_N_top, slice_N_right, slice_N_bottom]
            ]
    """
    slice_bboxes = []
    y_max = y_min = 0

    if slice_height and slice_width:
        if overlap_height_ratio is not None and overlap_height_ratio >= 1.0:
            raise ValueError("Overlap ratio must be less than 1.0")
        if overlap_width_ratio is not None and overlap_width_ratio >= 1.0:
            raise ValueError("Overlap ratio must be less than 1.0")
        y_overlap = int((overlap_height_ratio if overlap_height_ratio is not None else 0.2) * slice_height)
        x_overlap = int((overlap_width_ratio if overlap_width_ratio is not None else 0.2) * slice_width)
    elif auto_slice_resolution:
        x_overlap, y_overlap, slice_width, slice_height = get_auto_slice_params(height=image_height, width=image_width)
    else:
        raise ValueError("Compute type is not auto and slice width and height are not provided.")

    while y_max < image_height:
        x_min = x_max = 0
        y_max = y_min + slice_height
        while x_max < image_width:
            x_max = x_min + slice_width
            if y_max > image_height or x_max > image_width:
                x_max = min(image_width, x_max)
                y_max = min(image_height, y_max)
                x_min = max(0, x_max - slice_width)
                y_min = max(0, y_max - slice_height)
                slice_bboxes.append([x_min, y_min, x_max, y_max])
            else:
                slice_bboxes.append([x_min, y_min, x_max, y_max])
            x_min = x_max - x_overlap
        y_min = y_max - y_overlap
    return slice_bboxes

`process_coco_annotations(coco_annotation_list, slice_bbox, min_area_ratio)` ¶

Slices and filters given list of CocoAnnotation objects with given 'slice_bbox' and 'min_area_ratio'.

Parameters:

Name	Type	Description	Default
`coco_annotation_list` ¶	`list[CocoAnnotation]`	List[CocoAnnotation] Annotations to slice and filter.	required
`slice_bbox` ¶	`List[int]`	Generated from `get_slice_bboxes`. Format for each slice bbox: [x_min, y_min, x_max, y_max].	required
`min_area_ratio` ¶	`float`	If the cropped annotation area to original annotation ratio is smaller than this value, the annotation is filtered out. Default 0.1.	required

Returns:

Type	Description
`List[CocoAnnotation]`	Sliced annotations.

Source code in sahi/slicing.py

def process_coco_annotations(
    coco_annotation_list: list[CocoAnnotation], slice_bbox: list[int], min_area_ratio: float
) -> list[CocoAnnotation]:
    """Slices and filters given list of CocoAnnotation objects with given 'slice_bbox' and 'min_area_ratio'.

    Args:
        coco_annotation_list: List[CocoAnnotation]
            Annotations to slice and filter.
        slice_bbox (List[int]): Generated from `get_slice_bboxes`.
            Format for each slice bbox: [x_min, y_min, x_max, y_max].
        min_area_ratio (float): If the cropped annotation area to original
            annotation ratio is smaller than this value, the annotation is
            filtered out. Default 0.1.

    Returns:
        (List[CocoAnnotation]): Sliced annotations.
    """
    sliced_coco_annotation_list: list[CocoAnnotation] = []
    for coco_annotation in coco_annotation_list:
        if annotation_inside_slice(coco_annotation.json, slice_bbox):
            sliced_coco_annotation = coco_annotation.get_sliced_coco_annotation(slice_bbox)
            if sliced_coco_annotation.area / coco_annotation.area >= min_area_ratio:
                sliced_coco_annotation_list.append(sliced_coco_annotation)
    return sliced_coco_annotation_list

`shift_bboxes(bboxes, offset)` ¶

Shift bboxes w.r.t offset.

Supports Tensor, np.ndarray, and list inputs.

Parameters:

Name	Type	Description	Default
`bboxes` ¶	`(Tensor, ndarray, list)`	The bboxes need to be translated. Its shape can be (n, 4), which means (x, y, x, y).	required
`offset` ¶	`Sequence[int]`	The translation offsets with shape of (2, ).	required

Returns:

Type	Description
`Any`	Tensor, np.ndarray, list: Shifted bboxes.

Source code in sahi/slicing.py

def shift_bboxes(bboxes: Any, offset: Sequence[int]) -> Any:
    """Shift bboxes w.r.t offset.

    Supports Tensor, np.ndarray, and list inputs.

    Args:
        bboxes (Tensor, np.ndarray, list): The bboxes need to be translated. Its shape can
            be (n, 4), which means (x, y, x, y).
        offset (Sequence[int]): The translation offsets with shape of (2, ).

    Returns:
        Tensor, np.ndarray, list: Shifted bboxes.
    """
    shifted_bboxes = []

    if type(bboxes).__module__ == "torch":
        bboxes_is_torch_tensor = True
    else:
        bboxes_is_torch_tensor = False

    # Assert bboxes is iterable
    assert hasattr(bboxes, "__iter__"), "bboxes must be iterable"

    for bbox in bboxes:  # type: ignore[union-attr]
        if bboxes_is_torch_tensor or isinstance(bbox, np.ndarray):
            bbox = bbox.tolist()
        bbox = BoundingBox(bbox, shift_amount=tuple(offset[:2]))  # type: ignore[arg-type]
        bbox = bbox.get_shifted_box()
        shifted_bboxes.append(bbox.to_xyxy())

    if isinstance(bboxes, np.ndarray):
        return np.stack(shifted_bboxes, axis=0)
    elif bboxes_is_torch_tensor:
        return bboxes.new_tensor(shifted_bboxes)  # type: ignore[attr-defined]
    else:
        return shifted_bboxes

`shift_masks(masks, offset, full_shape)` ¶

Shift masks to the original image.

Parameters:

Name	Type	Description	Default
`masks` ¶	`ndarray`	masks that need to be shifted.	required
`offset` ¶	`Sequence[int]`	The offset to translate with shape of (2, ).	required
`full_shape` ¶	`Sequence[int]`	A (height, width) tuple of the huge image's shape.	required

Returns:

Type	Description
`ndarray`	np.ndarray: Shifted masks.

Source code in sahi/slicing.py

def shift_masks(masks: np.ndarray, offset: Sequence[int], full_shape: Sequence[int]) -> np.ndarray:
    """Shift masks to the original image.

    Args:
        masks (np.ndarray): masks that need to be shifted.
        offset (Sequence[int]): The offset to translate with shape of (2, ).
        full_shape (Sequence[int]): A (height, width) tuple of the huge image's shape.

    Returns:
        np.ndarray: Shifted masks.
    """
    # empty masks
    if masks is None:
        return masks

    shifted_masks = []
    for mask_seg in masks:
        mask = Mask(segmentation=mask_seg, shift_amount=list(offset[:2]), full_shape=list(full_shape[:2]))  # type: ignore[arg-type]
        mask = mask.get_shifted_mask()
        shifted_masks.append(mask.bool_mask)

    return np.stack(shifted_masks, axis=0)

`slice_coco(coco_annotation_file_path, image_dir, output_coco_annotation_file_name, output_dir=None, ignore_negative_samples=False, slice_height=512, slice_width=512, overlap_height_ratio=0.2, overlap_width_ratio=0.2, min_area_ratio=0.1, out_ext=None, verbose=False, exif_fix=True)` ¶

Slice large images given in a directory into smaller windows.

If output_dir is given, export sliced images and coco file.

Parameters:

Name	Type	Description	Default
`coco_annotation_file_path` ¶	`str`	Location of the coco annotation file	required
`image_dir` ¶	`str`	Base directory for the images	required
`output_coco_annotation_file_name` ¶	`str`	File name of the exported coco dataset json.	required
`output_dir` ¶	`str`	Output directory	`None`
`ignore_negative_samples` ¶	`bool`	If True, images without annotations are ignored. Defaults to False.	`False`
`slice_height` ¶	`int`	Height of each slice. Default 512.	`512`
`slice_width` ¶	`int`	Width of each slice. Default 512.	`512`
`overlap_height_ratio` ¶	`float`	Fractional overlap in height of each slice (e.g. an overlap of 0.2 for a slice of size 100 yields an overlap of 20 pixels). Default 0.2.	`0.2`
`overlap_width_ratio` ¶	`float`	Fractional overlap in width of each slice (e.g. an overlap of 0.2 for a slice of size 100 yields an overlap of 20 pixels). Default 0.2.	`0.2`
`min_area_ratio` ¶	`float`	If the cropped annotation area to original annotation ratio is smaller than this value, the annotation is filtered out. Default 0.1.	`0.1`
`out_ext` ¶	`str`	Extension of saved images. Default is the original suffix.	`None`
`verbose` ¶	`bool`	Switch to print relevant values to screen.	`False`
`exif_fix` ¶	`bool`	Whether to apply an EXIF fix to the image.	`True`

Returns:

Name	Type	Description
`coco_dict`	`dict`	dict COCO dict for sliced images and annotations
`save_path`	`str`	str Path to the saved coco file

Source code in sahi/slicing.py

def slice_coco(
    coco_annotation_file_path: str,
    image_dir: str,
    output_coco_annotation_file_name: str,
    output_dir: str | None = None,
    ignore_negative_samples: bool | None = False,
    slice_height: int | None = 512,
    slice_width: int | None = 512,
    overlap_height_ratio: float | None = 0.2,
    overlap_width_ratio: float | None = 0.2,
    min_area_ratio: float | None = 0.1,
    out_ext: str | None = None,
    verbose: bool | None = False,
    exif_fix: bool = True,
) -> tuple[dict, str]:
    """Slice large images given in a directory into smaller windows.

    If output_dir is given, export sliced images and coco file.

    Args:
        coco_annotation_file_path (str): Location of the coco annotation file
        image_dir (str): Base directory for the images
        output_coco_annotation_file_name (str): File name of the exported coco
            dataset json.
        output_dir (str, optional): Output directory
        ignore_negative_samples (bool, optional): If True, images without annotations
            are ignored. Defaults to False.
        slice_height (int, optional): Height of each slice. Default 512.
        slice_width (int, optional): Width of each slice. Default 512.
        overlap_height_ratio (float, optional): Fractional overlap in height of each
            slice (e.g. an overlap of 0.2 for a slice of size 100 yields an
            overlap of 20 pixels). Default 0.2.
        overlap_width_ratio (float, optional): Fractional overlap in width of each
            slice (e.g. an overlap of 0.2 for a slice of size 100 yields an
            overlap of 20 pixels). Default 0.2.
        min_area_ratio (float): If the cropped annotation area to original annotation
            ratio is smaller than this value, the annotation is filtered out. Default 0.1.
        out_ext (str, optional): Extension of saved images. Default is the
            original suffix.
        verbose (bool, optional): Switch to print relevant values to screen.
        exif_fix (bool, optional): Whether to apply an EXIF fix to the image.

    Returns:
        coco_dict: dict
            COCO dict for sliced images and annotations
        save_path: str
            Path to the saved coco file
    """
    # read coco file
    coco_dict: dict = load_json(coco_annotation_file_path)  # type: ignore[assignment]
    # create image_id_to_annotation_list mapping
    coco = Coco.from_coco_dict_or_path(coco_dict)
    # init sliced coco_utils.CocoImage list
    sliced_coco_images: list = []

    # iterate over images and slice
    for idx, coco_image in enumerate(tqdm(coco.images)):
        # get image path
        image_path: str = os.path.join(image_dir, coco_image.file_name)
        # get annotation json list corresponding to selected coco image
        # slice image
        try:
            slice_image_result = slice_image(
                image=image_path,
                coco_annotation_list=coco_image.annotations,
                output_file_name=f"{Path(coco_image.file_name).stem}_{idx}",
                output_dir=output_dir,
                slice_height=slice_height,
                slice_width=slice_width,
                overlap_height_ratio=overlap_height_ratio,
                overlap_width_ratio=overlap_width_ratio,
                min_area_ratio=min_area_ratio,
                out_ext=out_ext,
                verbose=verbose,
                exif_fix=exif_fix,
            )
            # append slice outputs
            sliced_coco_images.extend(slice_image_result.coco_images)
        except TopologicalError:
            logger.warning(f"Invalid annotation found, skipping this image: {image_path}")

    # create and save coco dict
    ignore_negative_samples_val: bool = ignore_negative_samples if ignore_negative_samples is not None else False
    sliced_coco_dict = create_coco_dict(
        sliced_coco_images, coco_dict["categories"], ignore_negative_samples=ignore_negative_samples_val
    )
    save_path: str = ""
    if output_coco_annotation_file_name and output_dir:
        save_path = str(Path(output_dir) / (output_coco_annotation_file_name + "_coco.json"))
        save_json(sliced_coco_dict, save_path)

    return sliced_coco_dict, save_path

`slice_image(image, coco_annotation_list=None, output_file_name=None, output_dir=None, slice_height=None, slice_width=None, overlap_height_ratio=0.2, overlap_width_ratio=0.2, auto_slice_resolution=True, min_area_ratio=0.1, out_ext=None, verbose=False, exif_fix=True)` ¶

Slice a large image into smaller windows. If output_file_name and output_dir is given, export sliced images.

Parameters:

Name	Type	Description	Default
`image` ¶	`str or Image`	File path of image or Pillow Image to be sliced.	required
`coco_annotation_list` ¶	`List[CocoAnnotation]`	List of CocoAnnotation objects.	`None`
`output_file_name` ¶	`str`	Root name of output files (coordinates will be appended to this)	`None`
`output_dir` ¶	`str`	Output directory	`None`
`slice_height` ¶	`int`	Height of each slice. Default None.	`None`
`slice_width` ¶	`int`	Width of each slice. Default None.	`None`
`overlap_height_ratio` ¶	`float`	Fractional overlap in height of each slice (e.g. an overlap of 0.2 for a slice of size 100 yields an overlap of 20 pixels). Default 0.2.	`0.2`
`overlap_width_ratio` ¶	`float`	Fractional overlap in width of each slice (e.g. an overlap of 0.2 for a slice of size 100 yields an overlap of 20 pixels). Default 0.2.	`0.2`
`auto_slice_resolution` ¶	`bool`	if not set slice parameters such as slice_height and slice_width, it enables automatically calculate these params from image resolution and orientation.	`True`
`min_area_ratio` ¶	`float`	If the cropped annotation area to original annotation ratio is smaller than this value, the annotation is filtered out. Default 0.1.	`0.1`
`out_ext` ¶	`str`	Extension of saved images. Default is the original suffix for lossless image formats and png for lossy formats ('.jpg','.jpeg').	`None`
`verbose` ¶	`bool`	Switch to print relevant values to screen. Default 'False'.	`False`
`exif_fix` ¶	`bool`	Whether to apply an EXIF fix to the image.	`True`

Returns:

Name	Type	Description
`sliced_image_result`	`SliceImageResult`	SliceImageResult: sliced_image_list: list of SlicedImage image_dir: str Directory of the sliced image exports. original_image_size: list of int Size of the unsliced original image in [height, width]

Source code in sahi/slicing.py

def slice_image(
    image: str | Image.Image | np.ndarray,
    coco_annotation_list: list[CocoAnnotation] | None = None,
    output_file_name: str | None = None,
    output_dir: str | None = None,
    slice_height: int | None = None,
    slice_width: int | None = None,
    overlap_height_ratio: float | None = 0.2,
    overlap_width_ratio: float | None = 0.2,
    auto_slice_resolution: bool | None = True,
    min_area_ratio: float | None = 0.1,
    out_ext: str | None = None,
    verbose: bool | None = False,
    exif_fix: bool = True,
) -> SliceImageResult:
    """Slice a large image into smaller windows. If output_file_name and output_dir is given, export sliced images.

    Args:
        image (str or PIL.Image): File path of image or Pillow Image to be sliced.
        coco_annotation_list (List[CocoAnnotation], optional): List of CocoAnnotation objects.
        output_file_name (str, optional): Root name of output files (coordinates will
            be appended to this)
        output_dir (str, optional): Output directory
        slice_height (int, optional): Height of each slice. Default None.
        slice_width (int, optional): Width of each slice. Default None.
        overlap_height_ratio (float, optional): Fractional overlap in height of each
            slice (e.g. an overlap of 0.2 for a slice of size 100 yields an
            overlap of 20 pixels). Default 0.2.
        overlap_width_ratio (float, optional): Fractional overlap in width of each
            slice (e.g. an overlap of 0.2 for a slice of size 100 yields an
            overlap of 20 pixels). Default 0.2.
        auto_slice_resolution (bool, optional): if not set slice parameters such as slice_height and slice_width,
            it enables automatically calculate these params from image resolution and orientation.
        min_area_ratio (float, optional): If the cropped annotation area to original annotation
            ratio is smaller than this value, the annotation is filtered out. Default 0.1.
        out_ext (str, optional): Extension of saved images. Default is the
            original suffix for lossless image formats and png for lossy formats ('.jpg','.jpeg').
        verbose (bool, optional): Switch to print relevant values to screen.
            Default 'False'.
        exif_fix (bool): Whether to apply an EXIF fix to the image.

    Returns:
        sliced_image_result: SliceImageResult:
                                sliced_image_list: list of SlicedImage
                                image_dir: str
                                    Directory of the sliced image exports.
                                original_image_size: list of int
                                    Size of the unsliced original image in [height, width]
    """
    # define verboseprint
    verboselog = logger.info if verbose else lambda *a, **k: None

    def _export_single_slice(image: np.ndarray, output_dir: str, slice_file_name: str) -> None:
        image_pil = read_image_as_pil(image, exif_fix=exif_fix)
        slice_file_path = str(Path(output_dir) / slice_file_name)
        # export sliced image
        image_pil.save(slice_file_path)
        image_pil.close()  # to fix https://github.com/obss/sahi/issues/565
        verboselog("sliced image path: " + slice_file_path)

    # create outdir if not present
    if output_dir is not None:
        Path(output_dir).mkdir(parents=True, exist_ok=True)

    # read image
    image_pil = read_image_as_pil(image, exif_fix=exif_fix)
    verboselog("image.shape: " + str(image_pil.size))

    image_width, image_height = image_pil.size
    if not (image_width != 0 and image_height != 0):
        raise RuntimeError(f"invalid image size: {image_pil.size} for 'slice_image'.")
    slice_bboxes = get_slice_bboxes(
        image_height=image_height,
        image_width=image_width,
        auto_slice_resolution=auto_slice_resolution,
        slice_height=slice_height,
        slice_width=slice_width,
        overlap_height_ratio=overlap_height_ratio,
        overlap_width_ratio=overlap_width_ratio,
    )

    n_ims = 0

    # init images and annotations lists
    sliced_image_result = SliceImageResult(original_image_size=[image_height, image_width], image_dir=output_dir)

    image_pil_arr = np.asarray(image_pil)
    # iterate over slices
    for slice_bbox in slice_bboxes:
        n_ims += 1

        # extract image
        tlx = slice_bbox[0]
        tly = slice_bbox[1]
        brx = slice_bbox[2]
        bry = slice_bbox[3]
        image_pil_slice = image_pil_arr[tly:bry, tlx:brx]

        # set image file suffixes
        slice_suffixes = "_".join(map(str, slice_bbox))
        if out_ext:
            suffix = out_ext
        elif hasattr(image_pil, "filename"):
            suffix = Path(getattr(image_pil, "filename")).suffix
            if suffix in IMAGE_EXTENSIONS_LOSSY:
                suffix = ".png"
            elif suffix in IMAGE_EXTENSIONS_LOSSLESS:
                suffix = Path(image_pil.filename).suffix
        else:
            suffix = ".png"

        # set image file name and path
        slice_file_name = f"{output_file_name}_{slice_suffixes}{suffix}"

        # create coco image
        slice_width = slice_bbox[2] - slice_bbox[0]
        slice_height = slice_bbox[3] - slice_bbox[1]
        coco_image = CocoImage(file_name=slice_file_name, height=slice_height, width=slice_width)

        # append coco annotations (if present) to coco image
        if coco_annotation_list is not None:
            min_area_ratio_val: float = min_area_ratio if min_area_ratio is not None else 0.1
            for sliced_coco_annotation in process_coco_annotations(
                coco_annotation_list, slice_bbox, min_area_ratio_val
            ):
                coco_image.add_annotation(sliced_coco_annotation)

        # create sliced image and append to sliced_image_result
        sliced_image = SlicedImage(
            image=image_pil_slice, coco_image=coco_image, starting_pixel=[slice_bbox[0], slice_bbox[1]]
        )
        sliced_image_result.add_sliced_image(sliced_image)

    # export slices if output directory is provided
    if output_file_name and output_dir:
        # Use a context-managed ThreadPoolExecutor for clean shutdown and
        # limit workers based on CPU count to avoid oversubscription.
        max_workers = min(MAX_WORKERS, len(sliced_image_result))
        max_workers = max(1, max_workers)
        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
            # map will schedule tasks and wait for completion when the context exits
            list(
                executor.map(
                    _export_single_slice,
                    sliced_image_result.images,
                    [output_dir] * len(sliced_image_result),
                    sliced_image_result.filenames,
                )
            )

    verboselog(
        "Num slices: " + str(n_ims) + " slice_height: " + str(slice_height) + " slice_width: " + str(slice_width)
    )

    return sliced_image_result

`utils` ¶

Utilities for SAHI object detection and image processing.

Modules¶

`coco` ¶

COCO dataset format utilities and classes for handling annotations and predictions.

Classes¶

Coco ¶

COCO dataset object for managing images, annotations, and predictions.

Source code in sahi/utils/coco.py

class Coco:
    """COCO dataset object for managing images, annotations, and predictions."""

    def __init__(
        self,
        name: str | None = None,
        image_dir: str | None = None,
        remapping_dict: dict[int, int] | None = None,
        ignore_negative_samples: bool = False,
        clip_bboxes_to_img_dims: bool = False,
        image_id_setting: Literal["auto", "manual"] = "auto",
    ) -> None:
        """Create Coco object.

        Args:
            name: Name of the Coco dataset, determines exported json name.
            image_dir: Base file directory that contains dataset images. Required for dataset merging.
            remapping_dict: Maps category ids, e.g., {1:0, 2:1} maps category id 1 to 0.
            ignore_negative_samples: If True, ignores images without annotations.
            clip_bboxes_to_img_dims: If True, clips bounding boxes to image dimensions.
            image_id_setting: How to assign image ids while exporting ("auto" or "manual").
        """
        if image_id_setting not in ["auto", "manual"]:
            raise ValueError("image_id_setting must be either 'auto' or 'manual'")
        self.name: str | None = name
        self.image_dir: str | None = image_dir
        self.remapping_dict: dict[int, int] | None = remapping_dict
        self.ignore_negative_samples = ignore_negative_samples
        self.categories: list[CocoCategory] = []
        self.images: list[CocoImage] = []
        self._stats: dict | None = None
        self.clip_bboxes_to_img_dims = clip_bboxes_to_img_dims
        self.image_id_setting = image_id_setting

    def add_categories_from_coco_category_list(self, coco_category_list: list[dict]) -> None:
        """Create CocoCategory object using coco category list.

        Args:
            coco_category_list: List[Dict]
                [
                    {"supercategory": "person", "id": 1, "name": "person"},
                    {"supercategory": "vehicle", "id": 2, "name": "bicycle"}
                ]
        """
        for coco_category in coco_category_list:
            if self.remapping_dict is not None:
                for source_id in self.remapping_dict.keys():
                    if coco_category["id"] == source_id:
                        target_id = self.remapping_dict[source_id]
                        coco_category["id"] = target_id

            self.add_category(CocoCategory.from_coco_category(coco_category))

    def add_category(self, category: CocoCategory) -> None:
        """Add category to this Coco instance.

        Args:
            category: CocoCategory
        """
        # assert type(category) == CocoCategory, "category must be a CocoCategory instance"
        if not isinstance(category, CocoCategory):
            raise TypeError("category must be a CocoCategory instance")
        self.categories.append(category)

    def add_image(self, image: CocoImage) -> None:
        """Add image to this Coco instance.

        Args:
            image: CocoImage
        """
        if self.image_id_setting == "manual" and image.id is None:
            raise ValueError("image id should be manually set for image_id_setting='manual'")
        self.images.append(image)

    def update_categories(self, desired_name2id: dict[str, int], update_image_filenames: bool = False) -> None:
        """Rearrange category mapping of given COCO object based on given desired_name2id.

        Can also be used to filter some of the categories.

        Args:
            desired_name2id: dict
                {"big_vehicle": 1, "car": 2, "human": 3}
            update_image_filenames: bool
                If True, updates coco image file_names with absolute file paths.
        """
        # init vars
        currentid2desiredid_mapping: dict[int, int | None] = {}
        updated_coco = Coco(
            name=self.name,
            image_dir=self.image_dir,
            remapping_dict=self.remapping_dict,
            ignore_negative_samples=self.ignore_negative_samples,
        )
        # create category id mapping (currentid2desiredid_mapping)
        for coco_category in self.categories:
            current_category_id = coco_category.id
            current_category_name = coco_category.name
            if not current_category_name:
                logger.warning("no category name provided to update categories")
                continue
            if current_category_name in desired_name2id.keys():
                currentid2desiredid_mapping[current_category_id] = desired_name2id[current_category_name]
            else:
                # ignore categories that are not included in desired_name2id
                currentid2desiredid_mapping[current_category_id] = None

        # add updated categories
        for name in desired_name2id.keys():
            updated_coco_category = CocoCategory(id=desired_name2id[name], name=name, supercategory=name)
            updated_coco.add_category(updated_coco_category)

        # add updated images & annotations
        for coco_image in copy.deepcopy(self.images):
            updated_coco_image = CocoImage.from_coco_image_dict(coco_image.json)
            # update filename to abspath
            file_name_is_abspath = True if os.path.abspath(coco_image.file_name) == coco_image.file_name else False
            if update_image_filenames and not file_name_is_abspath:
                if not self.image_dir:
                    logger.error("image directory not set")
                else:
                    updated_coco_image.file_name = str(Path(os.path.abspath(self.image_dir)) / coco_image.file_name)
            # update annotations
            for coco_annotation in coco_image.annotations:
                current_category_id = coco_annotation.category_id
                desired_category_id = currentid2desiredid_mapping[current_category_id]
                # append annotations with category id present in desired_name2id
                if desired_category_id is not None:
                    # update cetegory id
                    coco_annotation.category_id = desired_category_id
                    # append updated annotation to target coco dict
                    updated_coco_image.add_annotation(coco_annotation)
            updated_coco.add_image(updated_coco_image)

        # overwrite instance
        self.__dict__ = updated_coco.__dict__

    def merge(self, coco: Coco, desired_name2id: dict | None = None, verbose: int = 1) -> None:
        """Combine the images/annotations/categories of given coco object with current one.

        Args:
            coco: sahi.utils.coco.Coco instance
                A COCO dataset object
            desired_name2id: dict
                {"human": 1, "car": 2, "big_vehicle": 3}
            verbose: bool
                If True, merging info is printed
        """
        if self.image_dir is None or coco.image_dir is None:
            raise ValueError("image_dir should be provided for merging.")
        if verbose:
            if not desired_name2id:
                print("'desired_name2id' is not specified, combining all categories.")

        # create desired_name2id by combining all categories, if desired_name2id is not specified
        coco1 = self
        coco2 = coco
        category_ind = 0
        if desired_name2id is None:
            desired_name2id = {}
            for coco in [coco1, coco2]:
                temp_categories = copy.deepcopy(coco.json_categories)
                for temp_category in temp_categories:
                    if temp_category["name"] not in desired_name2id:
                        desired_name2id[temp_category["name"]] = category_ind
                        category_ind += 1
                    else:
                        continue

        # update categories and image paths
        for coco in [coco1, coco2]:
            coco.update_categories(desired_name2id=desired_name2id, update_image_filenames=True)

        # combine images and categories
        coco1.images.extend(coco2.images)
        self.images = coco1.images
        self.categories = coco1.categories

        # print categories
        if verbose:
            print(
                "Categories are formed as:\n",
                self.json_categories,
            )

    @classmethod
    def from_coco_dict_or_path(
        cls: type[_TCoco],
        coco_dict_or_path: dict | str,
        image_dir: str | None = None,
        remapping_dict: dict | None = None,
        ignore_negative_samples: bool = False,
        clip_bboxes_to_img_dims: bool = False,
        use_threads: bool = False,
        num_threads: int = 10,
    ) -> _TCoco:
        """Create coco object from COCO formatted dict or COCO dataset file path.

        Args:
            coco_dict_or_path: dict/str or List[dict/str]
                COCO formatted dict or COCO dataset file path
                List of COCO formatted dict or COCO dataset file path
            image_dir: str
                Base file directory that contains dataset images. Required for merging and yolov5 conversion.
            remapping_dict: dict
                {1:0, 2:1} maps category id 1 to 0 and category id 2 to 1
            ignore_negative_samples: bool
                If True ignores images without annotations in all operations.
            clip_bboxes_to_img_dims: bool = False
                Limits bounding boxes to image dimensions.
            use_threads: bool = False
                Use threads when processing the json image list, defaults to False
            num_threads: int = 10
                Slice the image list to given number of chunks, defaults to 10

        Properties:
            images: list of CocoImage
            category_mapping: dict
        """
        # init coco object
        coco = cls(
            image_dir=image_dir,
            remapping_dict=remapping_dict,
            ignore_negative_samples=ignore_negative_samples,
            clip_bboxes_to_img_dims=clip_bboxes_to_img_dims,
        )

        if type(coco_dict_or_path) not in [str, dict]:
            raise TypeError("coco_dict_or_path should be a dict or str")

        # load coco dict if path is given
        if isinstance(coco_dict_or_path, str):
            coco_dict = cast("dict[Any, Any]", load_json(coco_dict_or_path))
        else:
            coco_dict = coco_dict_or_path

        dict_size = len(coco_dict["images"])

        # arrange image id to annotation id mapping
        coco.add_categories_from_coco_category_list(coco_dict["categories"])
        image_id_to_annotation_list = get_imageid2annotationlist_mapping(coco_dict)
        category_mapping = coco.category_mapping

        # https://github.com/obss/sahi/issues/98
        image_id_set: set = set()

        lock = Lock()

        def fill_image_id_set(
            start: int,
            finish: int,
            image_list: list,
            _image_id_set: set,
            _image_id_to_annotation_list: dict,
            _coco: Coco,
            lock: Lock,
        ) -> None:
            for coco_image_dict in tqdm(
                image_list[start:finish], f"Loading coco annotations between {start} and {finish}"
            ):
                coco_image = CocoImage.from_coco_image_dict(coco_image_dict)
                image_id = coco_image_dict["id"]
                # https://github.com/obss/sahi/issues/98
                if image_id in _image_id_set:
                    print(f"duplicate image_id: {image_id}, will be ignored.")
                    continue
                else:
                    lock.acquire()
                    _image_id_set.add(image_id)
                    lock.release()

                # select annotations of the image
                annotation_list = _image_id_to_annotation_list[image_id]
                for coco_annotation_dict in annotation_list:
                    # apply category remapping if remapping_dict is provided
                    if _coco.remapping_dict is not None:
                        # apply category remapping (id:id)
                        category_id = _coco.remapping_dict[coco_annotation_dict["category_id"]]
                        # update category id
                        coco_annotation_dict["category_id"] = category_id
                    else:
                        category_id = coco_annotation_dict["category_id"]
                    # get category name (id:name)
                    category_name = category_mapping[category_id]
                    coco_annotation = CocoAnnotation.from_coco_annotation_dict(
                        category_name=category_name, annotation_dict=coco_annotation_dict
                    )
                    coco_image.add_annotation(coco_annotation)
                _coco.add_image(coco_image)

        chunk_size = dict_size / num_threads

        if use_threads is True:
            for i in range(num_threads):
                start = int(i * chunk_size)
                finish = int(start + chunk_size)
                if finish > dict_size:
                    finish = dict_size
                t = Thread(
                    target=fill_image_id_set,
                    args=(start, finish, coco_dict["images"], image_id_set, image_id_to_annotation_list, coco, lock),
                )
                t.start()

            main_thread = threading.currentThread()
            for t in threading.enumerate():
                if t is not main_thread:
                    t.join()

        else:
            for coco_image_dict in tqdm(coco_dict["images"], "Loading coco annotations"):
                coco_image = CocoImage.from_coco_image_dict(coco_image_dict)
                image_id = coco_image_dict["id"]
                # https://github.com/obss/sahi/issues/98
                if image_id in image_id_set:
                    print(f"duplicate image_id: {image_id}, will be ignored.")
                    continue
                else:
                    image_id_set.add(image_id)
                # select annotations of the image
                annotation_list = image_id_to_annotation_list[image_id]
                # TODO: coco_annotation_dict is of type CocoAnnotation according to how image_id_to_annotation_list
                # was created. Either image_id_to_annotation_list is not defined correctly or the following
                # loop is wrong as it expects a dict.
                for coco_annotation_dict in annotation_list:
                    # apply category remapping if remapping_dict is provided
                    if coco.remapping_dict is not None:
                        # apply category remapping (id:id)
                        category_id = coco.remapping_dict[coco_annotation_dict["category_id"]]
                        # update category id
                        coco_annotation_dict["category_id"] = category_id
                    else:
                        category_id = coco_annotation_dict["category_id"]
                    # get category name (id:name)
                    category_name = category_mapping[category_id]
                    coco_annotation = CocoAnnotation.from_coco_annotation_dict(
                        category_name=category_name, annotation_dict=coco_annotation_dict
                    )
                    coco_image.add_annotation(coco_annotation)
                coco.add_image(coco_image)

        if clip_bboxes_to_img_dims:
            coco = cast(_TCoco, coco.get_coco_with_clipped_bboxes())
        return coco

    @property
    def json_categories(self) -> list[dict]:
        """Get list of categories in JSON format."""
        categories = []
        for category in self.categories:
            categories.append(category.json)
        return categories

    @property
    def category_mapping(self) -> dict[int, str | None]:
        """Get mapping of category IDs to names."""
        category_mapping = {}
        for category in self.categories:
            category_mapping[category.id] = category.name
        return category_mapping

    @property
    def json(self) -> dict:
        """Get COCO formatted dictionary."""
        return create_coco_dict(
            images=self.images,
            categories=self.json_categories,
            ignore_negative_samples=self.ignore_negative_samples,
            image_id_setting=self.image_id_setting,
        )

    @property
    def prediction_array(self) -> list:
        """Get COCO prediction array."""
        return create_coco_prediction_array(
            images=self.images,
            ignore_negative_samples=self.ignore_negative_samples,
            image_id_setting=self.image_id_setting,
        )

    @property
    def stats(self) -> dict:
        """Get dataset statistics."""
        if not self._stats:
            self.calculate_stats()
        return self._stats if self._stats is not None else {}

    def calculate_stats(self) -> None:
        """Iterate over all annotations and calculate total number of."""
        # init all stats
        num_annotations = 0
        num_images = len(self.images)
        num_negative_images = 0
        num_categories = len(self.json_categories)
        category_name_to_zero = {category["name"]: 0 for category in self.json_categories}
        category_name_to_inf = {category["name"]: float("inf") for category in self.json_categories}
        num_images_per_category = copy.deepcopy(category_name_to_zero)
        num_annotations_per_category = copy.deepcopy(category_name_to_zero)
        min_annotation_area_per_category = copy.deepcopy(category_name_to_inf)
        max_annotation_area_per_category = {category["name"]: 0.0 for category in self.json_categories}
        min_num_annotations_in_image = float("inf")
        max_num_annotations_in_image = 0
        total_annotation_area = 0.0
        min_annotation_area = 1e10
        max_annotation_area = 0.0
        for image in self.images:
            image_contains_category = {}
            for annotation in image.annotations:
                annotation_area = annotation.area
                total_annotation_area += annotation_area
                num_annotations_per_category[annotation.category_name] += 1
                image_contains_category[annotation.category_name] = 1
                # update min&max annotation area
                if annotation_area > max_annotation_area:
                    max_annotation_area = annotation_area
                if annotation_area < min_annotation_area:
                    min_annotation_area = annotation_area
                if annotation_area > max_annotation_area_per_category[annotation.category_name]:
                    max_annotation_area_per_category[annotation.category_name] = annotation_area
                if annotation_area < min_annotation_area_per_category[annotation.category_name]:
                    min_annotation_area_per_category[annotation.category_name] = annotation_area
            # update num_negative_images
            if len(image.annotations) == 0:
                num_negative_images += 1
            # update num_annotations
            num_annotations += len(image.annotations)
            # update num_images_per_category
            num_images_per_category = dict(Counter(num_images_per_category) + Counter(image_contains_category))
            # update min&max_num_annotations_in_image
            num_annotations_in_image = len(image.annotations)
            if num_annotations_in_image > max_num_annotations_in_image:
                max_num_annotations_in_image = num_annotations_in_image
            if num_annotations_in_image < min_num_annotations_in_image:
                min_num_annotations_in_image = num_annotations_in_image
        if (num_images - num_negative_images) > 0:
            avg_num_annotations_in_image = num_annotations / (num_images - num_negative_images)
            avg_annotation_area = total_annotation_area / num_annotations
        else:
            avg_num_annotations_in_image = 0.0
            avg_annotation_area = 0.0

        self._stats = {
            "num_images": num_images,
            "num_annotations": num_annotations,
            "num_categories": num_categories,
            "num_negative_images": num_negative_images,
            "num_images_per_category": num_images_per_category,
            "num_annotations_per_category": num_annotations_per_category,
            "min_num_annotations_in_image": min_num_annotations_in_image,
            "max_num_annotations_in_image": max_num_annotations_in_image,
            "avg_num_annotations_in_image": avg_num_annotations_in_image,
            "min_annotation_area": min_annotation_area,
            "max_annotation_area": max_annotation_area,
            "avg_annotation_area": avg_annotation_area,
            "min_annotation_area_per_category": min_annotation_area_per_category,
            "max_annotation_area_per_category": max_annotation_area_per_category,
        }

    def split_coco_as_train_val(self, train_split_rate: float = 0.9, numpy_seed: int = 0) -> dict:
        """Split images into train-val and return as Coco objects.

        Args:
            train_split_rate: float
            numpy_seed: int
                random seed. Actually, this doesn't use numpy, but the random package
                from the standard library, but it is called numpy for compatibility.

        Returns:
            result: Dict with keys "train_coco" and "val_coco".
        """
        # divide images
        num_images = len(self.images)
        shuffled_images = copy.deepcopy(self.images)
        random.seed(numpy_seed)
        random.shuffle(shuffled_images)
        num_train = int(num_images * train_split_rate)
        train_images = shuffled_images[:num_train]
        val_images = shuffled_images[num_train:]

        # form train val coco objects
        train_coco = Coco(
            name=self.name if self.name else "split" + "_train",
            image_dir=self.image_dir,
        )
        train_coco.images = train_images
        train_coco.categories = self.categories

        val_coco = Coco(name=self.name if self.name else "split" + "_val", image_dir=self.image_dir)
        val_coco.images = val_images
        val_coco.categories = self.categories

        # return result
        return {
            "train_coco": train_coco,
            "val_coco": val_coco,
        }

    def export_as_yolo(
        self,
        output_dir: str | Path,
        train_split_rate: float = 1.0,
        numpy_seed: int = 0,
        mp: bool = False,
        disable_symlink: bool = False,
    ) -> None:
        """Export current COCO dataset in YOLO format.

        Creates train/val folders with image symlinks and txt files and a data yaml file.

        Args:
            output_dir: str
                Export directory.
            train_split_rate: If given 1, exports as train split. If 0, as val split. If between 0-1, exports both.
            numpy_seed: Random seed for splitting.
            mp: If True, multiprocess mode is on (should be in 'if __name__ == "__main__":' block).
            disable_symlink: If True, images will be copied instead of creating symlinks.
        """
        try:
            import yaml
        except ImportError:
            raise ImportError('Please run "pip install -U pyyaml" to install yaml first for yolo formatted exporting.')

        # set split_mode
        if 0 < train_split_rate and train_split_rate < 1:
            split_mode = "TRAINVAL"
        elif train_split_rate == 0:
            split_mode = "VAL"
        elif train_split_rate == 1:
            split_mode = "TRAIN"
        else:
            raise ValueError("train_split_rate cannot be <0 or >1")

        # split dataset
        if split_mode == "TRAINVAL":
            result = self.split_coco_as_train_val(
                train_split_rate=train_split_rate,
                numpy_seed=numpy_seed,
            )
            train_coco = result["train_coco"]
            val_coco = result["val_coco"]
        elif split_mode == "TRAIN":
            train_coco = self
            val_coco = None
        elif split_mode == "VAL":
            train_coco = None
            val_coco = self

        # create train val image dirs
        train_dir: Path | str = ""
        val_dir: Path | str = ""
        if split_mode in ["TRAINVAL", "TRAIN"]:
            train_dir = Path(os.path.abspath(output_dir)) / "train/"
            train_dir.mkdir(parents=True, exist_ok=True)  # create dir
        if split_mode in ["TRAINVAL", "VAL"]:
            val_dir = Path(os.path.abspath(output_dir)) / "val/"
            val_dir.mkdir(parents=True, exist_ok=True)  # create dir

        # create image symlinks and annotation txts
        if split_mode in ["TRAINVAL", "TRAIN"]:
            export_yolo_images_and_txts_from_coco_object(
                output_dir=str(train_dir),
                coco=train_coco,
                ignore_negative_samples=self.ignore_negative_samples,
                mp=mp,
                disable_symlink=disable_symlink,
            )
        if split_mode in ["TRAINVAL", "VAL"]:
            export_yolo_images_and_txts_from_coco_object(
                output_dir=str(val_dir),
                coco=val_coco,
                ignore_negative_samples=self.ignore_negative_samples,
                mp=mp,
                disable_symlink=disable_symlink,
            )

        # create yolov5 data yaml
        data = {
            "train": str(train_dir),
            "val": str(val_dir),
            "nc": len(self.category_mapping),
            "names": list(self.category_mapping.values()),
        }
        yaml_path = str(Path(output_dir) / "data.yml")
        with open(yaml_path, "w") as outfile:
            yaml.dump(data, outfile, default_flow_style=None)

    def get_subsampled_coco(self, subsample_ratio: int = 2, category_id: int | None = None) -> Coco:
        """Subsample images and return as Coco object.

        Args:
            subsample_ratio: int
                10 means take every 10th image with its annotations
            category_id: int
                subsample only images containing given category_id, if -1 then subsamples negative samples
        Returns:
            subsampled_coco: sahi.utils.coco.Coco
        """
        subsampled_coco = Coco(
            name=self.name,
            image_dir=self.image_dir,
            remapping_dict=self.remapping_dict,
            ignore_negative_samples=self.ignore_negative_samples,
        )
        subsampled_coco.add_categories_from_coco_category_list(self.json_categories)

        if category_id is not None:
            # get images that contain given category id
            images_that_contain_category: list[CocoImage] = []
            annotation: CocoAnnotation
            for image in self.images:
                category_id_to_contains = defaultdict(int)
                for annotation in image.annotations:
                    category_id_to_contains[annotation.category_id] = 1
                if category_id_to_contains[category_id]:
                    add_this_image = True
                elif category_id == -1 and len(image.annotations) == 0:
                    # if category_id is given as -1, select negative samples
                    add_this_image = True
                else:
                    add_this_image = False

                if add_this_image:
                    images_that_contain_category.append(image)

            # get images that does not contain given category id
            images_that_doesnt_contain_category: list[CocoImage] = []
            for image in self.images:
                category_id_to_contains = defaultdict(int)
                for annotation in image.annotations:
                    category_id_to_contains[annotation.category_id] = 1
                if category_id_to_contains[category_id]:
                    add_this_image = False
                elif category_id == -1 and len(image.annotations) == 0:
                    # if category_id is given as -1, dont select negative samples
                    add_this_image = False
                else:
                    add_this_image = True

                if add_this_image:
                    images_that_doesnt_contain_category.append(image)

        if category_id:
            selected_images = images_that_contain_category
            # add images that does not contain given category without subsampling
            for image_ind in range(len(images_that_doesnt_contain_category)):
                subsampled_coco.add_image(images_that_doesnt_contain_category[image_ind])
        else:
            selected_images = self.images
        for image_ind in range(0, len(selected_images), subsample_ratio):
            subsampled_coco.add_image(selected_images[image_ind])

        return subsampled_coco

    def get_upsampled_coco(self, upsample_ratio: int = 2, category_id: int | None = None) -> Coco:
        """Upsample images and return as Coco object.

        Args:
            upsample_ratio: int
                10 means copy each sample 10 times
            category_id: int
                upsample only images containing given category_id, if -1 then upsamples negative samples
        Returns:
            upsampled_coco: sahi.utils.coco.Coco
        """
        upsampled_coco = Coco(
            name=self.name,
            image_dir=self.image_dir,
            remapping_dict=self.remapping_dict,
            ignore_negative_samples=self.ignore_negative_samples,
        )
        upsampled_coco.add_categories_from_coco_category_list(self.json_categories)
        for ind in range(upsample_ratio):
            for image_ind in range(len(self.images)):
                # calculate add_this_image
                if category_id is not None:
                    category_id_to_contains = defaultdict(int)
                    annotation: CocoAnnotation
                    for annotation in self.images[image_ind].annotations:
                        category_id_to_contains[annotation.category_id] = 1
                    if category_id_to_contains[category_id]:
                        add_this_image = True
                    elif category_id == -1 and len(self.images[image_ind].annotations) == 0:
                        # if category_id is given as -1, select negative samples
                        add_this_image = True
                    elif ind == 0:
                        # in first iteration add all images
                        add_this_image = True
                    else:
                        add_this_image = False
                else:
                    add_this_image = True

                if add_this_image:
                    upsampled_coco.add_image(self.images[image_ind])

        return upsampled_coco

    def get_area_filtered_coco(
        self, min: int = 0, max_val: float = float("inf"), intervals_per_category: dict | None = None
    ) -> Coco:
        """Filter annotations by area and return remaining images as Coco object.

        Args:
            min: int
                minimum allowed area
            max_val: int
                maximum allowed area
            intervals_per_category: dict of dicts
                {
                    "human": {"min": 20, "max": 10000},
                    "vehicle": {"min": 50, "max": 15000},
                }

        Returns:
            area_filtered_coco: sahi.utils.coco.Coco
        """
        area_filtered_coco = Coco(
            name=self.name,
            image_dir=self.image_dir,
            remapping_dict=self.remapping_dict,
            ignore_negative_samples=self.ignore_negative_samples,
        )
        area_filtered_coco.add_categories_from_coco_category_list(self.json_categories)
        for image in self.images:
            is_valid_image = True
            for annotation in image.annotations:
                if intervals_per_category is not None and annotation.category_name in intervals_per_category.keys():
                    category_based_min = intervals_per_category[annotation.category_name]["min"]
                    category_based_max = intervals_per_category[annotation.category_name]["max"]
                    if annotation.area < category_based_min or annotation.area > category_based_max:
                        is_valid_image = False
                if annotation.area < min or annotation.area > max_val:
                    is_valid_image = False
            if is_valid_image:
                area_filtered_coco.add_image(image)

        return area_filtered_coco

    def get_coco_with_clipped_bboxes(self) -> Coco:
        """Limits overflowing bounding boxes to image dimensions."""
        from sahi.slicing import annotation_inside_slice

        coco = Coco(
            name=self.name,
            image_dir=self.image_dir,
            remapping_dict=self.remapping_dict,
            ignore_negative_samples=self.ignore_negative_samples,
        )
        coco.add_categories_from_coco_category_list(self.json_categories)

        for coco_img in self.images:
            img_dims = [0, 0, coco_img.width, coco_img.height]
            coco_image = CocoImage(
                file_name=coco_img.file_name, height=coco_img.height, width=coco_img.width, id=coco_img.id
            )
            for coco_ann in coco_img.annotations:
                ann_dict: dict = coco_ann.json
                if annotation_inside_slice(annotation=ann_dict, slice_bbox=img_dims):
                    shapely_ann = coco_ann.get_sliced_coco_annotation(img_dims)
                    bbox = ShapelyAnnotation.to_xywh(shapely_ann._shapely_annotation)
                    coco_ann_from_shapely = CocoAnnotation(
                        bbox=[int(x) for x in bbox],  # type: ignore[list-item]
                        category_id=coco_ann.category_id,
                        category_name=coco_ann.category_name,
                        image_id=coco_ann.image_id,
                    )
                    coco_image.add_annotation(coco_ann_from_shapely)
                else:
                    continue
            coco.add_image(coco_image)
        return coco

Attributes¶

category_mapping property ¶

Get mapping of category IDs to names.

json property ¶

Get COCO formatted dictionary.

json_categories property ¶

Get list of categories in JSON format.

prediction_array property ¶

Get COCO prediction array.

stats property ¶

Get dataset statistics.

Functions¶

__init__(name=None, image_dir=None, remapping_dict=None, ignore_negative_samples=False, clip_bboxes_to_img_dims=False, image_id_setting='auto')

¶

Create Coco object.

Parameters:

Name	Type	Description	Default
`name` ¶	`str \| None`	Name of the Coco dataset, determines exported json name.	`None`
`image_dir` ¶	`str \| None`	Base file directory that contains dataset images. Required for dataset merging.	`None`
`remapping_dict` ¶	`dict[int, int] \| None`	Maps category ids, e.g., {1:0, 2:1} maps category id 1 to 0.	`None`
`ignore_negative_samples` ¶	`bool`	If True, ignores images without annotations.	`False`
`clip_bboxes_to_img_dims` ¶	`bool`	If True, clips bounding boxes to image dimensions.	`False`
`image_id_setting` ¶	`Literal['auto', 'manual']`	How to assign image ids while exporting ("auto" or "manual").	`'auto'`

Source code in sahi/utils/coco.py

def __init__(
    self,
    name: str | None = None,
    image_dir: str | None = None,
    remapping_dict: dict[int, int] | None = None,
    ignore_negative_samples: bool = False,
    clip_bboxes_to_img_dims: bool = False,
    image_id_setting: Literal["auto", "manual"] = "auto",
) -> None:
    """Create Coco object.

    Args:
        name: Name of the Coco dataset, determines exported json name.
        image_dir: Base file directory that contains dataset images. Required for dataset merging.
        remapping_dict: Maps category ids, e.g., {1:0, 2:1} maps category id 1 to 0.
        ignore_negative_samples: If True, ignores images without annotations.
        clip_bboxes_to_img_dims: If True, clips bounding boxes to image dimensions.
        image_id_setting: How to assign image ids while exporting ("auto" or "manual").
    """
    if image_id_setting not in ["auto", "manual"]:
        raise ValueError("image_id_setting must be either 'auto' or 'manual'")
    self.name: str | None = name
    self.image_dir: str | None = image_dir
    self.remapping_dict: dict[int, int] | None = remapping_dict
    self.ignore_negative_samples = ignore_negative_samples
    self.categories: list[CocoCategory] = []
    self.images: list[CocoImage] = []
    self._stats: dict | None = None
    self.clip_bboxes_to_img_dims = clip_bboxes_to_img_dims
    self.image_id_setting = image_id_setting

add_categories_from_coco_category_list(coco_category_list) ¶

Create CocoCategory object using coco category list.

Parameters:

Name	Type	Description	Default
`coco_category_list` ¶	`list[dict]`	List[Dict] [ {"supercategory": "person", "id": 1, "name": "person"}, {"supercategory": "vehicle", "id": 2, "name": "bicycle"} ]	required

Source code in sahi/utils/coco.py

def add_categories_from_coco_category_list(self, coco_category_list: list[dict]) -> None:
    """Create CocoCategory object using coco category list.

    Args:
        coco_category_list: List[Dict]
            [
                {"supercategory": "person", "id": 1, "name": "person"},
                {"supercategory": "vehicle", "id": 2, "name": "bicycle"}
            ]
    """
    for coco_category in coco_category_list:
        if self.remapping_dict is not None:
            for source_id in self.remapping_dict.keys():
                if coco_category["id"] == source_id:
                    target_id = self.remapping_dict[source_id]
                    coco_category["id"] = target_id

        self.add_category(CocoCategory.from_coco_category(coco_category))

add_category(category) ¶

Add category to this Coco instance.

Parameters:

Name	Type	Description	Default
`category` ¶	`CocoCategory`	CocoCategory	required

Source code in sahi/utils/coco.py

def add_category(self, category: CocoCategory) -> None:
    """Add category to this Coco instance.

    Args:
        category: CocoCategory
    """
    # assert type(category) == CocoCategory, "category must be a CocoCategory instance"
    if not isinstance(category, CocoCategory):
        raise TypeError("category must be a CocoCategory instance")
    self.categories.append(category)

add_image(image) ¶

Add image to this Coco instance.

Parameters:

Name	Type	Description	Default
`image` ¶	`CocoImage`	CocoImage	required

Source code in sahi/utils/coco.py

def add_image(self, image: CocoImage) -> None:
    """Add image to this Coco instance.

    Args:
        image: CocoImage
    """
    if self.image_id_setting == "manual" and image.id is None:
        raise ValueError("image id should be manually set for image_id_setting='manual'")
    self.images.append(image)

calculate_stats() ¶

Iterate over all annotations and calculate total number of.

Source code in sahi/utils/coco.py

def calculate_stats(self) -> None:
    """Iterate over all annotations and calculate total number of."""
    # init all stats
    num_annotations = 0
    num_images = len(self.images)
    num_negative_images = 0
    num_categories = len(self.json_categories)
    category_name_to_zero = {category["name"]: 0 for category in self.json_categories}
    category_name_to_inf = {category["name"]: float("inf") for category in self.json_categories}
    num_images_per_category = copy.deepcopy(category_name_to_zero)
    num_annotations_per_category = copy.deepcopy(category_name_to_zero)
    min_annotation_area_per_category = copy.deepcopy(category_name_to_inf)
    max_annotation_area_per_category = {category["name"]: 0.0 for category in self.json_categories}
    min_num_annotations_in_image = float("inf")
    max_num_annotations_in_image = 0
    total_annotation_area = 0.0
    min_annotation_area = 1e10
    max_annotation_area = 0.0
    for image in self.images:
        image_contains_category = {}
        for annotation in image.annotations:
            annotation_area = annotation.area
            total_annotation_area += annotation_area
            num_annotations_per_category[annotation.category_name] += 1
            image_contains_category[annotation.category_name] = 1
            # update min&max annotation area
            if annotation_area > max_annotation_area:
                max_annotation_area = annotation_area
            if annotation_area < min_annotation_area:
                min_annotation_area = annotation_area
            if annotation_area > max_annotation_area_per_category[annotation.category_name]:
                max_annotation_area_per_category[annotation.category_name] = annotation_area
            if annotation_area < min_annotation_area_per_category[annotation.category_name]:
                min_annotation_area_per_category[annotation.category_name] = annotation_area
        # update num_negative_images
        if len(image.annotations) == 0:
            num_negative_images += 1
        # update num_annotations
        num_annotations += len(image.annotations)
        # update num_images_per_category
        num_images_per_category = dict(Counter(num_images_per_category) + Counter(image_contains_category))
        # update min&max_num_annotations_in_image
        num_annotations_in_image = len(image.annotations)
        if num_annotations_in_image > max_num_annotations_in_image:
            max_num_annotations_in_image = num_annotations_in_image
        if num_annotations_in_image < min_num_annotations_in_image:
            min_num_annotations_in_image = num_annotations_in_image
    if (num_images - num_negative_images) > 0:
        avg_num_annotations_in_image = num_annotations / (num_images - num_negative_images)
        avg_annotation_area = total_annotation_area / num_annotations
    else:
        avg_num_annotations_in_image = 0.0
        avg_annotation_area = 0.0

    self._stats = {
        "num_images": num_images,
        "num_annotations": num_annotations,
        "num_categories": num_categories,
        "num_negative_images": num_negative_images,
        "num_images_per_category": num_images_per_category,
        "num_annotations_per_category": num_annotations_per_category,
        "min_num_annotations_in_image": min_num_annotations_in_image,
        "max_num_annotations_in_image": max_num_annotations_in_image,
        "avg_num_annotations_in_image": avg_num_annotations_in_image,
        "min_annotation_area": min_annotation_area,
        "max_annotation_area": max_annotation_area,
        "avg_annotation_area": avg_annotation_area,
        "min_annotation_area_per_category": min_annotation_area_per_category,
        "max_annotation_area_per_category": max_annotation_area_per_category,
    }

export_as_yolo(output_dir, train_split_rate=1.0, numpy_seed=0, mp=False, disable_symlink=False) ¶

Export current COCO dataset in YOLO format.

Creates train/val folders with image symlinks and txt files and a data yaml file.

Parameters:

Name	Type	Description	Default
`output_dir` ¶	`str \| Path`	str Export directory.	required
`train_split_rate` ¶	`float`	If given 1, exports as train split. If 0, as val split. If between 0-1, exports both.	`1.0`
`numpy_seed` ¶	`int`	Random seed for splitting.	`0`
`mp` ¶	`bool`	If True, multiprocess mode is on (should be in 'if name == "main":' block).	`False`
`disable_symlink` ¶	`bool`	If True, images will be copied instead of creating symlinks.	`False`

Source code in sahi/utils/coco.py

def export_as_yolo(
    self,
    output_dir: str | Path,
    train_split_rate: float = 1.0,
    numpy_seed: int = 0,
    mp: bool = False,
    disable_symlink: bool = False,
) -> None:
    """Export current COCO dataset in YOLO format.

    Creates train/val folders with image symlinks and txt files and a data yaml file.

    Args:
        output_dir: str
            Export directory.
        train_split_rate: If given 1, exports as train split. If 0, as val split. If between 0-1, exports both.
        numpy_seed: Random seed for splitting.
        mp: If True, multiprocess mode is on (should be in 'if __name__ == "__main__":' block).
        disable_symlink: If True, images will be copied instead of creating symlinks.
    """
    try:
        import yaml
    except ImportError:
        raise ImportError('Please run "pip install -U pyyaml" to install yaml first for yolo formatted exporting.')

    # set split_mode
    if 0 < train_split_rate and train_split_rate < 1:
        split_mode = "TRAINVAL"
    elif train_split_rate == 0:
        split_mode = "VAL"
    elif train_split_rate == 1:
        split_mode = "TRAIN"
    else:
        raise ValueError("train_split_rate cannot be <0 or >1")

    # split dataset
    if split_mode == "TRAINVAL":
        result = self.split_coco_as_train_val(
            train_split_rate=train_split_rate,
            numpy_seed=numpy_seed,
        )
        train_coco = result["train_coco"]
        val_coco = result["val_coco"]
    elif split_mode == "TRAIN":
        train_coco = self
        val_coco = None
    elif split_mode == "VAL":
        train_coco = None
        val_coco = self

    # create train val image dirs
    train_dir: Path | str = ""
    val_dir: Path | str = ""
    if split_mode in ["TRAINVAL", "TRAIN"]:
        train_dir = Path(os.path.abspath(output_dir)) / "train/"
        train_dir.mkdir(parents=True, exist_ok=True)  # create dir
    if split_mode in ["TRAINVAL", "VAL"]:
        val_dir = Path(os.path.abspath(output_dir)) / "val/"
        val_dir.mkdir(parents=True, exist_ok=True)  # create dir

    # create image symlinks and annotation txts
    if split_mode in ["TRAINVAL", "TRAIN"]:
        export_yolo_images_and_txts_from_coco_object(
            output_dir=str(train_dir),
            coco=train_coco,
            ignore_negative_samples=self.ignore_negative_samples,
            mp=mp,
            disable_symlink=disable_symlink,
        )
    if split_mode in ["TRAINVAL", "VAL"]:
        export_yolo_images_and_txts_from_coco_object(
            output_dir=str(val_dir),
            coco=val_coco,
            ignore_negative_samples=self.ignore_negative_samples,
            mp=mp,
            disable_symlink=disable_symlink,
        )

    # create yolov5 data yaml
    data = {
        "train": str(train_dir),
        "val": str(val_dir),
        "nc": len(self.category_mapping),
        "names": list(self.category_mapping.values()),
    }
    yaml_path = str(Path(output_dir) / "data.yml")
    with open(yaml_path, "w") as outfile:
        yaml.dump(data, outfile, default_flow_style=None)

from_coco_dict_or_path(coco_dict_or_path, image_dir=None, remapping_dict=None, ignore_negative_samples=False, clip_bboxes_to_img_dims=False, use_threads=False, num_threads=10)

classmethod ¶

Create coco object from COCO formatted dict or COCO dataset file path.

Parameters:

Name	Type	Description	Default
`coco_dict_or_path` ¶	`dict \| str`	dict/str or List[dict/str] COCO formatted dict or COCO dataset file path List of COCO formatted dict or COCO dataset file path	required
`image_dir` ¶	`str \| None`	str Base file directory that contains dataset images. Required for merging and yolov5 conversion.	`None`
`remapping_dict` ¶	`dict \| None`	dict {1:0, 2:1} maps category id 1 to 0 and category id 2 to 1	`None`
`ignore_negative_samples` ¶	`bool`	bool If True ignores images without annotations in all operations.	`False`
`clip_bboxes_to_img_dims` ¶	`bool`	bool = False Limits bounding boxes to image dimensions.	`False`
`use_threads` ¶	`bool`	bool = False Use threads when processing the json image list, defaults to False	`False`
`num_threads` ¶	`int`	int = 10 Slice the image list to given number of chunks, defaults to 10	`10`

Properties

images: list of CocoImage category_mapping: dict

Source code in sahi/utils/coco.py

@classmethod
def from_coco_dict_or_path(
    cls: type[_TCoco],
    coco_dict_or_path: dict | str,
    image_dir: str | None = None,
    remapping_dict: dict | None = None,
    ignore_negative_samples: bool = False,
    clip_bboxes_to_img_dims: bool = False,
    use_threads: bool = False,
    num_threads: int = 10,
) -> _TCoco:
    """Create coco object from COCO formatted dict or COCO dataset file path.

    Args:
        coco_dict_or_path: dict/str or List[dict/str]
            COCO formatted dict or COCO dataset file path
            List of COCO formatted dict or COCO dataset file path
        image_dir: str
            Base file directory that contains dataset images. Required for merging and yolov5 conversion.
        remapping_dict: dict
            {1:0, 2:1} maps category id 1 to 0 and category id 2 to 1
        ignore_negative_samples: bool
            If True ignores images without annotations in all operations.
        clip_bboxes_to_img_dims: bool = False
            Limits bounding boxes to image dimensions.
        use_threads: bool = False
            Use threads when processing the json image list, defaults to False
        num_threads: int = 10
            Slice the image list to given number of chunks, defaults to 10

    Properties:
        images: list of CocoImage
        category_mapping: dict
    """
    # init coco object
    coco = cls(
        image_dir=image_dir,
        remapping_dict=remapping_dict,
        ignore_negative_samples=ignore_negative_samples,
        clip_bboxes_to_img_dims=clip_bboxes_to_img_dims,
    )

    if type(coco_dict_or_path) not in [str, dict]:
        raise TypeError("coco_dict_or_path should be a dict or str")

    # load coco dict if path is given
    if isinstance(coco_dict_or_path, str):
        coco_dict = cast("dict[Any, Any]", load_json(coco_dict_or_path))
    else:
        coco_dict = coco_dict_or_path

    dict_size = len(coco_dict["images"])

    # arrange image id to annotation id mapping
    coco.add_categories_from_coco_category_list(coco_dict["categories"])
    image_id_to_annotation_list = get_imageid2annotationlist_mapping(coco_dict)
    category_mapping = coco.category_mapping

    # https://github.com/obss/sahi/issues/98
    image_id_set: set = set()

    lock = Lock()

    def fill_image_id_set(
        start: int,
        finish: int,
        image_list: list,
        _image_id_set: set,
        _image_id_to_annotation_list: dict,
        _coco: Coco,
        lock: Lock,
    ) -> None:
        for coco_image_dict in tqdm(
            image_list[start:finish], f"Loading coco annotations between {start} and {finish}"
        ):
            coco_image = CocoImage.from_coco_image_dict(coco_image_dict)
            image_id = coco_image_dict["id"]
            # https://github.com/obss/sahi/issues/98
            if image_id in _image_id_set:
                print(f"duplicate image_id: {image_id}, will be ignored.")
                continue
            else:
                lock.acquire()
                _image_id_set.add(image_id)
                lock.release()

            # select annotations of the image
            annotation_list = _image_id_to_annotation_list[image_id]
            for coco_annotation_dict in annotation_list:
                # apply category remapping if remapping_dict is provided
                if _coco.remapping_dict is not None:
                    # apply category remapping (id:id)
                    category_id = _coco.remapping_dict[coco_annotation_dict["category_id"]]
                    # update category id
                    coco_annotation_dict["category_id"] = category_id
                else:
                    category_id = coco_annotation_dict["category_id"]
                # get category name (id:name)
                category_name = category_mapping[category_id]
                coco_annotation = CocoAnnotation.from_coco_annotation_dict(
                    category_name=category_name, annotation_dict=coco_annotation_dict
                )
                coco_image.add_annotation(coco_annotation)
            _coco.add_image(coco_image)

    chunk_size = dict_size / num_threads

    if use_threads is True:
        for i in range(num_threads):
            start = int(i * chunk_size)
            finish = int(start + chunk_size)
            if finish > dict_size:
                finish = dict_size
            t = Thread(
                target=fill_image_id_set,
                args=(start, finish, coco_dict["images"], image_id_set, image_id_to_annotation_list, coco, lock),
            )
            t.start()

        main_thread = threading.currentThread()
        for t in threading.enumerate():
            if t is not main_thread:
                t.join()

    else:
        for coco_image_dict in tqdm(coco_dict["images"], "Loading coco annotations"):
            coco_image = CocoImage.from_coco_image_dict(coco_image_dict)
            image_id = coco_image_dict["id"]
            # https://github.com/obss/sahi/issues/98
            if image_id in image_id_set:
                print(f"duplicate image_id: {image_id}, will be ignored.")
                continue
            else:
                image_id_set.add(image_id)
            # select annotations of the image
            annotation_list = image_id_to_annotation_list[image_id]
            # TODO: coco_annotation_dict is of type CocoAnnotation according to how image_id_to_annotation_list
            # was created. Either image_id_to_annotation_list is not defined correctly or the following
            # loop is wrong as it expects a dict.
            for coco_annotation_dict in annotation_list:
                # apply category remapping if remapping_dict is provided
                if coco.remapping_dict is not None:
                    # apply category remapping (id:id)
                    category_id = coco.remapping_dict[coco_annotation_dict["category_id"]]
                    # update category id
                    coco_annotation_dict["category_id"] = category_id
                else:
                    category_id = coco_annotation_dict["category_id"]
                # get category name (id:name)
                category_name = category_mapping[category_id]
                coco_annotation = CocoAnnotation.from_coco_annotation_dict(
                    category_name=category_name, annotation_dict=coco_annotation_dict
                )
                coco_image.add_annotation(coco_annotation)
            coco.add_image(coco_image)

    if clip_bboxes_to_img_dims:
        coco = cast(_TCoco, coco.get_coco_with_clipped_bboxes())
    return coco

get_area_filtered_coco(min=0, max_val=float('inf'), intervals_per_category=None) ¶

Filter annotations by area and return remaining images as Coco object.

Parameters:

Name	Type	Description	Default
`min` ¶	`int`	int minimum allowed area	`0`
`max_val` ¶	`float`	int maximum allowed area	`float('inf')`
`intervals_per_category` ¶	`dict \| None`	dict of dicts { "human": {"min": 20, "max": 10000}, "vehicle": {"min": 50, "max": 15000}, }	`None`

Returns:

Name	Type	Description
`area_filtered_coco`	`Coco`	sahi.utils.coco.Coco

Source code in sahi/utils/coco.py

def get_area_filtered_coco(
    self, min: int = 0, max_val: float = float("inf"), intervals_per_category: dict | None = None
) -> Coco:
    """Filter annotations by area and return remaining images as Coco object.

    Args:
        min: int
            minimum allowed area
        max_val: int
            maximum allowed area
        intervals_per_category: dict of dicts
            {
                "human": {"min": 20, "max": 10000},
                "vehicle": {"min": 50, "max": 15000},
            }

    Returns:
        area_filtered_coco: sahi.utils.coco.Coco
    """
    area_filtered_coco = Coco(
        name=self.name,
        image_dir=self.image_dir,
        remapping_dict=self.remapping_dict,
        ignore_negative_samples=self.ignore_negative_samples,
    )
    area_filtered_coco.add_categories_from_coco_category_list(self.json_categories)
    for image in self.images:
        is_valid_image = True
        for annotation in image.annotations:
            if intervals_per_category is not None and annotation.category_name in intervals_per_category.keys():
                category_based_min = intervals_per_category[annotation.category_name]["min"]
                category_based_max = intervals_per_category[annotation.category_name]["max"]
                if annotation.area < category_based_min or annotation.area > category_based_max:
                    is_valid_image = False
            if annotation.area < min or annotation.area > max_val:
                is_valid_image = False
        if is_valid_image:
            area_filtered_coco.add_image(image)

    return area_filtered_coco

get_coco_with_clipped_bboxes() ¶

Limits overflowing bounding boxes to image dimensions.

Source code in sahi/utils/coco.py

def get_coco_with_clipped_bboxes(self) -> Coco:
    """Limits overflowing bounding boxes to image dimensions."""
    from sahi.slicing import annotation_inside_slice

    coco = Coco(
        name=self.name,
        image_dir=self.image_dir,
        remapping_dict=self.remapping_dict,
        ignore_negative_samples=self.ignore_negative_samples,
    )
    coco.add_categories_from_coco_category_list(self.json_categories)

    for coco_img in self.images:
        img_dims = [0, 0, coco_img.width, coco_img.height]
        coco_image = CocoImage(
            file_name=coco_img.file_name, height=coco_img.height, width=coco_img.width, id=coco_img.id
        )
        for coco_ann in coco_img.annotations:
            ann_dict: dict = coco_ann.json
            if annotation_inside_slice(annotation=ann_dict, slice_bbox=img_dims):
                shapely_ann = coco_ann.get_sliced_coco_annotation(img_dims)
                bbox = ShapelyAnnotation.to_xywh(shapely_ann._shapely_annotation)
                coco_ann_from_shapely = CocoAnnotation(
                    bbox=[int(x) for x in bbox],  # type: ignore[list-item]
                    category_id=coco_ann.category_id,
                    category_name=coco_ann.category_name,
                    image_id=coco_ann.image_id,
                )
                coco_image.add_annotation(coco_ann_from_shapely)
            else:
                continue
        coco.add_image(coco_image)
    return coco

get_subsampled_coco(subsample_ratio=2, category_id=None) ¶

Subsample images and return as Coco object.

Parameters:

Name	Type	Description	Default
`subsample_ratio` ¶	`int`	int 10 means take every 10th image with its annotations	`2`
`category_id` ¶	`int \| None`	int subsample only images containing given category_id, if -1 then subsamples negative samples	`None`

Returns: subsampled_coco: sahi.utils.coco.Coco

Source code in sahi/utils/coco.py

def get_subsampled_coco(self, subsample_ratio: int = 2, category_id: int | None = None) -> Coco:
    """Subsample images and return as Coco object.

    Args:
        subsample_ratio: int
            10 means take every 10th image with its annotations
        category_id: int
            subsample only images containing given category_id, if -1 then subsamples negative samples
    Returns:
        subsampled_coco: sahi.utils.coco.Coco
    """
    subsampled_coco = Coco(
        name=self.name,
        image_dir=self.image_dir,
        remapping_dict=self.remapping_dict,
        ignore_negative_samples=self.ignore_negative_samples,
    )
    subsampled_coco.add_categories_from_coco_category_list(self.json_categories)

    if category_id is not None:
        # get images that contain given category id
        images_that_contain_category: list[CocoImage] = []
        annotation: CocoAnnotation
        for image in self.images:
            category_id_to_contains = defaultdict(int)
            for annotation in image.annotations:
                category_id_to_contains[annotation.category_id] = 1
            if category_id_to_contains[category_id]:
                add_this_image = True
            elif category_id == -1 and len(image.annotations) == 0:
                # if category_id is given as -1, select negative samples
                add_this_image = True
            else:
                add_this_image = False

            if add_this_image:
                images_that_contain_category.append(image)

        # get images that does not contain given category id
        images_that_doesnt_contain_category: list[CocoImage] = []
        for image in self.images:
            category_id_to_contains = defaultdict(int)
            for annotation in image.annotations:
                category_id_to_contains[annotation.category_id] = 1
            if category_id_to_contains[category_id]:
                add_this_image = False
            elif category_id == -1 and len(image.annotations) == 0:
                # if category_id is given as -1, dont select negative samples
                add_this_image = False
            else:
                add_this_image = True

            if add_this_image:
                images_that_doesnt_contain_category.append(image)

    if category_id:
        selected_images = images_that_contain_category
        # add images that does not contain given category without subsampling
        for image_ind in range(len(images_that_doesnt_contain_category)):
            subsampled_coco.add_image(images_that_doesnt_contain_category[image_ind])
    else:
        selected_images = self.images
    for image_ind in range(0, len(selected_images), subsample_ratio):
        subsampled_coco.add_image(selected_images[image_ind])

    return subsampled_coco

get_upsampled_coco(upsample_ratio=2, category_id=None) ¶

Upsample images and return as Coco object.

Parameters:

Name	Type	Description	Default
`upsample_ratio` ¶	`int`	int 10 means copy each sample 10 times	`2`
`category_id` ¶	`int \| None`	int upsample only images containing given category_id, if -1 then upsamples negative samples	`None`

Returns: upsampled_coco: sahi.utils.coco.Coco

Source code in sahi/utils/coco.py

def get_upsampled_coco(self, upsample_ratio: int = 2, category_id: int | None = None) -> Coco:
    """Upsample images and return as Coco object.

    Args:
        upsample_ratio: int
            10 means copy each sample 10 times
        category_id: int
            upsample only images containing given category_id, if -1 then upsamples negative samples
    Returns:
        upsampled_coco: sahi.utils.coco.Coco
    """
    upsampled_coco = Coco(
        name=self.name,
        image_dir=self.image_dir,
        remapping_dict=self.remapping_dict,
        ignore_negative_samples=self.ignore_negative_samples,
    )
    upsampled_coco.add_categories_from_coco_category_list(self.json_categories)
    for ind in range(upsample_ratio):
        for image_ind in range(len(self.images)):
            # calculate add_this_image
            if category_id is not None:
                category_id_to_contains = defaultdict(int)
                annotation: CocoAnnotation
                for annotation in self.images[image_ind].annotations:
                    category_id_to_contains[annotation.category_id] = 1
                if category_id_to_contains[category_id]:
                    add_this_image = True
                elif category_id == -1 and len(self.images[image_ind].annotations) == 0:
                    # if category_id is given as -1, select negative samples
                    add_this_image = True
                elif ind == 0:
                    # in first iteration add all images
                    add_this_image = True
                else:
                    add_this_image = False
            else:
                add_this_image = True

            if add_this_image:
                upsampled_coco.add_image(self.images[image_ind])

    return upsampled_coco

merge(coco, desired_name2id=None, verbose=1) ¶

Combine the images/annotations/categories of given coco object with current one.

Parameters:

Name	Type	Description	Default
`coco` ¶	`Coco`	sahi.utils.coco.Coco instance A COCO dataset object	required
`desired_name2id` ¶	`dict \| None`	dict	`None`
`verbose` ¶	`int`	bool If True, merging info is printed	`1`

Source code in sahi/utils/coco.py

def merge(self, coco: Coco, desired_name2id: dict | None = None, verbose: int = 1) -> None:
    """Combine the images/annotations/categories of given coco object with current one.

    Args:
        coco: sahi.utils.coco.Coco instance
            A COCO dataset object
        desired_name2id: dict
            {"human": 1, "car": 2, "big_vehicle": 3}
        verbose: bool
            If True, merging info is printed
    """
    if self.image_dir is None or coco.image_dir is None:
        raise ValueError("image_dir should be provided for merging.")
    if verbose:
        if not desired_name2id:
            print("'desired_name2id' is not specified, combining all categories.")

    # create desired_name2id by combining all categories, if desired_name2id is not specified
    coco1 = self
    coco2 = coco
    category_ind = 0
    if desired_name2id is None:
        desired_name2id = {}
        for coco in [coco1, coco2]:
            temp_categories = copy.deepcopy(coco.json_categories)
            for temp_category in temp_categories:
                if temp_category["name"] not in desired_name2id:
                    desired_name2id[temp_category["name"]] = category_ind
                    category_ind += 1
                else:
                    continue

    # update categories and image paths
    for coco in [coco1, coco2]:
        coco.update_categories(desired_name2id=desired_name2id, update_image_filenames=True)

    # combine images and categories
    coco1.images.extend(coco2.images)
    self.images = coco1.images
    self.categories = coco1.categories

    # print categories
    if verbose:
        print(
            "Categories are formed as:\n",
            self.json_categories,
        )

split_coco_as_train_val(train_split_rate=0.9, numpy_seed=0) ¶

Split images into train-val and return as Coco objects.

Parameters:

Name	Type	Description	Default
`train_split_rate` ¶	`float`	float	`0.9`
`numpy_seed` ¶	`int`	int random seed. Actually, this doesn't use numpy, but the random package from the standard library, but it is called numpy for compatibility.	`0`

Returns:

Name	Type	Description
`result`	`dict`	Dict with keys "train_coco" and "val_coco".

Source code in sahi/utils/coco.py

def split_coco_as_train_val(self, train_split_rate: float = 0.9, numpy_seed: int = 0) -> dict:
    """Split images into train-val and return as Coco objects.

    Args:
        train_split_rate: float
        numpy_seed: int
            random seed. Actually, this doesn't use numpy, but the random package
            from the standard library, but it is called numpy for compatibility.

    Returns:
        result: Dict with keys "train_coco" and "val_coco".
    """
    # divide images
    num_images = len(self.images)
    shuffled_images = copy.deepcopy(self.images)
    random.seed(numpy_seed)
    random.shuffle(shuffled_images)
    num_train = int(num_images * train_split_rate)
    train_images = shuffled_images[:num_train]
    val_images = shuffled_images[num_train:]

    # form train val coco objects
    train_coco = Coco(
        name=self.name if self.name else "split" + "_train",
        image_dir=self.image_dir,
    )
    train_coco.images = train_images
    train_coco.categories = self.categories

    val_coco = Coco(name=self.name if self.name else "split" + "_val", image_dir=self.image_dir)
    val_coco.images = val_images
    val_coco.categories = self.categories

    # return result
    return {
        "train_coco": train_coco,
        "val_coco": val_coco,
    }

update_categories(desired_name2id, update_image_filenames=False) ¶

Rearrange category mapping of given COCO object based on given desired_name2id.

Can also be used to filter some of the categories.

Parameters:

Name	Type	Description	Default
`desired_name2id` ¶	`dict[str, int]`	dict	required
`update_image_filenames` ¶	`bool`	bool If True, updates coco image file_names with absolute file paths.	`False`

Source code in sahi/utils/coco.py

def update_categories(self, desired_name2id: dict[str, int], update_image_filenames: bool = False) -> None:
    """Rearrange category mapping of given COCO object based on given desired_name2id.

    Can also be used to filter some of the categories.

    Args:
        desired_name2id: dict
            {"big_vehicle": 1, "car": 2, "human": 3}
        update_image_filenames: bool
            If True, updates coco image file_names with absolute file paths.
    """
    # init vars
    currentid2desiredid_mapping: dict[int, int | None] = {}
    updated_coco = Coco(
        name=self.name,
        image_dir=self.image_dir,
        remapping_dict=self.remapping_dict,
        ignore_negative_samples=self.ignore_negative_samples,
    )
    # create category id mapping (currentid2desiredid_mapping)
    for coco_category in self.categories:
        current_category_id = coco_category.id
        current_category_name = coco_category.name
        if not current_category_name:
            logger.warning("no category name provided to update categories")
            continue
        if current_category_name in desired_name2id.keys():
            currentid2desiredid_mapping[current_category_id] = desired_name2id[current_category_name]
        else:
            # ignore categories that are not included in desired_name2id
            currentid2desiredid_mapping[current_category_id] = None

    # add updated categories
    for name in desired_name2id.keys():
        updated_coco_category = CocoCategory(id=desired_name2id[name], name=name, supercategory=name)
        updated_coco.add_category(updated_coco_category)

    # add updated images & annotations
    for coco_image in copy.deepcopy(self.images):
        updated_coco_image = CocoImage.from_coco_image_dict(coco_image.json)
        # update filename to abspath
        file_name_is_abspath = True if os.path.abspath(coco_image.file_name) == coco_image.file_name else False
        if update_image_filenames and not file_name_is_abspath:
            if not self.image_dir:
                logger.error("image directory not set")
            else:
                updated_coco_image.file_name = str(Path(os.path.abspath(self.image_dir)) / coco_image.file_name)
        # update annotations
        for coco_annotation in coco_image.annotations:
            current_category_id = coco_annotation.category_id
            desired_category_id = currentid2desiredid_mapping[current_category_id]
            # append annotations with category id present in desired_name2id
            if desired_category_id is not None:
                # update cetegory id
                coco_annotation.category_id = desired_category_id
                # append updated annotation to target coco dict
                updated_coco_image.add_annotation(coco_annotation)
        updated_coco.add_image(updated_coco_image)

    # overwrite instance
    self.__dict__ = updated_coco.__dict__

CocoAnnotation ¶

COCO formatted annotation.

Source code in sahi/utils/coco.py

class CocoAnnotation:
    """COCO formatted annotation."""

    @classmethod
    def from_coco_segmentation(
        cls: type[_TCocoAnnotation],
        segmentation: list[list[float]] | list[list[int]],
        category_id: int,
        category_name: str,
        iscrowd: int = 0,
    ) -> _TCocoAnnotation:
        """Create CocoAnnotation object using coco segmentation.

        Args:
            segmentation: List[List]
                [[1, 1, 325, 125, 250, 200, 5, 200]]
            category_id: int
                Category id of the annotation
            category_name: str
                Category name of the annotation
            iscrowd: int
                0 or 1
        """
        return cls(
            segmentation=segmentation,
            category_id=category_id,
            category_name=category_name,
            iscrowd=iscrowd,
        )

    @classmethod
    def from_coco_bbox(
        cls: type[_TCocoAnnotation], bbox: list[int], category_id: int, category_name: str, iscrowd: int = 0
    ) -> _TCocoAnnotation:
        """Create CocoAnnotation object using coco bbox.

        Args:
            bbox: List
                [xmin, ymin, width, height]
            category_id: int
                Category id of the annotation
            category_name: str
                Category name of the annotation
            iscrowd: int
                0 or 1
        """
        return cls(
            bbox=bbox,
            category_id=category_id,
            category_name=category_name,
            iscrowd=iscrowd,
        )

    @classmethod
    def from_coco_annotation_dict(
        cls: type[_TCocoAnnotation], annotation_dict: dict, category_name: str | None = None
    ) -> _TCocoAnnotation:
        """Create CocoAnnotation object from category name and COCO formatted annotation dict.

        Creates object from COCO formatted annotation dict with fields "bbox", "segmentation", "category_id".

        Args:
            category_name: str
                Category name of the annotation
            annotation_dict: dict
                COCO formatted annotation dict (with fields "bbox", "segmentation", "category_id")
        """
        if annotation_dict.__contains__("segmentation") and isinstance(annotation_dict["segmentation"], dict):
            has_rle_segmentation = True
            logger.warning(
                f"Segmentation annotation for id {annotation_dict['id']} is skipped since "
                "RLE segmentation format is not supported."
            )
        else:
            has_rle_segmentation = False

        if (
            annotation_dict.__contains__("segmentation")
            and annotation_dict["segmentation"]
            and not has_rle_segmentation
        ):
            return cls(
                segmentation=annotation_dict["segmentation"],
                category_id=annotation_dict["category_id"],
                category_name=category_name,
            )
        else:
            return cls(
                bbox=annotation_dict["bbox"],
                category_id=annotation_dict["category_id"],
                category_name=category_name,
            )

    @classmethod
    def from_shapely_annotation(
        cls: type[_TCocoAnnotation],
        shapely_annotation: ShapelyAnnotation,
        category_id: int,
        category_name: str,
        iscrowd: int,
    ) -> _TCocoAnnotation:
        """Create CocoAnnotation object from ShapelyAnnotation object.

        Args:
            shapely_annotation: ShapelyAnnotation object to convert.
            category_id: Category id of the annotation.
            category_name: Category name of the annotation.
            iscrowd: 0 or 1.
        """
        coco_annotation = cls(
            bbox=[0, 0, 0, 0],
            category_id=category_id,
            category_name=category_name,
            iscrowd=iscrowd,
        )
        coco_annotation._segmentation = shapely_annotation.to_coco_segmentation()  # type: ignore[attr-defined, assignment]
        coco_annotation._shapely_annotation = shapely_annotation  # type: ignore[attr-defined]
        return coco_annotation

    def __init__(
        self,
        category_id: int,
        category_name: str | None = None,
        segmentation: list[list[float]] | list[list[int]] | None = None,
        bbox: list[int] | None = None,
        image_id: int | None = None,
        iscrowd: int = 0,
    ) -> None:
        """Create coco annotation object using bbox or segmentation.

        Args:
            segmentation: List[List]
                [[1, 1, 325, 125, 250, 200, 5, 200]]
            bbox: List
                [xmin, ymin, width, height]
            category_id: int
                Category id of the annotation
            category_name: str
                Category name of the annotation
            image_id: int
                Image ID of the annotation
            iscrowd: int
                0 or 1
        """
        if bbox is None and segmentation is None:
            raise ValueError("you must provide a bbox or polygon")

        self._segmentation = segmentation
        self._category_id = category_id
        self._category_name = category_name
        self._image_id = image_id
        self._iscrowd = iscrowd

        if self._segmentation:
            shapely_annotation = ShapelyAnnotation.from_coco_segmentation(segmentation=self._segmentation)
        else:
            if not bbox:
                raise TypeError("Coco bounding box not set")
            shapely_annotation = ShapelyAnnotation.from_coco_bbox(bbox=bbox)
        self._shapely_annotation = shapely_annotation

    def get_sliced_coco_annotation(self, slice_bbox: list[int]) -> CocoAnnotation:
        """Get the annotation sliced by a bounding box.

        Args:
            slice_bbox: Bounding box to slice with as [xmin, ymin, xmax, ymax].

        Returns:
            CocoAnnotation: The sliced annotation.
        """
        shapely_polygon = box(slice_bbox[0], slice_bbox[1], slice_bbox[2], slice_bbox[3])
        intersection_shapely_annotation = self._shapely_annotation.get_intersection(shapely_polygon)
        return CocoAnnotation.from_shapely_annotation(
            intersection_shapely_annotation,
            category_id=self.category_id,
            category_name=self.category_name or "",
            iscrowd=self.iscrowd,
        )

    @property
    def area(self) -> float:
        """Returns area of annotation polygon (or bbox if no polygon available)."""
        return self._shapely_annotation.area

    @property
    def bbox(self) -> list[float]:
        """Returns coco formatted bbox of the annotation as [xmin, ymin, width, height]."""
        return self._shapely_annotation.to_xywh()

    @property
    def segmentation(self) -> list[list[int]]:
        """Returns coco formatted segmentation of the annotation as [[1, 1, 325, 125, 250, 200, 5, 200]]."""
        if self._segmentation:
            return self._shapely_annotation.to_coco_segmentation()
        else:
            return []

    @property
    def category_id(self) -> int:
        """Returns category id of the annotation as int."""
        return self._category_id

    @category_id.setter
    def category_id(self, i: int) -> None:
        if not isinstance(i, int):
            raise Exception("category_id must be an integer")
        self._category_id = i

    @property
    def image_id(self) -> int | None:
        """Returns image id of the annotation as int."""
        return self._image_id

    @image_id.setter
    def image_id(self, i: int) -> None:
        if not isinstance(i, int):
            raise Exception("image_id must be an integer")
        self._image_id = i

    @property
    def category_name(self) -> str | None:
        """Returns category name of the annotation as str."""
        return self._category_name

    @category_name.setter
    def category_name(self, n: str) -> None:
        if not isinstance(n, str):
            raise Exception("category_name must be a string")
        self._category_name = n

    @property
    def iscrowd(self) -> int:
        """Returns iscrowd info of the annotation."""
        return self._iscrowd

    @property
    def json(self) -> dict:
        """Get annotation as COCO formatted dictionary."""
        return {
            "image_id": self.image_id,
            "bbox": self.bbox,
            "category_id": self.category_id,
            "segmentation": self.segmentation,
            "iscrowd": self.iscrowd,
            "area": self.area,
        }

    def __repr__(self) -> str:
        """Return string representation of the annotation."""
        return f"""CocoAnnotation<
    image_id: {self.image_id},
    bbox: {self.bbox},
    segmentation: {self.segmentation},
    category_id: {self.category_id},
    category_name: {self.category_name},
    iscrowd: {self.iscrowd},
    area: {self.area}>"""

Attributes¶

area property ¶

Returns area of annotation polygon (or bbox if no polygon available).

bbox property ¶

Returns coco formatted bbox of the annotation as [xmin, ymin, width, height].

category_id property writable ¶

Returns category id of the annotation as int.

category_name property writable ¶

Returns category name of the annotation as str.

image_id property writable ¶

Returns image id of the annotation as int.

iscrowd property ¶

Returns iscrowd info of the annotation.

json property ¶

Get annotation as COCO formatted dictionary.

segmentation property ¶

Returns coco formatted segmentation of the annotation as [[1, 1, 325, 125, 250, 200, 5, 200]].

Functions¶

__init__(category_id, category_name=None, segmentation=None, bbox=None, image_id=None, iscrowd=0) ¶

Create coco annotation object using bbox or segmentation.

Parameters:

Name	Type	Description	Default
`segmentation` ¶	`list[list[float]] \| list[list[int]] \| None`	List[List][[1, 1, 325, 125, 250, 200, 5, 200]]	`None`
`bbox` ¶	`list[int] \| None`	List [xmin, ymin, width, height]	`None`
`category_id` ¶	`int`	int Category id of the annotation	required
`category_name` ¶	`str \| None`	str Category name of the annotation	`None`
`image_id` ¶	`int \| None`	int Image ID of the annotation	`None`
`iscrowd` ¶	`int`	int 0 or 1	`0`

Source code in sahi/utils/coco.py

def __init__(
    self,
    category_id: int,
    category_name: str | None = None,
    segmentation: list[list[float]] | list[list[int]] | None = None,
    bbox: list[int] | None = None,
    image_id: int | None = None,
    iscrowd: int = 0,
) -> None:
    """Create coco annotation object using bbox or segmentation.

    Args:
        segmentation: List[List]
            [[1, 1, 325, 125, 250, 200, 5, 200]]
        bbox: List
            [xmin, ymin, width, height]
        category_id: int
            Category id of the annotation
        category_name: str
            Category name of the annotation
        image_id: int
            Image ID of the annotation
        iscrowd: int
            0 or 1
    """
    if bbox is None and segmentation is None:
        raise ValueError("you must provide a bbox or polygon")

    self._segmentation = segmentation
    self._category_id = category_id
    self._category_name = category_name
    self._image_id = image_id
    self._iscrowd = iscrowd

    if self._segmentation:
        shapely_annotation = ShapelyAnnotation.from_coco_segmentation(segmentation=self._segmentation)
    else:
        if not bbox:
            raise TypeError("Coco bounding box not set")
        shapely_annotation = ShapelyAnnotation.from_coco_bbox(bbox=bbox)
    self._shapely_annotation = shapely_annotation

__repr__() ¶

Return string representation of the annotation.

Source code in sahi/utils/coco.py

def __repr__(self) -> str:
    """Return string representation of the annotation."""
    return f"""CocoAnnotation<
image_id: {self.image_id},
bbox: {self.bbox},
segmentation: {self.segmentation},
category_id: {self.category_id},
category_name: {self.category_name},
iscrowd: {self.iscrowd},
area: {self.area}>"""

from_coco_annotation_dict(annotation_dict, category_name=None) classmethod ¶

Create CocoAnnotation object from category name and COCO formatted annotation dict.

Creates object from COCO formatted annotation dict with fields "bbox", "segmentation", "category_id".

Parameters:

Name	Type	Description	Default
`category_name` ¶	`str \| None`	str Category name of the annotation	`None`
`annotation_dict` ¶	`dict`	dict COCO formatted annotation dict (with fields "bbox", "segmentation", "category_id")	required

Source code in sahi/utils/coco.py

@classmethod
def from_coco_annotation_dict(
    cls: type[_TCocoAnnotation], annotation_dict: dict, category_name: str | None = None
) -> _TCocoAnnotation:
    """Create CocoAnnotation object from category name and COCO formatted annotation dict.

    Creates object from COCO formatted annotation dict with fields "bbox", "segmentation", "category_id".

    Args:
        category_name: str
            Category name of the annotation
        annotation_dict: dict
            COCO formatted annotation dict (with fields "bbox", "segmentation", "category_id")
    """
    if annotation_dict.__contains__("segmentation") and isinstance(annotation_dict["segmentation"], dict):
        has_rle_segmentation = True
        logger.warning(
            f"Segmentation annotation for id {annotation_dict['id']} is skipped since "
            "RLE segmentation format is not supported."
        )
    else:
        has_rle_segmentation = False

    if (
        annotation_dict.__contains__("segmentation")
        and annotation_dict["segmentation"]
        and not has_rle_segmentation
    ):
        return cls(
            segmentation=annotation_dict["segmentation"],
            category_id=annotation_dict["category_id"],
            category_name=category_name,
        )
    else:
        return cls(
            bbox=annotation_dict["bbox"],
            category_id=annotation_dict["category_id"],
            category_name=category_name,
        )

from_coco_bbox(bbox, category_id, category_name, iscrowd=0) classmethod ¶

Create CocoAnnotation object using coco bbox.

Parameters:

Name	Type	Description	Default
`bbox` ¶	`list[int]`	List [xmin, ymin, width, height]	required
`category_id` ¶	`int`	int Category id of the annotation	required
`category_name` ¶	`str`	str Category name of the annotation	required
`iscrowd` ¶	`int`	int 0 or 1	`0`

Source code in sahi/utils/coco.py

@classmethod
def from_coco_bbox(
    cls: type[_TCocoAnnotation], bbox: list[int], category_id: int, category_name: str, iscrowd: int = 0
) -> _TCocoAnnotation:
    """Create CocoAnnotation object using coco bbox.

    Args:
        bbox: List
            [xmin, ymin, width, height]
        category_id: int
            Category id of the annotation
        category_name: str
            Category name of the annotation
        iscrowd: int
            0 or 1
    """
    return cls(
        bbox=bbox,
        category_id=category_id,
        category_name=category_name,
        iscrowd=iscrowd,
    )

from_coco_segmentation(segmentation, category_id, category_name, iscrowd=0) classmethod ¶

Create CocoAnnotation object using coco segmentation.

Parameters:

Name	Type	Description	Default
`segmentation` ¶	`list[list[float]] \| list[list[int]]`	List[List][[1, 1, 325, 125, 250, 200, 5, 200]]	required
`category_id` ¶	`int`	int Category id of the annotation	required
`category_name` ¶	`str`	str Category name of the annotation	required
`iscrowd` ¶	`int`	int 0 or 1	`0`

Source code in sahi/utils/coco.py

@classmethod
def from_coco_segmentation(
    cls: type[_TCocoAnnotation],
    segmentation: list[list[float]] | list[list[int]],
    category_id: int,
    category_name: str,
    iscrowd: int = 0,
) -> _TCocoAnnotation:
    """Create CocoAnnotation object using coco segmentation.

    Args:
        segmentation: List[List]
            [[1, 1, 325, 125, 250, 200, 5, 200]]
        category_id: int
            Category id of the annotation
        category_name: str
            Category name of the annotation
        iscrowd: int
            0 or 1
    """
    return cls(
        segmentation=segmentation,
        category_id=category_id,
        category_name=category_name,
        iscrowd=iscrowd,
    )

from_shapely_annotation(shapely_annotation, category_id, category_name, iscrowd) classmethod ¶

Create CocoAnnotation object from ShapelyAnnotation object.

Parameters:

Name	Type	Description	Default
`shapely_annotation` ¶	`ShapelyAnnotation`	ShapelyAnnotation object to convert.	required
`category_id` ¶	`int`	Category id of the annotation.	required
`category_name` ¶	`str`	Category name of the annotation.	required
`iscrowd` ¶	`int`	0 or 1.	required

Source code in sahi/utils/coco.py

@classmethod
def from_shapely_annotation(
    cls: type[_TCocoAnnotation],
    shapely_annotation: ShapelyAnnotation,
    category_id: int,
    category_name: str,
    iscrowd: int,
) -> _TCocoAnnotation:
    """Create CocoAnnotation object from ShapelyAnnotation object.

    Args:
        shapely_annotation: ShapelyAnnotation object to convert.
        category_id: Category id of the annotation.
        category_name: Category name of the annotation.
        iscrowd: 0 or 1.
    """
    coco_annotation = cls(
        bbox=[0, 0, 0, 0],
        category_id=category_id,
        category_name=category_name,
        iscrowd=iscrowd,
    )
    coco_annotation._segmentation = shapely_annotation.to_coco_segmentation()  # type: ignore[attr-defined, assignment]
    coco_annotation._shapely_annotation = shapely_annotation  # type: ignore[attr-defined]
    return coco_annotation

get_sliced_coco_annotation(slice_bbox) ¶

Get the annotation sliced by a bounding box.

Parameters:

Name	Type	Description	Default
`slice_bbox` ¶	`list[int]`	Bounding box to slice with as [xmin, ymin, xmax, ymax].	required

Returns:

Name	Type	Description
`CocoAnnotation`	`CocoAnnotation`	The sliced annotation.

Source code in sahi/utils/coco.py

def get_sliced_coco_annotation(self, slice_bbox: list[int]) -> CocoAnnotation:
    """Get the annotation sliced by a bounding box.

    Args:
        slice_bbox: Bounding box to slice with as [xmin, ymin, xmax, ymax].

    Returns:
        CocoAnnotation: The sliced annotation.
    """
    shapely_polygon = box(slice_bbox[0], slice_bbox[1], slice_bbox[2], slice_bbox[3])
    intersection_shapely_annotation = self._shapely_annotation.get_intersection(shapely_polygon)
    return CocoAnnotation.from_shapely_annotation(
        intersection_shapely_annotation,
        category_id=self.category_id,
        category_name=self.category_name or "",
        iscrowd=self.iscrowd,
    )

CocoCategory ¶

COCO formatted category.

Source code in sahi/utils/coco.py

class CocoCategory:
    """COCO formatted category."""

    def __init__(self, id: int = 0, name: str | None = None, supercategory: str | None = None) -> None:
        """Initialize a COCO category.

        Args:
            id: Category ID.
            name: Category name.
            supercategory: Supercategory name.
        """
        self.id = int(id)
        self.name = name
        self.supercategory = supercategory if supercategory else name

    @classmethod
    def from_coco_category(cls: type[_TCocoCategory], category: dict) -> _TCocoCategory:
        """Create CocoCategory object using coco category.

        Args:
            category: Dict
                {"supercategory": "person", "id": 1, "name": "person"},
        """
        return cls(
            id=category["id"],
            name=category["name"],
            supercategory=category["supercategory"] if "supercategory" in category else category["name"],
        )

    @property
    def json(self) -> dict:
        """Get category as COCO formatted dictionary."""
        return {
            "id": self.id,
            "name": self.name,
            "supercategory": self.supercategory,
        }

    def __repr__(self) -> str:
        """Return string representation of the category."""
        return f"""CocoCategory<
    id: {self.id},
    name: {self.name},
    supercategory: {self.supercategory}>"""

Attributes¶

json property ¶

Get category as COCO formatted dictionary.

Functions¶

__init__(id=0, name=None, supercategory=None) ¶

Initialize a COCO category.

Parameters:

Name	Type	Description	Default
`id` ¶	`int`	Category ID.	`0`
`name` ¶	`str \| None`	Category name.	`None`
`supercategory` ¶	`str \| None`	Supercategory name.	`None`

Source code in sahi/utils/coco.py

def __init__(self, id: int = 0, name: str | None = None, supercategory: str | None = None) -> None:
    """Initialize a COCO category.

    Args:
        id: Category ID.
        name: Category name.
        supercategory: Supercategory name.
    """
    self.id = int(id)
    self.name = name
    self.supercategory = supercategory if supercategory else name

__repr__() ¶

Return string representation of the category.

Source code in sahi/utils/coco.py

def __repr__(self) -> str:
    """Return string representation of the category."""
    return f"""CocoCategory<
id: {self.id},
name: {self.name},
supercategory: {self.supercategory}>"""

from_coco_category(category) classmethod ¶

Create CocoCategory object using coco category.

Parameters:

Name	Type	Description	Default
`category` ¶	`dict`	Dict {"supercategory": "person", "id": 1, "name": "person"},	required

Source code in sahi/utils/coco.py

@classmethod
def from_coco_category(cls: type[_TCocoCategory], category: dict) -> _TCocoCategory:
    """Create CocoCategory object using coco category.

    Args:
        category: Dict
            {"supercategory": "person", "id": 1, "name": "person"},
    """
    return cls(
        id=category["id"],
        name=category["name"],
        supercategory=category["supercategory"] if "supercategory" in category else category["name"],
    )

CocoImage ¶

COCO formatted image.

Source code in sahi/utils/coco.py

class CocoImage:
    """COCO formatted image."""

    @classmethod
    def from_coco_image_dict(cls: type[_TCocoImage], image_dict: dict) -> _TCocoImage:
        """Create CocoImage object from COCO formatted image dict.

        Creates object from COCO formatted image dict with fields "id", "file_name", "height" and "width".

        Args:
            image_dict: dict
                COCO formatted image dict (with fields "id", "file_name", "height" and "weight")
        """
        return cls(
            id=image_dict["id"],
            file_name=image_dict["file_name"],
            height=image_dict["height"],
            width=image_dict["width"],
        )

    def __init__(self, file_name: str, height: int, width: int, id: int | None = None) -> None:
        """Create CocoImage object.

        Args:
            id: int
                Image id
            file_name: str
                Image path
            height: int
                Image height in pixels
            width: int
                Image width in pixels
        """
        self.id = int(id) if id else id
        self.file_name = file_name
        self.height = int(height)
        self.width = int(width)
        self.annotations: list[CocoAnnotation] = []  # list of CocoAnnotation that belong to this image
        self.predictions: list[CocoPrediction] = []  # list of CocoPrediction that belong to this image

    def add_annotation(self, annotation: CocoAnnotation) -> None:
        """Add annotation to this CocoImage instance.

        Args:
            annotation: CocoAnnotation object to add.
        """
        if not isinstance(annotation, CocoAnnotation):
            raise TypeError("annotation must be a CocoAnnotation instance")
        self.annotations.append(annotation)

    def add_prediction(self, prediction: CocoPrediction) -> None:
        """Add prediction to this CocoImage instance.

        Args:
            prediction: CocoPrediction object to add.
        """
        if not isinstance(prediction, CocoPrediction):
            raise TypeError("prediction must be a CocoPrediction instance")
        self.predictions.append(prediction)

    @property
    def json(self) -> dict:
        """Get image as COCO formatted dictionary."""
        return {
            "id": self.id,
            "file_name": self.file_name,
            "height": self.height,
            "width": self.width,
        }

    def __repr__(self) -> str:
        """Return string representation of the image."""
        return f"""CocoImage<
    id: {self.id},
    file_name: {self.file_name},
    height: {self.height},
    width: {self.width},
    annotations: List[CocoAnnotation],
    predictions: List[CocoPrediction]>"""

Attributes¶

json property ¶

Get image as COCO formatted dictionary.

Functions¶

__init__(file_name, height, width, id=None) ¶

Create CocoImage object.

Parameters:

Name	Type	Description	Default
`id` ¶	`int \| None`	int Image id	`None`
`file_name` ¶	`str`	str Image path	required
`height` ¶	`int`	int Image height in pixels	required
`width` ¶	`int`	int Image width in pixels	required

Source code in sahi/utils/coco.py

def __init__(self, file_name: str, height: int, width: int, id: int | None = None) -> None:
    """Create CocoImage object.

    Args:
        id: int
            Image id
        file_name: str
            Image path
        height: int
            Image height in pixels
        width: int
            Image width in pixels
    """
    self.id = int(id) if id else id
    self.file_name = file_name
    self.height = int(height)
    self.width = int(width)
    self.annotations: list[CocoAnnotation] = []  # list of CocoAnnotation that belong to this image
    self.predictions: list[CocoPrediction] = []  # list of CocoPrediction that belong to this image

__repr__() ¶

Return string representation of the image.

Source code in sahi/utils/coco.py

def __repr__(self) -> str:
    """Return string representation of the image."""
    return f"""CocoImage<
id: {self.id},
file_name: {self.file_name},
height: {self.height},
width: {self.width},
annotations: List[CocoAnnotation],
predictions: List[CocoPrediction]>"""

add_annotation(annotation) ¶

Add annotation to this CocoImage instance.

Parameters:

Name	Type	Description	Default
`annotation` ¶	`CocoAnnotation`	CocoAnnotation object to add.	required

Source code in sahi/utils/coco.py

def add_annotation(self, annotation: CocoAnnotation) -> None:
    """Add annotation to this CocoImage instance.

    Args:
        annotation: CocoAnnotation object to add.
    """
    if not isinstance(annotation, CocoAnnotation):
        raise TypeError("annotation must be a CocoAnnotation instance")
    self.annotations.append(annotation)

add_prediction(prediction) ¶

Add prediction to this CocoImage instance.

Parameters:

Name	Type	Description	Default
`prediction` ¶	`CocoPrediction`	CocoPrediction object to add.	required

Source code in sahi/utils/coco.py

def add_prediction(self, prediction: CocoPrediction) -> None:
    """Add prediction to this CocoImage instance.

    Args:
        prediction: CocoPrediction object to add.
    """
    if not isinstance(prediction, CocoPrediction):
        raise TypeError("prediction must be a CocoPrediction instance")
    self.predictions.append(prediction)

from_coco_image_dict(image_dict) classmethod ¶

Create CocoImage object from COCO formatted image dict.

Creates object from COCO formatted image dict with fields "id", "file_name", "height" and "width".

Parameters:

Name	Type	Description	Default
`image_dict` ¶	`dict`	dict COCO formatted image dict (with fields "id", "file_name", "height" and "weight")	required

Source code in sahi/utils/coco.py

@classmethod
def from_coco_image_dict(cls: type[_TCocoImage], image_dict: dict) -> _TCocoImage:
    """Create CocoImage object from COCO formatted image dict.

    Creates object from COCO formatted image dict with fields "id", "file_name", "height" and "width".

    Args:
        image_dict: dict
            COCO formatted image dict (with fields "id", "file_name", "height" and "weight")
    """
    return cls(
        id=image_dict["id"],
        file_name=image_dict["file_name"],
        height=image_dict["height"],
        width=image_dict["width"],
    )

CocoPrediction ¶

Bases: CocoAnnotation

Class for handling predictions in coco format.

Source code in sahi/utils/coco.py

class CocoPrediction(CocoAnnotation):
    """Class for handling predictions in coco format."""

    @classmethod
    def from_coco_segmentation(  # type: ignore[override]
        cls: type[_TCocoPrediction],
        segmentation: list[list[float]] | list[list[int]],
        category_id: int,
        category_name: str,
        score: float,
        iscrowd: int = 0,
        image_id: int | None = None,
    ) -> _TCocoPrediction:
        """Create CocoAnnotation object using coco segmentation.

        Args:
            segmentation: List[List]
                [[1, 1, 325, 125, 250, 200, 5, 200]]
            category_id: int
                Category id of the annotation
            category_name: str
                Category name of the annotation
            score: float
                Prediction score between 0 and 1
            iscrowd: int
                0 or 1
            image_id: Image ID of the prediction.
        """
        return cls(
            segmentation=segmentation,
            category_id=category_id,
            category_name=category_name,
            score=score,
            iscrowd=iscrowd,
            image_id=image_id,
        )

    @classmethod
    def from_coco_bbox(  # type: ignore[override]
        cls: type[_TCocoPrediction],
        bbox: list[int],
        category_id: int,
        category_name: str,
        score: float,
        iscrowd: int = 0,
        image_id: int | None = None,
    ) -> _TCocoPrediction:
        """Create CocoAnnotation object using coco bbox.

        Args:
            bbox: List
                [xmin, ymin, width, height]
            category_id: int
                Category id of the annotation
            category_name: str
                Category name of the annotation
            score: float
                Prediction score between 0 and 1
            iscrowd: int
                0 or 1
            image_id: Image ID of the prediction.
        """
        return cls(
            bbox=bbox,
            category_id=category_id,
            category_name=category_name,
            score=score,
            iscrowd=iscrowd,
            image_id=image_id,
        )

    @classmethod
    def from_coco_annotation_dict(  # type: ignore[override]
        cls: type[_TCocoPrediction],
        category_name: str,
        annotation_dict: dict,
        score: float,
        image_id: int | None = None,
    ) -> _TCocoPrediction:
        """Create CocoAnnotation object from category name and COCO formatted annotation dict.

        Creates object from COCO formatted annotation dict with fields "bbox", "segmentation", "category_id".

        Args:
            category_name: str
                Category name of the annotation
            annotation_dict: dict
                COCO formatted annotation dict (with fields "bbox", "segmentation", "category_id")
            score: float
                Prediction score between 0 and 1
            image_id: Image ID of the prediction.
        """
        if annotation_dict["segmentation"]:
            return cls(
                segmentation=annotation_dict["segmentation"],
                category_id=annotation_dict["category_id"],
                category_name=category_name,
                score=score,
                image_id=image_id,
            )
        else:
            return cls(
                bbox=annotation_dict["bbox"],
                category_id=annotation_dict["category_id"],
                category_name=category_name,
                score=score,
                image_id=image_id,
            )

    def __init__(
        self,
        segmentation: list[list[float]] | list[list[int]] | None = None,
        bbox: list[int] | None = None,
        category_id: int = 0,
        category_name: str = "",
        image_id: int | None = None,
        score: float | None = None,
        iscrowd: int = 0,
    ) -> None:
        """Initialize a COCO prediction object.

        Args:
            segmentation: List[List]
                [[1, 1, 325, 125, 250, 200, 5, 200]]
            bbox: List
                [xmin, ymin, width, height]
            category_id: int
                Category id of the annotation
            category_name: str
                Category name of the annotation
            image_id: int
                Image ID of the annotation
            score: float
                Prediction score between 0 and 1
            iscrowd: int
                0 or 1.
        """
        self.score = score
        super().__init__(
            segmentation=segmentation,
            bbox=bbox,
            category_id=category_id,
            category_name=category_name,
            image_id=image_id,
            iscrowd=iscrowd,
        )

    @property
    def json(self) -> dict:
        """Get prediction as COCO formatted dictionary."""
        return {
            "image_id": self.image_id,
            "bbox": self.bbox,
            "score": self.score,
            "category_id": self.category_id,
            "category_name": self.category_name,
            "segmentation": self.segmentation,
            "iscrowd": self.iscrowd,
            "area": self.area,
        }

    def __repr__(self) -> str:
        """Return string representation of the prediction."""
        return f"""CocoPrediction<
    image_id: {self.image_id},
    bbox: {self.bbox},
    segmentation: {self.segmentation},
    score: {self.score},
    category_id: {self.category_id},
    category_name: {self.category_name},
    iscrowd: {self.iscrowd},
    area: {self.area}>"""

Attributes¶

json property ¶

Get prediction as COCO formatted dictionary.

Functions¶

__init__(segmentation=None, bbox=None, category_id=0, category_name='', image_id=None, score=None, iscrowd=0) ¶

Initialize a COCO prediction object.

Parameters:

Name	Type	Description	Default
`segmentation` ¶	`list[list[float]] \| list[list[int]] \| None`	List[List][[1, 1, 325, 125, 250, 200, 5, 200]]	`None`
`bbox` ¶	`list[int] \| None`	List [xmin, ymin, width, height]	`None`
`category_id` ¶	`int`	int Category id of the annotation	`0`
`category_name` ¶	`str`	str Category name of the annotation	`''`
`image_id` ¶	`int \| None`	int Image ID of the annotation	`None`
`score` ¶	`float \| None`	float Prediction score between 0 and 1	`None`
`iscrowd` ¶	`int`	int 0 or 1.	`0`

Source code in sahi/utils/coco.py

def __init__(
    self,
    segmentation: list[list[float]] | list[list[int]] | None = None,
    bbox: list[int] | None = None,
    category_id: int = 0,
    category_name: str = "",
    image_id: int | None = None,
    score: float | None = None,
    iscrowd: int = 0,
) -> None:
    """Initialize a COCO prediction object.

    Args:
        segmentation: List[List]
            [[1, 1, 325, 125, 250, 200, 5, 200]]
        bbox: List
            [xmin, ymin, width, height]
        category_id: int
            Category id of the annotation
        category_name: str
            Category name of the annotation
        image_id: int
            Image ID of the annotation
        score: float
            Prediction score between 0 and 1
        iscrowd: int
            0 or 1.
    """
    self.score = score
    super().__init__(
        segmentation=segmentation,
        bbox=bbox,
        category_id=category_id,
        category_name=category_name,
        image_id=image_id,
        iscrowd=iscrowd,
    )

__repr__() ¶

Return string representation of the prediction.

Source code in sahi/utils/coco.py

def __repr__(self) -> str:
    """Return string representation of the prediction."""
    return f"""CocoPrediction<
image_id: {self.image_id},
bbox: {self.bbox},
segmentation: {self.segmentation},
score: {self.score},
category_id: {self.category_id},
category_name: {self.category_name},
iscrowd: {self.iscrowd},
area: {self.area}>"""

from_coco_annotation_dict(category_name, annotation_dict, score, image_id=None) classmethod ¶

Create CocoAnnotation object from category name and COCO formatted annotation dict.

Creates object from COCO formatted annotation dict with fields "bbox", "segmentation", "category_id".

Parameters:

Name	Type	Description	Default
`category_name` ¶	`str`	str Category name of the annotation	required
`annotation_dict` ¶	`dict`	dict COCO formatted annotation dict (with fields "bbox", "segmentation", "category_id")	required
`score` ¶	`float`	float Prediction score between 0 and 1	required
`image_id` ¶	`int \| None`	Image ID of the prediction.	`None`

Source code in sahi/utils/coco.py

@classmethod
def from_coco_annotation_dict(  # type: ignore[override]
    cls: type[_TCocoPrediction],
    category_name: str,
    annotation_dict: dict,
    score: float,
    image_id: int | None = None,
) -> _TCocoPrediction:
    """Create CocoAnnotation object from category name and COCO formatted annotation dict.

    Creates object from COCO formatted annotation dict with fields "bbox", "segmentation", "category_id".

    Args:
        category_name: str
            Category name of the annotation
        annotation_dict: dict
            COCO formatted annotation dict (with fields "bbox", "segmentation", "category_id")
        score: float
            Prediction score between 0 and 1
        image_id: Image ID of the prediction.
    """
    if annotation_dict["segmentation"]:
        return cls(
            segmentation=annotation_dict["segmentation"],
            category_id=annotation_dict["category_id"],
            category_name=category_name,
            score=score,
            image_id=image_id,
        )
    else:
        return cls(
            bbox=annotation_dict["bbox"],
            category_id=annotation_dict["category_id"],
            category_name=category_name,
            score=score,
            image_id=image_id,
        )

from_coco_bbox(bbox, category_id, category_name, score, iscrowd=0, image_id=None) classmethod ¶

Create CocoAnnotation object using coco bbox.

Parameters:

Name	Type	Description	Default
`bbox` ¶	`list[int]`	List [xmin, ymin, width, height]	required
`category_id` ¶	`int`	int Category id of the annotation	required
`category_name` ¶	`str`	str Category name of the annotation	required
`score` ¶	`float`	float Prediction score between 0 and 1	required
`iscrowd` ¶	`int`	int 0 or 1	`0`
`image_id` ¶	`int \| None`	Image ID of the prediction.	`None`

Source code in sahi/utils/coco.py

@classmethod
def from_coco_bbox(  # type: ignore[override]
    cls: type[_TCocoPrediction],
    bbox: list[int],
    category_id: int,
    category_name: str,
    score: float,
    iscrowd: int = 0,
    image_id: int | None = None,
) -> _TCocoPrediction:
    """Create CocoAnnotation object using coco bbox.

    Args:
        bbox: List
            [xmin, ymin, width, height]
        category_id: int
            Category id of the annotation
        category_name: str
            Category name of the annotation
        score: float
            Prediction score between 0 and 1
        iscrowd: int
            0 or 1
        image_id: Image ID of the prediction.
    """
    return cls(
        bbox=bbox,
        category_id=category_id,
        category_name=category_name,
        score=score,
        iscrowd=iscrowd,
        image_id=image_id,
    )

from_coco_segmentation(segmentation, category_id, category_name, score, iscrowd=0, image_id=None) classmethod ¶

Create CocoAnnotation object using coco segmentation.

Parameters:

Name	Type	Description	Default
`segmentation` ¶	`list[list[float]] \| list[list[int]]`	List[List][[1, 1, 325, 125, 250, 200, 5, 200]]	required
`category_id` ¶	`int`	int Category id of the annotation	required
`category_name` ¶	`str`	str Category name of the annotation	required
`score` ¶	`float`	float Prediction score between 0 and 1	required
`iscrowd` ¶	`int`	int 0 or 1	`0`
`image_id` ¶	`int \| None`	Image ID of the prediction.	`None`

Source code in sahi/utils/coco.py

@classmethod
def from_coco_segmentation(  # type: ignore[override]
    cls: type[_TCocoPrediction],
    segmentation: list[list[float]] | list[list[int]],
    category_id: int,
    category_name: str,
    score: float,
    iscrowd: int = 0,
    image_id: int | None = None,
) -> _TCocoPrediction:
    """Create CocoAnnotation object using coco segmentation.

    Args:
        segmentation: List[List]
            [[1, 1, 325, 125, 250, 200, 5, 200]]
        category_id: int
            Category id of the annotation
        category_name: str
            Category name of the annotation
        score: float
            Prediction score between 0 and 1
        iscrowd: int
            0 or 1
        image_id: Image ID of the prediction.
    """
    return cls(
        segmentation=segmentation,
        category_id=category_id,
        category_name=category_name,
        score=score,
        iscrowd=iscrowd,
        image_id=image_id,
    )

CocoVid ¶

COCOVid dataset object for managing videos, images, and annotations.

Source code in sahi/utils/coco.py

class CocoVid:
    """COCOVid dataset object for managing videos, images, and annotations."""

    def __init__(self, name: str | None = None, remapping_dict: dict | None = None) -> None:
        """Initialize a COCOVid dataset object.

        Args:
            name: Name of the CocoVid dataset, determines exported json name.
            remapping_dict: Category id mapping, e.g., {1:0, 2:1} maps id 1 to 0.
        """
        self.name = name
        self.remapping_dict = remapping_dict
        self.categories: list[CocoCategory] = []
        self.videos: list[CocoVideo] = []

    def add_categories_from_coco_category_list(self, coco_category_list: list[dict]) -> None:
        """Create CocoCategory object using coco category list.

        Args:
            coco_category_list: List[Dict]
                [
                    {"supercategory": "person", "id": 1, "name": "person"},
                    {"supercategory": "vehicle", "id": 2, "name": "bicycle"}
                ]
        """
        for coco_category in coco_category_list:
            if self.remapping_dict is not None:
                for source_id in self.remapping_dict.keys():
                    if coco_category["id"] == source_id:
                        target_id = self.remapping_dict[source_id]
                        coco_category["id"] = target_id

            self.add_category(CocoCategory.from_coco_category(coco_category))

    def add_category(self, category: CocoCategory) -> None:
        """Add category to this CocoVid instance.

        Args:
            category: CocoCategory
        """
        if not isinstance(category, CocoCategory):
            raise TypeError("category must be a CocoCategory instance")  # type: ignore
        self.categories.append(category)

    @property
    def json_categories(self) -> list[dict]:
        """Get list of categories in JSON format."""
        categories = []
        for category in self.categories:
            categories.append(category.json)
        return categories

    @property
    def category_mapping(self) -> dict[int, str | None]:
        """Get mapping of category IDs to names."""
        category_mapping = {}
        for category in self.categories:
            category_mapping[category.id] = category.name
        return category_mapping

    def add_video(self, video: CocoVideo) -> None:
        """Add video to this CocoVid instance.

        Args:
            video: CocoVideo
        """
        if not isinstance(video, CocoVideo):
            raise TypeError("video must be a CocoVideo instance")  # type: ignore
        self.videos.append(video)

    @property
    def json(self) -> dict:
        """Get COCO formatted dictionary."""
        coco_dict = {
            "videos": [],
            "images": [],
            "annotations": [],
            "categories": self.json_categories,
        }
        annotation_id = 1
        image_id = 1
        video_id = 1
        global_instance_id = 1
        for coco_video in self.videos:
            coco_video.id = video_id
            coco_dict["videos"].append(coco_video.json)

            frame_id = 0
            instance_id_set = set()
            for cocovid_image in coco_video.images:
                cocovid_image.id = image_id
                cocovid_image.frame_id = frame_id
                cocovid_image.video_id = coco_video.id
                coco_dict["images"].append(cocovid_image.json)

                for cocovid_annotation in cocovid_image.annotations:
                    if cocovid_annotation.instance_id is not None:  # type: ignore[attr-defined]
                        instance_id_set.add(cocovid_annotation.instance_id)  # type: ignore[attr-defined]
                        cocovid_annotation.instance_id += global_instance_id  # type: ignore[attr-defined]

                    cocovid_annotation.id = annotation_id  # type: ignore[attr-defined]
                    cocovid_annotation.image_id = cocovid_image.id
                    coco_dict["annotations"].append(cocovid_annotation.json)

                    # increment annotation_id
                    annotation_id = copy.deepcopy(annotation_id + 1)
                # increment image_id and frame_id
                image_id = copy.deepcopy(image_id + 1)
                frame_id = copy.deepcopy(frame_id + 1)
            # increment video_id and global_instance_id
            video_id = copy.deepcopy(video_id + 1)
            global_instance_id += len(instance_id_set)

        return coco_dict

Attributes¶

category_mapping property ¶

Get mapping of category IDs to names.

json property ¶

Get COCO formatted dictionary.

json_categories property ¶

Get list of categories in JSON format.

Functions¶

__init__(name=None, remapping_dict=None) ¶

Initialize a COCOVid dataset object.

Parameters:

Name	Type	Description	Default
`name` ¶	`str \| None`	Name of the CocoVid dataset, determines exported json name.	`None`
`remapping_dict` ¶	`dict \| None`	Category id mapping, e.g., {1:0, 2:1} maps id 1 to 0.	`None`

Source code in sahi/utils/coco.py

def __init__(self, name: str | None = None, remapping_dict: dict | None = None) -> None:
    """Initialize a COCOVid dataset object.

    Args:
        name: Name of the CocoVid dataset, determines exported json name.
        remapping_dict: Category id mapping, e.g., {1:0, 2:1} maps id 1 to 0.
    """
    self.name = name
    self.remapping_dict = remapping_dict
    self.categories: list[CocoCategory] = []
    self.videos: list[CocoVideo] = []

add_categories_from_coco_category_list(coco_category_list) ¶

Create CocoCategory object using coco category list.

Parameters:

Name	Type	Description	Default
`coco_category_list` ¶	`list[dict]`	List[Dict] [ {"supercategory": "person", "id": 1, "name": "person"}, {"supercategory": "vehicle", "id": 2, "name": "bicycle"} ]	required

Source code in sahi/utils/coco.py

def add_categories_from_coco_category_list(self, coco_category_list: list[dict]) -> None:
    """Create CocoCategory object using coco category list.

    Args:
        coco_category_list: List[Dict]
            [
                {"supercategory": "person", "id": 1, "name": "person"},
                {"supercategory": "vehicle", "id": 2, "name": "bicycle"}
            ]
    """
    for coco_category in coco_category_list:
        if self.remapping_dict is not None:
            for source_id in self.remapping_dict.keys():
                if coco_category["id"] == source_id:
                    target_id = self.remapping_dict[source_id]
                    coco_category["id"] = target_id

        self.add_category(CocoCategory.from_coco_category(coco_category))

add_category(category) ¶

Add category to this CocoVid instance.

Parameters:

Name	Type	Description	Default
`category` ¶	`CocoCategory`	CocoCategory	required

Source code in sahi/utils/coco.py

def add_category(self, category: CocoCategory) -> None:
    """Add category to this CocoVid instance.

    Args:
        category: CocoCategory
    """
    if not isinstance(category, CocoCategory):
        raise TypeError("category must be a CocoCategory instance")  # type: ignore
    self.categories.append(category)

add_video(video) ¶

Add video to this CocoVid instance.

Parameters:

Name	Type	Description	Default
`video` ¶	`CocoVideo`	CocoVideo	required

Source code in sahi/utils/coco.py

def add_video(self, video: CocoVideo) -> None:
    """Add video to this CocoVid instance.

    Args:
        video: CocoVideo
    """
    if not isinstance(video, CocoVideo):
        raise TypeError("video must be a CocoVideo instance")  # type: ignore
    self.videos.append(video)

CocoVidAnnotation ¶

Bases: CocoAnnotation

COCOVid formatted annotation.

https://github.com/open-mmlab/mmtracking/blob/master/docs/tutorials/customize_dataset.md#the-cocovid-annotation-file

Source code in sahi/utils/coco.py

class CocoVidAnnotation(CocoAnnotation):
    """COCOVid formatted annotation.

    https://github.com/open-mmlab/mmtracking/blob/master/docs/tutorials/customize_dataset.md#the-cocovid-annotation-file
    """

    def __init__(
        self,
        category_id: int,
        category_name: str,
        bbox: list[int],
        image_id: int | None = None,
        instance_id: int | None = None,
        iscrowd: int = 0,
        id: int | None = None,
    ) -> None:
        """Initialize a COCOVid annotation object.

        Args:
            category_id: Category id of the annotation.
            category_name: Category name of the annotation.
            bbox: List [xmin, ymin, width, height].
            image_id: Image ID of the annotation.
            instance_id: Instance id used for tracking.
            iscrowd: 0 or 1.
            id: Annotation id.
        """
        super().__init__(
            bbox=bbox,
            category_id=category_id,
            category_name=category_name,
            image_id=image_id,
            iscrowd=iscrowd,
        )
        self.instance_id = instance_id
        self.id = id

    @property
    def json(self) -> dict:
        """Get annotation as COCOVid formatted dictionary."""
        return {
            "id": self.id,
            "image_id": self.image_id,
            "bbox": self.bbox,
            "segmentation": self.segmentation,
            "category_id": self.category_id,
            "category_name": self.category_name,
            "instance_id": self.instance_id,
            "iscrowd": self.iscrowd,
            "area": self.area,
        }

    def __repr__(self) -> str:
        """Return string representation of the annotation."""
        return f"""CocoAnnotation<
    id: {self.id},
    image_id: {self.image_id},
    bbox: {self.bbox},
    segmentation: {self.segmentation},
    category_id: {self.category_id},
    category_name: {self.category_name},
    instance_id: {self.instance_id},
    iscrowd: {self.iscrowd},
    area: {self.area}>"""

Attributes¶

json property ¶

Get annotation as COCOVid formatted dictionary.

Functions¶

__init__(category_id, category_name, bbox, image_id=None, instance_id=None, iscrowd=0, id=None) ¶

Initialize a COCOVid annotation object.

Parameters:

Name	Type	Description	Default
`category_id` ¶	`int`	Category id of the annotation.	required
`category_name` ¶	`str`	Category name of the annotation.	required
`bbox` ¶	`list[int]`	List [xmin, ymin, width, height].	required
`image_id` ¶	`int \| None`	Image ID of the annotation.	`None`
`instance_id` ¶	`int \| None`	Instance id used for tracking.	`None`
`iscrowd` ¶	`int`	0 or 1.	`0`
`id` ¶	`int \| None`	Annotation id.	`None`

Source code in sahi/utils/coco.py

def __init__(
    self,
    category_id: int,
    category_name: str,
    bbox: list[int],
    image_id: int | None = None,
    instance_id: int | None = None,
    iscrowd: int = 0,
    id: int | None = None,
) -> None:
    """Initialize a COCOVid annotation object.

    Args:
        category_id: Category id of the annotation.
        category_name: Category name of the annotation.
        bbox: List [xmin, ymin, width, height].
        image_id: Image ID of the annotation.
        instance_id: Instance id used for tracking.
        iscrowd: 0 or 1.
        id: Annotation id.
    """
    super().__init__(
        bbox=bbox,
        category_id=category_id,
        category_name=category_name,
        image_id=image_id,
        iscrowd=iscrowd,
    )
    self.instance_id = instance_id
    self.id = id

__repr__() ¶

Return string representation of the annotation.

Source code in sahi/utils/coco.py

def __repr__(self) -> str:
    """Return string representation of the annotation."""
    return f"""CocoAnnotation<
id: {self.id},
image_id: {self.image_id},
bbox: {self.bbox},
segmentation: {self.segmentation},
category_id: {self.category_id},
category_name: {self.category_name},
instance_id: {self.instance_id},
iscrowd: {self.iscrowd},
area: {self.area}>"""

CocoVidImage ¶

Bases: CocoImage

COCOVid formatted image.

https://github.com/open-mmlab/mmtracking/blob/master/docs/tutorials/customize_dataset.md#the-cocovid-annotation-file

Source code in sahi/utils/coco.py

class CocoVidImage(CocoImage):
    """COCOVid formatted image.

    https://github.com/open-mmlab/mmtracking/blob/master/docs/tutorials/customize_dataset.md#the-cocovid-annotation-file
    """

    def __init__(
        self,
        file_name: str,
        height: int,
        width: int,
        video_id: int | None = None,
        frame_id: int | None = None,
        id: int | None = None,
    ) -> None:
        """Create CocoVidImage object.

        Args:
            id: int
                Image id
            file_name: str
                Image path
            height: int
                Image height in pixels
            width: int
                Image width in pixels
            frame_id: int
                0-indexed frame id
            video_id: int
                Video id
        """
        super().__init__(file_name=file_name, height=height, width=width, id=id)
        self.frame_id = frame_id
        self.video_id = video_id

    @classmethod
    def from_coco_image(
        cls: type[_TCocoVidImage], coco_image: CocoImage, video_id: int | None = None, frame_id: int | None = None
    ) -> _TCocoVidImage:
        """Create CocoVidImage object using CocoImage object.

        Args:
            coco_image: CocoImage
            frame_id: int
                0-indexed frame id
            video_id: int
                Video id
        """
        return cls(
            file_name=coco_image.file_name,
            height=coco_image.height,
            width=coco_image.width,
            id=coco_image.id,
            video_id=video_id,
            frame_id=frame_id,
        )

    def add_annotation(self, annotation: CocoVidAnnotation) -> None:  # type: ignore[override]
        """Add annotation to this CocoImage instance.

        Args:
            annotation: CocoVidAnnotation object to add.
        """
        if not isinstance(annotation, CocoVidAnnotation):
            raise TypeError("annotation must be a CocoVidAnnotation instance")
        self.annotations.append(annotation)

    @property
    def json(self) -> dict:
        """Get image as COCOVid formatted dictionary."""
        return {
            "file_name": self.file_name,
            "height": self.height,
            "width": self.width,
            "id": self.id,
            "video_id": self.video_id,
            "frame_id": self.frame_id,
        }

    def __repr__(self) -> str:
        """Return string representation of the image."""
        return f"""CocoVidImage<
    file_name: {self.file_name},
    height: {self.height},
    width: {self.width},
    id: {self.id},
    video_id: {self.video_id},
    frame_id: {self.frame_id},
    annotations: List[CocoVidAnnotation]>"""

Attributes¶

json property ¶

Get image as COCOVid formatted dictionary.

Functions¶

__init__(file_name, height, width, video_id=None, frame_id=None, id=None) ¶

Create CocoVidImage object.

Parameters:

Name	Type	Description	Default
`id` ¶	`int \| None`	int Image id	`None`
`file_name` ¶	`str`	str Image path	required
`height` ¶	`int`	int Image height in pixels	required
`width` ¶	`int`	int Image width in pixels	required
`frame_id` ¶	`int \| None`	int 0-indexed frame id	`None`
`video_id` ¶	`int \| None`	int Video id	`None`

Source code in sahi/utils/coco.py

def __init__(
    self,
    file_name: str,
    height: int,
    width: int,
    video_id: int | None = None,
    frame_id: int | None = None,
    id: int | None = None,
) -> None:
    """Create CocoVidImage object.

    Args:
        id: int
            Image id
        file_name: str
            Image path
        height: int
            Image height in pixels
        width: int
            Image width in pixels
        frame_id: int
            0-indexed frame id
        video_id: int
            Video id
    """
    super().__init__(file_name=file_name, height=height, width=width, id=id)
    self.frame_id = frame_id
    self.video_id = video_id

__repr__() ¶

Return string representation of the image.

Source code in sahi/utils/coco.py

def __repr__(self) -> str:
    """Return string representation of the image."""
    return f"""CocoVidImage<
file_name: {self.file_name},
height: {self.height},
width: {self.width},
id: {self.id},
video_id: {self.video_id},
frame_id: {self.frame_id},
annotations: List[CocoVidAnnotation]>"""

add_annotation(annotation) ¶

Add annotation to this CocoImage instance.

Parameters:

Name	Type	Description	Default
`annotation` ¶	`CocoVidAnnotation`	CocoVidAnnotation object to add.	required

Source code in sahi/utils/coco.py

def add_annotation(self, annotation: CocoVidAnnotation) -> None:  # type: ignore[override]
    """Add annotation to this CocoImage instance.

    Args:
        annotation: CocoVidAnnotation object to add.
    """
    if not isinstance(annotation, CocoVidAnnotation):
        raise TypeError("annotation must be a CocoVidAnnotation instance")
    self.annotations.append(annotation)

from_coco_image(coco_image, video_id=None, frame_id=None) classmethod ¶

Create CocoVidImage object using CocoImage object.

Parameters:

Name	Type	Description	Default
`coco_image` ¶	`CocoImage`	CocoImage	required
`frame_id` ¶	`int \| None`	int 0-indexed frame id	`None`
`video_id` ¶	`int \| None`	int Video id	`None`

Source code in sahi/utils/coco.py

@classmethod
def from_coco_image(
    cls: type[_TCocoVidImage], coco_image: CocoImage, video_id: int | None = None, frame_id: int | None = None
) -> _TCocoVidImage:
    """Create CocoVidImage object using CocoImage object.

    Args:
        coco_image: CocoImage
        frame_id: int
            0-indexed frame id
        video_id: int
            Video id
    """
    return cls(
        file_name=coco_image.file_name,
        height=coco_image.height,
        width=coco_image.width,
        id=coco_image.id,
        video_id=video_id,
        frame_id=frame_id,
    )

CocoVideo ¶

COCO formatted video.

https://github.com/open-mmlab/mmtracking/blob/master/docs/tutorials/customize_dataset.md#the-cocovid-annotation-file

Source code in sahi/utils/coco.py

class CocoVideo:
    """COCO formatted video.

    https://github.com/open-mmlab/mmtracking/blob/master/docs/tutorials/customize_dataset.md#the-cocovid-annotation-file
    """

    def __init__(
        self,
        name: str,
        id: int | None = None,
        fps: float | None = None,
        height: int | None = None,
        width: int | None = None,
    ) -> None:
        """Create CocoVideo object.

        Args:
            name: str
                Video name
            id: int
                Video id
            fps: float
                Video fps
            height: int
                Video height in pixels
            width: int
                Video width in pixels
        """
        self.name = name
        self.id = id
        self.fps = fps
        self.height = height
        self.width = width
        self.images: list[CocoVidImage] = []  # list of CocoImage that belong to this video

    def add_image(self, image: CocoImage) -> None:
        """Add image to this CocoVideo instance.

        Args:
            image: CocoImage.
        """
        if not isinstance(image, CocoImage):
            raise TypeError("image must be a CocoImage instance")
        self.images.append(CocoVidImage.from_coco_image(image))

    def add_cocovidimage(self, cocovidimage: CocoVidImage) -> None:
        """Add CocoVidImage to this CocoVideo instance.

        Args:
            cocovidimage: CocoVidImage.
        """
        if not isinstance(cocovidimage, CocoVidImage):
            raise TypeError("cocovidimage must be a CocoVidImage instance")
        self.images.append(cocovidimage)

    @property
    def json(self) -> dict:
        """Get video as COCO formatted dictionary."""
        return {
            "name": self.name,
            "id": self.id,
            "fps": self.fps,
            "height": self.height,
            "width": self.width,
        }

    def __repr__(self) -> str:
        """Return string representation of the video."""
        return f"""CocoVideo<
    id: {self.id},
    name: {self.name},
    fps: {self.fps},
    height: {self.height},
    width: {self.width},
    images: List[CocoVidImage]>"""

Attributes¶

json property ¶

Get video as COCO formatted dictionary.

Functions¶

__init__(name, id=None, fps=None, height=None, width=None) ¶

Create CocoVideo object.

Parameters:

Name	Type	Description	Default
`name` ¶	`str`	str Video name	required
`id` ¶	`int \| None`	int Video id	`None`
`fps` ¶	`float \| None`	float Video fps	`None`
`height` ¶	`int \| None`	int Video height in pixels	`None`
`width` ¶	`int \| None`	int Video width in pixels	`None`

Source code in sahi/utils/coco.py

def __init__(
    self,
    name: str,
    id: int | None = None,
    fps: float | None = None,
    height: int | None = None,
    width: int | None = None,
) -> None:
    """Create CocoVideo object.

    Args:
        name: str
            Video name
        id: int
            Video id
        fps: float
            Video fps
        height: int
            Video height in pixels
        width: int
            Video width in pixels
    """
    self.name = name
    self.id = id
    self.fps = fps
    self.height = height
    self.width = width
    self.images: list[CocoVidImage] = []  # list of CocoImage that belong to this video

__repr__() ¶

Return string representation of the video.

Source code in sahi/utils/coco.py

def __repr__(self) -> str:
    """Return string representation of the video."""
    return f"""CocoVideo<
id: {self.id},
name: {self.name},
fps: {self.fps},
height: {self.height},
width: {self.width},
images: List[CocoVidImage]>"""

add_cocovidimage(cocovidimage) ¶

Add CocoVidImage to this CocoVideo instance.

Parameters:

Name	Type	Description	Default
`cocovidimage` ¶	`CocoVidImage`	CocoVidImage.	required

Source code in sahi/utils/coco.py

def add_cocovidimage(self, cocovidimage: CocoVidImage) -> None:
    """Add CocoVidImage to this CocoVideo instance.

    Args:
        cocovidimage: CocoVidImage.
    """
    if not isinstance(cocovidimage, CocoVidImage):
        raise TypeError("cocovidimage must be a CocoVidImage instance")
    self.images.append(cocovidimage)

add_image(image) ¶

Add image to this CocoVideo instance.

Parameters:

Name	Type	Description	Default
`image` ¶	`CocoImage`	CocoImage.	required

Source code in sahi/utils/coco.py

def add_image(self, image: CocoImage) -> None:
    """Add image to this CocoVideo instance.

    Args:
        image: CocoImage.
    """
    if not isinstance(image, CocoImage):
        raise TypeError("image must be a CocoImage instance")
    self.images.append(CocoVidImage.from_coco_image(image))

DatasetClassCounts dataclass ¶

Stores the number of images that include each category in a dataset.

Source code in sahi/utils/coco.py

@dataclass
class DatasetClassCounts:
    """Stores the number of images that include each category in a dataset."""

    counts: dict
    total_images: int

    def frequencies(self) -> dict:
        """Calculate the frequency of images that contain each category."""
        return {cid: count / self.total_images for cid, count in self.counts.items()}

    def __add__(self, o: DatasetClassCounts) -> DatasetClassCounts:
        """Add two DatasetClassCounts objects together."""
        total = self.total_images + o.total_images
        exclusive_keys = set(o.counts.keys()) - set(self.counts.keys())
        counts = {}
        for k, v in self.counts.items():
            counts[k] = v + o.counts.get(k, 0)
        for k in exclusive_keys:
            counts[k] = o.counts[k]
        return DatasetClassCounts(counts, total)

Functions¶

__add__(o) ¶

Add two DatasetClassCounts objects together.

Source code in sahi/utils/coco.py

def __add__(self, o: DatasetClassCounts) -> DatasetClassCounts:
    """Add two DatasetClassCounts objects together."""
    total = self.total_images + o.total_images
    exclusive_keys = set(o.counts.keys()) - set(self.counts.keys())
    counts = {}
    for k, v in self.counts.items():
        counts[k] = v + o.counts.get(k, 0)
    for k in exclusive_keys:
        counts[k] = o.counts[k]
    return DatasetClassCounts(counts, total)

frequencies() ¶

Calculate the frequency of images that contain each category.

Source code in sahi/utils/coco.py

def frequencies(self) -> dict:
    """Calculate the frequency of images that contain each category."""
    return {cid: count / self.total_images for cid, count in self.counts.items()}

Functions¶

add_bbox_and_area_to_coco(source_coco_path='', target_coco_path='', add_bbox=True, add_area=True) ¶

Calculate and fill bbox and area fields in COCO annotations.

Takes a COCO dataset file, calculates bbox and area fields, and exports updated dict.

Returns:

Name	Type	Description
`coco_dict`	`dict`	Updated COCO dict.

Source code in sahi/utils/coco.py

def add_bbox_and_area_to_coco(
    source_coco_path: str = "",
    target_coco_path: str = "",
    add_bbox: bool = True,
    add_area: bool = True,
) -> dict:
    """Calculate and fill bbox and area fields in COCO annotations.

    Takes a COCO dataset file, calculates bbox and area fields, and exports updated dict.

    Returns:
        coco_dict: Updated COCO dict.
    """
    coco_dict = cast("dict[Any, Any]", load_json(source_coco_path))
    coco_dict = copy.deepcopy(coco_dict)

    annotations = coco_dict["annotations"]
    for ind, annotation in enumerate(annotations):
        # assign annotation bbox
        if add_bbox:
            coco_polygons: list = []
            for coco_polygon in annotation["segmentation"]:
                coco_polygons.extend(coco_polygon)
            minx, miny, maxx, maxy = list(
                [
                    min(coco_polygons[0::2]),
                    min(coco_polygons[1::2]),
                    max(coco_polygons[0::2]),
                    max(coco_polygons[1::2]),
                ]
            )
            x, y, width, height = (
                minx,
                miny,
                maxx - minx,
                maxy - miny,
            )
            annotations[ind]["bbox"] = [x, y, width, height]

        # assign annotation area
        if add_area:
            shapely_multipolygon = get_shapely_multipolygon(coco_segmentation=annotation["segmentation"])
            annotations[ind]["area"] = shapely_multipolygon.area

    coco_dict["annotations"] = annotations
    save_json(coco_dict, target_coco_path)
    return coco_dict

count_images_with_category(coco_file_path) ¶

Count images with each category in COCO dataset.

Parameters:

Name	Type	Description	Default
`coco_file_path` ¶	`str`	Path to COCO dataset file.	required

Returns:

Type	Description
`DatasetClassCounts`	DatasetClassCounts object storing counts.

Source code in sahi/utils/coco.py

def count_images_with_category(coco_file_path: str) -> DatasetClassCounts:
    """Count images with each category in COCO dataset.

    Args:
        coco_file_path: Path to COCO dataset file.

    Returns:
        DatasetClassCounts object storing counts.
    """
    image_id_2_category_2_count: defaultdict[Any, defaultdict[Any, int]] = defaultdict(lambda: defaultdict(int))
    coco = cast("dict[Any, Any]", load_json(coco_file_path))
    for annotation in coco["annotations"]:
        image_id = annotation["image_id"]
        cid = annotation["category_id"]
        image_id_2_category_2_count[image_id][cid] = image_id_2_category_2_count[image_id][cid] + 1

    category_2_count_defaultdict: defaultdict[Any, int] = defaultdict(int)
    for image_id, image_category_2_count in image_id_2_category_2_count.items():
        for cid, count in image_category_2_count.items():
            if count > 0:
                category_2_count_defaultdict[cid] = category_2_count_defaultdict[cid] + 1

    category_2_count: dict[Any, int] = dict(category_2_count_defaultdict)
    total_images = len(image_id_2_category_2_count.keys())
    return DatasetClassCounts(category_2_count, total_images)

create_coco_dict(images, categories, ignore_negative_samples=False, image_id_setting='auto') ¶

Create COCO dict with fields "images", "annotations", "categories".

Parameters:

Name	Type	Description	Default
`images` ¶	`list[CocoImage]`	List of CocoImage containing a list of CocoAnnotation.	required
`categories` ¶	`list[dict]`	List of Dict COCO categories.	required
`ignore_negative_samples` ¶	`bool`	If True, images without annotations are ignored.	`False`
`image_id_setting` ¶	`str`	How to assign image ids while exporting can be auto --> will assign id from scratch (.id will be ignored) manual --> you will need to provide image ids in instances (.id can not be None)	`'auto'`

Returns:

Name	Type	Description
`coco_dict`	`dict`	COCO dict with fields "images", "annotations", "categories".

Source code in sahi/utils/coco.py

def create_coco_dict(
    images: list[CocoImage],
    categories: list[dict],
    ignore_negative_samples: bool = False,
    image_id_setting: str = "auto",
) -> dict:
    """Create COCO dict with fields "images", "annotations", "categories".

    Args:
        images: List of CocoImage containing a list of CocoAnnotation.
        categories: List of Dict
            COCO categories.
        ignore_negative_samples: If True, images without annotations are ignored.
        image_id_setting: How to assign image ids while exporting can be
            auto --> will assign id from scratch (<CocoImage>.id will be ignored)
            manual --> you will need to provide image ids in <CocoImage> instances (<CocoImage>.id can not be None)

    Returns:
        coco_dict: COCO dict with fields "images", "annotations", "categories".
    """
    # assertion of parameters
    if image_id_setting not in ["auto", "manual"]:
        raise ValueError("'image_id_setting' should be one of ['auto', 'manual']")

    # define accumulators
    image_index = 1
    annotation_id = 1
    coco_dict = dict(images=[], annotations=[], categories=categories)
    for coco_image in images:
        # get coco annotations
        coco_annotations = coco_image.annotations
        # get num annotations
        num_annotations = len(coco_annotations)
        # if ignore_negative_samples is True and no annotations, skip image
        if ignore_negative_samples and num_annotations == 0:
            continue
        else:
            # get image_id
            if image_id_setting == "auto":
                image_id = image_index
                image_index += 1
            elif image_id_setting == "manual":
                if coco_image.id is None:
                    raise ValueError("'coco_image.id' should be set manually when image_id_setting == 'manual'")
                image_id = coco_image.id

            # create coco image object
            out_image = {
                "height": coco_image.height,
                "width": coco_image.width,
                "id": image_id,
                "file_name": coco_image.file_name,
            }
            coco_dict["images"].append(out_image)

            # do the same for image annotations
            for coco_annotation in coco_annotations:
                # create coco annotation object
                out_annotation = {
                    "iscrowd": 0,
                    "image_id": image_id,
                    "bbox": coco_annotation.bbox,
                    "segmentation": coco_annotation.segmentation,
                    "category_id": coco_annotation.category_id,
                    "id": annotation_id,
                    "area": coco_annotation.area,
                }
                coco_dict["annotations"].append(out_annotation)
                # increment annotation id
                annotation_id += 1

    # return coco dict
    return coco_dict

create_coco_prediction_array(images, ignore_negative_samples=False, image_id_setting='auto') ¶

Create COCO prediction array which is list of predictions.

Parameters:

Name	Type	Description	Default
`images` ¶	`list[CocoImage]`	List of CocoImage containing a list of CocoAnnotation.	required
`ignore_negative_samples` ¶	`bool`	If True, images without predictions are ignored.	`False`
`image_id_setting` ¶	`str`	How to assign image ids while exporting can be auto --> will assign id from scratch (.id will be ignored) manual --> you will need to provide image ids in instances (.id can not be None)	`'auto'`

Returns:

Name	Type	Description
`coco_prediction_array`	`list[dict]`	COCO predictions array.

Source code in sahi/utils/coco.py

def create_coco_prediction_array(
    images: list[CocoImage], ignore_negative_samples: bool = False, image_id_setting: str = "auto"
) -> list[dict]:
    """Create COCO prediction array which is list of predictions.

    Args:
        images: List of CocoImage containing a list of CocoAnnotation.
        ignore_negative_samples: If True, images without predictions are ignored.
        image_id_setting: How to assign image ids while exporting can be
            auto --> will assign id from scratch (<CocoImage>.id will be ignored)
            manual --> you will need to provide image ids in <CocoImage> instances (<CocoImage>.id can not be None)

    Returns:
        coco_prediction_array: COCO predictions array.
    """
    # assertion of parameters
    if image_id_setting not in ["auto", "manual"]:
        raise ValueError("'image_id_setting' should be one of ['auto', 'manual']")
    # define accumulators
    image_index = 1
    prediction_id = 1
    predictions_array = []
    for coco_image in images:
        # get coco predictions
        coco_predictions = coco_image.predictions
        # get num predictions
        num_predictions = len(coco_predictions)
        # if ignore_negative_samples is True and no annotations, skip image
        if ignore_negative_samples and num_predictions == 0:
            continue
        else:
            # get image_id
            if image_id_setting == "auto":
                image_id = image_index
                image_index += 1
            elif image_id_setting == "manual":
                if coco_image.id is None:
                    raise ValueError("'coco_image.id' should be set manually when image_id_setting == 'manual'")
                image_id = coco_image.id

            # create coco prediction object
            for prediction_index, coco_prediction in enumerate(coco_predictions):
                # create coco prediction object
                out_prediction = {
                    "id": prediction_id,
                    "image_id": image_id,
                    "bbox": coco_prediction.bbox,
                    "score": coco_prediction.score,
                    "category_id": coco_prediction.category_id,
                    "segmentation": coco_prediction.segmentation,
                    "iscrowd": coco_prediction.iscrowd,
                    "area": coco_prediction.area,
                }
                predictions_array.append(out_prediction)

                # increment prediction id
                prediction_id += 1

    # return predictions array
    return predictions_array

export_coco_as_yolo(output_dir, train_coco=None, val_coco=None, train_split_rate=0.9, numpy_seed=0, disable_symlink=False)

¶

Export current COCO dataset in ultralytics/YOLO format.

Creates train val folders with image symlinks and txt files and a data yaml file.

Parameters:

Name	Type	Description	Default
`output_dir` ¶	`str`	str Export directory.	required
`train_coco` ¶	`Coco \| None`	Coco coco object for training	`None`
`val_coco` ¶	`Coco \| None`	Coco coco object for val	`None`
`train_split_rate` ¶	`float`	float train split rate between 0 and 1. will be used when val_coco is None.	`0.9`
`numpy_seed` ¶	`int`	int To fix the numpy seed.	`0`
`disable_symlink` ¶	`bool`	bool If True, copy images instead of creating symlinks.	`False`

Returns:

Name	Type	Description
`yaml_path`	`str`	str Path for the exported YOLO data.yml

Source code in sahi/utils/coco.py

def export_coco_as_yolo(
    output_dir: str,
    train_coco: Coco | None = None,
    val_coco: Coco | None = None,
    train_split_rate: float = 0.9,
    numpy_seed: int = 0,
    disable_symlink: bool = False,
) -> str:
    """Export current COCO dataset in ultralytics/YOLO format.

    Creates train val folders with image symlinks and txt files and a data yaml file.

    Args:
        output_dir: str
            Export directory.
        train_coco: Coco
            coco object for training
        val_coco: Coco
            coco object for val
        train_split_rate: float
            train split rate between 0 and 1. will be used when val_coco is None.
        numpy_seed: int
            To fix the numpy seed.
        disable_symlink: bool
            If True, copy images instead of creating symlinks.

    Returns:
        yaml_path: str
            Path for the exported YOLO data.yml
    """
    try:
        import yaml
    except ImportError:
        raise ImportError('Please run "pip install -U pyyaml" to install yaml first for YOLO formatted exporting.')

    # set split_mode
    if train_coco and not val_coco:
        split_mode = True
    elif train_coco and val_coco:
        split_mode = False
    else:
        raise ValueError("'train_coco' have to be provided")

    # check train_split_rate
    if split_mode and not (0 < train_split_rate < 1):
        raise ValueError("train_split_rate cannot be <0 or >1")

    # split dataset
    if split_mode:
        result = train_coco.split_coco_as_train_val(
            train_split_rate=train_split_rate,
            numpy_seed=numpy_seed,
        )
        train_coco = result["train_coco"]
        val_coco = result["val_coco"]

    # create train val image dirs
    train_dir = Path(os.path.abspath(output_dir)) / "train/"
    train_dir.mkdir(parents=True, exist_ok=True)  # create dir
    val_dir = Path(os.path.abspath(output_dir)) / "val/"
    val_dir.mkdir(parents=True, exist_ok=True)  # create dir

    # create image symlinks and annotation txts
    export_yolo_images_and_txts_from_coco_object(
        output_dir=str(train_dir),
        coco=train_coco,
        ignore_negative_samples=train_coco.ignore_negative_samples,
        mp=False,
        disable_symlink=disable_symlink,
    )
    assert val_coco, "Validation Coco object not set"
    export_yolo_images_and_txts_from_coco_object(
        output_dir=str(val_dir),
        coco=val_coco,
        ignore_negative_samples=val_coco.ignore_negative_samples,
        mp=False,
        disable_symlink=disable_symlink,
    )

    # create yolov5 data yaml
    data = {
        "train": str(train_dir).replace("\\", "/"),
        "val": str(val_dir).replace("\\", "/"),
        "nc": len(train_coco.category_mapping),
        "names": list(train_coco.category_mapping.values()),
    }
    yaml_path = str(Path(output_dir) / "data.yml")
    with open(yaml_path, "w") as outfile:
        yaml.dump(data, outfile, default_flow_style=False)

    return yaml_path

export_coco_as_yolo_via_yml(yml_path, output_dir, train_split_rate=0.9, numpy_seed=0, disable_symlink=False) ¶

Export current COCO dataset in ultralytics/YOLO format using a YML file.

Creates train val folders with image symlinks and txt files and a data yaml file.

Parameters:

Name	Type	Description	Default
`yml_path` ¶	`str`	str file should contain these fields: train_json_path: str train_image_dir: str val_json_path: str val_image_dir: str	required
`output_dir` ¶	`str`	str Export directory.	required
`train_split_rate` ¶	`float`	float train split rate between 0 and 1. will be used when val_json_path is None.	`0.9`
`numpy_seed` ¶	`int`	int To fix the numpy seed.	`0`
`disable_symlink` ¶	`bool`	bool If True, copy images instead of creating symlinks.	`False`

Returns:

Name	Type	Description
`yaml_path`	`str`	str Path for the exported YOLO data.yml

Source code in sahi/utils/coco.py

def export_coco_as_yolo_via_yml(
    yml_path: str, output_dir: str, train_split_rate: float = 0.9, numpy_seed: int = 0, disable_symlink: bool = False
) -> str:
    """Export current COCO dataset in ultralytics/YOLO format using a YML file.

    Creates train val folders with image symlinks and txt files and a data yaml file.

    Args:
        yml_path: str
            file should contain these fields:
                train_json_path: str
                train_image_dir: str
                val_json_path: str
                val_image_dir: str
        output_dir: str
            Export directory.
        train_split_rate: float
            train split rate between 0 and 1. will be used when val_json_path is None.
        numpy_seed: int
            To fix the numpy seed.
        disable_symlink: bool
            If True, copy images instead of creating symlinks.

    Returns:
        yaml_path: str
            Path for the exported YOLO data.yml
    """
    try:
        import yaml
    except ImportError:
        raise ImportError('Please run "pip install -U pyyaml" to install yaml first for YOLO formatted exporting.')

    with open(yml_path) as stream:
        config_dict = yaml.safe_load(stream)

    if config_dict["train_json_path"]:
        if not config_dict["train_image_dir"]:
            raise ValueError(f"{yml_path} is missing `train_image_dir`")
        train_coco = Coco.from_coco_dict_or_path(
            config_dict["train_json_path"], image_dir=config_dict["train_image_dir"]
        )
    else:
        train_coco = None

    if config_dict["val_json_path"]:
        if not config_dict["val_image_dir"]:
            raise ValueError(f"{yml_path} is missing `val_image_dir`")
        val_coco = Coco.from_coco_dict_or_path(config_dict["val_json_path"], image_dir=config_dict["val_image_dir"])
    else:
        val_coco = None

    yaml_path = export_coco_as_yolo(
        output_dir=output_dir,
        train_coco=train_coco,
        val_coco=val_coco,
        train_split_rate=train_split_rate,
        numpy_seed=numpy_seed,
        disable_symlink=disable_symlink,
    )

    return yaml_path

export_single_yolo_image_and_corresponding_txt(coco_image, coco_image_dir, output_dir, ignore_negative_samples=False, disable_symlink=False)

¶

Generate YOLO formatted image symlink and annotation txt file.

Parameters:

Name	Type	Description	Default
`coco_image` ¶	`CocoImage`	CocoImage object.	required
`coco_image_dir` ¶	`str`	Image directory path.	required
`output_dir` ¶	`str`	Export directory.	required
`ignore_negative_samples` ¶	`bool`	If True, ignores images without annotations.	`False`
`disable_symlink` ¶	`bool`	If True, copies images instead of creating symlinks.	`False`

Source code in sahi/utils/coco.py

def export_single_yolo_image_and_corresponding_txt(
    coco_image: CocoImage,
    coco_image_dir: str,
    output_dir: str,
    ignore_negative_samples: bool = False,
    disable_symlink: bool = False,
) -> None:
    """Generate YOLO formatted image symlink and annotation txt file.

    Args:
        coco_image: CocoImage object.
        coco_image_dir: Image directory path.
        output_dir: Export directory.
        ignore_negative_samples: If True, ignores images without annotations.
        disable_symlink: If True, copies images instead of creating symlinks.
    """
    # if coco_image contains any invalid annotations, skip it
    contains_invalid_annotations = False
    for coco_annotation in coco_image.annotations:
        if len(coco_annotation.bbox) != 4:
            contains_invalid_annotations = True
            break
    if contains_invalid_annotations:
        return
    # skip images without annotations
    if len(coco_image.annotations) == 0 and ignore_negative_samples:
        return
    # skip images without suffix
    # https://github.com/obss/sahi/issues/114
    if Path(coco_image.file_name).suffix == "":
        print(f"image file has no suffix, skipping it: '{coco_image.file_name}'")
        return
    elif Path(coco_image.file_name).suffix in [".txt"]:  # TODO: extend this list
        print(f"image file has incorrect suffix, skipping it: '{coco_image.file_name}'")
        return
    # set coco and yolo image paths
    if Path(coco_image.file_name).is_file():
        coco_image_path = os.path.abspath(coco_image.file_name)
    else:
        if coco_image_dir is None:
            raise ValueError("You have to specify image_dir of Coco object for yolo conversion.")

        coco_image_path = os.path.abspath(str(Path(coco_image_dir) / coco_image.file_name))

    yolo_image_path_temp = str(Path(output_dir) / Path(coco_image.file_name).name)
    # increment target file name if already present
    yolo_image_path = copy.deepcopy(yolo_image_path_temp)
    name_increment = 2
    while Path(yolo_image_path).is_file():
        parent_dir = Path(yolo_image_path_temp).parent
        filename = Path(yolo_image_path_temp).stem
        filesuffix = Path(yolo_image_path_temp).suffix
        filename = filename + "_" + str(name_increment)
        yolo_image_path = str(parent_dir / (filename + filesuffix))
        name_increment += 1
    # create a symbolic link pointing to coco_image_path named yolo_image_path
    if disable_symlink:
        import shutil

        shutil.copy(coco_image_path, yolo_image_path)
    else:
        os.symlink(coco_image_path, yolo_image_path)
    # calculate annotation normalization ratios
    width = coco_image.width
    height = coco_image.height
    dw = 1.0 / (width)
    dh = 1.0 / (height)
    # set annotation filepath
    image_file_suffix = Path(yolo_image_path).suffix
    yolo_annotation_path = yolo_image_path.replace(image_file_suffix, ".txt")
    # create annotation file
    annotations = coco_image.annotations
    with open(yolo_annotation_path, "w") as outfile:
        for annotation in annotations:
            # convert coco bbox to yolo bbox
            x_center = annotation.bbox[0] + annotation.bbox[2] / 2.0
            y_center = annotation.bbox[1] + annotation.bbox[3] / 2.0
            bbox_width = annotation.bbox[2]
            bbox_height = annotation.bbox[3]
            x_center = x_center * dw
            y_center = y_center * dh
            bbox_width = bbox_width * dw
            bbox_height = bbox_height * dh
            category_id = annotation.category_id
            yolo_bbox = (x_center, y_center, bbox_width, bbox_height)
            # save yolo annotation
            outfile.write(str(category_id) + " " + " ".join([str(value) for value in yolo_bbox]) + "\n")

export_yolo_images_and_txts_from_coco_object(output_dir, coco, ignore_negative_samples=False, mp=False, disable_symlink=False)

¶

Create image symlinks and annotation txts in yolo format from coco dataset.

Parameters:

Name	Type	Description	Default
`output_dir` ¶	`str`	str Export directory.	required
`coco` ¶	`Coco`	sahi.utils.coco.Coco Initialized Coco object that contains images and categories.	required
`ignore_negative_samples` ¶	`bool`	bool If True ignores images without annotations in all operations.	`False`
`mp` ¶	`bool`	bool If True, multiprocess mode is on. Should be called in 'if name == main:' block.	`False`
`disable_symlink` ¶	`bool`	bool If True, symlinks are not created. Instead images are copied.	`False`

Source code in sahi/utils/coco.py

def export_yolo_images_and_txts_from_coco_object(
    output_dir: str, coco: Coco, ignore_negative_samples: bool = False, mp: bool = False, disable_symlink: bool = False
) -> None:
    """Create image symlinks and annotation txts in yolo format from coco dataset.

    Args:
        output_dir: str
            Export directory.
        coco: sahi.utils.coco.Coco
            Initialized Coco object that contains images and categories.
        ignore_negative_samples: bool
            If True ignores images without annotations in all operations.
        mp: bool
            If True, multiprocess mode is on.
            Should be called in 'if __name__ == __main__:' block.
        disable_symlink: bool
            If True, symlinks are not created. Instead images are copied.
    """
    logger.info("generating image symlinks and annotation files for yolo...")
    # symlink is not supported in colab
    if is_colab() and not disable_symlink:
        logger.warning("symlink is not supported in colab, disabling it...")
        disable_symlink = True
    image_dir = coco.image_dir or ""
    if mp:
        with Pool(processes=48) as pool:
            args = [
                (coco_image, image_dir, output_dir, ignore_negative_samples, disable_symlink)
                for coco_image in coco.images
            ]
            pool.starmap(
                export_single_yolo_image_and_corresponding_txt,
                tqdm(args, total=len(args)),
            )
    else:
        for coco_image in tqdm(coco.images):
            export_single_yolo_image_and_corresponding_txt(
                coco_image, image_dir, output_dir, ignore_negative_samples, disable_symlink
            )

get_imageid2annotationlist_mapping(coco_dict) ¶

Get image_id to annotationlist mapping for faster indexing.

Parameters:

Name	Type	Description	Default
`coco_dict` ¶	`dict`	COCO dict with fields "images", "annotations", "categories".	required

Returns:

Name	Type	Description
`image_id_to_annotation_list`	`dict[int, list[dict]]`	Mapping from image id to list of annotation dicts.

Source code in sahi/utils/coco.py

def get_imageid2annotationlist_mapping(coco_dict: dict) -> dict[int, list[dict]]:
    """Get image_id to annotationlist mapping for faster indexing.

    Args:
        coco_dict: COCO dict with fields "images", "annotations", "categories".

    Returns:
        image_id_to_annotation_list: Mapping from image id to list of annotation dicts.
    """
    image_id_to_annotation_list: dict = defaultdict(list)
    logger.debug("indexing coco dataset annotations...")
    for annotation in coco_dict["annotations"]:
        image_id = annotation["image_id"]
        image_id_to_annotation_list[image_id].append(annotation)

    return image_id_to_annotation_list

merge(coco_dict1, coco_dict2, desired_name2id=None) ¶

Combine 2 coco formatted annotations dicts, and returns the combined coco dict.

Parameters:

Name	Type	Description	Default
`coco_dict1` ¶	`dict`	dict First coco dictionary.	required
`coco_dict2` ¶	`dict`	dict Second coco dictionary.	required
`desired_name2id` ¶	`dict \| None`	dict	`None`

Returns:

Name	Type	Description
`merged_coco_dict`	`dict`	Merged COCO dict.

Source code in sahi/utils/coco.py

def merge(coco_dict1: dict, coco_dict2: dict, desired_name2id: dict | None = None) -> dict:
    """Combine 2 coco formatted annotations dicts, and returns the combined coco dict.

    Args:
        coco_dict1: dict
            First coco dictionary.
        coco_dict2: dict
            Second coco dictionary.
        desired_name2id: dict
            {"human": 1, "car": 2, "big_vehicle": 3}

    Returns:
        merged_coco_dict: Merged COCO dict.
    """
    # copy input dicts so that original dicts are not affected
    temp_coco_dict1 = copy.deepcopy(coco_dict1)
    temp_coco_dict2 = copy.deepcopy(coco_dict2)

    # rearrange categories if any desired_name2id mapping is given
    if desired_name2id is not None:
        temp_coco_dict1 = update_categories(desired_name2id, temp_coco_dict1)
        temp_coco_dict2 = update_categories(desired_name2id, temp_coco_dict2)

    # rearrange categories of the second coco based on first, if their categories are not the same
    if temp_coco_dict1["categories"] != temp_coco_dict2["categories"]:
        desired_name2id = {category["name"]: category["id"] for category in temp_coco_dict1["categories"]}
        temp_coco_dict2 = update_categories(desired_name2id, temp_coco_dict2)

    # calculate first image and annotation index of the second coco file
    max_image_id = np.array([image["id"] for image in coco_dict1["images"]]).max()
    max_annotation_id = np.array([annotation["id"] for annotation in coco_dict1["annotations"]]).max()

    merged_coco_dict = temp_coco_dict1

    for image in temp_coco_dict2["images"]:
        image["id"] += max_image_id + 1
        merged_coco_dict["images"].append(image)

    for annotation in temp_coco_dict2["annotations"]:
        annotation["image_id"] += max_image_id + 1
        annotation["id"] += max_annotation_id + 1
        merged_coco_dict["annotations"].append(annotation)

    return merged_coco_dict

merge_from_file(coco_path1, coco_path2, save_path) ¶

Combine 2 coco formatted annotations files given their paths, and saves the combined file to save_path.

Parameters:

Name	Type	Description	Default
`coco_path1` ¶	`str`	Path for the first coco file.	required
`coco_path2` ¶	`str`	Path for the second coco file.	required
`save_path` ¶	`str`	Path to save the merged file, e.g. "dirname/coco.json".	required

Source code in sahi/utils/coco.py

def merge_from_file(coco_path1: str, coco_path2: str, save_path: str) -> None:
    """Combine 2 coco formatted annotations files given their paths, and saves the combined file to save_path.

    Args:
        coco_path1: Path for the first coco file.
        coco_path2: Path for the second coco file.
        save_path: Path to save the merged file, e.g. "dirname/coco.json".
    """
    # load coco files to be combined
    coco_dict1 = cast("dict[Any, Any]", load_json(coco_path1))
    coco_dict2 = cast("dict[Any, Any]", load_json(coco_path2))

    # merge coco dicts
    merged_coco_dict = merge(coco_dict1, coco_dict2)

    # save merged coco dict
    save_json(merged_coco_dict, save_path)

merge_from_list(coco_dict_list, desired_name2id=None, verbose=1) ¶

Combine a list of coco formatted annotations dicts, and returns the combined coco dict.

Parameters:

Name	Type	Description	Default
`coco_dict_list` ¶	`list[dict]`	list of dict A list of coco dicts	required
`desired_name2id` ¶	`dict \| None`	dict	`None`
`verbose` ¶	`int`	bool If True, merging info is printed	`1`

Returns:

merged_coco_dict: dict
    Merged COCO dict.

Source code in sahi/utils/coco.py

def merge_from_list(coco_dict_list: list[dict], desired_name2id: dict | None = None, verbose: int = 1) -> dict:
    """Combine a list of coco formatted annotations dicts, and returns the combined coco dict.

    Args:
        coco_dict_list: list of dict
            A list of coco dicts
        desired_name2id: dict
            {"human": 1, "car": 2, "big_vehicle": 3}
        verbose: bool
            If True, merging info is printed
    Returns:

        merged_coco_dict: dict
            Merged COCO dict.
    """
    if verbose:
        if not desired_name2id:
            print("'desired_name2id' is not specified, combining all categories.")

    # create desired_name2id by combinin all categories, if desired_name2id is not specified
    if desired_name2id is None:
        desired_name2id = {}
        ind = 0
        for coco_dict in coco_dict_list:
            temp_categories = copy.deepcopy(coco_dict["categories"])
            for temp_category in temp_categories:
                if temp_category["name"] not in desired_name2id:
                    desired_name2id[temp_category["name"]] = ind
                    ind += 1
                else:
                    continue

    for ind, coco_dict in enumerate(coco_dict_list):
        if ind == 0:
            merged_coco_dict = copy.deepcopy(coco_dict)
        else:
            merged_coco_dict = merge(merged_coco_dict, coco_dict, desired_name2id)

    # print categories
    if verbose:
        print(
            "Categories are formed as:\n",
            merged_coco_dict["categories"],
        )

    return merged_coco_dict

remove_invalid_coco_results(result_list_or_path, dataset_dict_or_path=None) ¶

Remove invalid predictions from coco result.

Removes predictions with negative bbox values or extreme bbox values.

Parameters:

Name	Type	Description	Default
`result_list_or_path` ¶	`list \| str`	path or list for coco result json	required
`dataset_dict_or_path` ¶	`optional`	path or dict for coco dataset json	`None`

Source code in sahi/utils/coco.py

def remove_invalid_coco_results(
    result_list_or_path: list | str, dataset_dict_or_path: dict | str | None = None
) -> list[dict]:
    """Remove invalid predictions from coco result.

    Removes predictions with negative bbox values or extreme bbox values.

    Args:
        result_list_or_path: path or list for coco result json
        dataset_dict_or_path (optional): path or dict for coco dataset json
    """
    # prepare coco results
    if isinstance(result_list_or_path, str):
        result_list = cast("list[dict]", load_json(result_list_or_path))
    elif isinstance(result_list_or_path, list):
        result_list = result_list_or_path
    else:
        raise TypeError('incorrect type for "result_list_or_path"')  # type: ignore

    # prepare image info from coco dataset
    if dataset_dict_or_path is not None:
        if isinstance(dataset_dict_or_path, str):
            dataset_dict = cast("dict[Any, Any]", load_json(dataset_dict_or_path))
        elif isinstance(dataset_dict_or_path, dict):
            dataset_dict = dataset_dict_or_path
        else:
            raise TypeError('incorrect type for "dataset_dict"')  # type: ignore
        image_id_to_height: dict[Any, Any] = {}
        image_id_to_width: dict[Any, Any] = {}
        for coco_image in dataset_dict["images"]:
            image_id_to_height[coco_image["id"]] = coco_image["height"]
            image_id_to_width[coco_image["id"]] = coco_image["width"]

    # remove invalid predictions
    fixed_result_list = []
    for coco_result in result_list:
        bbox = coco_result["bbox"]
        # ignore invalid predictions
        if not bbox:
            print("ignoring invalid prediction with empty bbox")
            continue
        if bbox[0] < 0 or bbox[1] < 0 or bbox[2] < 0 or bbox[3] < 0:
            print(f"ignoring invalid prediction with bbox: {bbox}")
            continue
        if dataset_dict_or_path is not None:
            if (
                bbox[1] > image_id_to_height[coco_result["image_id"]]
                or bbox[3] > image_id_to_height[coco_result["image_id"]]
                or bbox[0] > image_id_to_width[coco_result["image_id"]]
                or bbox[2] > image_id_to_width[coco_result["image_id"]]
            ):
                print(f"ignoring invalid prediction with bbox: {bbox}")
                continue
        fixed_result_list.append(coco_result)
    return fixed_result_list

update_categories(desired_name2id, coco_dict) ¶

Rearrange category mapping of COCO dictionary.

Can also be used to filter some of the categories.

Parameters:

Name	Type	Description	Default
`desired_name2id` ¶	`dict`	Desired category name to id mapping, e.g. {"big_vehicle": 1, "car": 2, "human": 3}.	required
`coco_dict` ¶	`dict`	COCO formatted dictionary.	required

Returns:

Name	Type	Description
`coco_target`	`dict`	COCO dict with updated/filtered categories.

Source code in sahi/utils/coco.py

def update_categories(desired_name2id: dict, coco_dict: dict) -> dict:
    """Rearrange category mapping of COCO dictionary.

    Can also be used to filter some of the categories.

    Args:
        desired_name2id: Desired category name to id mapping,
            e.g. {"big_vehicle": 1, "car": 2, "human": 3}.
        coco_dict: COCO formatted dictionary.

    Returns:
        coco_target: COCO dict with updated/filtered categories.
    """
    # so that original variable doesn't get affected
    coco_source = copy.deepcopy(coco_dict)

    # init target coco dict
    coco_target: dict = {"images": [], "annotations": [], "categories": []}

    # init vars
    currentid2desiredid_mapping: dict = {}
    # create category id mapping (currentid2desiredid_mapping)
    for category in coco_source["categories"]:
        current_category_id = category["id"]
        current_category_name = category["name"]
        if current_category_name in desired_name2id.keys():
            currentid2desiredid_mapping[current_category_id] = desired_name2id[current_category_name]
        else:
            # ignore categories that are not included in desired_name2id
            currentid2desiredid_mapping[current_category_id] = -1

    # update annotations
    for annotation in coco_source["annotations"]:
        current_category_id = annotation["category_id"]
        desired_category_id = currentid2desiredid_mapping[current_category_id]
        # append annotations with category id present in desired_name2id
        if desired_category_id != -1:
            # update cetegory id
            annotation["category_id"] = desired_category_id
            # append updated annotation to target coco dict
            coco_target["annotations"].append(annotation)

    # create desired categories
    categories = []
    for name in desired_name2id.keys():
        category = {}
        category["name"] = category["supercategory"] = name
        category["id"] = desired_name2id[name]
        categories.append(category)

    # update categories
    coco_target["categories"] = categories

    # update images
    coco_target["images"] = coco_source["images"]

    return coco_target

update_categories_from_file(desired_name2id, coco_path, save_path) ¶

Rearrange category mapping from COCO file.

Can also be used to filter some of the categories.

Parameters:

Name	Type	Description	Default
`desired_name2id` ¶	`dict`	Category name to id mapping, e.g., {"human": 1, "car": 2}.	required
`coco_path` ¶	`str`	Path to COCO JSON file.	required
`save_path` ¶	`str`	Path where the updated COCO JSON will be saved.	required

Source code in sahi/utils/coco.py

def update_categories_from_file(desired_name2id: dict, coco_path: str, save_path: str) -> None:
    """Rearrange category mapping from COCO file.

    Can also be used to filter some of the categories.

    Args:
        desired_name2id: Category name to id mapping, e.g., {"human": 1, "car": 2}.
        coco_path: Path to COCO JSON file.
        save_path: Path where the updated COCO JSON will be saved.
    """
    # load source coco dict
    coco_source = cast("dict[Any, Any]", load_json(coco_path))

    # update categories
    coco_target = update_categories(desired_name2id, coco_source)

    # save modified coco file
    save_json(coco_target, save_path)

`compatibility` ¶

Compatibility utilities for version handling.

Functions¶

fix_full_shape_list(full_shape_list) ¶

Ensure full_shape_list is in the expected nested list format.

Compatibility for sahi v0.8.15 and earlier versions.

Source code in sahi/utils/compatibility.py

def fix_full_shape_list(full_shape_list: list | None) -> list[list[int | float]] | None:
    """Ensure full_shape_list is in the expected nested list format.

    Compatibility for sahi v0.8.15 and earlier versions.
    """
    if full_shape_list is not None and isinstance(full_shape_list[0], (int, float)):
        full_shape_list = [full_shape_list]
    return full_shape_list

fix_shift_amount_list(shift_amount_list) ¶

Ensure shift_amount_list is in the expected nested list format.

Compatibility for sahi v0.8.15 and earlier versions.

Source code in sahi/utils/compatibility.py

def fix_shift_amount_list(shift_amount_list: list | None) -> list[list[int | float]]:
    """Ensure shift_amount_list is in the expected nested list format.

    Compatibility for sahi v0.8.15 and earlier versions.
    """
    if shift_amount_list is None:
        return [[0, 0]]
    if isinstance(shift_amount_list[0], (int, float)):
        shift_amount_list = [shift_amount_list]
    return shift_amount_list

`cv` ¶

Computer vision utilities for image processing and visualization.

Classes¶

Colors ¶

Color palette for visualization.

Source code in sahi/utils/cv.py

class Colors:
    """Color palette for visualization."""

    def __init__(self) -> None:
        """Initialize the color palette from hex color codes."""
        hex_colors = (
            "FF3838 2C99A8 FF701F 6473FF CFD231 48F90A 92CC17 3DDB86 1A9334 00D4BB "
            "FF9D97 00C2FF 344593 FFB21D 0018EC 8438FF 520085 CB38FF FF95C8 FF37C7"
        )

        self.palette = [self.hex_to_rgb(f"#{c}") for c in hex_colors.split()]
        self.n = len(self.palette)

    def __call__(self, ind: int, bgr: bool = False) -> tuple[int, int, int]:
        """Convert an index to a color code.

        Args:
            ind (int): The index to convert.
            bgr (bool, optional): Whether to return the color code in BGR format. Defaults to False.

        Returns:
            tuple: The color code in RGB or BGR format, depending on the value of `bgr`.
        """
        color_codes = self.palette[int(ind) % self.n]
        return (color_codes[2], color_codes[1], color_codes[0]) if bgr else color_codes

    @staticmethod
    def hex_to_rgb(hex_code: str) -> tuple[int, int, int]:
        """Converts a hexadecimal color code to RGB format.

        Args:
            hex_code (str): The hexadecimal color code to convert.

        Returns:
            tuple: A tuple representing the RGB values in the order (R, G, B).
        """
        rgb = []
        for i in (0, 2, 4):
            rgb.append(int(hex_code[1 + i : 1 + i + 2], 16))
        return (rgb[0], rgb[1], rgb[2])

Functions¶

__call__(ind, bgr=False) ¶

Convert an index to a color code.

Parameters:

Name	Type	Description	Default
`ind` ¶	`int`	The index to convert.	required
`bgr` ¶	`bool`	Whether to return the color code in BGR format. Defaults to False.	`False`

Returns:

Name	Type	Description
`tuple`	`tuple[int, int, int]`	The color code in RGB or BGR format, depending on the value of `bgr`.

Source code in sahi/utils/cv.py

def __call__(self, ind: int, bgr: bool = False) -> tuple[int, int, int]:
    """Convert an index to a color code.

    Args:
        ind (int): The index to convert.
        bgr (bool, optional): Whether to return the color code in BGR format. Defaults to False.

    Returns:
        tuple: The color code in RGB or BGR format, depending on the value of `bgr`.
    """
    color_codes = self.palette[int(ind) % self.n]
    return (color_codes[2], color_codes[1], color_codes[0]) if bgr else color_codes

__init__() ¶

Initialize the color palette from hex color codes.

Source code in sahi/utils/cv.py

def __init__(self) -> None:
    """Initialize the color palette from hex color codes."""
    hex_colors = (
        "FF3838 2C99A8 FF701F 6473FF CFD231 48F90A 92CC17 3DDB86 1A9334 00D4BB "
        "FF9D97 00C2FF 344593 FFB21D 0018EC 8438FF 520085 CB38FF FF95C8 FF37C7"
    )

    self.palette = [self.hex_to_rgb(f"#{c}") for c in hex_colors.split()]
    self.n = len(self.palette)

hex_to_rgb(hex_code) staticmethod ¶

Converts a hexadecimal color code to RGB format.

Parameters:

Name	Type	Description	Default
`hex_code` ¶	`str`	The hexadecimal color code to convert.	required

Returns:

Name	Type	Description
`tuple`	`tuple[int, int, int]`	A tuple representing the RGB values in the order (R, G, B).

Source code in sahi/utils/cv.py

@staticmethod
def hex_to_rgb(hex_code: str) -> tuple[int, int, int]:
    """Converts a hexadecimal color code to RGB format.

    Args:
        hex_code (str): The hexadecimal color code to convert.

    Returns:
        tuple: A tuple representing the RGB values in the order (R, G, B).
    """
    rgb = []
    for i in (0, 2, 4):
        rgb.append(int(hex_code[1 + i : 1 + i + 2], 16))
    return (rgb[0], rgb[1], rgb[2])

Functions¶

apply_color_mask(image, color) ¶

Applies color mask to given input image.

Parameters:

Name	Type	Description	Default
`image` ¶	`ndarray`	The input image to apply the color mask to.	required
`color` ¶	`tuple`	The RGB color tuple to use for the mask.	required

Returns:

Type	Description
`ndarray`	np.ndarray: The resulting image with the applied color mask.

Source code in sahi/utils/cv.py

def apply_color_mask(image: np.ndarray, color: tuple[int, int, int]) -> np.ndarray:
    """Applies color mask to given input image.

    Args:
        image (np.ndarray): The input image to apply the color mask to.
        color (tuple): The RGB color tuple to use for the mask.

    Returns:
        np.ndarray: The resulting image with the applied color mask.
    """
    r = np.zeros_like(image).astype(np.uint8)
    g = np.zeros_like(image).astype(np.uint8)
    b = np.zeros_like(image).astype(np.uint8)

    (r[image == 1], g[image == 1], b[image == 1]) = color
    colored_mask = np.stack([r, g, b], axis=2)
    return colored_mask

convert_image_to(read_path, extension='jpg', grayscale=False) ¶

Reads an image from the given path and saves it with the specified extension.

Parameters:

Name	Type	Description	Default
`read_path` ¶	`str`	The path to the image file.	required
`extension` ¶	`str`	The desired file extension for the saved image. Defaults to "jpg".	`'jpg'`
`grayscale` ¶	`bool`	Whether to convert the image to grayscale. Defaults to False.	`False`

Source code in sahi/utils/cv.py

def convert_image_to(read_path: str, extension: str = "jpg", grayscale: bool = False) -> None:
    """Reads an image from the given path and saves it with the specified extension.

    Args:
        read_path (str): The path to the image file.
        extension (str, optional): The desired file extension for the saved image. Defaults to "jpg".
        grayscale (bool, optional): Whether to convert the image to grayscale. Defaults to False.
    """
    image = cv2.imread(read_path)
    assert image is not None, f"Failed to read image: {read_path}"
    pre, _ = os.path.splitext(read_path)
    if grayscale:
        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        pre = pre + "_gray"
    save_path = pre + "." + extension
    cv2.imwrite(save_path, image)

crop_object_predictions(image, object_prediction_list, output_dir='', file_name='prediction_visual', export_format='png')

¶

Crops bounding boxes over the source image and exports it to the output folder.

Parameters:

Name	Type	Description	Default
`image` ¶	`ndarray`	The source image to crop bounding boxes from.	required
`object_prediction_list` ¶	`list`	A list of object predictions.	required
`output_dir` ¶	`str`	The directory where the resulting visualizations will be exported. Defaults to an empty string.	`''`
`file_name` ¶	`str`	The name of the exported file. The exported file will be saved as `output_dir + file_name + ".png"`. Defaults to "prediction_visual".	`'prediction_visual'`
`export_format` ¶	`str`	The format of the exported file. Can be specified as 'jpg' or 'png'. Defaults to "png".	`'png'`

Source code in sahi/utils/cv.py

def crop_object_predictions(
    image: np.ndarray,
    object_prediction_list: list,
    output_dir: str = "",
    file_name: str = "prediction_visual",
    export_format: str = "png",
) -> None:
    """Crops bounding boxes over the source image and exports it to the output folder.

    Args:
        image (np.ndarray): The source image to crop bounding boxes from.
        object_prediction_list: A list of object predictions.
        output_dir (str): The directory where the resulting visualizations will be exported. Defaults to an empty string.
        file_name (str): The name of the exported file. The exported file will be saved as `output_dir + file_name + ".png"`. Defaults to "prediction_visual".
        export_format (str): The format of the exported file. Can be specified as 'jpg' or 'png'. Defaults to "png".
    """  # noqa
    # create output folder if not present
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    # add bbox and mask to image if present
    for ind, object_prediction in enumerate(object_prediction_list):
        # deepcopy object_prediction_list so that the original is not altered
        object_prediction = object_prediction.deepcopy()
        bbox = object_prediction.bbox.to_xyxy()
        category_id = object_prediction.category.id
        # crop detections
        # deepcopy crops so that the original is not altered
        cropped_img = copy.deepcopy(
            image[
                int(bbox[1]) : int(bbox[3]),
                int(bbox[0]) : int(bbox[2]),
                :,
            ]
        )
        save_path = os.path.join(
            output_dir,
            file_name + "_box" + str(ind) + "_class" + str(category_id) + "." + export_format,
        )
        cv2.imwrite(save_path, cv2.cvtColor(cropped_img, cv2.COLOR_RGB2BGR))

get_bbox_from_bool_mask(bool_mask) ¶

Generate VOC bounding box [xmin, ymin, xmax, ymax] from given boolean mask.

Parameters:

Name	Type	Description	Default
`bool_mask` ¶	`ndarray`	2D boolean mask.	required

Returns:

Type	Description
`list[int] \| None`	Optional[List[int]]: VOC bounding box [xmin, ymin, xmax, ymax] or None if no bounding box is found.

Source code in sahi/utils/cv.py

def get_bbox_from_bool_mask(bool_mask: np.ndarray) -> list[int] | None:
    """Generate VOC bounding box [xmin, ymin, xmax, ymax] from given boolean mask.

    Args:
        bool_mask (np.ndarray): 2D boolean mask.

    Returns:
        Optional[List[int]]: VOC bounding box [xmin, ymin, xmax, ymax] or None if no bounding box is found.
    """
    rows = np.any(bool_mask, axis=1)
    cols = np.any(bool_mask, axis=0)

    if not np.any(rows) or not np.any(cols):
        return None

    ymin, ymax = np.where(rows)[0][[0, -1]]
    xmin, xmax = np.where(cols)[0][[0, -1]]
    width = xmax - xmin
    height = ymax - ymin

    if width == 0 or height == 0:
        return None

    return [xmin, ymin, xmax, ymax]

get_bbox_from_coco_segmentation(coco_segmentation) ¶

Generate voc box ([xmin, ymin, xmax, ymax]) from given coco segmentation.

Source code in sahi/utils/cv.py

def get_bbox_from_coco_segmentation(coco_segmentation: list) -> list | None:
    """Generate voc box ([xmin, ymin, xmax, ymax]) from given coco segmentation."""
    xs = []
    ys = []
    for segm in coco_segmentation:
        xs.extend(segm[::2])
        ys.extend(segm[1::2])
    if len(xs) == 0 or len(ys) == 0:
        return None
    xmin = min(xs)
    xmax = max(xs)
    ymin = min(ys)
    ymax = max(ys)
    return [xmin, ymin, xmax, ymax]

get_bool_mask_from_coco_segmentation(coco_segmentation, width, height) ¶

Convert COCO segmentation to a 2D boolean mask.

Parameters:

Name	Type	Description	Default
`coco_segmentation` ¶	`list[list[float]]`	List of polygons representing the COCO segmentation.	required
`width` ¶	`int`	Width of the boolean mask.	required
`height` ¶	`int`	Height of the boolean mask.	required

Returns:

Type	Description
`ndarray`	2D boolean mask of size (height, width).

Source code in sahi/utils/cv.py

def get_bool_mask_from_coco_segmentation(coco_segmentation: list[list[float]], width: int, height: int) -> np.ndarray:
    """Convert COCO segmentation to a 2D boolean mask.

    Args:
        coco_segmentation: List of polygons representing the COCO segmentation.
        width: Width of the boolean mask.
        height: Height of the boolean mask.

    Returns:
        2D boolean mask of size (height, width).
    """
    size = [height, width]
    points = [np.array(point).reshape(-1, 2).round().astype(int) for point in coco_segmentation]
    bool_mask = np.zeros(size)
    bool_mask = cv2.fillPoly(bool_mask, points, (1.0,))  # type: ignore[assignment]
    bool_mask.astype(bool)
    return bool_mask

get_coco_segmentation_from_bool_mask(bool_mask) ¶

Convert boolean mask to COCO segmentation format.

Converts a 2D boolean mask to COCO polygon format: [ [x1, y1, x2, y2, x3, y3, ...], [x1, y1, x2, y2, x3, y3, ...], ... ].

Source code in sahi/utils/cv.py

def get_coco_segmentation_from_bool_mask(bool_mask: np.ndarray) -> list[list[float]]:
    """Convert boolean mask to COCO segmentation format.

    Converts a 2D boolean mask to COCO polygon format:
    [
        [x1, y1, x2, y2, x3, y3, ...],
        [x1, y1, x2, y2, x3, y3, ...],
        ...
    ].
    """
    # Generate polygons from mask
    mask = np.squeeze(bool_mask)
    mask = mask.astype(np.uint8)
    mask = cv2.copyMakeBorder(mask, 1, 1, 1, 1, cv2.BORDER_CONSTANT, value=(0, 0, 0))
    contour_result = cv2.findContours(mask, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE, offset=(-1, -1))
    polygons = contour_result[0] if len(contour_result) == 2 else contour_result[1]  # type: ignore[index]
    # Convert polygon to coco segmentation
    coco_segmentation = []
    for polygon in polygons:
        segmentation = polygon.flatten().tolist()
        # at least 3 points needed for a polygon
        if len(segmentation) >= 6:
            coco_segmentation.append(segmentation)
    return coco_segmentation

get_coco_segmentation_from_obb_points(obb_points) ¶

Convert OBB (Oriented Bounding Box) points to COCO polygon format.

Parameters:

Name	Type	Description	Default
`obb_points` ¶	`ndarray`	np.ndarray OBB points tensor from ultralytics.engine.results.OBB Shape: (4, 2) containing 4 points with (x,y) coordinates each	required

Returns:

Type	Description
`list[list[float]]`	List[List[float]]: Polygon points in COCO format [[x1, y1, x2, y2, x3, y3, x4, y4], [...], ...]

Source code in sahi/utils/cv.py

def get_coco_segmentation_from_obb_points(obb_points: np.ndarray) -> list[list[float]]:
    """Convert OBB (Oriented Bounding Box) points to COCO polygon format.

    Args:
        obb_points: np.ndarray
            OBB points tensor from ultralytics.engine.results.OBB
            Shape: (4, 2) containing 4 points with (x,y) coordinates each

    Returns:
        List[List[float]]: Polygon points in COCO format
            [[x1, y1, x2, y2, x3, y3, x4, y4], [...], ...]
    """
    # Convert from (4,2) to [x1,y1,x2,y2,x3,y3,x4,y4] format
    points = obb_points.reshape(-1).tolist()

    # Create polygon from points and close it by repeating first point
    polygons = []
    # Add first point to end to close polygon
    closed_polygon = [*points, points[0], points[1]]
    polygons.append(closed_polygon)

    return polygons

get_video_reader(source, save_dir, frame_skip_interval, export_visual=False, view_visual=False) ¶

Creates OpenCV video capture object from given video file path.

Parameters:

Name	Type	Description	Default
`source` ¶	`str`	Video file path	required
`save_dir` ¶	`str`	Video export directory	required
`frame_skip_interval` ¶	`int`	Frame skip interval	required
`export_visual` ¶	`bool`	Set True if you want to export visuals	`False`
`view_visual` ¶	`bool`	Set True if you want to render visual	`False`

Returns:

Name	Type	Description
`iterator`	`Generator[Image]`	Pillow Image
`video_writer`	`VideoWriter \| None`	cv2.VideoWriter
`video_file_name`	`str`	video name with extension

Source code in sahi/utils/cv.py

def get_video_reader(
    source: str,
    save_dir: str,
    frame_skip_interval: int,
    export_visual: bool = False,
    view_visual: bool = False,
) -> tuple[Generator[Image.Image], cv2.VideoWriter | None, str, int]:
    """Creates OpenCV video capture object from given video file path.

    Args:
        source: Video file path
        save_dir: Video export directory
        frame_skip_interval: Frame skip interval
        export_visual: Set True if you want to export visuals
        view_visual: Set True if you want to render visual

    Returns:
        iterator: Pillow Image
        video_writer: cv2.VideoWriter
        video_file_name: video name with extension
    """
    # get video name with extension
    video_file_name = os.path.basename(source)
    # get video from video path
    video_capture = cv2.VideoCapture(source)

    num_frames = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT))
    if view_visual:
        num_frames = int(num_frames / (frame_skip_interval + 1))

    def read_video_frame(video_capture: cv2.VideoCapture, frame_skip_interval: int) -> Generator[Image.Image]:  # type: ignore[type-arg]
        if view_visual:
            window_name = f"Prediction of {video_file_name!s}"
            cv2.namedWindow(window_name, cv2.WINDOW_AUTOSIZE)
            default_image = np.zeros((480, 640, 3), dtype=np.uint8)
            cv2.imshow(window_name, default_image)

            while video_capture.isOpened():
                frame_num = video_capture.get(cv2.CAP_PROP_POS_FRAMES)
                video_capture.set(cv2.CAP_PROP_POS_FRAMES, frame_num + frame_skip_interval)

                k = cv2.waitKey(20)
                frame_num = video_capture.get(cv2.CAP_PROP_POS_FRAMES)

                if k == 27:
                    print(
                        "\n===========================Closing==========================="
                    )  # Exit the prediction, Key = Esc
                    exit()
                if k == 100:
                    frame_num += 100  # Skip 100 frames, Key = d
                if k == 97:
                    frame_num -= 100  # Prev 100 frames, Key = a
                if k == 103:
                    frame_num += 20  # Skip 20 frames, Key = g
                if k == 102:
                    frame_num -= 20  # Prev 20 frames, Key = f
                video_capture.set(cv2.CAP_PROP_POS_FRAMES, frame_num)

                ret, frame = video_capture.read()
                if not ret:
                    print("\n=========================== Video Ended ===========================")
                    break
                yield Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

        else:
            while video_capture.isOpened():
                frame_num = video_capture.get(cv2.CAP_PROP_POS_FRAMES)
                video_capture.set(cv2.CAP_PROP_POS_FRAMES, frame_num + frame_skip_interval)

                ret, frame = video_capture.read()
                if not ret:
                    print("\n=========================== Video Ended ===========================")
                    break
                yield Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

    if export_visual:
        # get video properties and create VideoWriter object
        if frame_skip_interval != 0:
            fps = video_capture.get(cv2.CAP_PROP_FPS)  # original fps of video
            # The fps of export video is increasing during view_image because frame is skipped
            fps = (
                fps / frame_skip_interval
            )  # How many time_interval equals to original fps. One time_interval skip x frames.
        else:
            fps = video_capture.get(cv2.CAP_PROP_FPS)

        w = int(video_capture.get(cv2.CAP_PROP_FRAME_WIDTH))
        h = int(video_capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
        size = (w, h)
        fourcc = cv2.VideoWriter_fourcc(*"mp4v")  # type: ignore[attr-defined]  # pyright: ignore[reportAttributeAccessIssue]
        video_writer = cv2.VideoWriter(os.path.join(save_dir, video_file_name), fourcc, fps, size)
    else:
        video_writer = None

    return read_video_frame(video_capture, frame_skip_interval), video_writer, video_file_name, num_frames

ipython_display(image) ¶

Displays numpy image in notebook.

If input image is in range 0..1, please first multiply img by 255 Assumes image is ndarray of shape [height, width, channels] where channels can be 1, 3 or 4

Source code in sahi/utils/cv.py

def ipython_display(image: np.ndarray) -> None:
    """Displays numpy image in notebook.

    If input image is in range 0..1, please first multiply img by 255
    Assumes image is ndarray of shape [height, width, channels] where channels can be 1, 3 or 4
    """
    import IPython

    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    _, ret = cv2.imencode(".png", image)
    i = IPython.display.Image(data=ret)  # type: ignore[attr-defined]
    IPython.display.display(i)  # type: ignore[attr-defined]

normalize_numpy_image(image) ¶

Normalizes numpy image.

Source code in sahi/utils/cv.py

def normalize_numpy_image(image: np.ndarray) -> np.ndarray:
    """Normalizes numpy image."""
    return image / np.max(image)

read_image(image_path) ¶

Loads image as a numpy array from the given path.

Parameters:

Name	Type	Description	Default
`image_path` ¶	`str`	The path to the image file.	required

Returns:

Type	Description
`ndarray`	numpy.ndarray: The loaded image as a numpy array.

Source code in sahi/utils/cv.py

def read_image(image_path: str) -> np.ndarray:
    """Loads image as a numpy array from the given path.

    Args:
        image_path (str): The path to the image file.

    Returns:
        numpy.ndarray: The loaded image as a numpy array.
    """
    # read image
    image = cv2.imread(image_path)
    assert image is not None, f"Failed to read image: {image_path}"
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    # return image
    return image

read_image_as_pil(image, exif_fix=True, return_arr=False) ¶

Loads an image as PIL.Image.Image (or np.ndarray when return_arr=True).

Parameters:

Name	Type	Description	Default
`image` ¶	`Union[Image, str, ndarray]`	The image to be loaded. It can be an image path or URL (str), a numpy image (np.ndarray), or a PIL.Image object.	required
`exif_fix` ¶	`bool`	Whether to apply an EXIF fix to the image. Defaults to False.	`True`
`return_arr` ¶	`bool`	When True and the input is already a numpy array, skip the costly PIL conversion and return an HWC RGB ndarray directly. For PIL/str inputs the PIL image is converted to ndarray before returning. Defaults to False.	`False`

Returns:

Type	Description
`Image \| ndarray`	PIL.Image.Image \| np.ndarray: The loaded image.

Source code in sahi/utils/cv.py

def read_image_as_pil(
    image: Image.Image | str | np.ndarray,
    exif_fix: bool = True,
    return_arr: bool = False,
) -> Image.Image | np.ndarray:
    """Loads an image as PIL.Image.Image (or np.ndarray when return_arr=True).

    Args:
        image (Union[Image.Image, str, np.ndarray]): The image to be loaded. It can be an image path or URL (str),
            a numpy image (np.ndarray), or a PIL.Image object.
        exif_fix (bool): Whether to apply an EXIF fix to the image. Defaults to False.
        return_arr (bool): When True and the input is already a numpy array, skip the
            costly PIL conversion and return an HWC RGB ndarray directly. For PIL/str inputs the
            PIL image is converted to ndarray before returning. Defaults to False.

    Returns:
        PIL.Image.Image | np.ndarray: The loaded image.
    """
    # https://stackoverflow.com/questions/56174099/how-to-load-images-larger-than-max-image-pixels-with-pil
    Image.MAX_IMAGE_PIXELS = None

    if isinstance(image, Image.Image):
        if return_arr:
            return np.asarray(image)
        return image
    elif isinstance(image, str):
        # read image if str image path is provided
        try:
            import requests

            image_pil = Image.open(
                BytesIO(requests.get(image, stream=True).content) if str(image).startswith("http") else image
            ).convert("RGB")
            if exif_fix:
                ImageOps.exif_transpose(image_pil, in_place=True)
        except Exception as e:  # handle large/tiff image reading
            logger.error(f"PIL failed reading image with error {e}, trying skimage instead")
            try:
                import skimage.io
            except ImportError:
                raise ImportError("Please run 'pip install -U scikit-image imagecodecs' for large image handling.")
            image_sk = skimage.io.imread(image).astype(np.uint8)
            if len(image_sk.shape) == 2:  # b&w
                image_pil = Image.fromarray(image_sk, mode="1")
            elif image_sk.shape[2] == 4:  # rgba
                image_pil = Image.fromarray(image_sk, mode="RGBA")
            elif image_sk.shape[2] == 3:  # rgb
                image_pil = Image.fromarray(image_sk, mode="RGB")
            else:
                raise TypeError(f"image with shape: {image_sk.shape[3]} is not supported.")
        if return_arr:
            return np.asarray(image_pil)
        return image_pil
    elif isinstance(image, np.ndarray):
        arr = _to_hwc(image)
        if return_arr:
            return arr
        return Image.fromarray(arr)
    else:
        raise TypeError("read image with 'pillow' using 'Image.open()'")

read_large_image(image_path) ¶

Reads a large image from the specified image path.

Parameters:

Name	Type	Description	Default
`image_path` ¶	`str`	The path to the image file.	required

Returns:

Name	Type	Description
`tuple`	`tuple[ndarray, bool]`	A tuple containing the image data and a flag indicating whether cv2 was used to read the image. The image data is a numpy array representing the image in RGB format. The flag is True if cv2 was used, False otherwise.

Source code in sahi/utils/cv.py

def read_large_image(image_path: str) -> tuple[np.ndarray, bool]:
    """Reads a large image from the specified image path.

    Args:
        image_path (str): The path to the image file.

    Returns:
        tuple: A tuple containing the image data and a flag indicating whether cv2 was used to read the image.
            The image data is a numpy array representing the image in RGB format.
            The flag is True if cv2 was used, False otherwise.
    """
    use_cv2 = True
    # read image, cv2 fails on large files
    try:
        # convert to rgb (cv2 reads in bgr)
        img_cv2 = cv2.imread(image_path, 1)
        assert img_cv2 is not None, f"Failed to read image: {image_path}"
        image0 = cv2.cvtColor(img_cv2, cv2.COLOR_BGR2RGB)
    except Exception as e:
        logger.error(f"OpenCV failed reading image with error {e}, trying skimage instead")
        try:
            import skimage.io
        except ImportError:
            raise ImportError(
                'Please run "pip install -U scikit-image" to install scikit-image first for large image handling.'
            )
        image0 = skimage.io.imread(image_path, as_grey=False).astype(np.uint8)  # [::-1]
        use_cv2 = False
    return image0, use_cv2

select_random_color() ¶

Selects a random color from a predefined list of colors.

Returns:

Name	Type	Description
`list`	`list[int]`	A list representing the RGB values of the selected color.

Source code in sahi/utils/cv.py

def select_random_color() -> list[int]:
    """Selects a random color from a predefined list of colors.

    Returns:
        list: A list representing the RGB values of the selected color.
    """
    colors = [
        [0, 255, 0],
        [0, 0, 255],
        [255, 0, 0],
        [0, 255, 255],
        [255, 255, 0],
        [255, 0, 255],
        [80, 70, 180],
        [250, 80, 190],
        [245, 145, 50],
        [70, 150, 250],
        [50, 190, 190],
    ]
    return colors[random.randrange(0, 10)]

visualize_object_predictions(image, object_prediction_list, rect_th=None, text_size=None, text_th=None, color=None, hide_labels=False, hide_conf=False, output_dir=None, file_name='prediction_visual', export_format='png')

¶

Visualize object predictions with bounding boxes and category names.

Parameters:

Name	Type	Description	Default
`image` ¶	`ndarray`	Input image as numpy array.	required
`object_prediction_list` ¶	`list`	List of prediction.ObjectPrediction instances.	required
`rect_th` ¶	`int \| None`	rectangle thickness	`None`
`text_size` ¶	`float \| None`	size of the category name over box	`None`
`text_th` ¶	`int \| None`	text thickness	`None`
`color` ¶	`tuple \| None`	annotation color in the form: (0, 255, 0)	`None`
`hide_labels` ¶	`bool`	hide labels	`False`
`hide_conf` ¶	`bool`	hide confidence	`False`
`output_dir` ¶	`str \| None`	directory for resulting visualization to be exported	`None`
`file_name` ¶	`str \| None`	exported file will be saved as: output_dir+file_name+".png"	`'prediction_visual'`
`export_format` ¶	`str \| None`	can be specified as 'jpg' or 'png'	`'png'`

Source code in sahi/utils/cv.py

def visualize_object_predictions(
    image: np.ndarray,
    object_prediction_list: list,
    rect_th: int | None = None,
    text_size: float | None = None,
    text_th: int | None = None,
    color: tuple | None = None,
    hide_labels: bool = False,
    hide_conf: bool = False,
    output_dir: str | None = None,
    file_name: str | None = "prediction_visual",
    export_format: str | None = "png",
) -> dict:
    """Visualize object predictions with bounding boxes and category names.

    Args:
        image: Input image as numpy array.
        object_prediction_list: List of prediction.ObjectPrediction instances.
        rect_th: rectangle thickness
        text_size: size of the category name over box
        text_th: text thickness
        color: annotation color in the form: (0, 255, 0)
        hide_labels: hide labels
        hide_conf: hide confidence
        output_dir: directory for resulting visualization to be exported
        file_name: exported file will be saved as: output_dir+file_name+".png"
        export_format: can be specified as 'jpg' or 'png'
    """
    elapsed_time = time.time()
    # deepcopy image so that original is not altered
    image = copy.deepcopy(image)
    # select predefined classwise color palette if not specified
    if color is None:
        colors = Colors()
    else:
        colors = None
    # set rect_th for boxes
    rect_th = rect_th or max(round(sum(image.shape) / 2 * 0.003), 2)
    # set text_th for category names
    text_th = text_th or max(rect_th - 1, 1)
    # set text_size for category names
    text_size = text_size or rect_th / 3

    # add masks or obb polygons to image if present
    for object_prediction in object_prediction_list:
        # deepcopy object_prediction_list so that original is not altered
        object_prediction = object_prediction.deepcopy()
        # arange label to be displayed
        label = f"{object_prediction.category.name}"
        if not hide_conf:
            label += f" {object_prediction.score.value:.2f}"
        # set color
        if colors is not None:
            color = colors(object_prediction.category.id)
        # visualize masks or obb polygons if present
        has_mask = object_prediction.mask is not None
        is_obb_pred = False
        if has_mask:
            segmentation = object_prediction.mask.segmentation
            if len(segmentation) == 1 and len(segmentation[0]) == 8:
                is_obb_pred = True

            if is_obb_pred:
                points = np.array(segmentation).reshape((-1, 1, 2)).astype(np.int32)
                cv2.polylines(image, [points], isClosed=True, color=color or (0, 0, 0), thickness=rect_th)

                if not hide_labels:
                    lowest_point = points[points[:, :, 1].argmax()][0]
                    box_width, box_height = cv2.getTextSize(label, 0, fontScale=text_size, thickness=text_th)[0]
                    outside = lowest_point[1] - box_height - 3 >= 0
                    text_bg_point1 = (
                        lowest_point[0],
                        lowest_point[1] - box_height - 3 if outside else lowest_point[1] + 3,
                    )
                    text_bg_point2 = (lowest_point[0] + box_width, lowest_point[1])
                    cv2.rectangle(
                        image, text_bg_point1, text_bg_point2, color or (0, 0, 0), thickness=-1, lineType=cv2.LINE_AA
                    )
                    cv2.putText(
                        image,
                        label,
                        (lowest_point[0], lowest_point[1] - 2 if outside else lowest_point[1] + box_height + 2),
                        0,
                        text_size,
                        (255, 255, 255),
                        thickness=text_th,
                    )
            else:
                # draw mask
                rgb_mask = apply_color_mask(object_prediction.mask.bool_mask, color or (0, 0, 0))
                image = cv2.addWeighted(image, 1, rgb_mask, 0.6, 0)

        # add bboxes to image if is_obb_pred=False
        if not is_obb_pred:
            bbox = object_prediction.bbox.to_xyxy()

            # set bbox points
            point1, point2 = (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3]))
            # visualize boxes
            cv2.rectangle(
                image,
                point1,
                point2,
                color=color or (0, 0, 0),
                thickness=rect_th,
            )

            if not hide_labels:
                box_width, box_height = cv2.getTextSize(label, 0, fontScale=text_size, thickness=text_th)[
                    0
                ]  # label width, height
                outside = point1[1] - box_height - 3 >= 0  # label fits outside box
                point2 = (point1[0] + box_width, point1[1] - box_height - 3 if outside else point1[1] + box_height + 3)
                # add bounding box text
                cv2.rectangle(image, point1, point2, color or (0, 0, 0), -1, cv2.LINE_AA)  # filled
                cv2.putText(
                    image,
                    label,
                    (point1[0], point1[1] - 2 if outside else point1[1] + box_height + 2),
                    0,
                    text_size,
                    (255, 255, 255),
                    thickness=text_th,
                )

    # export if output_dir is present
    if output_dir is not None:
        # export image with predictions
        Path(output_dir).mkdir(parents=True, exist_ok=True)
        # save inference result
        save_path = str(Path(output_dir) / ((file_name or "") + "." + (export_format or "")))
        cv2.imwrite(save_path, cv2.cvtColor(image, cv2.COLOR_RGB2BGR))

    elapsed_time = time.time() - elapsed_time
    return {"image": image, "elapsed_time": elapsed_time}

visualize_prediction(image, boxes, classes, masks=None, rect_th=None, text_size=None, text_th=None, color=None, hide_labels=False, output_dir=None, file_name='prediction_visual')

¶

Visualizes prediction classes, bounding boxes over the source image and exports it to output folder.

Parameters:

Name	Type	Description	Default
`image` ¶	`ndarray`	The source image.	required
`boxes` ¶	`List[List]`	List of bounding boxes coordinates.	required
`classes` ¶	`List[str]`	List of class labels corresponding to each bounding box.	required
`masks` ¶	`Optional[List[ndarray]]`	List of masks corresponding to each bounding box. Defaults to None.	`None`
`rect_th` ¶	`int`	Thickness of the bounding box rectangle. Defaults to None.	`None`
`text_size` ¶	`float`	Size of the text for class labels. Defaults to None.	`None`
`text_th` ¶	`int`	Thickness of the text for class labels. Defaults to None.	`None`
`color` ¶	`tuple`	Color of the bounding box and text. Defaults to None.	`None`
`hide_labels` ¶	`bool`	Whether to hide the class labels. Defaults to False.	`False`
`output_dir` ¶	`Optional[str]`	Output directory to save the visualization. Defaults to None.	`None`
`file_name` ¶	`Optional[str]`	File name for the saved visualization. Defaults to "prediction_visual".	`'prediction_visual'`

Returns:

Name	Type	Description
`dict`	`dict`	A dictionary containing the visualized image and the elapsed time for the visualization process.

Source code in sahi/utils/cv.py

def visualize_prediction(
    image: np.ndarray,
    boxes: list[list],
    classes: list[str],
    masks: list[np.ndarray] | None = None,
    rect_th: int | None = None,
    text_size: float | None = None,
    text_th: int | None = None,
    color: tuple | None = None,
    hide_labels: bool = False,
    output_dir: str | None = None,
    file_name: str | None = "prediction_visual",
) -> dict:
    """Visualizes prediction classes, bounding boxes over the source image and exports it to output folder.

    Args:
        image (np.ndarray): The source image.
        boxes (List[List]): List of bounding boxes coordinates.
        classes (List[str]): List of class labels corresponding to each bounding box.
        masks (Optional[List[np.ndarray]], optional): List of masks corresponding to each bounding box. Defaults to None.
        rect_th (int, optional): Thickness of the bounding box rectangle. Defaults to None.
        text_size (float, optional): Size of the text for class labels. Defaults to None.
        text_th (int, optional): Thickness of the text for class labels. Defaults to None.
        color (tuple, optional): Color of the bounding box and text. Defaults to None.
        hide_labels (bool, optional): Whether to hide the class labels. Defaults to False.
        output_dir (Optional[str], optional): Output directory to save the visualization. Defaults to None.
        file_name (Optional[str], optional): File name for the saved visualization. Defaults to "prediction_visual".

    Returns:
        dict: A dictionary containing the visualized image and the elapsed time for the visualization process.
    """  # noqa
    elapsed_time = time.time()
    # deepcopy image so that original is not altered
    image = copy.deepcopy(image)
    # select predefined classwise color palette if not specified
    if color is None:
        colors = Colors()
    else:
        colors = None
    # set rect_th for boxes
    rect_th = rect_th or max(round(sum(image.shape) / 2 * 0.003), 2)
    # set text_th for category names
    text_th = text_th or max(rect_th - 1, 1)
    # set text_size for category names
    text_size = text_size or rect_th / 3

    # add masks to image if present
    if masks is not None and color is None:
        logger.error("Cannot add mask, no color tuple given")
    elif masks is not None and color is not None:
        for mask in masks:
            # deepcopy mask so that original is not altered
            mask = copy.deepcopy(mask)
            # draw mask
            rgb_mask = apply_color_mask(np.squeeze(mask), color)
            image = cv2.addWeighted(image, 1, rgb_mask, 0.6, 0)

    # add bboxes to image if present
    for box_indice in range(len(boxes)):
        # deepcopy boxso that original is not altered
        box = copy.deepcopy(boxes[box_indice])
        class_ = classes[box_indice]

        # set color
        if colors is not None:
            mycolor = colors(int(class_))
        elif color is not None:
            mycolor = color
        else:
            logger.error("color cannot be defined")
            continue

        # set bbox points
        point1, point2 = [int(box[0]), int(box[1])], [int(box[2]), int(box[3])]
        # visualize boxes
        cv2.rectangle(
            image,
            point1,
            point2,
            color=mycolor,
            thickness=rect_th,
        )

        if not hide_labels:
            # arange bounding box text location
            label = f"{class_}"
            box_width, box_height = cv2.getTextSize(label, 0, fontScale=text_size, thickness=text_th)[
                0
            ]  # label width, height
            outside = point1[1] - box_height - 3 >= 0  # label fits outside box
            point2 = [point1[0] + box_width, point1[1] - box_height - 3 if outside else point1[1] + box_height + 3]
            # add bounding box text
            cv2.rectangle(image, point1, point2, color or (0, 0, 0), -1, cv2.LINE_AA)  # filled
            cv2.putText(
                image,
                label,
                (point1[0], point1[1] - 2 if outside else point1[1] + box_height + 2),
                0,
                text_size,
                (255, 255, 255),
                thickness=text_th,
            )
    if output_dir:
        # create output folder if not present
        Path(output_dir).mkdir(parents=True, exist_ok=True)
        # save inference result
        save_path = os.path.join(output_dir, (file_name or "unknown") + ".png")
        cv2.imwrite(save_path, cv2.cvtColor(image, cv2.COLOR_RGB2BGR))

    elapsed_time = time.time() - elapsed_time
    return {"image": image, "elapsed_time": elapsed_time}

yolo_bbox_to_voc_bbox(yolo_bbox, image_width, image_height) ¶

Convert YOLO format bounding box to VOC format.

Converts normalized YOLO format [x_center, y_center, width, height] to absolute VOC format [xmin, ymin, xmax, ymax] pixel coordinates.

Parameters:

Name	Type	Description	Default
`yolo_bbox` ¶	`list[float]`	list of [x_center, y_center, width, height]	required
`image_width` ¶	`int`	width of the image	required
`image_height` ¶	`int`	height of the image	required

Returns:

Type	Description
`list[float]`	list of [xmin, ymin, xmax, ymax]

Source code in sahi/utils/cv.py

def yolo_bbox_to_voc_bbox(yolo_bbox: list[float], image_width: int, image_height: int) -> list[float]:
    """Convert YOLO format bounding box to VOC format.

    Converts normalized YOLO format [x_center, y_center, width, height] to absolute
    VOC format [xmin, ymin, xmax, ymax] pixel coordinates.

    Args:
        yolo_bbox: list of [x_center, y_center, width, height]
        image_width: width of the image
        image_height: height of the image

    Returns:
        list of [xmin, ymin, xmax, ymax]
    """
    x_c, y_c, w, h = yolo_bbox
    xmin = (x_c - w / 2) * image_width
    ymin = (y_c - h / 2) * image_height
    xmax = (x_c + w / 2) * image_width
    ymax = (y_c + h / 2) * image_height
    return [xmin, ymin, xmax, ymax]

`detectron2` ¶

Detectron2 model utilities and constants.

Classes¶

Detectron2TestConstants ¶

Detectron2 test model configurations.

Source code in sahi/utils/detectron2.py

class Detectron2TestConstants:
    """Detectron2 test model configurations."""

    FASTERCNN_MODEL_ZOO_NAME = "COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml"
    RETINANET_MODEL_ZOO_NAME = "COCO-Detection/retinanet_R_50_FPN_3x.yaml"
    MASKRCNN_MODEL_ZOO_NAME = "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"

Functions¶

export_cfg_as_yaml(cfg, export_path='config.yaml') ¶

Export Detectron2 config object to YAML format.

Parameters:

Name	Type	Description	Default
`cfg` ¶	`CfgNode`	Detectron2 config object.	required
`export_path` ¶	`str`	Path to export the Detectron2 config.	`'config.yaml'`

Related Detectron2 doc: https://detectron2.readthedocs.io/en/stable/modules/config.html#detectron2.config.CfgNode.dump

Source code in sahi/utils/detectron2.py

def export_cfg_as_yaml(cfg: object, export_path: str = "config.yaml") -> None:
    """Export Detectron2 config object to YAML format.

    Args:
        cfg (detectron2.config.CfgNode): Detectron2 config object.
        export_path (str): Path to export the Detectron2 config.
    Related Detectron2 doc: https://detectron2.readthedocs.io/en/stable/modules/config.html#detectron2.config.CfgNode.dump
    """
    Path(export_path).parent.mkdir(exist_ok=True, parents=True)

    with open(export_path, "w") as f:
        f.write(cfg.dump())  # type: ignore[attr-defined]

`fiftyone` ¶

FiftyOne dataset and visualization utilities.

Classes¶

COCODetectionDatasetImporter ¶

Bases: COCODetectionDatasetImporter

Custom COCO detection dataset importer for FiftyOne.

Source code in sahi/utils/fiftyone.py

class COCODetectionDatasetImporter(BaseCOCODetectionDatasetImporter):
    """Custom COCO detection dataset importer for FiftyOne."""

    def setup(self) -> None:
        """Set up the importer with COCO dataset information."""
        if self.labels_path is not None and os.path.isfile(self.labels_path):
            (
                info,
                classes,
                supercategory_map,
                images,
                annotations,
            ) = load_coco_detection_annotations(self.labels_path, extra_attrs=self.extra_attrs)

            if classes is not None:
                info["classes"] = classes

            image_ids = _get_matching_image_ids(
                classes,
                images,
                annotations,
                image_ids=self.image_ids,
                classes=self.classes,
                shuffle=self.shuffle,
                seed=self.seed,
                max_samples=self.max_samples,
            )

            filenames = [images[_id]["file_name"] for _id in image_ids]

            _image_ids = set(image_ids)
            image_dicts_map = {i["file_name"]: i for _id, i in images.items() if _id in _image_ids}
        else:
            info = {}
            classes = None
            supercategory_map = None
            image_dicts_map = {}
            annotations = None
            filenames = []

        self._image_paths_map = {
            image["file_name"]: os.path.join(self.data_path, image["file_name"]) for image in images.values()
        }

        self._info = info
        self._classes = classes
        self._supercategory_map = supercategory_map
        self._image_dicts_map = image_dicts_map
        self._annotations = annotations
        self._filenames = filenames

Functions¶

setup() ¶

Set up the importer with COCO dataset information.

Source code in sahi/utils/fiftyone.py

def setup(self) -> None:
    """Set up the importer with COCO dataset information."""
    if self.labels_path is not None and os.path.isfile(self.labels_path):
        (
            info,
            classes,
            supercategory_map,
            images,
            annotations,
        ) = load_coco_detection_annotations(self.labels_path, extra_attrs=self.extra_attrs)

        if classes is not None:
            info["classes"] = classes

        image_ids = _get_matching_image_ids(
            classes,
            images,
            annotations,
            image_ids=self.image_ids,
            classes=self.classes,
            shuffle=self.shuffle,
            seed=self.seed,
            max_samples=self.max_samples,
        )

        filenames = [images[_id]["file_name"] for _id in image_ids]

        _image_ids = set(image_ids)
        image_dicts_map = {i["file_name"]: i for _id, i in images.items() if _id in _image_ids}
    else:
        info = {}
        classes = None
        supercategory_map = None
        image_dicts_map = {}
        annotations = None
        filenames = []

    self._image_paths_map = {
        image["file_name"]: os.path.join(self.data_path, image["file_name"]) for image in images.values()
    }

    self._info = info
    self._classes = classes
    self._supercategory_map = supercategory_map
    self._image_dicts_map = image_dicts_map
    self._annotations = annotations
    self._filenames = filenames

Functions¶

create_fiftyone_dataset_from_coco_file(coco_image_dir, coco_json_path) ¶

Create a FiftyOne dataset from COCO format files.

Source code in sahi/utils/fiftyone.py

def create_fiftyone_dataset_from_coco_file(coco_image_dir: str, coco_json_path: str) -> Any:
    """Create a FiftyOne dataset from COCO format files."""
    coco_importer = COCODetectionDatasetImporter(
        data_path=coco_image_dir, labels_path=coco_json_path, include_id=True
    )
    dataset = fo.Dataset.from_importer(coco_importer, label_field="gt")
    return dataset

launch_fiftyone_app(coco_image_dir, coco_json_path) ¶

Launch FiftyOne app with COCO dataset.

Source code in sahi/utils/fiftyone.py

def launch_fiftyone_app(coco_image_dir: str, coco_json_path: str) -> Any:
    """Launch FiftyOne app with COCO dataset."""
    dataset = create_fiftyone_dataset_from_coco_file(coco_image_dir, coco_json_path)
    session = fo.launch_app()
    session.dataset = dataset
    return session

`file` ¶

File I/O utilities for SAHI.

Classes¶

NumpyEncoder ¶

Bases: JSONEncoder

JSON encoder for numpy types.

Source code in sahi/utils/file.py

class NumpyEncoder(json.JSONEncoder):
    """JSON encoder for numpy types."""

    def default(self, obj: object) -> object:
        """Encode numpy types as JSON-serializable Python types."""
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        else:
            return super().default(obj)

Functions¶

default(obj) ¶

Encode numpy types as JSON-serializable Python types.

Source code in sahi/utils/file.py

def default(self, obj: object) -> object:
    """Encode numpy types as JSON-serializable Python types."""
    if isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    else:
        return super().default(obj)

Functions¶

download_from_url(from_url, to_path) ¶

Downloads a file from the given URL and saves it to the specified path.

Parameters:

Name	Type	Description	Default
`from_url` ¶	`str`	The URL of the file to download.	required
`to_path` ¶	`str`	The path where the downloaded file should be saved.	required

Returns:

Type	Description
`None`	None

Source code in sahi/utils/file.py

def download_from_url(from_url: str, to_path: str) -> None:
    """Downloads a file from the given URL and saves it to the specified path.

    Args:
        from_url (str): The URL of the file to download.
        to_path (str): The path where the downloaded file should be saved.

    Returns:
        None
    """
    Path(to_path).parent.mkdir(parents=True, exist_ok=True)

    if not os.path.exists(to_path):
        import urllib.request

        urllib.request.urlretrieve(from_url, to_path)

get_base_filename(path) ¶

Takes a file path, returns (base_filename_with_extension, base_filename_without_extension).

Source code in sahi/utils/file.py

def get_base_filename(path: str) -> tuple[str, str]:
    """Takes a file path, returns (base_filename_with_extension, base_filename_without_extension)."""
    base_filename_with_extension = ntpath.basename(path)
    base_filename_without_extension, _ = os.path.splitext(base_filename_with_extension)
    return base_filename_with_extension, base_filename_without_extension

get_file_extension(path) ¶

Get the file extension from a given file path.

Parameters:

Name	Type	Description	Default
`path` ¶	`str`	The file path.	required

Returns:

Name	Type	Description
`str`	`str`	The file extension.

Source code in sahi/utils/file.py

def get_file_extension(path: str) -> str:
    """Get the file extension from a given file path.

    Args:
        path (str): The file path.

    Returns:
        str: The file extension.
    """
    _, file_extension = os.path.splitext(path)
    return file_extension

import_model_class(model_type, class_name) ¶

Import a predefined detection model class by name.

Parameters:

Name	Type	Description	Default
`model_type` ¶	`str`	Framework type ("yolov5", "detectron2", "mmdet", etc).	required
`class_name` ¶	`str`	Name of the detection model class (e.g., "MmdetDetectionModel").	required

Returns:

Name	Type	Description
`class_`	`type`	class with given path

Source code in sahi/utils/file.py

def import_model_class(model_type: str, class_name: str) -> type:
    """Import a predefined detection model class by name.

    Args:
        model_type: Framework type ("yolov5", "detectron2", "mmdet", etc).
        class_name: Name of the detection model class (e.g., "MmdetDetectionModel").

    Returns:
        class_: class with given path
    """
    module = __import__(f"sahi.models.{model_type}", fromlist=[class_name])
    class_ = getattr(module, class_name)
    return class_

increment_path(path, exist_ok=True, sep='') ¶

Increment path, i.e. runs/exp --> runs/exp{sep}0, runs/exp{sep}1 etc.

Parameters:

Name	Type	Description	Default
`path` ¶	`str \| Path`	str The base path to increment.	required
`exist_ok` ¶	`bool`	bool If True, return the path as is if it already exists. If False, increment the path.	`True`
`sep` ¶	`str`	str The separator to use between the base path and the increment number.	`''`

Returns:

Name	Type	Description
`str`	`str`	The incremented path.

Example

increment_path("runs/exp", sep="") 'runs/exp_0' increment_path("runs/exp_0", sep="") 'runs/exp_1'

Source code in sahi/utils/file.py

def increment_path(path: str | Path, exist_ok: bool = True, sep: str = "") -> str:
    """Increment path, i.e. runs/exp --> runs/exp{sep}0, runs/exp{sep}1 etc.

    Args:
        path: str
            The base path to increment.
        exist_ok: bool
            If True, return the path as is if it already exists. If False, increment the path.
        sep: str
            The separator to use between the base path and the increment number.

    Returns:
        str: The incremented path.

    Example:
        >>> increment_path("runs/exp", sep="_")
        'runs/exp_0'
        >>> increment_path("runs/exp_0", sep="_")
        'runs/exp_1'
    """
    path = Path(path)  # os-agnostic
    if (path.exists() and exist_ok) or (not path.exists()):
        return str(path)
    else:
        dirs = glob.glob(f"{path}{sep}*")  # similar paths
        matches = [re.search(rf"%s{sep}(\d+)" % path.stem, d) for d in dirs]
        indices = [int(m.groups()[0]) for m in matches if m]  # indices
        n = max(indices) + 1 if indices else 2  # increment number
        return f"{path}{sep}{n}"  # update path

is_colab() ¶

Check if the current environment is a Google Colab instance.

Returns:

Name	Type	Description
`bool`	`bool`	True if the environment is a Google Colab instance, False otherwise.

Source code in sahi/utils/file.py

def is_colab() -> bool:
    """Check if the current environment is a Google Colab instance.

    Returns:
        bool: True if the environment is a Google Colab instance, False otherwise.
    """
    import sys

    return "google.colab" in sys.modules

list_files(directory, contains=['.json'], verbose=1) ¶

Walk given directory and return a list of file path with desired extension.

Parameters:

Name	Type	Description	Default
`directory` ¶	`str`	str "data/coco/"	required
`contains` ¶	`list`	list A list of strings to check if the target file contains them, example: ["coco.png", ".jpg", "jpeg"]	`['.json']`
`verbose` ¶	`int`	int 0: no print 1: print number of files	`1`

Returns:

Name	Type	Description
`filepath_list`	`list[str]`	List of file paths.

Source code in sahi/utils/file.py

def list_files(
    directory: str,
    contains: list = [".json"],
    verbose: int = 1,
) -> list[str]:
    """Walk given directory and return a list of file path with desired extension.

    Args:
        directory: str
            "data/coco/"
        contains: list
            A list of strings to check if the target file contains them, example: ["coco.png", ".jpg", "jpeg"]
        verbose: int
            0: no print
            1: print number of files

    Returns:
        filepath_list: List of file paths.
    """
    # define verboseprint
    verboseprint = print if verbose else lambda *a, **k: None

    filepath_list: list[str] = []

    for file in os.listdir(directory):
        # check if filename contains any of the terms given in contains list
        if any(strtocheck in file.lower() for strtocheck in contains):
            filepath = str(os.path.join(directory, file))
            filepath_list.append(filepath)

    number_of_files = len(filepath_list)
    folder_name = Path(directory).name

    verboseprint(f"There are {number_of_files!s} listed files in folder: {folder_name}/")

    return filepath_list

list_files_recursively(directory, contains=['.json'], verbose=True) ¶

Walk given directory recursively and return a list of file path with desired extension.

Parameters:

Name	Type	Description	Default
`directory` ¶	`str`	Directory path to walk, e.g. "data/coco/".	required
`contains` ¶	`list[str]`	A list of strings to check if the target file contains them, example: ["coco.png", ".jpg", "jpeg"].	`['.json']`
`verbose` ¶	`bool`	If true, prints some results.	`True`

Returns:

Name	Type	Description
`relative_filepath_list`	`list[str]`	List of file paths relative to given directory.
`abs_filepath_list`	`list[str]`	List of absolute file paths.

Source code in sahi/utils/file.py

def list_files_recursively(
    directory: str, contains: list[str] = [".json"], verbose: bool = True
) -> tuple[list[str], list[str]]:
    """Walk given directory recursively and return a list of file path with desired extension.

    Args:
        directory: Directory path to walk, e.g. "data/coco/".
        contains: A list of strings to check if the target file contains them,
            example: ["coco.png", ".jpg", "jpeg"].
        verbose: If true, prints some results.

    Returns:
        relative_filepath_list: List of file paths relative to given directory.
        abs_filepath_list: List of absolute file paths.
    """
    # define verboseprint
    verboseprint = print if verbose else lambda *a, **k: None

    # walk directories recursively and find json files
    abs_filepath_list = []
    relative_filepath_list = []

    # r=root, d=directories, f=files
    for r, _, f in os.walk(directory):
        for file in f:
            # check if filename contains any of the terms given in contains list
            if any(strtocheck in file.lower() for strtocheck in contains):
                abs_filepath = os.path.join(r, file)
                abs_filepath_list.append(abs_filepath)
                relative_filepath = abs_filepath.split(directory)[-1]
                relative_filepath_list.append(relative_filepath)

    number_of_files = len(relative_filepath_list)
    folder_name = directory.split(os.sep)[-1]

    verboseprint(f"There are {number_of_files} listed files in folder {folder_name}.")

    return relative_filepath_list, abs_filepath_list

load_json(load_path, encoding='utf-8') ¶

Load JSON formatted data from file.

Encoding type can be specified with 'encoding' argument.

Parameters:

Name	Type	Description	Default
`load_path` ¶	`str`	str "dirname/coco.json"	required
`encoding` ¶	`str`	str Encoding type, default is 'utf-8'	`'utf-8'`

Example inputs

load_path: "dirname/coco.json"

Source code in sahi/utils/file.py

def load_json(load_path: str, encoding: str = "utf-8") -> object:
    """Load JSON formatted data from file.

    Encoding type can be specified with 'encoding' argument.

    Args:
        load_path: str
            "dirname/coco.json"
        encoding: str
            Encoding type, default is 'utf-8'

    Example inputs:
        load_path: "dirname/coco.json"
    """
    # read from path
    with open(load_path, encoding=encoding) as json_file:
        data = json.load(json_file)
    return data

load_pickle(load_path) ¶

Loads pickle formatted data (given as "data") from load_path.

Parameters:

Name	Type	Description	Default
`load_path` ¶	`str \| Path`	str "dirname/coco.pickle"	required

Example inputs

load_path: "dirname/coco.pickle"

Source code in sahi/utils/file.py

def load_pickle(load_path: str | Path) -> object:
    """Loads pickle formatted data (given as "data") from load_path.

    Args:
        load_path: str
            "dirname/coco.pickle"

    Example inputs:
        load_path: "dirname/coco.pickle"
    """
    with open(load_path, "rb") as json_file:
        data = pickle.load(json_file)
    return data

save_json(data, save_path, indent=None) ¶

Saves json formatted data (given as "data") as save_path.

Parameters:

Name	Type	Description	Default
`data` ¶	`object`	dict Data to be saved as json	required
`save_path` ¶	`str \| Path`	str "dirname/coco.json"	required
`indent` ¶	`int \| None`	int or None Indentation level for pretty-printing the JSON data. If None, the most compact representation will be used. If an integer is provided, it specifies the number of spaces to use for indentation. Example: indent=4 will format the JSON data with an indentation of 4 spaces per level.	`None`

Example inputs

data: {"image_id": 5} save_path: "dirname/coco.json" indent: Train json files with indent=None, val json files with indent=4

Source code in sahi/utils/file.py

def save_json(data: object, save_path: str | Path, indent: int | None = None) -> None:
    """Saves json formatted data (given as "data") as save_path.

    Args:
        data: dict
            Data to be saved as json
        save_path: str
            "dirname/coco.json"
        indent: int or None
            Indentation level for pretty-printing the JSON data. If None, the most compact representation
            will be used. If an integer is provided, it specifies the number of spaces to use for indentation.
            Example: indent=4 will format the JSON data with an indentation of 4 spaces per level.

    Example inputs:
        data: {"image_id": 5}
        save_path: "dirname/coco.json"
        indent: Train json files with indent=None, val json files with indent=4
    """
    # create dir if not present
    Path(save_path).parent.mkdir(parents=True, exist_ok=True)

    # export as json
    with open(save_path, "w", encoding="utf-8") as outfile:
        json.dump(data, outfile, separators=(",", ":"), cls=NumpyEncoder, indent=indent)

save_pickle(data, save_path) ¶

Saves pickle formatted data (given as "data") as save_path.

Parameters:

Name	Type	Description	Default
`data` ¶	`object`	dict Data to be saved as pickle	required
`save_path` ¶	`str \| Path`	str "dirname/coco.pickle"	required

Example inputs

data: {"image_id": 5} save_path: "dirname/coco.pickle"

Source code in sahi/utils/file.py

def save_pickle(data: object, save_path: str | Path) -> None:
    """Saves pickle formatted data (given as "data") as save_path.

    Args:
        data: dict
            Data to be saved as pickle
        save_path: str
            "dirname/coco.pickle"

    Example inputs:
        data: {"image_id": 5}
        save_path: "dirname/coco.pickle"
    """
    # create dir if not present
    Path(save_path).parent.mkdir(parents=True, exist_ok=True)

    # export as json
    with open(save_path, "wb") as outfile:
        pickle.dump(data, outfile)

unzip(file_path, dest_dir) ¶

Unzips compressed .zip file.

Example inputs

file_path: 'data/01_alb_id.zip' dest_dir: 'data/'

Source code in sahi/utils/file.py

def unzip(file_path: str, dest_dir: str) -> None:
    """Unzips compressed .zip file.

    Example inputs:
        file_path: 'data/01_alb_id.zip'
        dest_dir: 'data/'
    """
    # unzip file
    with zipfile.ZipFile(file_path) as zf:
        zf.extractall(dest_dir)

`import_utils` ¶

Import utilities for checking package availability.

Functions¶

check_package_minimum_version(package_name, minimum_version, verbose=False) ¶

Check whether an installed package meets a minimum version requirement.

Parameters:

Name	Type	Description	Default
`package_name` ¶	`str`	The name of the package to check.	required
`minimum_version` ¶	`str`	The minimum acceptable version string (e.g. "1.0.0").	required
`verbose` ¶	`bool`	If True, log the detected package version.	`False`

Returns:

Type	Description
`bool`	True if the package is missing (assumed compatible), its version is unknown, or its version meets the minimum. False if the installed version is below the minimum.

Source code in sahi/utils/import_utils.py

def check_package_minimum_version(package_name: str, minimum_version: str, verbose: bool = False) -> bool:
    """Check whether an installed package meets a minimum version requirement.

    Args:
        package_name: The name of the package to check.
        minimum_version: The minimum acceptable version string (e.g. "1.0.0").
        verbose: If True, log the detected package version.

    Returns:
        True if the package is missing (assumed compatible), its version
            is unknown, or its version meets the minimum. False if the
            installed version is below the minimum.
    """
    from packaging import version

    _is_available, _version = get_package_info(package_name, verbose=verbose)
    if _is_available:
        if _version == "unknown":
            logger.warning(
                f"Could not determine version of {package_name}. Assuming version {minimum_version} is compatible."
            )
        else:
            if version.parse(_version) < version.parse(minimum_version):
                return False
    return True

check_requirements(package_names) ¶

Verify that all required packages are importable.

Parameters:

Name	Type	Description	Default
`package_names` ¶	`Iterable[str]`	Iterable of package names to check.	required

Raises:

Type	Description
`ImportError`	If any of the listed packages cannot be found.

Yields:

Type	Description
`Any`	Control back to the caller if all packages are available.

Source code in sahi/utils/import_utils.py

def check_requirements(package_names: Iterable[str]) -> Generator[Any, Any, Any]:
    """Verify that all required packages are importable.

    Args:
        package_names: Iterable of package names to check.

    Raises:
        ImportError: If any of the listed packages cannot be found.

    Yields:
        Control back to the caller if all packages are available.
    """
    missing_packages = []
    for package_name in package_names:
        if importlib.util.find_spec(package_name) is None:
            missing_packages.append(package_name)
    if missing_packages:
        raise ImportError(f"The following packages are required to use this module: {missing_packages}")
    yield

ensure_package_minimum_version(package_name, minimum_version, verbose=False) ¶

Ensure a package meets a minimum version, raising on failure.

Parameters:

Name	Type	Description	Default
`package_name` ¶	`str`	The name of the package to check.	required
`minimum_version` ¶	`str`	The minimum acceptable version string (e.g. "1.0.0").	required
`verbose` ¶	`bool`	If True, log the detected package version.	`False`

Raises:

Type	Description
`ImportError`	If the installed version is below minimum_version.

Yields:

Type	Description
`None`	Control back to the caller if the version requirement is met.

Source code in sahi/utils/import_utils.py

def ensure_package_minimum_version(
    package_name: str, minimum_version: str, verbose: bool = False
) -> Generator[None, Any, None]:
    """Ensure a package meets a minimum version, raising on failure.

    Args:
        package_name: The name of the package to check.
        minimum_version: The minimum acceptable version string (e.g. "1.0.0").
        verbose: If True, log the detected package version.

    Raises:
        ImportError: If the installed version is below minimum_version.

    Yields:
        Control back to the caller if the version requirement is met.
    """
    from packaging import version

    _is_available, _version = get_package_info(package_name, verbose=verbose)
    if _is_available:
        if _version == "unknown":
            logger.warning(
                f"Could not determine version of {package_name}. Assuming version {minimum_version} is compatible."
            )
        else:
            if version.parse(_version) < version.parse(minimum_version):
                raise ImportError(
                    f"Please upgrade {package_name} to version {minimum_version} or higher to use this module."
                )
    yield

get_package_info(package_name, verbose=True) ¶

Check whether a package is installed and retrieve its version.

Parameters:

Name	Type	Description	Default
`package_name` ¶	`str`	The name of the package to look up.	required
`verbose` ¶	`bool`	If True, log the package version when available.	`True`

Returns:

Name	Type	Description
`is_available`	`bool`	Whether the package is installed.
`version_string`	`str`	Version string, or "N/A" if not installed.

Source code in sahi/utils/import_utils.py

def get_package_info(package_name: str, verbose: bool = True) -> tuple[bool, str]:
    """Check whether a package is installed and retrieve its version.

    Args:
        package_name: The name of the package to look up.
        verbose: If True, log the package version when available.

    Returns:
        is_available (bool): Whether the package is installed.
        version_string (str): Version string, or "N/A" if not installed.
    """
    _is_available = is_available(package_name)

    if _is_available:
        try:
            import importlib.metadata as _importlib_metadata

            _version = _importlib_metadata.version(package_name)
        except (ModuleNotFoundError, AttributeError):
            try:
                _version = importlib.import_module(package_name).__version__
            except AttributeError:
                _version = "unknown"
        if verbose:
            logger.pkg_info(f"{package_name} version {_version} is available.")
    else:
        _version = "N/A"

    return _is_available, _version

is_available(module_name) ¶

Check whether a Python module is importable.

Parameters:

Name	Type	Description	Default
`module_name` ¶	`str`	Dotted module name (e.g. "torch", "torchvision").	required

Returns:

Type	Description
`bool`	True if the module can be found by the import system.

Source code in sahi/utils/import_utils.py

def is_available(module_name: str) -> bool:
    """Check whether a Python module is importable.

    Args:
        module_name: Dotted module name (e.g. "torch", "torchvision").

    Returns:
        True if the module can be found by the import system.
    """
    return importlib.util.find_spec(module_name) is not None

print_environment_info() ¶

Log version info for all commonly used SAHI dependency packages.

Source code in sahi/utils/import_utils.py

def print_environment_info() -> None:
    """Log version info for all commonly used SAHI dependency packages."""
    get_package_info("torch")
    get_package_info("torchvision")
    get_package_info("tensorflow")
    get_package_info("tensorflow-hub")
    get_package_info("ultralytics")
    get_package_info("yolov5")
    get_package_info("mmdet")
    get_package_info("mmcv")
    get_package_info("detectron2")
    get_package_info("transformers")
    get_package_info("timm")
    get_package_info("fiftyone")
    get_package_info("pillow")
    get_package_info("opencv-python")

`mmdet` ¶

MMDetection model utilities and helpers.

Classes¶

MmdetTestConstants ¶

MMDetection test model configurations.

Source code in sahi/utils/mmdet.py

class MmdetTestConstants:
    """MMDetection test model configurations."""

    MMDET_CASCADEMASKRCNN_MODEL_URL = "http://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco/cascade_mask_rcnn_r50_fpn_1x_coco_20200203-9d4dcb24.pth"
    MMDET_CASCADEMASKRCNN_MODEL_PATH = (
        "tests/data/models/mmdet/cascade_mask_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco_20200203-9d4dcb24.pth"
    )
    MMDET_RETINANET_MODEL_URL = "http://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_fpn_2x_coco/retinanet_r50_fpn_2x_coco_20200131-fdb43119.pth"
    MMDET_RETINANET_MODEL_PATH = "tests/data/models/mmdet/retinanet/retinanet_r50_fpn_2x_coco_20200131-fdb43119.pth"
    MMDET_YOLOX_TINY_MODEL_URL = "https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_tiny_8x8_300e_coco/yolox_tiny_8x8_300e_coco_20211124_171234-b4047906.pth"
    MMDET_YOLOX_TINY_MODEL_PATH = "tests/data/models/mmdet/yolox/yolox_tiny_8x8_300e_coco_20211124_171234-b4047906.pth"

    MMDET_CASCADEMASKRCNN_CONFIG_PATH = "tests/data/models/mmdet/cascade_mask_rcnn/cascade-mask-rcnn_r50_fpn_1x_coco.py"
    MMDET_RETINANET_CONFIG_PATH = "tests/data/models/mmdet/retinanet/retinanet_r50_fpn_1x_coco.py"
    MMDET_YOLOX_TINY_CONFIG_PATH = "tests/data/models/mmdet/yolox/yolox_tiny_8xb8-300e_coco.py"

Functions¶

download_mmdet_cascade_mask_rcnn_model(destination_path=None) ¶

Download the Cascade Mask R-CNN model for testing.

Source code in sahi/utils/mmdet.py

def download_mmdet_cascade_mask_rcnn_model(destination_path: str | None = None) -> None:
    """Download the Cascade Mask R-CNN model for testing."""
    if destination_path is None:
        destination_path = MmdetTestConstants.MMDET_CASCADEMASKRCNN_MODEL_PATH

    Path(destination_path).parent.mkdir(parents=True, exist_ok=True)

    download_from_url(MmdetTestConstants.MMDET_CASCADEMASKRCNN_MODEL_URL, destination_path)

download_mmdet_config(model_name='cascade_rcnn', config_file_name='cascade_mask_rcnn_r50_fpn_1x_coco.py', verbose=True) ¶

Merges config files starting from given main config file name. Saves as single file.

Parameters:

Name	Type	Description	Default
`model_name` ¶	`str`	mmdet model name. check https://github.com/open-mmlab/mmdetection/tree/master/configs.	`'cascade_rcnn'`
`config_file_name` ¶	`str`	mdmet config file name.	`'cascade_mask_rcnn_r50_fpn_1x_coco.py'`
`verbose` ¶	`bool`	if True, print save path.	`True`

Returns:

Type	Description
`str`	(str) abs path of the downloaded config file.

Source code in sahi/utils/mmdet.py

def download_mmdet_config(
    model_name: str = "cascade_rcnn",
    config_file_name: str = "cascade_mask_rcnn_r50_fpn_1x_coco.py",
    verbose: bool = True,
) -> str:
    """Merges config files starting from given main config file name. Saves as single file.

    Args:
        model_name (str): mmdet model name. check https://github.com/open-mmlab/mmdetection/tree/master/configs.
        config_file_name (str): mdmet config file name.
        verbose (bool): if True, print save path.

    Returns:
        (str) abs path of the downloaded config file.
    """
    # get mmdet version
    from mmdet import __version__

    mmdet_ver = "v" + __version__

    # set main config url
    base_config_url = (
        "https://raw.githubusercontent.com/open-mmlab/mmdetection/" + mmdet_ver + "/configs/" + model_name + "/"
    )
    main_config_url = base_config_url + config_file_name

    # set final config dirs
    configs_dir = Path("mmdet_configs") / mmdet_ver
    model_config_dir = configs_dir / model_name

    # create final config dir
    configs_dir.mkdir(parents=True, exist_ok=True)
    model_config_dir.mkdir(parents=True, exist_ok=True)

    # get final config file name
    filename = Path(main_config_url).name

    # set final config file path
    final_config_path = str(model_config_dir / filename)

    if not Path(final_config_path).exists():
        # set config dirs
        temp_configs_dir = Path("temp_mmdet_configs")
        main_config_dir = temp_configs_dir / model_name

        # create config dirs
        temp_configs_dir.mkdir(parents=True, exist_ok=True)
        main_config_dir.mkdir(parents=True, exist_ok=True)

        # get main config file name
        filename = Path(main_config_url).name

        # set main config file path
        main_config_path = str(main_config_dir / filename)

        # download main config file
        urllib.request.urlretrieve(
            main_config_url,
            main_config_path,
        )

        # read main config file
        sys.path.insert(0, str(main_config_dir))
        temp_module_name = path.splitext(filename)[0]
        mod = import_module(temp_module_name)
        sys.path.pop(0)
        config_dict = {name: value for name, value in mod.__dict__.items() if not name.startswith("__")}

        # handle when config_dict["_base_"] is string
        if not isinstance(config_dict["_base_"], list):
            config_dict["_base_"] = [config_dict["_base_"]]

        # iterate over secondary config files
        for secondary_config_file_path in config_dict["_base_"]:
            # set config url
            config_url = base_config_url + secondary_config_file_path
            config_path = main_config_dir / secondary_config_file_path

            # create secondary config dir
            config_path.parent.mkdir(parents=True, exist_ok=True)

            # download secondary config files
            urllib.request.urlretrieve(
                config_url,
                str(config_path),
            )

            # read secondary config file
            secondary_config_dir = config_path.parent
            sys.path.insert(0, str(secondary_config_dir))
            temp_module_name = path.splitext(Path(config_path).name)[0]
            mod = import_module(temp_module_name)
            sys.path.pop(0)
            secondary_config_dict = {name: value for name, value in mod.__dict__.items() if not name.startswith("__")}

            # go deeper if there are more steps
            if secondary_config_dict.get("_base_") is not None:
                # handle when config_dict["_base_"] is string
                if not isinstance(secondary_config_dict["_base_"], list):
                    secondary_config_dict["_base_"] = [secondary_config_dict["_base_"]]

                # iterate over third config files
                for third_config_file_path in secondary_config_dict["_base_"]:
                    # set config url
                    config_url = base_config_url + third_config_file_path
                    config_path = main_config_dir / third_config_file_path

                    # create secondary config dir
                    config_path.parent.mkdir(parents=True, exist_ok=True)
                    # download secondary config files
                    urllib.request.urlretrieve(
                        config_url,
                        str(config_path),
                    )

        from mmengine import Config
        # dump final config as single file

        config = Config.fromfile(main_config_path)
        config.dump(final_config_path)

        if verbose:
            print(f"mmdet config file has been downloaded to {path.abspath(final_config_path)}")

        # remove temp config dir
        shutil.rmtree(temp_configs_dir)

    return path.abspath(final_config_path)

download_mmdet_retinanet_model(destination_path=None) ¶

Download the RetinaNet model for testing.

Source code in sahi/utils/mmdet.py

def download_mmdet_retinanet_model(destination_path: str | None = None) -> None:
    """Download the RetinaNet model for testing."""
    if destination_path is None:
        destination_path = MmdetTestConstants.MMDET_RETINANET_MODEL_PATH

    Path(destination_path).parent.mkdir(parents=True, exist_ok=True)

    download_from_url(MmdetTestConstants.MMDET_RETINANET_MODEL_URL, destination_path)

download_mmdet_yolox_tiny_model(destination_path=None) ¶

Download the YOLOX-Tiny model for testing.

Source code in sahi/utils/mmdet.py

def download_mmdet_yolox_tiny_model(destination_path: str | None = None) -> None:
    """Download the YOLOX-Tiny model for testing."""
    if destination_path is None:
        destination_path = MmdetTestConstants.MMDET_YOLOX_TINY_MODEL_PATH

    Path(destination_path).parent.mkdir(parents=True, exist_ok=True)

    download_from_url(MmdetTestConstants.MMDET_YOLOX_TINY_MODEL_URL, destination_path)

mmdet_version_as_integer() ¶

Get the MMDetection version as an integer.

Source code in sahi/utils/mmdet.py

def mmdet_version_as_integer() -> int:
    """Get the MMDetection version as an integer."""
    import mmdet

    return int(mmdet.__version__.replace(".", ""))

`rtdetr` ¶

RT-DETR model utilities and constants.

Classes¶

RTDETRTestConstants ¶

RT-DETR test model configurations.

Source code in sahi/utils/rtdetr.py

class RTDETRTestConstants:
    """RT-DETR test model configurations."""

    RTDETRL_MODEL_URL = "https://github.com/ultralytics/assets/releases/download/v0.0.0/rtdetr-l.pt"
    RTDETRL_MODEL_PATH = "tests/data/models/rtdetr/rtdetr-l.pt"

    RTDETRX_MODEL_URL = "https://github.com/ultralytics/assets/releases/download/v0.0.0/rtdetr-x.pt"
    RTDETRX_MODEL_PATH = "tests/data/models/rtdetr/rtdetr-x.pt"

Functions¶

download_rtdetrl_model(destination_path=None) ¶

Download the RT-DETR-L model for testing.

Source code in sahi/utils/rtdetr.py

def download_rtdetrl_model(destination_path: str | None = None) -> None:
    """Download the RT-DETR-L model for testing."""
    if destination_path is None:
        destination_path = RTDETRTestConstants.RTDETRL_MODEL_PATH

    Path(destination_path).parent.mkdir(parents=True, exist_ok=True)

    if not path.exists(destination_path):
        urllib.request.urlretrieve(
            RTDETRTestConstants.RTDETRX_MODEL_URL,
            destination_path,
        )

download_rtdetrx_model(destination_path=None) ¶

Download the RT-DETR-X model for testing.

Source code in sahi/utils/rtdetr.py

def download_rtdetrx_model(destination_path: str | None = None) -> None:
    """Download the RT-DETR-X model for testing."""
    if destination_path is None:
        destination_path = RTDETRTestConstants.RTDETRX_MODEL_PATH

    Path(destination_path).parent.mkdir(parents=True, exist_ok=True)

    if not path.exists(destination_path):
        urllib.request.urlretrieve(
            RTDETRTestConstants.RTDETRX_MODEL_URL,
            destination_path,
        )

`shapely` ¶

Shapely-based geometry utilities for polygon and segmentation handling.

Classes¶

ShapelyAnnotation ¶

Creates ShapelyAnnotation (as shapely MultiPolygon).

Can convert this instance annotation to various formats.

Source code in sahi/utils/shapely.py

class ShapelyAnnotation:
    """Creates ShapelyAnnotation (as shapely MultiPolygon).

    Can convert this instance annotation to various formats.
    """

    @classmethod
    def from_coco_segmentation(
        cls, segmentation: list[list[float]] | list[list[int]], slice_bbox: list[float] | None = None
    ) -> ShapelyAnnotation:
        """Init ShapelyAnnotation from coco segmentation.

        Args:
            segmentation: COCO segmentation format,
                e.g. [[1, 1, 325, 125, 250, 200, 5, 200]].
            slice_bbox: Bounding box as [xmin, ymin, width, height].
                Should have the same format as the output of the get_bbox_from_shapely function.
                Is used to calculate sliced coco coordinates.
        """
        shapely_multipolygon = get_shapely_multipolygon(segmentation)
        return cls(multipolygon=shapely_multipolygon, slice_bbox=slice_bbox)

    @classmethod
    def from_coco_bbox(cls, bbox: list[int] | list[float], slice_bbox: list[float] | None = None) -> ShapelyAnnotation:
        """Init ShapelyAnnotation from coco bbox.

        bbox (List[int]): [xmin, ymin, width, height] slice_bbox (List[int]): [x_min, y_min, x_max, y_max] Is used
        to calculate sliced coco coordinates.
        """
        shapely_polygon = get_shapely_box(x=bbox[0], y=bbox[1], width=bbox[2], height=bbox[3])
        shapely_multipolygon = MultiPolygon([shapely_polygon])
        return cls(multipolygon=shapely_multipolygon, slice_bbox=slice_bbox)

    def __init__(self, multipolygon: MultiPolygon, slice_bbox: list[float] | None = None) -> None:
        """Initialize ShapelyAnnotation with a multipolygon.

        Args:
            multipolygon: A Shapely MultiPolygon object.
            slice_bbox: Optional slice bounding box for coordinate adjustment.
        """
        self.multipolygon = multipolygon
        self.slice_bbox = slice_bbox

    @property
    def multipolygon(self) -> MultiPolygon:
        """Get the underlying Shapely MultiPolygon object."""
        return self.__multipolygon

    @multipolygon.setter
    def multipolygon(self, multipolygon: MultiPolygon) -> None:
        """Set multipolygon and compute total area."""
        self.__multipolygon = multipolygon
        # calculate areas of all polygons
        area = 0
        for shapely_polygon in multipolygon.geoms:
            area += shapely_polygon.area
        # set instance area
        self.__area = area

    @property
    def area(self) -> int:
        """Get the total area of all polygons."""
        return int(self.__area)

    def to_list(self) -> list[list[tuple[float, float]]]:
        """Convert to nested list of coordinate tuples.

        Returns:
            List format: [
                [(x1, y1), (x2, y2), (x3, y3), ...],
                [(x1, y1), (x2, y2), (x3, y3), ...],
                ...
            ].
        """
        list_of_list_of_points: list = []
        for shapely_polygon in self.multipolygon.geoms:
            # create list_of_points for selected shapely_polygon
            if shapely_polygon.area != 0:
                x_coords = shapely_polygon.exterior.coords.xy[0]
                y_coords = shapely_polygon.exterior.coords.xy[1]
                # fix coord by slice_bbox
                if self.slice_bbox:
                    minx = self.slice_bbox[0]
                    miny = self.slice_bbox[1]
                    x_coords = [x_coord - minx for x_coord in x_coords]
                    y_coords = [y_coord - miny for y_coord in y_coords]
                list_of_points = list(zip(x_coords, y_coords))
            else:
                list_of_points = []
            # append list_of_points to list_of_list_of_points
            list_of_list_of_points.append(list_of_points)
        # return result
        return list_of_list_of_points

    def to_coco_segmentation(self) -> list[list[int]]:
        """Convert to COCO segmentation format.

        Returns:
            List format: [
                [x1, y1, x2, y2, x3, y3, ...],
                [x1, y1, x2, y2, x3, y3, ...],
                ...
            ].
        """
        coco_segmentation: list = []
        for shapely_polygon in self.multipolygon.geoms:
            # create list_of_points for selected shapely_polygon
            if shapely_polygon.area != 0:
                x_coords = shapely_polygon.exterior.coords.xy[0]
                y_coords = shapely_polygon.exterior.coords.xy[1]
                # fix coord by slice_bbox
                if self.slice_bbox:
                    minx = self.slice_bbox[0]
                    miny = self.slice_bbox[1]
                    x_coords = [x_coord - minx for x_coord in x_coords]
                    y_coords = [y_coord - miny for y_coord in y_coords]
                # convert intersection to coco style segmentation annotation
                coco_polygon: list[None | int] = [None] * (len(x_coords) * 2)
                coco_polygon[0::2] = [int(coord) for coord in x_coords]
                coco_polygon[1::2] = [int(coord) for coord in y_coords]
            else:
                coco_polygon = []
            # remove if first and last points are duplicate
            if coco_polygon[:2] == coco_polygon[-2:]:
                del coco_polygon[-2:]
            # append coco_polygon to coco_segmentation
            coco_polygon = [point for point in coco_polygon] if coco_polygon else coco_polygon
            coco_segmentation.append(coco_polygon)
        return coco_segmentation

    def to_opencv_contours(self) -> list[list[list[list[int]]]]:
        """Convert to OpenCV contours format."""
        opencv_contours: list = []
        for shapely_polygon in self.multipolygon.geoms:
            # create opencv_contour for selected shapely_polygon
            if shapely_polygon.area != 0:
                x_coords = shapely_polygon.exterior.coords.xy[0]
                y_coords = shapely_polygon.exterior.coords.xy[1]
                # fix coord by slice_bbox
                if self.slice_bbox:
                    minx = self.slice_bbox[0]
                    miny = self.slice_bbox[1]
                    x_coords = [x_coord - minx for x_coord in x_coords]
                    y_coords = [y_coord - miny for y_coord in y_coords]
                opencv_contour = [[[int(x_coords[ind]), int(y_coords[ind])]] for ind in range(len(x_coords))]
            else:
                opencv_contour = []
            # append opencv_contour to opencv_contours
            opencv_contours.append(opencv_contour)
        # return result
        return opencv_contours

    def to_xywh(self) -> list[float]:
        """[xmin, ymin, width, height]."""
        if self.multipolygon.area != 0:
            coco_bbox, _ = get_bbox_from_shapely(self.multipolygon)
            # fix coord by slice box
            if self.slice_bbox:
                minx = self.slice_bbox[0]
                miny = self.slice_bbox[1]
                coco_bbox[0] = coco_bbox[0] - minx
                coco_bbox[1] = coco_bbox[1] - miny
        else:
            coco_bbox = []
        return coco_bbox

    def to_coco_bbox(self) -> list[float]:
        """[xmin, ymin, width, height]."""
        return self.to_xywh()

    def to_xyxy(self) -> list[float]:
        """[xmin, ymin, xmax, ymax]."""
        if self.multipolygon.area != 0:
            _, voc_bbox = get_bbox_from_shapely(self.multipolygon)
            # fix coord by slice box
            if self.slice_bbox:
                minx = self.slice_bbox[0]
                miny = self.slice_bbox[1]
                voc_bbox[0] = voc_bbox[0] - minx
                voc_bbox[2] = voc_bbox[2] - minx
                voc_bbox[1] = voc_bbox[1] - miny
                voc_bbox[3] = voc_bbox[3] - miny
        else:
            voc_bbox = []
        return voc_bbox

    def to_voc_bbox(self) -> list[float]:
        """[xmin, ymin, xmax, ymax]."""
        return self.to_xyxy()

    def get_convex_hull_shapely_annotation(self) -> ShapelyAnnotation:
        """Return convex hull of this annotation as a new ShapelyAnnotation."""
        shapely_multipolygon = MultiPolygon([self.multipolygon.convex_hull])
        shapely_annotation = ShapelyAnnotation(shapely_multipolygon)
        return shapely_annotation

    def get_simplified_shapely_annotation(self, tolerance: float = 1) -> ShapelyAnnotation:
        """Return simplified version of this annotation as a new ShapelyAnnotation."""
        shapely_multipolygon = MultiPolygon([self.multipolygon.simplify(tolerance)])
        shapely_annotation = ShapelyAnnotation(shapely_multipolygon)
        return shapely_annotation

    def get_buffered_shapely_annotation(
        self,
        distance: float = 3,
        resolution: int = 16,
        quadsegs: int | None = None,
        cap_style: int = CAP_STYLE.round,
        join_style: int = JOIN_STYLE.round,
        mitre_limit: float = 5.0,
        single_sided: bool = False,
    ) -> ShapelyAnnotation:
        """Approximates the present polygon to have a valid polygon shape.

        For more, check: https://shapely.readthedocs.io/en/stable/manual.html#object.buffer
        """
        buffered_polygon = self.multipolygon.buffer(
            distance=distance,
            resolution=resolution,
            quadsegs=quadsegs,
            cap_style=cap_style,
            join_style=join_style,
            mitre_limit=mitre_limit,
            single_sided=single_sided,
        )
        shapely_annotation = ShapelyAnnotation(MultiPolygon([buffered_polygon]))
        return shapely_annotation

    def get_intersection(self, polygon: Polygon) -> ShapelyAnnotation:
        """Accepts shapely polygon object and returns the intersection in ShapelyAnnotation format."""
        # convert intersection polygon to list of tuples
        intersection = self.multipolygon.intersection(polygon)
        # if polygon is box then set slice_box property
        if (
            len(polygon.exterior.xy[0]) == 5
            and polygon.exterior.xy[0][0] == polygon.exterior.xy[0][1]
            and polygon.exterior.xy[0][2] == polygon.exterior.xy[0][3]
        ):
            coco_bbox, _ = get_bbox_from_shapely(polygon)
            slice_bbox = coco_bbox
        else:
            slice_bbox = None
        # convert intersection to multipolygon
        if intersection.geom_type == "Polygon":
            intersection_multipolygon = MultiPolygon([intersection])
        elif intersection.geom_type == "MultiPolygon":
            intersection_multipolygon = intersection
        else:
            intersection_multipolygon = MultiPolygon([])
        # create shapely annotation from intersection multipolygon
        intersection_shapely_annotation = ShapelyAnnotation(intersection_multipolygon, slice_bbox)

        return intersection_shapely_annotation

Attributes¶

area property ¶

Get the total area of all polygons.

multipolygon property writable ¶

Get the underlying Shapely MultiPolygon object.

Functions¶

__init__(multipolygon, slice_bbox=None) ¶

Initialize ShapelyAnnotation with a multipolygon.

Parameters:

Name	Type	Description	Default
`multipolygon` ¶	`MultiPolygon`	A Shapely MultiPolygon object.	required
`slice_bbox` ¶	`list[float] \| None`	Optional slice bounding box for coordinate adjustment.	`None`

Source code in sahi/utils/shapely.py

def __init__(self, multipolygon: MultiPolygon, slice_bbox: list[float] | None = None) -> None:
    """Initialize ShapelyAnnotation with a multipolygon.

    Args:
        multipolygon: A Shapely MultiPolygon object.
        slice_bbox: Optional slice bounding box for coordinate adjustment.
    """
    self.multipolygon = multipolygon
    self.slice_bbox = slice_bbox

from_coco_bbox(bbox, slice_bbox=None) classmethod ¶

Init ShapelyAnnotation from coco bbox.

bbox (List[int]): [xmin, ymin, width, height] slice_bbox (List[int]): [x_min, y_min, x_max, y_max] Is used to calculate sliced coco coordinates.

Source code in sahi/utils/shapely.py

@classmethod
def from_coco_bbox(cls, bbox: list[int] | list[float], slice_bbox: list[float] | None = None) -> ShapelyAnnotation:
    """Init ShapelyAnnotation from coco bbox.

    bbox (List[int]): [xmin, ymin, width, height] slice_bbox (List[int]): [x_min, y_min, x_max, y_max] Is used
    to calculate sliced coco coordinates.
    """
    shapely_polygon = get_shapely_box(x=bbox[0], y=bbox[1], width=bbox[2], height=bbox[3])
    shapely_multipolygon = MultiPolygon([shapely_polygon])
    return cls(multipolygon=shapely_multipolygon, slice_bbox=slice_bbox)

from_coco_segmentation(segmentation, slice_bbox=None) classmethod ¶

Init ShapelyAnnotation from coco segmentation.

Parameters:

Name	Type	Description	Default
`segmentation` ¶	`list[list[float]] \| list[list[int]]`	COCO segmentation format, e.g. [[1, 1, 325, 125, 250, 200, 5, 200]].	required
`slice_bbox` ¶	`list[float] \| None`	Bounding box as [xmin, ymin, width, height]. Should have the same format as the output of the get_bbox_from_shapely function. Is used to calculate sliced coco coordinates.	`None`

Source code in sahi/utils/shapely.py

@classmethod
def from_coco_segmentation(
    cls, segmentation: list[list[float]] | list[list[int]], slice_bbox: list[float] | None = None
) -> ShapelyAnnotation:
    """Init ShapelyAnnotation from coco segmentation.

    Args:
        segmentation: COCO segmentation format,
            e.g. [[1, 1, 325, 125, 250, 200, 5, 200]].
        slice_bbox: Bounding box as [xmin, ymin, width, height].
            Should have the same format as the output of the get_bbox_from_shapely function.
            Is used to calculate sliced coco coordinates.
    """
    shapely_multipolygon = get_shapely_multipolygon(segmentation)
    return cls(multipolygon=shapely_multipolygon, slice_bbox=slice_bbox)

get_buffered_shapely_annotation(distance=3, resolution=16, quadsegs=None, cap_style=CAP_STYLE.round, join_style=JOIN_STYLE.round, mitre_limit=5.0, single_sided=False)

¶

Approximates the present polygon to have a valid polygon shape.

For more, check: https://shapely.readthedocs.io/en/stable/manual.html#object.buffer

Source code in sahi/utils/shapely.py

def get_buffered_shapely_annotation(
    self,
    distance: float = 3,
    resolution: int = 16,
    quadsegs: int | None = None,
    cap_style: int = CAP_STYLE.round,
    join_style: int = JOIN_STYLE.round,
    mitre_limit: float = 5.0,
    single_sided: bool = False,
) -> ShapelyAnnotation:
    """Approximates the present polygon to have a valid polygon shape.

    For more, check: https://shapely.readthedocs.io/en/stable/manual.html#object.buffer
    """
    buffered_polygon = self.multipolygon.buffer(
        distance=distance,
        resolution=resolution,
        quadsegs=quadsegs,
        cap_style=cap_style,
        join_style=join_style,
        mitre_limit=mitre_limit,
        single_sided=single_sided,
    )
    shapely_annotation = ShapelyAnnotation(MultiPolygon([buffered_polygon]))
    return shapely_annotation

get_convex_hull_shapely_annotation() ¶

Return convex hull of this annotation as a new ShapelyAnnotation.

Source code in sahi/utils/shapely.py

def get_convex_hull_shapely_annotation(self) -> ShapelyAnnotation:
    """Return convex hull of this annotation as a new ShapelyAnnotation."""
    shapely_multipolygon = MultiPolygon([self.multipolygon.convex_hull])
    shapely_annotation = ShapelyAnnotation(shapely_multipolygon)
    return shapely_annotation

get_intersection(polygon) ¶

Accepts shapely polygon object and returns the intersection in ShapelyAnnotation format.

Source code in sahi/utils/shapely.py

def get_intersection(self, polygon: Polygon) -> ShapelyAnnotation:
    """Accepts shapely polygon object and returns the intersection in ShapelyAnnotation format."""
    # convert intersection polygon to list of tuples
    intersection = self.multipolygon.intersection(polygon)
    # if polygon is box then set slice_box property
    if (
        len(polygon.exterior.xy[0]) == 5
        and polygon.exterior.xy[0][0] == polygon.exterior.xy[0][1]
        and polygon.exterior.xy[0][2] == polygon.exterior.xy[0][3]
    ):
        coco_bbox, _ = get_bbox_from_shapely(polygon)
        slice_bbox = coco_bbox
    else:
        slice_bbox = None
    # convert intersection to multipolygon
    if intersection.geom_type == "Polygon":
        intersection_multipolygon = MultiPolygon([intersection])
    elif intersection.geom_type == "MultiPolygon":
        intersection_multipolygon = intersection
    else:
        intersection_multipolygon = MultiPolygon([])
    # create shapely annotation from intersection multipolygon
    intersection_shapely_annotation = ShapelyAnnotation(intersection_multipolygon, slice_bbox)

    return intersection_shapely_annotation

get_simplified_shapely_annotation(tolerance=1) ¶

Return simplified version of this annotation as a new ShapelyAnnotation.

Source code in sahi/utils/shapely.py

def get_simplified_shapely_annotation(self, tolerance: float = 1) -> ShapelyAnnotation:
    """Return simplified version of this annotation as a new ShapelyAnnotation."""
    shapely_multipolygon = MultiPolygon([self.multipolygon.simplify(tolerance)])
    shapely_annotation = ShapelyAnnotation(shapely_multipolygon)
    return shapely_annotation

to_coco_bbox() ¶

[xmin, ymin, width, height].

Source code in sahi/utils/shapely.py

def to_coco_bbox(self) -> list[float]:
    """[xmin, ymin, width, height]."""
    return self.to_xywh()

to_coco_segmentation() ¶

Convert to COCO segmentation format.

Returns:

Type	Description
`list[list[int]]`	List format: [ [x1, y1, x2, y2, x3, y3, ...], [x1, y1, x2, y2, x3, y3, ...], ...
`list[list[int]]`	].

Source code in sahi/utils/shapely.py

def to_coco_segmentation(self) -> list[list[int]]:
    """Convert to COCO segmentation format.

    Returns:
        List format: [
            [x1, y1, x2, y2, x3, y3, ...],
            [x1, y1, x2, y2, x3, y3, ...],
            ...
        ].
    """
    coco_segmentation: list = []
    for shapely_polygon in self.multipolygon.geoms:
        # create list_of_points for selected shapely_polygon
        if shapely_polygon.area != 0:
            x_coords = shapely_polygon.exterior.coords.xy[0]
            y_coords = shapely_polygon.exterior.coords.xy[1]
            # fix coord by slice_bbox
            if self.slice_bbox:
                minx = self.slice_bbox[0]
                miny = self.slice_bbox[1]
                x_coords = [x_coord - minx for x_coord in x_coords]
                y_coords = [y_coord - miny for y_coord in y_coords]
            # convert intersection to coco style segmentation annotation
            coco_polygon: list[None | int] = [None] * (len(x_coords) * 2)
            coco_polygon[0::2] = [int(coord) for coord in x_coords]
            coco_polygon[1::2] = [int(coord) for coord in y_coords]
        else:
            coco_polygon = []
        # remove if first and last points are duplicate
        if coco_polygon[:2] == coco_polygon[-2:]:
            del coco_polygon[-2:]
        # append coco_polygon to coco_segmentation
        coco_polygon = [point for point in coco_polygon] if coco_polygon else coco_polygon
        coco_segmentation.append(coco_polygon)
    return coco_segmentation

to_list() ¶

Convert to nested list of coordinate tuples.

Returns:

Type	Description
`list[list[tuple[float, float]]]`	List format: [ [(x1, y1), (x2, y2), (x3, y3), ...], [(x1, y1), (x2, y2), (x3, y3), ...], ...
`list[list[tuple[float, float]]]`	].

Source code in sahi/utils/shapely.py

def to_list(self) -> list[list[tuple[float, float]]]:
    """Convert to nested list of coordinate tuples.

    Returns:
        List format: [
            [(x1, y1), (x2, y2), (x3, y3), ...],
            [(x1, y1), (x2, y2), (x3, y3), ...],
            ...
        ].
    """
    list_of_list_of_points: list = []
    for shapely_polygon in self.multipolygon.geoms:
        # create list_of_points for selected shapely_polygon
        if shapely_polygon.area != 0:
            x_coords = shapely_polygon.exterior.coords.xy[0]
            y_coords = shapely_polygon.exterior.coords.xy[1]
            # fix coord by slice_bbox
            if self.slice_bbox:
                minx = self.slice_bbox[0]
                miny = self.slice_bbox[1]
                x_coords = [x_coord - minx for x_coord in x_coords]
                y_coords = [y_coord - miny for y_coord in y_coords]
            list_of_points = list(zip(x_coords, y_coords))
        else:
            list_of_points = []
        # append list_of_points to list_of_list_of_points
        list_of_list_of_points.append(list_of_points)
    # return result
    return list_of_list_of_points

to_opencv_contours() ¶

Convert to OpenCV contours format.

Source code in sahi/utils/shapely.py

def to_opencv_contours(self) -> list[list[list[list[int]]]]:
    """Convert to OpenCV contours format."""
    opencv_contours: list = []
    for shapely_polygon in self.multipolygon.geoms:
        # create opencv_contour for selected shapely_polygon
        if shapely_polygon.area != 0:
            x_coords = shapely_polygon.exterior.coords.xy[0]
            y_coords = shapely_polygon.exterior.coords.xy[1]
            # fix coord by slice_bbox
            if self.slice_bbox:
                minx = self.slice_bbox[0]
                miny = self.slice_bbox[1]
                x_coords = [x_coord - minx for x_coord in x_coords]
                y_coords = [y_coord - miny for y_coord in y_coords]
            opencv_contour = [[[int(x_coords[ind]), int(y_coords[ind])]] for ind in range(len(x_coords))]
        else:
            opencv_contour = []
        # append opencv_contour to opencv_contours
        opencv_contours.append(opencv_contour)
    # return result
    return opencv_contours

to_voc_bbox() ¶

[xmin, ymin, xmax, ymax].

Source code in sahi/utils/shapely.py

def to_voc_bbox(self) -> list[float]:
    """[xmin, ymin, xmax, ymax]."""
    return self.to_xyxy()

to_xywh() ¶

[xmin, ymin, width, height].

Source code in sahi/utils/shapely.py

def to_xywh(self) -> list[float]:
    """[xmin, ymin, width, height]."""
    if self.multipolygon.area != 0:
        coco_bbox, _ = get_bbox_from_shapely(self.multipolygon)
        # fix coord by slice box
        if self.slice_bbox:
            minx = self.slice_bbox[0]
            miny = self.slice_bbox[1]
            coco_bbox[0] = coco_bbox[0] - minx
            coco_bbox[1] = coco_bbox[1] - miny
    else:
        coco_bbox = []
    return coco_bbox

to_xyxy() ¶

[xmin, ymin, xmax, ymax].

Source code in sahi/utils/shapely.py

def to_xyxy(self) -> list[float]:
    """[xmin, ymin, xmax, ymax]."""
    if self.multipolygon.area != 0:
        _, voc_bbox = get_bbox_from_shapely(self.multipolygon)
        # fix coord by slice box
        if self.slice_bbox:
            minx = self.slice_bbox[0]
            miny = self.slice_bbox[1]
            voc_bbox[0] = voc_bbox[0] - minx
            voc_bbox[2] = voc_bbox[2] - minx
            voc_bbox[1] = voc_bbox[1] - miny
            voc_bbox[3] = voc_bbox[3] - miny
    else:
        voc_bbox = []
    return voc_bbox

Functions¶

get_bbox_from_shapely(shapely_object) ¶

Accepts shapely box/poly object and returns its bounding box in coco and voc formats.

Source code in sahi/utils/shapely.py

def get_bbox_from_shapely(shapely_object: Polygon | MultiPolygon) -> tuple[list[float], list[float]]:
    """Accepts shapely box/poly object and returns its bounding box in coco and voc formats."""
    minx, miny, maxx, maxy = shapely_object.bounds
    width = maxx - minx
    height = maxy - miny
    coco_bbox = [minx, miny, width, height]
    voc_bbox = [minx, miny, maxx, maxy]

    return coco_bbox, voc_bbox

get_shapely_box(x, y, width, height) ¶

Accepts coco style bbox coords and converts it to shapely box object.

Source code in sahi/utils/shapely.py

def get_shapely_box(x: int | float, y: int | float, width: int | float, height: int | float) -> Polygon:
    """Accepts coco style bbox coords and converts it to shapely box object."""
    minx = x
    miny = y
    maxx = x + width
    maxy = y + height
    shapely_box = box(minx, miny, maxx, maxy)

    return shapely_box

get_shapely_multipolygon(coco_segmentation) ¶

Accepts coco style polygon coords and converts it to valid shapely multipolygon object.

Source code in sahi/utils/shapely.py

def get_shapely_multipolygon(coco_segmentation: list[list]) -> MultiPolygon:
    """Accepts coco style polygon coords and converts it to valid shapely multipolygon object."""

    def filter_polygons(geometry: GeometryCollection | Polygon | MultiPolygon) -> MultiPolygon:
        """Filters out and returns only Polygon or MultiPolygon components of a geometry.

        If geometry is a Polygon, it converts it into a MultiPolygon. If it's a GeometryCollection, it filters to create
        a MultiPolygon from any Polygons in the collection. Returns an empty MultiPolygon if no Polygon or MultiPolygon
        components are found.

        Args:
            geometry: A shapely geometry object (Polygon, MultiPolygon, GeometryCollection, etc.)

        Returns: MultiPolygon
        """
        if isinstance(geometry, Polygon):
            return MultiPolygon([geometry])
        elif isinstance(geometry, MultiPolygon):
            return geometry
        elif isinstance(geometry, GeometryCollection):
            polygons = [
                geom.geoms if isinstance(geom, MultiPolygon) else geom
                for geom in geometry.geoms
                if isinstance(geom, (Polygon, MultiPolygon))
            ]
            return MultiPolygon(polygons) if polygons else MultiPolygon()
        return MultiPolygon()

    polygon_list = []
    for coco_polygon in coco_segmentation:
        point_list = list(zip(coco_polygon[0::2], coco_polygon[1::2]))
        shapely_polygon = Polygon(point_list)
        polygon_list.append(shapely_polygon)
    shapely_multipolygon = MultiPolygon(polygon_list)

    if not shapely_multipolygon.is_valid:
        shapely_multipolygon = filter_polygons(make_valid(shapely_multipolygon))

    return shapely_multipolygon

`table` ¶

Table formatting utilities.

Functions¶

create_ascii_table(data) ¶

Creates a clean, properly padded ASCII string grid from a list of lists.

Parameters:

Name	Type	Description	Default
`data` ¶	`List[List[Any]]`	A list of lists representing headers and rows.	required

Returns:

Name	Type	Description
`str`	`str`	The formatted ASCII table as a string.

Source code in sahi/utils/table.py

def create_ascii_table(data: list[list[Any]]) -> str:
    """Creates a clean, properly padded ASCII string grid from a list of lists.

    Args:
        data (List[List[Any]]): A list of lists representing headers and rows.

    Returns:
        str: The formatted ASCII table as a string.
    """
    if not data or not data[0]:
        return ""

    # Convert all elements to strings, handling None values as empty strings
    str_data = [[str(item) if item is not None else "" for item in row] for row in data]

    # Calculate column widths
    num_columns = max(len(row) for row in str_data)
    col_widths = [0] * num_columns
    for row in str_data:
        for i, cell in enumerate(row):
            if i < num_columns:
                col_widths[i] = max(col_widths[i], len(cell))

    # Define border part
    border = "+" + "+".join("-" * (w + 2) for w in col_widths) + "+"

    lines = [border]
    for i, row in enumerate(str_data):
        padded_row = row + [""] * (num_columns - len(row))
        content_row = "| " + " | ".join(cell.ljust(w) for cell, w in zip(padded_row, col_widths)) + " |"
        lines.append(content_row)
        if i == 0 or i == len(str_data) - 1:
            lines.append(border)

    return "\n".join(lines)

`torch_utils` ¶

Torch-related utilities for tensor operations.

Functions¶

empty_cuda_cache() ¶

Release all unused cached memory from the CUDA allocator.

Acts as a no-op when torch is not installed, so it is safe to call unconditionally regardless of the runtime environment.

Source code in sahi/utils/torch_utils.py

def empty_cuda_cache() -> None:
    """Release all unused cached memory from the CUDA allocator.

    Acts as a no-op when torch is not installed, so it is safe to call
    unconditionally regardless of the runtime environment.
    """
    if not is_available("torch"):
        return
    import torch

    torch.cuda.empty_cache()

select_device(device=None) ¶

Selects compute device.

When torch is not installed, returns the string "cpu" (raising an error if a GPU device was explicitly requested). When torch is installed, returns a torch.device with the usual auto-detection logic (cuda > mps > cpu).

Parameters:

Name	Type	Description	Default
`device` ¶	`str \| None`	"cpu", "mps", "cuda", "cuda:0", "cuda:1", etc. When no device string is given, the order of preference to try is: cuda:0 > mps > cpu	`None`

Returns:

Type	Description
`str \| device`	torch.device (when torch is available) or str "cpu"

Inspired by https://github.com/ultralytics/yolov5/blob/6371de8879e7ad7ec5283e8b95cc6dd85d6a5e72/utils/torch_utils.py#L107

Source code in sahi/utils/torch_utils.py

def select_device(device: str | None = None) -> str | torch.device:
    """Selects compute device.

    When torch is not installed, returns the string ``"cpu"`` (raising an
    error if a GPU device was explicitly requested).  When torch *is*
    installed, returns a ``torch.device`` with the usual auto-detection
    logic (cuda > mps > cpu).

    Args:
        device: "cpu", "mps", "cuda", "cuda:0", "cuda:1", etc.
                When no device string is given, the order of preference
                to try is: cuda:0 > mps > cpu

    Returns:
        torch.device (when torch is available) or str "cpu"

    Inspired by https://github.com/ultralytics/yolov5/blob/6371de8879e7ad7ec5283e8b95cc6dd85d6a5e72/utils/torch_utils.py#L107
    """
    if not is_available("torch"):
        # Without torch only CPU is possible
        if device is not None and device not in ("cpu", "none", "None", ""):
            raise ImportError(f"torch is required to use device='{device}'. Install it with: pip install torch")
        return "cpu"

    import torch

    if device == "cuda" or device is None:
        device = "cuda:0"
    device = str(device).strip().lower().replace("cuda:", "").replace("none", "")  # to string, 'cuda:0' to '0'
    cpu = device == "cpu"
    mps = device == "mps"  # Apple Metal Performance Shaders (MPS)
    if cpu or mps:
        environ["CUDA_VISIBLE_DEVICES"] = "-1"  # force torch.cuda.is_available() = False
    elif device:  # non-cpu device requested
        environ["CUDA_VISIBLE_DEVICES"] = device  # set environment variable - must be before assert is_available()

    cuda_id_pattern = r"^(0|[1-9]\d*)$"
    valid_cuda_id = bool(re.fullmatch(cuda_id_pattern, device))

    if not cpu and not mps and torch.cuda.is_available() and valid_cuda_id:  # prefer GPU if available
        arg = f"cuda:{device}" if device else "cuda:0"
    elif mps and getattr(torch, "has_mps", False) and torch.backends.mps.is_available():  # prefer MPS if available
        arg = "mps"
    else:  # revert to CPU
        arg = "cpu"

    return torch.device(arg)

to_float_tensor(img) ¶

Convert PIL.Image or numpy array to torch.FloatTensor.

Converts a PIL.Image (RGB) or numpy.ndarray (H x W x C) in the range [0, 255] to a torch.FloatTensor of shape (C x H x W).

Parameters:

Name	Type	Description	Default
`img` ¶	`ndarray \| Image`	PIL.Image or numpy array	required

Returns: torch.tensor

Source code in sahi/utils/torch_utils.py

def to_float_tensor(img: np.ndarray | Image) -> torch.Tensor:
    """Convert PIL.Image or numpy array to torch.FloatTensor.

    Converts a PIL.Image (RGB) or numpy.ndarray (H x W x C) in the range [0, 255]
    to a torch.FloatTensor of shape (C x H x W).

    Args:
        img: PIL.Image or numpy array
    Returns:
        torch.tensor
    """
    import torch

    nparray: np.ndarray
    if isinstance(img, np.ndarray):
        nparray = img
    else:
        nparray = np.array(img)
    nparray = nparray.transpose((2, 0, 1))
    tensor = torch.from_numpy(np.array(nparray)).float()
    if tensor.max() > 1:
        tensor /= 255
    return tensor

torch_to_numpy(img) ¶

Convert a torch image tensor to a numpy array in HWC format.

Pixel values greater than 1 are rescaled to [0, 1] by dividing by 255.

Parameters:

Name	Type	Description	Default
`img` ¶	`Tensor`	A torch.Tensor of shape (C, H, W).	required

Returns:

Type	Description
`ndarray`	A numpy array of shape (H, W, C) with values in [0, 1].

Source code in sahi/utils/torch_utils.py

def torch_to_numpy(img: torch.Tensor) -> np.ndarray:
    """Convert a torch image tensor to a numpy array in HWC format.

    Pixel values greater than 1 are rescaled to [0, 1] by dividing by 255.

    Args:
        img: A torch.Tensor of shape (C, H, W).

    Returns:
        A numpy array of shape (H, W, C) with values in [0, 1].
    """
    arr = img.cpu().numpy()
    if arr.max() > 1:
        arr /= 255
    return arr.transpose((1, 2, 0))

`torchvision` ¶

Torchvision model utilities.

`yolov5` ¶

YOLOv5 model utilities and constants.

Classes¶

Yolov5TestConstants ¶

YOLOv5 test model configurations.

Source code in sahi/utils/yolov5.py

class Yolov5TestConstants:
    """YOLOv5 test model configurations."""

    YOLOV5N_MODEL_URL = "https://github.com/ultralytics/yolov5/releases/download/v6.0/yolov5n.pt"
    YOLOV5N_MODEL_PATH = "tests/data/models/yolov5/yolov5n.pt"

    YOLOV5S6_MODEL_URL = "https://github.com/ultralytics/yolov5/releases/download/v5.0/yolov5s6.pt"
    YOLOV5S6_MODEL_PATH = "tests/data/models/yolov5/yolov5s6.pt"

    YOLOV5M6_MODEL_URL = "https://github.com/ultralytics/yolov5/releases/download/v5.0/yolov5m6.pt"
    YOLOV5M6_MODEL_PATH = "tests/data/models/yolov5/yolov5m6.pt"

Functions¶

download_yolov5n_model(destination_path=None) ¶

Download the YOLOv5-Nano model for testing.

Source code in sahi/utils/yolov5.py

def download_yolov5n_model(destination_path: str | None = None) -> None:
    """Download the YOLOv5-Nano model for testing."""
    if destination_path is None:
        destination_path = Yolov5TestConstants.YOLOV5N_MODEL_PATH

    Path(destination_path).parent.mkdir(parents=True, exist_ok=True)

    if not path.exists(destination_path):
        urllib.request.urlretrieve(
            Yolov5TestConstants.YOLOV5N_MODEL_URL,
            destination_path,
        )

download_yolov5s6_model(destination_path=None) ¶

Download the YOLOv5s6 model for testing.

Source code in sahi/utils/yolov5.py

def download_yolov5s6_model(destination_path: str | None = None) -> None:
    """Download the YOLOv5s6 model for testing."""
    if destination_path is None:
        destination_path = Yolov5TestConstants.YOLOV5S6_MODEL_PATH

    Path(destination_path).parent.mkdir(parents=True, exist_ok=True)

    if not path.exists(destination_path):
        urllib.request.urlretrieve(
            Yolov5TestConstants.YOLOV5S6_MODEL_URL,
            destination_path,
        )

sahi ¶

Classes¶

AutoDetectionModel ¶

Functions¶

from_pretrained(model_type, model_path=None, model=None, config_path=None, device=None, mask_threshold=0.5, confidence_threshold=0.3, category_mapping=None, category_remapping=None, load_at_init=True, image_size=None, **kwargs) staticmethod ¶

BoundingBox dataclass ¶

Attributes¶

area property ¶

maxx property ¶

maxy property ¶

minx property ¶

miny property ¶

shift_x property ¶

shift_y property ¶

Functions¶

__post_init__() ¶

__repr__() ¶

get_expanded_box(ratio=0.1, max_x=None, max_y=None) ¶

get_shifted_box() ¶

to_coco_bbox() ¶

to_voc_bbox() ¶

to_xywh() ¶

to_xyxy() ¶

Category dataclass ¶

Functions¶

__post_init__() ¶

__repr__() ¶

DetectionModel ¶

Attributes¶

object_prediction_list property ¶

object_prediction_list_per_image property ¶

original_predictions property ¶

Functions¶

__init__(model_path=None, model=None, config_path=None, device=None, mask_threshold=0.5, confidence_threshold=0.3, category_mapping=None, category_remapping=None, load_at_init=True, image_size=None) ¶

check_dependencies(packages=None) ¶

convert_original_predictions(shift_amount=[[0, 0]], full_shape=None) ¶

load_model() ¶

perform_batch_inference(images) ¶

perform_inference(image) ¶

set_device(device=None) ¶

set_model(model, **kwargs) ¶

unload_model() ¶

Mask ¶

segmentation ¶

full_shape ¶

shift_amount ¶

Attributes¶

bool_mask property ¶

full_shape property ¶

shape property ¶

shift_amount property ¶

Functions¶

__init__(segmentation, full_shape, shift_amount=[0, 0]) ¶

from_bool_mask(bool_mask, full_shape, shift_amount=None) classmethod ¶

from_float_mask(mask, full_shape, mask_threshold=0.5, shift_amount=None) classmethod ¶

get_shifted_mask() ¶

ObjectPrediction ¶

Functions¶

__init__(bbox=None, category_id=None, category_name=None, segmentation=None, score=0.0, shift_amount=None, full_shape=None) ¶

__repr__() ¶

get_shifted_object_prediction() ¶

to_coco_prediction(image_id=None) ¶

to_fiftyone_detection(image_height, image_width) ¶

Functions¶

__getattr__(name) ¶

Modules¶

annotation ¶

Classes¶

BoundingBox dataclass ¶

Category dataclass ¶

Mask ¶

ObjectAnnotation ¶

Functions¶

auto_model ¶

Classes¶

AutoDetectionModel ¶

Functions¶

cli ¶

Functions¶

app() ¶

`sahi` ¶

`AutoDetectionModel` ¶

`from_pretrained(model_type, model_path=None, model=None, config_path=None, device=None, mask_threshold=0.5, confidence_threshold=0.3, category_mapping=None, category_remapping=None, load_at_init=True, image_size=None, **kwargs)` `staticmethod` ¶

`BoundingBox` `dataclass` ¶

`area` `property` ¶

`maxx` `property` ¶

`maxy` `property` ¶

`minx` `property` ¶

`miny` `property` ¶

`shift_x` `property` ¶

`shift_y` `property` ¶

`__post_init__()` ¶

`repr()` ¶

`get_expanded_box(ratio=0.1, max_x=None, max_y=None)` ¶

`get_shifted_box()` ¶

`to_coco_bbox()` ¶

`to_voc_bbox()` ¶

`to_xywh()` ¶

`to_xyxy()` ¶

`Category` `dataclass` ¶

`__post_init__()` ¶

`repr()` ¶

`DetectionModel` ¶

`object_prediction_list` `property` ¶

`object_prediction_list_per_image` `property` ¶

`original_predictions` `property` ¶

`init(model_path=None, model=None, config_path=None, device=None, mask_threshold=0.5, confidence_threshold=0.3, category_mapping=None, category_remapping=None, load_at_init=True, image_size=None)` ¶

`check_dependencies(packages=None)` ¶

`convert_original_predictions(shift_amount=[[0, 0]], full_shape=None)` ¶

`load_model()` ¶

`perform_batch_inference(images)` ¶

`perform_inference(image)` ¶

`set_device(device=None)` ¶

`set_model(model, **kwargs)` ¶

`unload_model()` ¶

`Mask` ¶

`segmentation` ¶

`full_shape` ¶

`shift_amount` ¶

`bool_mask` `property` ¶

`full_shape` `property` ¶

`shape` `property` ¶

`shift_amount` `property` ¶

`init(segmentation, full_shape, shift_amount=[0, 0])` ¶

`from_bool_mask(bool_mask, full_shape, shift_amount=None)` `classmethod` ¶

`from_float_mask(mask, full_shape, mask_threshold=0.5, shift_amount=None)` `classmethod` ¶

`get_shifted_mask()` ¶

`ObjectPrediction` ¶

`init(bbox=None, category_id=None, category_name=None, segmentation=None, score=0.0, shift_amount=None, full_shape=None)` ¶

`repr()` ¶

`get_shifted_object_prediction()` ¶

`to_coco_prediction(image_id=None)` ¶

`to_fiftyone_detection(image_height, image_width)` ¶

`getattr(name)` ¶

`annotation` ¶

`BoundingBox` `dataclass` ¶

`Category` `dataclass` ¶

`Mask` ¶

`ObjectAnnotation` ¶

`auto_model` ¶

`AutoDetectionModel` ¶

`cli` ¶

`app()` ¶

`constants` ¶

`logger` ¶

`BaseSahiLogger` ¶

`SahiLogger` ¶

`SahiLoggerFormatter` ¶

`SupportsPkgInfo` ¶

`models` ¶

`base` ¶

`detectron2` ¶

`huggingface` ¶

`huggingface_segmentation` ¶

`mmdet` ¶

`roboflow` ¶

`rtdetr` ¶

`torchvision` ¶

`ultralytics` ¶

`yolo-world` ¶