Source code for daxs.filters

"""The module provides functions for filtering data."""

from __future__ import annotations

import logging

import numpy as np
import numpy.typing as npt

logger = logging.getLogger(__name__)



[docs]
def hampel(
    data: npt.NDArray[np.float64],
    window_size: int | None = None,
    threshold: float = 3.5,
    axis: int = 0,
    k: float = 1.4826,
) -> tuple[npt.NDArray[np.bool_], npt.NDArray[np.float64]]:
    """Outliers detection using the Hampel filter.

    More details about the filter can be found here:

    - https://fr.mathworks.com/help/dsp/ref/hampelfilter.html
    - https://dsp.stackexchange.com/questions/26552
    - https://stackoverflow.com/questions/22354094

    Args:
        data: Input data.
        window_size: Size of the sliding window.
        threshold: Threshold for outlier detection expressed in number of standard
          deviations. Iglewicz and Hoaglin [1]_ suggest using a value of 3.5, but
          larger values are often needed to avoid removing data points from a
          noisy signal.
        axis: Axis along which the detection is performed.
        k: Scale factor for the median absolute deviation. The default value is
          1.4826, which is the scale factor for normally distributed data.

    Returns:
        Mask identifying the outliers, and the rolling window median.

    References:
        .. [1] Boris Iglewicz and David Hoaglin (1993) "Volume 16: How to Detect
           and Handle Outliers", The ASQC Basic References in Quality Control:
           Statistical Techniques, Edward F. Mykytka, Ph.D., Editor.
    """
    if window_size is None:
        SECTIONS: int = 10
        window_size = data.shape[axis] // SECTIONS

    # Don't allow the window size to be too small or larger than the data.
    MIN_WINDOW_SIZE = 3
    window_size = max(window_size, MIN_WINDOW_SIZE)
    window_size = min(window_size, data.shape[axis])

    # Make the size of the window an odd number.
    if window_size % 2 == 0:
        window_size -= 1

    logger.debug("Hampel filter window size = %d.", window_size)

    # Pad the data along the direction used for the outlier detection.
    pad_size = np.zeros((len(data.shape), 2), dtype=int)
    pad_size[axis] = np.array((window_size // 2, window_size // 2))
    padded_data = np.pad(data, pad_size)

    # Create a sliding window view of the padded data.
    padded_data_views = np.lib.stride_tricks.sliding_window_view(
        padded_data, window_size, axis, subok=True
    )

    # Compute the median of each view. As the views increase the number of dimensions,
    # the axis argument always identifies the axis along which the median is computed.
    medians = np.median(padded_data_views, axis=-1)
    abs_diff = np.abs(data - medians)

    outliers = np.full(data.shape, False, dtype=bool)

    # Put the median in a suitable shape.
    medians_views = np.repeat(medians, window_size).reshape(padded_data_views.shape)

    # Calculate the median absolute deviation for each view.
    mad = np.median(np.abs(padded_data_views - medians_views), axis=-1)

    # Calculate the standard deviation. We assume that the data is normally distributed.
    # https://en.wikipedia.org/wiki/Median_absolute_deviation
    sigma = k * mad

    score = threshold * sigma
    outliers[(abs_diff > score)] = True

    # If the score is zero, the data is not necessarily an outlier because all
    # absolute differences with the exception of zero will be greater than it.
    # For this case we add a condition that excludes from the outliers the points
    # that are smaller than the median.
    outliers[(score == 0) & (np.abs(data) < np.abs(np.median(data)))] = False

    return outliers, medians