Source code for daxs.filters

"""The module provides functions for filtering data."""

from __future__ import annotations

import logging

import numpy as np
import numpy.typing as npt

logger = logging.getLogger(__name__)


[docs] def hampel( data: npt.NDArray[np.float64], window_size: int | None = None, threshold: float = 3.5, axis: int = 0, k: float = 1.4826, ) -> tuple[npt.NDArray[np.bool_], npt.NDArray[np.float64]]: """Outliers detection using the Hampel filter. More details about the filter can be found here: - https://fr.mathworks.com/help/dsp/ref/hampelfilter.html - https://dsp.stackexchange.com/questions/26552 - https://stackoverflow.com/questions/22354094 Args: data: Input data. window_size: Size of the sliding window. threshold: Threshold for outlier detection expressed in number of standard deviations. Iglewicz and Hoaglin [1]_ suggest using a value of 3.5, but larger values are often needed to avoid removing data points from a noisy signal. axis: Axis along which the detection is performed. k: Scale factor for the median absolute deviation. The default value is 1.4826, which is the scale factor for normally distributed data. Returns: Mask identifying the outliers, and the rolling window median. References: .. [1] Boris Iglewicz and David Hoaglin (1993) "Volume 16: How to Detect and Handle Outliers", The ASQC Basic References in Quality Control: Statistical Techniques, Edward F. Mykytka, Ph.D., Editor. """ if window_size is None: SECTIONS: int = 10 window_size = data.shape[axis] // SECTIONS # Don't allow the window size to be too small or larger than the data. MIN_WINDOW_SIZE = 3 window_size = max(window_size, MIN_WINDOW_SIZE) window_size = min(window_size, data.shape[axis]) # Make the size of the window an odd number. if window_size % 2 == 0: window_size -= 1 logger.debug("Hampel filter window size = %d.", window_size) # Pad the data along the direction used for the outlier detection. pad_size = np.zeros((len(data.shape), 2), dtype=int) pad_size[axis] = np.array((window_size // 2, window_size // 2)) padded_data = np.pad(data, pad_size) # Create a sliding window view of the padded data. padded_data_views = np.lib.stride_tricks.sliding_window_view( padded_data, window_size, axis, subok=True ) # Compute the median of each view. As the views increase the number of dimensions, # the axis argument always identifies the axis along which the median is computed. medians = np.median(padded_data_views, axis=-1) abs_diff = np.abs(data - medians) outliers = np.full(data.shape, False, dtype=bool) # Put the median in a suitable shape. medians_views = np.repeat(medians, window_size).reshape(padded_data_views.shape) # Calculate the median absolute deviation for each view. mad = np.median(np.abs(padded_data_views - medians_views), axis=-1) # Calculate the standard deviation. We assume that the data is normally distributed. # https://en.wikipedia.org/wiki/Median_absolute_deviation sigma = k * mad score = threshold * sigma outliers[(abs_diff > score)] = True # If the score is zero, the data is not necessarily an outlier because all # absolute differences with the exception of zero will be greater than it. # For this case we add a condition that excludes from the outliers the points # that are smaller than the median. outliers[(score == 0) & (np.abs(data) < np.abs(np.median(data)))] = False return outliers, medians