Source code for asaplib.kde.density_estimation

"""
class and methods for performing kernel density estimation
"""
import numpy as np
from scipy.stats import gaussian_kde
from sklearn.neighbors import KernelDensity

[docs]class Kernel_Density_Base:
    """
    Base class for performing kernel density estimation
    """
    def __init__(self):
        self.acronym = 'none'

[docs]    def fit(self, X):

        """Fit kernel model to X"""

        pass

[docs]    def evaluate_density(self, X):
        """Given an array of data, computes the local density of every point using kernel density estimation

        Input
        ------
        Data X : array, shape(n_sample,n_feature)

        Return
        ------
        Log of densities for every point: array, shape(n_sample)
        """
        pass

[docs]    def fit_evaluate_density(self, X):
        self.fit(X)
        return self.evaluate_density(X)

[docs]    def get_acronym(self):
        # we use an acronym for each KDE, so it's easy to find it and refer to it
        return self.acronym

[docs]class KDE_scipy(Kernel_Density_Base):
    """
    Kernel Density Estimation with Scipy
    https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.gaussian_kde.html
    """
    def __init__(self, bw_method=None):
        """
        bw_method: str, scalar or callable, optional

        The method used to calculate the estimator bandwidth. 
        This can be ‘scott’, ‘silverman’, a scalar constant or a callable. 
        If a scalar, this will be used directly as kde.factor. 
        If a callable, it should take a gaussian_kde instance as only parameter and return a scalar. 
        If None (default), ‘scott’ is used. 
        """
        self.bw_method = bw_method
        self.kde = None
        self.acronym = 'kde_scipy'
        self._fitted = False

[docs]    def fit(self, X):
        """
        X: dataset, array_like

        Datapoints to estimate from. In case of univariate data this is a 1-D array, 
        otherwise a 2-D array with shape (# of data, # of dimension)

        Note that scipy.stats.gaussian_kde take X with shape (# of dimension, # of data)
        This is why we transpose the input X.
        """
        if isinstance(self.bw_method, float):
            # Note that scipy weights its bandwidth by the covariance of the
            # input data.  To make the results comparable to the other methods,
            # we divide the bandwidth by the sample standard deviation here.
            self.kde = gaussian_kde(X.T, bw_method=self.bw_method/x.std(ddof=1))
        else:
            self.kde = gaussian_kde(X.T, bw_method=self.bw_method)
        self._fitted = True

[docs]    def evaluate_density(self, X):
        if self._fitted is False:
            raise ValueError("The KDE model has not been fitted.")
        return np.log(self.kde.evaluate(X.T))
        

[docs]class KDE_sklearn(Kernel_Density_Base):
    """
    Kernel Density Estimation with Sklearn
    https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KernelDensity.html#sklearn.neighbors.KernelDensity
    https://scikit-learn.org/stable/modules/density.html#kernel-density-estimation
    """
    def __init__(self, bandwidth=1.0, algorithm='auto', kernel='gaussian', metric='euclidean'):
        """
        bandwidth: float. The bandwidth of the kernel.
        algorithm: str. The tree algorithm to use. Valid options are [‘kd_tree’|’ball_tree’|’auto’]. Default is ‘auto’.
        kernel:str. The kernel to use. 
                    Valid kernels are [‘gaussian’|’tophat’|’epanechnikov’|’exponential’|’linear’|’cosine’] 
                    Default is ‘gaussian’.
        metric: str. The distance metric to use. 
        """
        self.bandwidth = bandwidth
        self.algorithm = algorithm
        self.kernel = kernel
        self.metric = metric
        self.kde = KernelDensity(bandwidth=bandwidth, algorithm=algorithm, kernel=kernel, metric=metric)
        self.acronym = 'kde_sklearn'
        self._fitted = False

[docs]    def fit(self, X):
        """
        X: dataset, array_like

        Datapoints to estimate from. In case of univariate data this is a 1-D array,
        otherwise a 2-D array with shape (# of data, # of dimension)

        """
        self.kde.fit(X)
        self._fitted = True

[docs]    def evaluate_density(self, X):
        if self._fitted is False:
            raise ValueError("The KDE model has not been fitted.")
        return self.kde.score_samples(X)