Source code for asaplib.kde.density_estimation_internal

"""
I adapted the code from:
https://github.com/alexandreday/fast_density_clustering.git
Copyright 2017 Alexandre Day
"""

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KernelDensity, NearestNeighbors

from .density_estimation import Kernel_Density_Base

[docs]class KDE_internal(Kernel_Density_Base): """Kernel density estimation (KDE) for accurate local density estimation. This is achieved by using maximum-likelihood estimation of the generative kernel density model which is regularized using cross-validation. Parameters ---------- bandwidth: float, optional bandwidth for the kernel density estimation. If not specified, will be determined automatically using maximum likelihood on a test-set. nh_size : int, optional (default = 'auto') number of points in a typical neighborhood... only relevant for evaluating a crude estimate of the bandwidth.'auto' means that the nh_size is scaled with number of samples. We use nh_size = 100 for 10000 samples. The minimum neighborhood size is set to 4. test_ratio_size: float, optional (default = 0.8) Ratio size of the test set used when performing maximum likehood estimation. In order to have smooth density estimations (prevent overfitting), it is recommended to use a large test_ratio_size (closer to 1.0) rather than a small one. atol: float, optional (default = 0.000005) kernel density estimate precision parameter. determines the precision used for kde. smaller values leads to slower execution but better precision rtol: float, optional (default = 0.00005) kernel density estimate precision parameter. determines the precision used for kde. smaller values leads to slower execution but better precision xtol: float, optional (default = 0.01) precision parameter for optimizing the bandwidth using maximum likelihood on a test set test_ratio_size: float, optional ratio of the test size for determining the bandwidth. kernel: str, optional (default='gaussian') Type of Kernel to use for density estimates. Other options are {'epanechnikov'|'linear','tophat'}. """ def __init__(self, nh_size='auto', bandwidth=None, test_ratio_size=0.1, xtol=0.01, atol=0.000005, rtol=0.00005, extreme_dist=False, nn_dist=None, kernel='gaussian'): self.bandwidth = bandwidth self.nh_size = nh_size self.test_ratio_size = test_ratio_size self.xtol = xtol self.atol = atol self.rtol = rtol self.extreme_dist = extreme_dist self.nn_dist = nn_dist self.kernel = kernel # epanechnikov other option self.acronym = 'kde_internal'
[docs] def fit(self, X): """Fit kernel model to X""" if self.nh_size is 'auto': self.nh_size = max([int(25 * np.log10(X.shape[0])), 4]) if X.shape[1] > 8: print('Careful, you are trying to do density estimation for data in a D > 8 dimensional space\n' ' ... you are warned !') if self.bandwidth is None: self.bandwidth = self.find_optimal_bandwidth(X) else: self.kde = KernelDensity( bandwidth=self.bandwidth, algorithm='kd_tree', kernel=self.kernel, metric='euclidean', atol=self.atol, rtol=self.rtol, breadth_first=True, leaf_size=40 ) self.kde.fit(X) return self
[docs] def evaluate_density(self, X): """Given an array of data, computes the local density of every point using kernel density estimation Input ------ Data X : array, shape(n_sample,n_feature) Return ------ Log of densities for every point: array, shape(n_sample) Return: kde.score_samples(X) """ return self.kde.score_samples(X)
[docs] def bandwidth_estimate(self, X_train, X_test): """Gives a rough estimate of the optimal bandwidth (based on the notion of some effective neigborhood) Return --------- bandwidth estimate, minimum possible value : tuple, shape(2) """ if self.nn_dist is None: nn = NearestNeighbors(n_neighbors=self.nh_size, algorithm='kd_tree') nn.fit(X_train) nn_dist, _ = nn.kneighbors(X_test, n_neighbors=self.nh_size, return_distance=True) else: nn_dist = self.nn_dist dim = X_train.shape[1] # Computation of minimum bound # This can be computed by taking the limit h -> 0 and making a saddle-point approx. mean_nn2_dist = np.mean(nn_dist[:, 1] * nn_dist[:, 1]) h_min = np.sqrt(mean_nn2_dist / dim) idx_1 = np.random.choice(np.arange(len(X_train)), size=min([1000, len(X_train)]), replace=False) idx_2 = np.random.choice(np.arange(len(X_test)), size=min([1000, len(X_test)]), replace=False) max_size = min([len(idx_1), len(idx_2)]) tmp = np.linalg.norm(X_train[idx_1[:max_size]] - X_test[idx_2[:max_size]], axis=1) h_max = np.sqrt(np.mean(tmp * tmp) / dim) h_est = 10 * h_min return h_est, h_min, h_max
[docs] def find_optimal_bandwidth(self, X): """Performs maximum likelihood estimation on a test set of the density model fitted on a training set """ from scipy.optimize import fminbound X_train, X_test = train_test_split(X, test_size=self.test_ratio_size) args = (X_test,) hest, hmin, hmax = self.bandwidth_estimate(X_train, X_test) print("[kde] Minimum bound = %.4f \t Rough estimate of h = %.4f \t Maximum bound = %.4f" % (hmin, hest, hmax)) # We are trying to find reasonable tight bounds (hmin, 4.0*hest) to bracket the error function minima # Would be nice to have some hard accurate bounds self.xtol = round_float(hmin) print('[kde] Bandwidth tolerance (xtol) set to precision of minimum bound : %.5f ' % self.xtol) self.kde = KernelDensity(algorithm='kd_tree', atol=self.atol, rtol=self.rtol, leaf_size=40, kernel=self.kernel) self.kde.fit(X_train) # hmax is the upper bound, however, heuristically it appears to always be way above the actual bandwidth. hmax*0.2 seems much better but still conservative h_optimal, score_opt, _, niter = fminbound(self.log_likelihood_test_set, hmin, hmax * 0.2, args, maxfun=100, xtol=self.xtol, full_output=True) print("[kde] Found log-likelihood maximum in %i evaluations, h = %.5f" % (niter, h_optimal)) if self.extreme_dist is False: # These bounds should always be satisfied ... assert abs(h_optimal - hmax) > 1e-4, "Upper boundary reached for bandwidth" assert abs(h_optimal - hmin) > 1e-4, "Lower boundary reached for bandwidth" return h_optimal
# @profile
[docs] def log_likelihood_test_set(self, bandwidth, X_test): """Fit the kde model on the training set given some bandwidth and evaluates the negative log-likelihood of the test set """ self.kde.bandwidth = bandwidth # l_test = len(X_test) return -self.kde.score(X_test[ :2000]) # X_test[np.random.choice(np.arange(0, l_test), size=min([int(0.5*l_test), 1000]), replace=False)]) # this should be accurate enough !
[docs]def round_float(x): """Rounds a float to it's first significant digit""" a = list(str(x)) for i, e in enumerate(a): if e != '.': if e != '0': pos = i a[i] = '1' break return float("".join(a[:pos + 1]))