Source code for asaplib.compressor.cur

"""
CUR matrix decomposition is a low-rank matrix decomposition algorithm that is explicitly expressed in a small number of actual columns and/or actual rows of data matrix.
"""

import numpy as np
import scipy.sparse.linalg as spalg
from scipy.linalg import svd, eigh, norm, pinv
from scipy.sparse.linalg import svds


[docs]def CUR_deterministic(X, n_col, error_estimate=True, costs=1):
    """
    Given rank k, find the best column and rows
    X = C U R

    Parameters
    ----------
    X: np.matrix
       input a covariance matrix 
    n_col: int
       number of column to keep
    error_estimate: bool, optional
       compute the remaining error of the CUR
    costs: float, optional
        a list of costs associated with each column

    Returns
    -------
    indices : np.array
        indices of columns to choose
    cur_error : np.array
        the error of the decomposition
    """

    RX = X.copy()
    rsel = np.zeros(n_col, dtype=int)
    rerror = np.zeros(n_col, dtype=float)

    for i in range(n_col):
        rval, RX = CUR_deterministic_step(RX, n_col - i, costs)
        rsel[i] = rval
        if error_estimate:
            rerror[i] = np.sum(np.abs(RX))
    return rsel, rerror


[docs]def CUR_deterministic_step(cov, k, costs=1):
    """ Apply (deterministic) CUR selection of k rows & columns of the
    given covariance matrix, including an orthogonalization step. Costs can be weighted if desired. """

    evc, evec = spalg.eigs(cov, k)
    # compute the weights and select the one with maximum weight
    weights = np.sum(np.square(evec), axis=1) / costs
    sel = np.argmax(weights)
    vsel = cov[sel] / np.linalg.norm(cov[sel])
    rcov = cov.copy()
    print("selected: ", sel)
    for i in range(len(rcov)):
        # Diagonalize the covariance matrix wrt the chosen column
        rcov[i] -= vsel * np.dot(cov[i].A1, vsel.A1)

    return sel, rcov


[docs]def cur_column_select(array, num, rank=None, deterministic=True, mode="sparse", weights=None, calc_error=False):
    """Select columns from a matrix according to statstical leverage score.

    Based on:
    [1] Mahoney MW, Drineas P. CUR matrix decompositions for improved data analysis. PNAS. 2009 Jan 20;106(3):697–702.


    Notes
    -----
    Which mode to use? If the matrix is not square or possibly not hermitian (or even real symmetric) then sparse is
    the only option. For hermitian matrices, the calculation of the eigenvectors with `eigh` may be faster than
    using the sparse svd method. Benchmark is needed for this, especially for descriptor kernels.


    Parameters
    ----------
    array : np.array
        Array to compute find columns of (M, N)
    num : int
        number of column indices to be returned

    rank : int, optional
        number of singular vectors to calculate for calculation of the statistical leverage scores
        default: minimum of (min(N, M) - 1, num/2) but at least one

    deterministic : bool, optional
        Usage of deterministic method or probabilistic.
        Deterministic (True): the top `num` columns are given
        Stochastic (False): use leverage scores as probabilities for choosing num

    mode : {"sparse", "dense", "hermitian"}
        mode of the singular vector calculation.
            - `sparse` (default), which is using sparse-SVD, which is expected to be robust and to solve the problem
            - `hermitian` offers a speedup for hermitian matrices (ie. real symmetric kernel matrices as well)
            - `dense` (not recommended) is using SVD, but calculated all the singular vectors and uses a number of them
            according to the rank parameter

    weights : np.array, optional
        Costs for taking each column. shape=(N,)
        The statistical leverage scores are scaled by this array

    calc_error : bool, optional
        calculate the error of the decomposition, default False
        There is a significant cost to the calculation of this, due to a semi-inverse operation needed.
        The error here is the norm of the matrix of difference between the original and the approximation obtained from
        the selected columns. Not necessarily meaningful in every situation.

    Returns
    -------
    indices : np.array
        indices of columns to choose
    cur_error :
    """

    if not isinstance(rank, int):
        # this is set heuristically
        rank = min(min(array.shape) - 1, max(1, int(num / 2)))

    # compute singular vectors -- the rest is not relevant from this calculation
    if mode == "sparse":
        # left_singular_vectors, _, right_singular_vectors = svds(array, k)  # for error calculation
        *_, right_singular_vectors = svds(array, rank)
    elif mode == "dense":
        *_, right_singular_vectors = svd(array)
        right_singular_vectors = right_singular_vectors[-rank:]
    elif mode == "hermitian":
        assert array.shape[0] == array.shape[1] and len(array.shape) == 2
        _, right_singular_vectors = eigh(array, eigvals=(array.shape[0] - rank, array.shape[0] - 1))
    else:
        raise ValueError(
            'mode {} is not valid for this calculation, use either of: "sparse", "dense", "hermitian"'.format(mode))

    # compute statistical leverage scores and scale by weight if given
    pi_j = np.sum(right_singular_vectors ** 2, axis=0) / rank
    if weights is not None:
        # 1D vector of length equal to the number of columns
        assert np.shape(array)[1] == np.shape(weights)[0] and len(np.shape(weights)) == 1
        pi_j *= weights
        pi_j /= np.sum(pi_j)  # normalise again

    if deterministic:
        indices = np.argsort(pi_j)[-num:]
    else:
        indices = np.random.choice(np.arange(array.shape[1]), size=num, replace=False, p=pi_j)

    if calc_error:
        # error of the representation, ||array -  C dot C+ dot array||
        c = array[:, indices]
        c_pinv = pinv(c)
        cur_error = norm(array - np.matmul(np.matmul(c, c_pinv), array))

        return indices, cur_error
    else:
        return indices