Source code for asaplib.cluster.ml_cluster_tools

"""
Tools to analyze clustering results
"""

from collections import Counter

import numpy as np


[docs]def output_cluster(prefix, labels, dicttags, tags):
    ofile = open("clustered-" + prefix + ".txt", "w")
    # output
    ofile.write("#! tag cluster_id \n")
    ndict = len(dicttags)
    ndata = len(tags)
    for i in range(ndict):
        ofile.write("%s %d\n" % (dicttags[i], labels[i]))
    for i in range(ndata):
        ofile.write("%s %d\n" % (tags[i], labels[i + ndict]))
    ofile.close()

    return 0


[docs]def output_cluster_sort(prefix, labels, dicttags, tags):
    # we sort the clusters as well
    ofile = open("sorted-clustered-" + prefix + ".txt", "w")
    # output
    ofile.write("#! tag cluster_id \n")
    ndict = len(dicttags)
    ndata = len(tags)

    sortlabels = np.stack((range(len(labels)), labels), axis=-1)
    sortlabels = sortlabels[sortlabels[:, 1].argsort()]

    for i, l in sortlabels:
        # print i,l
        if l >= 0 and i < ndict:
            ofile.write("%d %s\n" % (l, dicttags[i]))
        elif l >= 0:
            ofile.write("%d %s\n" % (l, tags[i - ndict]))
    return 0


[docs]def get_cluster_size(labels):
    unique_labels = set(labels)
    count = Counter(labels)
    return unique_labels, count


[docs]def most_frequent(List):
    occurence_count = Counter(List)
    return occurence_count.most_common(1)[0][0]


[docs]def array_handling(plist, attribute='mean'):
    """ available attributes:
        mean, sum, min, max, mode, all
    """
    if attribute == 'mean':
        return np.mean(plist)
    elif attribute == 'sum':
        return np.sum(plist)
    elif attribute == 'min':
        return np.amin(plist)
    elif attribute == 'max':
        return np.amax(plist)
    elif attribute == 'mode':
        return most_frequent(plist)
    elif attribute == 'all':
        return plist
    else:
        raise NameError('Attribute not found.')


[docs]def get_cluster_properties(labels, properties, attribute='mean'):
    unique_labels = set(labels)

    sortlabels = np.stack((range(len(labels)), labels), axis=-1)
    sortlabels = sortlabels[sortlabels[:, 1].argsort()]
    propertiesdict = {-1: 'noise'}

    ol = -1
    n = 0
    plist = []
    for i, l in sortlabels:
        if l > ol and l >= 0:
            propertiesdict[ol] = array_handling(plist, attribute)
            plist = []
        plist.append(properties[i])
        ol = l
    propertiesdict[ol] = array_handling(plist, attribute)

    return unique_labels, propertiesdict


[docs]def get_cluster_weighted_avg_properties(labels, properties, weights):
    unique_labels = set(labels)

    sortlabels = np.stack((range(len(labels)), labels), axis=-1)
    sortlabels = sortlabels[sortlabels[:, 1].argsort()]
    propertiesdict = {-1: 'noise'}

    ol = -1
    n = 0
    plist = []
    wlist = []
    for i, l in sortlabels:
        if l > ol and l >= 0:
            propertiesdict[ol] = np.mean(plist) / np.mean(wlist)
            plist = []
            wlist = []
        plist.append(properties[i] * weights[i])
        wlist.append(weights[i])
        ol = l
    propertiesdict[ol] = np.mean(plist) / np.mean(wlist)

    return unique_labels, propertiesdict