defnormalized_mutual_info_score(labels_true, labels_pred, average_method='warn'): """Normalized Mutual Information between two clusterings. Parameters ---------- labels_true : int array, shape = [n_samples] A clustering of the data into disjoint subsets. labels_pred : array, shape = [n_samples] A clustering of the data into disjoint subsets. average_method : string, optional (default: 'warn') How to compute the normalizer in the denominator. Possible options are 'min', 'geometric', 'arithmetic', and 'max'. If 'warn', 'geometric' will be used. The default will change to 'arithmetic' in version 0.22. Returns ------- nmi : float score between 0.0 and 1.0. 1.0 stands for perfectly complete labeling """ if average_method == 'warn': warnings.warn("The behavior of NMI will change in version 0.22. " "To match the behavior of 'v_measure_score', NMI will " "use average_method='arithmetic' by default.", FutureWarning) average_method = 'geometric' labels_true, labels_pred = check_clusterings(labels_true, labels_pred) classes = np.unique(labels_true) clusters = np.unique(labels_pred) # Special limit cases: no clustering since the data is not split. # This is a perfect match hence return 1.0. if (classes.shape[0] == clusters.shape[0] == 1or classes.shape[0] == clusters.shape[0] == 0): return1.0 contingency = contingency_matrix(labels_true, labels_pred, sparse=True) contingency = contingency.astype(np.float64) # Calculate the MI for the two clusterings mi = mutual_info_score(labels_true, labels_pred, contingency=contingency) # Calculate the expected value for the mutual information # Calculate entropy for each labeling h_true, h_pred = entropy(labels_true), entropy(labels_pred) normalizer = _generalized_average(h_true, h_pred, average_method) # Avoid 0.0 / 0.0 when either entropy is zero. normalizer = max(normalizer, np.finfo('float64').eps) nmi = mi / normalizer return nmi
defadjusted_rand_score(labels_true, labels_pred): """Rand index adjusted for chance. Parameters ---------- labels_true : int array, shape = [n_samples] Ground truth class labels to be used as a reference labels_pred : array, shape = [n_samples] Cluster labels to evaluate Returns ------- ari : float Similarity score between -1.0 and 1.0. Random labelings have an ARI close to 0.0. 1.0 stands for perfect match. """ labels_true, labels_pred = check_clusterings(labels_true, labels_pred) n_samples = labels_true.shape[0] n_classes = np.unique(labels_true).shape[0] n_clusters = np.unique(labels_pred).shape[0]
# Special limit cases: no clustering since the data is not split; # or trivial clustering where each document is assigned a unique cluster. # These are perfect matches hence return 1.0. if (n_classes == n_clusters == 1or n_classes == n_clusters == 0or n_classes == n_clusters == n_samples): return1.0
# Compute the ARI using the contingency data # 共现矩阵:行为真实类别数,列为聚类个数,第i行第j列表示真实类别为i的元素被聚类为j的元素个数 contingency = contingency_matrix(labels_true, labels_pred, sparse=True) sum_comb_c = sum(_comb2(n_c) for n_c in np.ravel(contingency.sum(axis=1))) sum_comb_k = sum(_comb2(n_k) for n_k in np.ravel(contingency.sum(axis=0))) sum_comb = sum(_comb2(n_ij) for n_ij in contingency.data)