"""
The :mod:`drcme.post_gmm_merging` module contains functions for merging
Gaussian mixture model components together based on an entropy criterion
as in `Baudry et al. (2010)
<https://www.tandfonline.com/doi/abs/10.1198/jcgs.2010.08111>`_.
"""
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import scipy
import os.path
import joblib
from scipy.cluster import hierarchy
import argschema as ags
import logging
class MergeParameters(ags.ArgSchema):
project = ags.fields.String(default="T301")
gmm_types = ags.fields.List(ags.fields.String, default=["diag", "diag", "diag"])
def log_p(x):
return np.log(np.maximum(x, 1e-300))
def entropy_merges(results_dir, project="T301", gmm_type="diag", piecewise_components=2):
data = pd.read_csv(os.path.join("..", "dev", results_dir, "sparse_pca_components_{:s}.csv".format(project)), index_col=0)
gmm = joblib.load(os.path.join("..", "dev", results_dir, "best_gmm_{:s}.pkl".format(project)))
my_gmm = gmm[gmm_type]
combo = data.values
tau = my_gmm.predict_proba(combo)
labels = my_gmm.predict(combo)
K_bic = max(labels) + 1
return entropy_combi(tau_labels, K_bic, piecewise_components=piecewise_components)
[docs]def entropy_specific_merges(tau, labels, K_bic, clusters_to_merge):
"""Merge set of specified clusters by entropy criterion
Parameters
----------
tau : array
Cluster membership probabilities
labels : array
Cluster assignments
K_bic : int
Number of original clusters (before merging)
clusters_to_merge : array
Set of clusters to merge
Returns
-------
merge_info : dict
Contains information about what was merged: entropies at each
merge point ("entropies"), sequence of merges by indices as the
matrix changes size ("merge_sequence"), sequence of merges by
original column name ("merges_by_name"), original cluster number
("K_bic"), sequence of cluster numbers ("Ks"), number of
cumulative points merged ("cumul_merges")
new_labels : array
Labels after merging
tau_merged : array
Cluster membership probabilities after merging
merge_matrix : array
Matrix of relationships between original clusters and merged clusters
"""
entropy = -np.sum(np.multiply(tau, log_p(tau)))
prior_merges = []
merges_by_names = []
entropies = [entropy]
Ks = [K_bic]
n_to_merge = len(clusters_to_merge)
n_merge_tracker = [0]
orig_col_names = np.arange(K_bic)
for K in range(K_bic, K_bic - n_to_merge, -1):
merge_matrix = np.identity(K_bic, dtype=int)
merge_col_names = orig_col_names.copy()
for merger in prior_merges:
i, j = merger
merge_matrix[:, i] = merge_matrix[:, i] | merge_matrix[:, j]
mask = np.ones(merge_matrix.shape[1], dtype=bool)
mask[j] = False
merge_matrix = merge_matrix[:, mask]
merge_col_names = merge_col_names[mask]
labels = np.argmax(np.dot(tau, merge_matrix), axis=1)
ent_current = np.inf
candidate_cols = np.arange(K)[np.isin(merge_col_names, clusters_to_merge)]
logging.debug("candidates left")
logging.debug(merge_col_names[np.isin(merge_col_names, clusters_to_merge)])
remaining_cols = np.arange(K)[~np.isin(merge_col_names, clusters_to_merge)]
for i in remaining_cols:
for j in candidate_cols:
new_merge_matrix = merge_matrix.copy()
new_merge_matrix[:, i] = merge_matrix[:, i] | merge_matrix[:, j]
mask = np.ones(K, dtype=bool)
mask[j] = False
new_merge_matrix = new_merge_matrix[:, mask]
tau_m = np.dot(tau, new_merge_matrix)
ent = -np.sum(np.multiply(tau_m, log_p(tau_m)))
if ent < ent_current:
ent_current = ent
merger = (i, j)
n_merged = np.sum(labels == i) + np.sum(labels == j)
prior_merges.append(merger)
merges_by_names.append([merge_col_names[i].item() for i in merger])
entropies.append(ent_current)
Ks.append(K-1)
n_merge_tracker.append(n_merged)
merge_info = {
"entropies": entropies,
"merge_sequence": prior_merges,
"merges_by_names": merges_by_names,
"K_bic": K_bic,
"Ks": Ks,
"cumul_merges": np.cumsum(n_merge_tracker)
}
merge_matrix = np.identity(K_bic, dtype=int)
for merger in prior_merges:
i, j = merger
merge_matrix[:, i] = merge_matrix[:, i] | merge_matrix[:, j]
mask = np.ones(merge_matrix.shape[1], dtype=bool)
mask[j] = False
merge_matrix = merge_matrix[:, mask]
tau_merged = np.dot(tau, merge_matrix)
new_labels = np.argmax(tau_merged, axis=1)
return merge_info, new_labels, tau_merged, merge_matrix
[docs]def entropy_combi(tau, labels, K_bic, piecewise_components=2):
"""Merge clusters by entropy criterion and piecewise fit
Parameters
----------
tau : array
Cluster membership probabilities
labels : array
Cluster assignments
K_bic : int
Number of original clusters (before merging)
piecewise_components : {2, 3}, optional
Number of components of linear piecewise fit
Returns
-------
merge_info : dict
Contains information about what was merged: entropies at each
merge point ("entropies"), sequence of merges by indices as the
matrix changes size ("merge_sequence"), sequence of merges by
original column name ("merges_by_name"), original cluster number
("K_bic"), sequence of cluster numbers ("Ks"), number of
cumulative points merged ("cumul_merges")
new_labels : array
Labels after merging
tau_merged : array
Cluster membership probabilities after merging
merge_matrix : array
Matrix of relationships between original clusters and merged clusters
"""
entropy = -np.sum(np.multiply(tau, log_p(tau)))
prior_merges = []
entropies = [entropy]
Ks = [K_bic]
n_merge_tracker = [0]
if K_bic <= 3:
print("Too few clusters to assess merging")
merge_info = {
"entropies": entropies,
"cumul_merges": None,
"best_fits": None,
# "fit2": None,
"cp": None,
"merge_sequence": None,
"K_bic": K_bic,
}
return merge_info, labels, tau, None
for K in range(K_bic, 1, -1):
merge_matrix = np.identity(K_bic, dtype=int)
for merger in prior_merges:
i, j = merger
merge_matrix[:, i] = merge_matrix[:, i] | merge_matrix[:, j]
mask = np.ones(merge_matrix.shape[1], dtype=bool)
mask[j] = False
merge_matrix = merge_matrix[:, mask]
labels = np.argmax(np.dot(tau, merge_matrix), axis=1)
ent_current = np.inf
for i in range(K-1):
for j in range(i + 1, K):
new_merge_matrix = merge_matrix.copy()
new_merge_matrix[:, i] = merge_matrix[:, i] | merge_matrix[:, j]
mask = np.ones(K, dtype=bool)
mask[j] = False
new_merge_matrix = new_merge_matrix[:, mask]
tau_m = np.dot(tau, new_merge_matrix)
ent = -np.sum(np.multiply(tau_m, log_p(tau_m)))
if ent < ent_current:
ent_current = ent
merger = (i, j)
n_merged = np.sum(labels == i) + np.sum(labels == j)
prior_merges.append(merger)
entropies.append(ent_current)
Ks.append(K-1)
n_merge_tracker.append(n_merged)
cumul_merges = np.cumsum(n_merge_tracker)
best_fits, cp = fit_piecewise(cumul_merges, entropies, piecewise_components)
merge_info = {
"entropies": entropies,
"cumul_merges": cumul_merges,
"best_fits": best_fits,
# "fit2": fit2,
"cp": cp,
"merge_sequence": prior_merges[:cp[0]],
"K_bic": K_bic,
}
merge_matrix = np.identity(K_bic, dtype=int)
for merger in prior_merges[:cp[0]]:
i, j = merger
merge_matrix[:, i] = merge_matrix[:, i] | merge_matrix[:, j]
mask = np.ones(merge_matrix.shape[1], dtype=bool)
mask[j] = False
merge_matrix = merge_matrix[:, mask]
tau_merged = np.dot(tau, merge_matrix)
new_labels = np.argmax(tau_merged, axis=1)
return merge_info, new_labels, tau_merged, merge_matrix
[docs]def fit_piecewise(cumul_merges, entropies, n_parts):
""" Fit entropy vs cumulative merge number with linear piecewise function
Parameters
----------
cumul_merges : array
Number of cumulative samples merged at each merge step. Length
must be `n_parts` + 2 or more and same length as `entropies`.
entropies : array
Entropy values at each merge step. Length must be
`n_parts` + 2 or more and same length as `cumul_merges`.
n_parts : {2, 3}
Number of components for linear fit
Returns
-------
best_fits : tuple
Fit parameters for each component. Length of tuple equals
`n_parts`
cp : tuple
Locations of change point(s) for linear fits. Length of tuple
equals `n_parts` - 1
"""
total_err = np.inf
if len(entropies) < n_parts + 2:
logging.info("Not enough clusters for piecewise fit")
return (None,), (0,)
if n_parts == 2:
for c in range(1, len(entropies)-1):
x1 = cumul_merges[:c + 1]
x2 = cumul_merges[c:]
y1 = entropies[:c + 1]
y2 = entropies[c:]
A1 = np.vstack([x1, np.ones(len(x1))]).T
A2 = np.vstack([x2, np.ones(len(x2))]).T
fit1 = np.linalg.lstsq(A1, y1, rcond=-1)
fit2 = np.linalg.lstsq(A2, y2, rcond=-1)
err = fit1[1] + fit2[1]
if err < total_err:
total_err = err
best_fits = (fit1[0], fit2[0])
cp = (c,)
elif n_parts == 3:
for c in range(1, len(entropies) - 3):
for d in range(c + 1, len(entropies)-1):
x1 = cumul_merges[:c + 1]
x2 = cumul_merges[c:d + 1]
x3 = cumul_merges[d:]
y1 = entropies[:c + 1]
y2 = entropies[c:d + 1]
y3 = entropies[d:]
A1 = np.vstack([x1, np.ones(len(x1))]).T
A2 = np.vstack([x2, np.ones(len(x2))]).T
A3 = np.vstack([x3, np.ones(len(x3))]).T
fit1 = np.linalg.lstsq(A1, y1, rcond=-1)
fit2 = np.linalg.lstsq(A2, y2, rcond=-1)
fit3 = np.linalg.lstsq(A3, y3, rcond=-1)
err = fit1[1] + fit2[1] + fit3[1]
if err < total_err:
total_err = err
best_fits = (fit1[0], fit2[0], fit3[0])
cp = (c, d)
else:
raise("Wrong value for n_parts")
return best_fits, cp
[docs]def order_new_labels(new_labels, tau_merged, data):
"""Reorder cluster labels by similarity of centroids
Parameters
----------
new_labels : array
Cluster labels for samples
tau_merged : array
Cluster membership probability matrix to be reordered
data : array
Data matrix
Returns
-------
new_labels_reorder : list
Cluster labels for samples after labels have been reordered
tau_merged : array
Cluster membership probability matrix following new cluster order
new_labels_reorder_dict : dict
Mapping from previous labels (keys) to newly orderered labels (values)
leaves : array
Original labels as leaves of hierarchical clustering of centroids
"""
uniq_labels = np.unique(new_labels)
uniq_labels = uniq_labels[~np.isnan(uniq_labels)]
n_cl = len(uniq_labels)
centroids = np.zeros((n_cl, data.shape[1]))
for i in range(n_cl):
centroids[i, :] = np.mean(data.values[new_labels == i, :], axis=0)
Z = hierarchy.linkage(centroids, method="ward")
D = hierarchy.dendrogram(Z, no_plot=True)
leaves = np.array(D["leaves"])
new_labels_reorder_dict = {d: i for i, d in enumerate(leaves)}
tau_merged = tau_merged[:, leaves]
new_labels_reorder = [new_labels_reorder_dict[d] if not np.isnan(d) else d for d in new_labels]
return new_labels_reorder, tau_merged, new_labels_reorder_dict, leaves
def main():
module = ags.ArgSchemaParser(schema_type=MergeParameters)
project = module.args["project"]
gmm_types = module.args["gmm_types"]
sub_dirs = [s.format(project) for s in ["all_{:s}", "exc_{:s}", "inh_{:s}"]]
piecewise_components = [2, 2, 3]
for sub_dir, gmm_type, pw_comp in zip(sub_dirs, gmm_types, piecewise_components):
print("merging for ", sub_dir, "with", gmm_type)
merge_info, new_labels, tau_merged, _ = entropy_merges(sub_dir, project, gmm_type=gmm_type, piecewise_components=pw_comp)
print(merge_info)
data = pd.read_csv(os.path.join(sub_dir, "sparse_pca_components_{:s}.csv".format(project)), index_col=0)
new_labels, tau_merged, _, _ = order_new_labels(new_labels, tau_merged, data)
np.savetxt(os.path.join(sub_dir, "post_merge_proba.txt"), tau_merged)
np.save(os.path.join(sub_dir, "post_merge_cluster_labels.npy"), new_labels)
df = pd.read_csv(os.path.join(sub_dir, "all_tsne_coords_{:s}.csv".format(project)))
df["clustering_3"] = new_labels
df.to_csv(os.path.join(sub_dir, "all_tsne_coords_{:s}_plus.csv".format(project)))
if __name__ == "__main__":
main()