Source code for drcme.ephys_morph_clustering

"""
The :mod:`drcme.ephys_morph_clustering` module contains functions for performing
joint clustering of electrophysiology and morphology data.
"""

import numpy as np
import pandas as pd

import sklearn.metrics as metrics
import sklearn.cluster as cluster
import sklearn.neighbors as neighbors
import sklearn.manifold as manifold
import sklearn.mixture as mixture
import sklearn.model_selection as model_selection

import scipy.cluster.hierarchy as hierarchy
import scipy.spatial.distance as distance

from hashlib import sha1
from multiprocessing import Pool
import logging
import sys


[docs]def clustjaccard(y_true, y_pred): """Calculate Jaccard coefficient Ranges from 0 to 1. Higher values indicate greater similarity. Parameters ---------- y_true : list or array Boolean indication of cluster membership (1 if belonging, 0 if not belonging) for actual labels y_pred : list or array Boolean indication of cluster membership (1 if belonging, 0 if not belonging) for predicted labels Returns ------- float Jaccard coefficient value """ if type(y_true) is list: y_true = np.array(y_true) if type(y_pred) is list: y_pred = np.array(y_pred) return float(np.sum(y_true & y_pred)) / (np.sum(y_true) + np.sum(y_pred) - np.sum(y_true & y_pred))
def all_cluster_calls(specimen_ids, morph_data, ephys_data, weights=[1, 2, 5], n_cl=[10, 15, 20, 25], n_nn=[4, 7, 10]): """Perform several clustering algorithms and variations Parameters ---------- specimen_ids : array Specimen labels of `morph_data` and `ephys_data` morph_data : array Specimens by morphology features array ephys_data : array Specimens by electrophysiology features array weights : list, optional Set of relative electrophysiology weights n_cl : list, optional Set of cluster numbers n_nn : list, optional Set of nearest-neighbor values Returns ------- DataFrame Combined clustering results. The index is the set of `specimen_ids`. Each column contains cluster labels for a different clustering variant. """ results_df = pd.DataFrame({"specimen_id": specimen_ids}).set_index("specimen_id") results_df = hc_nn_cluster_calls(results_df, morph_data, ephys_data, n_nn=n_nn, n_cl=n_cl) results_df = hc_combo_cluster_calls(results_df, morph_data, ephys_data, weights=weights, n_cl=n_cl) results_df = gmm_combo_cluster_calls(results_df, morph_data, ephys_data, weights=weights, n_cl=n_cl) results_df = spectral_combo_cluster_calls(results_df, morph_data, ephys_data, weights=weights, n_cl=n_cl) return results_df
[docs]def hc_nn_cluster_calls(results_df, morph_data, ephys_data, n_nn=[4, 7, 10], n_cl=[10, 15, 20, 25]): """Add agglomerative clustering results with connectivity constraints Parameters ---------- results_df : DataFrame Existing DataFrame that new clustering results are added to as new columns morph_data : array Specimens by morphology features array ephys_data : array Specimens by electrophysiology features array n_nn : list, optional Set of nearest-neighbor values n_cl : list, optional Set of cluster numbers Returns ------- DataFrame Combined clustering results. The index is the set of `specimen_ids`. Each column contains cluster labels for a different clustering variant. """ pw = metrics.pairwise.pairwise_distances(ephys_data) for nn in n_nn: knn_graph = neighbors.kneighbors_graph(pw, nn, include_self=False) for cl in n_cl: key = "hc_conn_{:d}_{:d}".format(nn, cl) model = cluster.AgglomerativeClustering(linkage="ward", connectivity=knn_graph, n_clusters=cl) model.fit(morph_data) results_df[key] = model.labels_ return results_df
[docs]def hc_combo_cluster_calls(results_df, morph_data, ephys_data, weights=[1, 2, 5], n_cl=[10, 15, 20, 25]): """Add agglomerative hierarchical clustering results Parameters ---------- results_df : DataFrame Existing DataFrame that new clustering results are added to as new columns morph_data : array Specimens by morphology features array ephys_data : array Specimens by electrophysiology features array weights : list, optional Set of relative electrophysiology weights n_cl : list, optional Set of cluster numbers Returns ------- DataFrame Combined clustering results. The index is the set of `specimen_ids`. Each column contains cluster labels for a different clustering variant. """ for wt in weights: EM_data = np.hstack([morph_data, wt * ephys_data]) for cl in n_cl: key = "hc_combo_{:g}_{:d}".format(wt, cl) model = cluster.AgglomerativeClustering(linkage="ward", n_clusters=cl) model.fit(EM_data) results_df[key] = model.labels_ return results_df
[docs]def gmm_combo_cluster_calls(results_df, morph_data, ephys_data, weights=[1, 2, 5], n_cl=[10, 15, 20, 25]): """Add Gaussian mixture model clustering results Parameters ---------- results_df : DataFrame Existing DataFrame that new clustering results are added to as new columns morph_data : array Specimens by morphology features array ephys_data : array Specimens by electrophysiology features array weights : list, optional Set of relative electrophysiology weights n_cl : list, optional Set of cluster numbers Returns ------- DataFrame Combined clustering results. The index is the set of `specimen_ids`. Each column contains cluster labels for a different clustering variant. """ for wt in weights: EM_data = np.hstack([morph_data, wt * ephys_data]) for cl in n_cl: key = "gmm_combo_{:g}_{:d}".format(wt, cl) model = mixture.GaussianMixture(n_components=cl, covariance_type="diag", n_init=20, random_state=0) results_df[key] = model.fit_predict(EM_data) return results_df
[docs]def spectral_combo_cluster_calls(results_df, morph_data, ephys_data, weights=[1, 2, 5], n_cl=[10, 15, 20, 25]): """Add spectral clustering results Parameters ---------- results_df : DataFrame Existing DataFrame that new clustering results are added to as new columns morph_data : array Specimens by morphology features array ephys_data : array Specimens by electrophysiology features array weights : list, optional Set of relative electrophysiology weights n_cl : list, optional Set of cluster numbers Returns ------- DataFrame Combined clustering results. The index is the set of `specimen_ids`. Each column contains cluster labels for a different clustering variant. """ for wt in weights: EM_data = np.hstack([morph_data, wt * ephys_data]) for cl in n_cl: key = "spec_combo_{:g}_{:d}".format(wt, cl) model = cluster.SpectralClustering(cl, gamma=0.01, n_init=20, random_state=0) results_df[key] = model.fit_predict(EM_data) return results_df
[docs]def consensus_clusters(results, min_clust_size=3): """Determine consensus clusters from multiple variations The method iteratively divides the co-clustering matrix by Ward hierarchical clustering, using the co-clustering fractions as the distance measure. The iterative division stops when a resultant cluster would be smaller than `min_clust_size`. Next, co-clustering rates between clusters are evaluated, and clusters are merged if the higher of the two within-cluster rates fails to exceed the between-cluster rate by 25%. Sample assignments are then refined by reassignment to the best-matched cluster (repeated until convergence). This procedure is based on the one described by `Tasic et al. (2018) <https://www.nature.com/articles/s41586-018-0654-5>`_. Parameters ---------- results : array Results of multiple clustering variants. Each column contains labels from a different variant. min_clust_size : int, optional Minimum size of consensus cluster Returns ------- clust_labels : array Consensus cluster labels shared_norm : array Sample by sample matrix of the fraction of times samples were placed in the same cluster cc_rates : array Cluster by cluster matrix of the average co-clustering rates between cells in a pair of consensus clusters """ n_cells = results.shape[0] shared = np.zeros((n_cells, n_cells)) for i in range(shared.shape[0]): for j in range(i, shared.shape[0]): shared_count = np.sum(results[i, :] == results[j, :]) shared[i, j] = shared_count shared[j, i] = shared_count shared_norm = shared / shared.max() clust_labels = np.zeros((shared.shape[0])) keep_going = True while keep_going: uniq_labels = np.unique(clust_labels) new_labels = np.zeros_like(clust_labels) for l in uniq_labels: cl_mask = clust_labels == l X = shared_norm[cl_mask, :][:, cl_mask] Z = hierarchy.linkage(X, method="ward") sub_labels = hierarchy.fcluster(Z, t=2, criterion="maxclust") if (np.sum(sub_labels == 1) < min_clust_size) or (np.sum(sub_labels == 2) < min_clust_size): # Don't split if it produces clusters that are too small sub_labels = 2 * clust_labels[cl_mask] else: sub_labels += (2 * int(l)) - 1 new_labels[cl_mask] = sub_labels if metrics.adjusted_rand_score(clust_labels, new_labels) == 1: keep_going = False clust_labels = new_labels logging.debug(f"{len(np.unique(clust_labels))} after iterative splitting") keep_going = True while keep_going: # Check within and against uniq_labels = np.unique(clust_labels) logging.debug(f"examining merges for {len(uniq_labels)} labels") cc_rates = coclust_rates(shared_norm, clust_labels, uniq_labels) merges = [] for i, l in enumerate(uniq_labels): for j, m in enumerate(uniq_labels[i + 1:]): Nll = cc_rates[i, i] Nmm = cc_rates[i + j + 1, i + j + 1] Nlm = cc_rates[i, i + j + 1] if Nlm > np.max([Nll, Nmm]) - 0.25: merges.append((l, m, Nlm)) if len(merges) == 0: keep_going = False else: best_cross = 0. for l, m, nlm in merges: if nlm > best_cross: best_cross = nlm merge = (l, m) l, m = merge clust_labels[clust_labels == m] = l clust_labels = refine_assignments(clust_labels, shared_norm) # Clean up the labels new_map = {v: i for i, v in enumerate(np.sort(np.unique(clust_labels)))} clust_labels = np.array([new_map[v] for v in clust_labels]) uniq_labels = np.unique(clust_labels) cc_rates = coclust_rates(shared_norm, clust_labels, uniq_labels) return clust_labels, shared_norm, cc_rates
[docs]def refine_assignments(clust_labels, shared_norm): """Reassign samples to the best-matched clusters All samples that have a better-matching cluster are reassigned at a time. Since reassignment changes the matching rates, the procedure is repeated until assignments no longer change or are identical to a previously encountered set of assignments (meaning that the algorithm has entered a cycle). Parameters ---------- clust_labels : array Cluster assignments shared_norm : array Matrix of normalized cell-by-cell co-clustering rates Returns ------- array New cluster assignments """ # Refine individual cell assignments keep_going = True uniq_labels = np.sort(np.unique(clust_labels)) reassignments = [] state_hashes = [] while keep_going: last_reassignments = reassignments reassignments = [] # Check within and against cl_masks = np.zeros((shared_norm.shape[0], len(uniq_labels))) for i, l in enumerate(uniq_labels): cl_masks[:, i] = (clust_labels == l).astype(int) cl_n = cl_masks.sum(axis=0) cl_sums = shared_norm @ cl_masks self_vals = np.diag(shared_norm)[:, np.newaxis] * cl_masks self_masks = (self_vals > 0).astype(float) cl_adj_sums = cl_sums - self_vals cl_adj_n = cl_n[np.newaxis, :] - self_masks cl_adj_n[cl_adj_n == 0] = 1 # avoid divide by zero cl_rates = cl_adj_sums / cl_adj_n rate_argmax = np.argmax(cl_rates, axis=1) assign_argmax = np.argmax(self_masks, axis=1) switch_candidates = np.flatnonzero(rate_argmax != assign_argmax) if len(switch_candidates) == 0: keep_going = False else: rate_max = np.max(cl_rates, axis=1) switch_ind = switch_candidates[np.argmax(rate_max[switch_candidates])] new_assignment = uniq_labels[rate_argmax[switch_ind]] logging.debug(f"switching {switch_ind} to {new_assignment}") clust_labels[switch_ind] = new_assignment state_hash = sha1(clust_labels).hexdigest() if state_hash in state_hashes: # have we encountered this set of assignments before? keep_going = False else: state_hashes.append(state_hash) return clust_labels
[docs]def coclust_rates(shared, clust_labels, uniq_labels): """Calculate co-clustering rates between clusters Parameters ---------- shared : array Matrix of normalized cell-by-cell co-clustering rates clust_labels : array Cluster assignments uniq_labels : array Set of unique cluster labels Returns ------- array Cluster-by-cluster matrix of co-clustering rates """ cc_rates = np.zeros((len(uniq_labels), len(uniq_labels))) for i, l in enumerate(uniq_labels): mask_l = clust_labels == l for j, m in enumerate(uniq_labels[i:]): mask_m = clust_labels == m X = shared[mask_l, :][:, mask_m] if l == m: ind1, ind2 = np.tril_indices(X.shape[0], k=-1) X = X[ind1, :][:, ind2] if X.size > 0: cc_rates[i, i + j] = cc_rates[i + j, i] = X.mean() else: cc_rates[i, i + j] = cc_rates[i + j, i] = 0 return cc_rates
[docs]def subsample_run(original_labels, specimen_ids, morph_data, ephys_data, weights=[1, 2, 5], n_cl=[10, 15, 20, 25], n_nn=[4, 7, 10], n_folds=10, n_iter=1, min_consensus_n=3): """Calculate Jaccard coefficients for subsampled clustering runs Parameters ---------- original_labels : array Cluster assignments from analysis on full data set specimen_ids : array Specimen labels morph_data : array Specimen by morphology feature matrix ephys_data : array Specimen by electrophysiology feature matrix weights : list, optional Set of relative electrophysiology weights n_cl : list, optional Set of cluster numbers n_nn : list, optional Set of nearest-neighbor values n_folds : int, optional Number of subsample folds n_iter : int, optional Number of subsampled runs to perform min_consensus_n : int, optional Minimum size of consensus cluster Returns ------- array Jaccard coefficients of each cluster (rows) from each run (columns). This array will have ``n_iter`` * ``n_folds`` columns. """ run_info_list = [{ "iter_number": i, "original_labels": original_labels, "specimen_ids": specimen_ids, "morph_data": morph_data, "ephys_data": ephys_data, "weights": weights, "n_cl": n_cl, "n_nn": n_nn, "n_folds": n_folds, "min_consensus_n": min_consensus_n, } for i in range(n_iter)] p = Pool() logging.info("Starting multiprocessing") results = [] for i, res in enumerate(p.imap_unordered(_individual_subsample_run, run_info_list, 1)): sys.stderr.write('\rdone {0:%}'.format(float(i + 1)/len(run_info_list))) results.append(res) jaccards = np.hstack(results) return jaccards
def _individual_subsample_run(run_info): """Perform an individual subsample run Used within :func:`subsample_run` """ i = run_info["iter_number"] original_labels = run_info["original_labels"] specimen_ids = run_info["specimen_ids"] morph_data = run_info["morph_data"] ephys_data = run_info["ephys_data"] weights = run_info["weights"] n_cl = run_info["n_cl"] n_nn = run_info["n_nn"] n_folds = run_info["n_folds"] min_consensus_n = run_info["min_consensus_n"] orig_labels_uniq = np.sort(np.unique(original_labels)) jaccards = np.zeros((len(orig_labels_uniq), n_folds)) kf = model_selection.KFold(n_splits=n_folds, shuffle=True, random_state=i) counter = 0 for train_index, _ in kf.split(original_labels): logging.info("starting {:d} {:d}".format(i, counter)) subsample_results = all_cluster_calls(specimen_ids[train_index], morph_data[train_index, :], ephys_data[train_index, :], weights=weights, n_cl=n_cl, n_nn=n_nn) subsample_labels, _, _ = consensus_clusters( subsample_results.values[:, 1:], min_clust_size=min_consensus_n) sub_uniq = np.sort(np.unique(subsample_labels)) for ii, orig_cl in enumerate(orig_labels_uniq): jacc = [] y_orig = original_labels[train_index] == orig_cl for sub_cl in sub_uniq: y_sub = subsample_labels == sub_cl jacc.append(clustjaccard(y_orig, y_sub)) jaccards[ii, counter] = np.max(jacc) counter += 1 return jaccards