Source code for drcme.bin.run_ephys_morph_clustering

"""
Script to cluster on combined electrophysiology and morphology data.

The script runs multiple clustering variants, determines consensus clusters, and
finally evaluates the stability of each cluster.

.. autoclass:: MeClusteringParameters

"""

import numpy as np
import pandas as pd
import drcme.ephys_morph_clustering as emc
import argschema as ags
import logging
import sys


[docs]class MeClusteringParameters(ags.ArgSchema):
    """Parameter schema for electrophysiology/morphology clustering"""
    ephys_file = ags.fields.InputFile(
        description="CSV file path with sparse PCA electrophysiology values")
    morph_file = ags.fields.InputFile(
        description="CSV file path with morphology parameter values")
    weights = ags.fields.List(
        ags.fields.Float,
        description="List of relative weights for the electrophysiology values",
        cli_as_single_argument=True,
        default=[1., 2., 4.])
    n_cl = ags.fields.List(
        ags.fields.Integer,
        description="List of number of clusters for initial clustering algorithms",
        cli_as_single_argument=True,
        default=[10, 15, 20, 25])
    min_consensus_n = ags.fields.Integer(
        default=3,
        description="Minimum cluster size for consensus clusters")
    cocluster_matrix_file = ags.fields.OutputFile(
        description="Output file path for co-clustering matrix")
    cluster_labels_file = ags.fields.OutputFile(
        description="Output file path for cluster labels")
    specimen_id_file = ags.fields.OutputFile(
        description="Output file path for specimen IDs")
    jaccards_file = ags.fields.OutputFile(
        description="Output file path for Jaccard coefficients")
    ordering_file = ags.fields.OutputFile(
        description="Output file path for new cluster order")


def main(ephys_file, morph_file,
         weights, n_cl, min_consensus_n, cocluster_matrix_file,
         cluster_labels_file, jaccards_file, ordering_file,
         specimen_id_file,
         **kwargs):
    """ Main runner function for script.

    See :class:`MeClusteringParameters` for argument descriptions.
    """

    # Load the data
    ephys_data = pd.read_csv(ephys_file, index_col=0)

    # Expect already normalized wide dataframe
    morph_data = pd.read_csv(morph_file, index_col=0)
    morph_ids = morph_data.index.values

    # Use cells with both types of data
    ephys_morph_ids = ephys_data.index.intersection(morph_data.index)

    logging.info(f"Using {len(ephys_morph_ids)} cells")

    logging.info("Calculating cluster calls")
    logging.info("Ephys weights: " + ", ".join(map(str, weights)))
    logging.info("Cluster numbers: " + ", ".join(map(str, n_cl)))

    results_df = emc.all_cluster_calls(ephys_morph_ids.values,
                                       morph_data.loc[ephys_morph_ids, :].values,
                                       ephys_data.loc[ephys_morph_ids, :].values,
                                       weights=weights,
                                       n_cl=n_cl)
    clust_labels, shared, cc_rates = emc.consensus_clusters(
        results_df.values[:, 1:], min_clust_size=min_consensus_n)
    new_order = np.lexsort((clust_labels,))

    logging.info(f"Identified {len(np.unique(clust_labels))} consensus clusters with full data set")

    np.savetxt(cocluster_matrix_file, shared)
    pd.DataFrame(clust_labels, index=ephys_morph_ids.values).to_csv(cluster_labels_file)
    np.savetxt(ordering_file, new_order, fmt="%d")
    np.savetxt(specimen_id_file, ephys_morph_ids.values, fmt="%d")

    logging.info("Evaluating cluster stability")
    jaccards = emc.subsample_run(clust_labels,
                                 ephys_morph_ids.values,
                                 morph_data.loc[ephys_morph_ids, :].values,
                                 ephys_data.loc[ephys_morph_ids, :].values,
                                 weights=weights,
                                 n_cl=n_cl,
                                 n_folds=10,
                                 n_iter=10,
                                 min_consensus_n=min_consensus_n)
    np.savetxt(jaccards_file, jaccards)

    logging.info("Done")


if __name__ == "__main__":
    module = ags.ArgSchemaParser(schema_type=MeClusteringParameters)
    main(**module.args)