Source code for drcme.bin.run_merge_unstable_clusters

"""
Script for merging unstable clusters into stable ones.

.. autoclass:: UnstableClusterMergingParameters

"""

import drcme.post_gmm_merging as pgm
import numpy as np
import pandas as pd
import os
import json
import logging
import argschema as ags

[docs]class UnstableClusterMergingParameters(ags.ArgSchema): """Parameter schema for unstable cluster merging""" components_file = ags.fields.InputFile( description="Path to CSV file with sPCA components") post_merge_tau_file = ags.fields.InputFile( description="Path to file with cluster membership probabilities after entropy-based merging") post_merge_labels_file = ags.fields.InputFile( description="Path to file with cluster labels after entropy-based merging") jaccard_file = ags.fields.InputFile( description="Path to file with Jaccard coefficient values after subset-based stability analysis") post_merge_proba_file = ags.fields.OutputFile( description="Path to file with post-merging cluster membership probabilities") etypes_file = ags.fields.OutputFile( description="Path to file with stable e-type assignments") merge_unstable_info_file = ags.fields.OutputFile( description="Path to file with merge sequence information") stability_threshold = ags.fields.Float( description="Threshold below which clusters are considered unstable", default=0.5) outliers = ags.fields.List(ags.fields.Integer, description="Specimen IDs to exclude from analysis" )
def main(components_file, post_merge_tau_file, post_merge_labels_file, jaccard_file, post_merge_proba_file, etypes_file, merge_unstable_info_file, outliers, stability_threshold, **kwargs): """ Main runner function for script. See :class:`UnstableClusterMergingParameters` for argument descriptions. """ data = pd.read_csv(components_file, index_col=0) tau = pd.read_csv(post_merge_tau_file, index_col=0) labels = pd.read_csv(post_merge_labels_file, index_col=0) jaccard_means = pd.read_csv(jaccard_file, index_col=0) K_bic = labels.values.max() labels -= 1 # shift from R to Python indexing specific_merges = np.flatnonzero(jaccard_means.values < stability_threshold) # definition of instability (merge_info, merge_labels, tau_merged, merge_matrix) = pgm.entropy_specific_merges(tau=tau.values, labels=labels.values, K_bic=K_bic, clusters_to_merge=specific_merges) merge_labels_reorder, tau_merged, _, _ = pgm.order_new_labels(merge_labels, tau_merged, data.loc[~data.index.isin(outliers), :]) indexes = data.loc[~data.index.isin(outliers), :].index.values np.savetxt(post_merge_proba_file, tau_merged) pd.DataFrame(merge_labels_reorder, index=indexes).to_csv(etypes_file) with open(merge_unstable_info_file, "w") as f: json.dump({"merges": merge_info["merges_by_names"]}, f) if __name__ == "__main__": module = ags.ArgSchemaParser(schema_type=UnstableClusterMergingParameters) main(**module.args)