Source code for drcme.bin.run_rf_prediction

"""
Script to predict type labels for new data using a random forest classifier and training data.

The electrophysiology files are split such that the ``reference_ephys_file`` contains
data for training the classifier and ``prediction_ephys_file`` contains data for new
predictions. The ``morph_file``, if used, has morphology data for both sets of cells.

.. autoclass:: RfPredictionParameters
.. autofunction:: construct_datasets
.. autofunction:: intersect_ephys_morph

"""

import numpy as np
import pandas as pd
import drcme.prediction as pred
import argschema as ags
import logging


[docs]class RfPredictionParameters(ags.ArgSchema): """Parameter schema for random-forest prediction""" reference_ephys_file = ags.fields.InputFile( description="Path to electrophysiology data file for reference cells") prediction_ephys_file = ags.fields.InputFile( description="Path to electrophysiology data file for cells that will have predicted labels") reference_label_file = ags.fields.InputFile( description="Path to type labels for reference cells") label_key = ags.fields.String( description="Column name of type label in 'reference_label_file'") morph_file = ags.fields.InputFile( description="Path to morphology data file for all cells", default=None, allow_none=True) output_file = ags.fields.OutputFile( description="Path to output file with predicted labels") ref_id_file = ags.fields.InputFile( description="Path to file with subset of IDs for reference cells", default=None, allow_none=True) pred_id_file = ags.fields.InputFile( description="Path to file with subset of IDs for predicted cells", default=None, allow_none=True) n_trees = ags.fields.Integer( description="Number of trees for random forest classifier", default=500) class_weight = ags.fields.String( description="Class weight parameter for random forest classifier", default=None, allow_none=True)
def main(reference_ephys_file, prediction_ephys_file, reference_label_file, label_key, morph_file, output_file, ref_id_file, pred_id_file, n_trees, class_weight, **kwargs): """ Main runner function for script. See :class:`RfPredictionParameters` for argument descriptions. """ ephys_ref = pd.read_csv(reference_ephys_file, index_col=0) ephys_pred = pd.read_csv(prediction_ephys_file, index_col=0) logging.debug("Running RF") ref_label_df = pd.read_csv(reference_label_file, index_col=0).set_index("specimen_id") if morph_file is None: ref_df = ephys_ref test_df = ephys_pred else: morph_df = pd.read_csv(morph_file, index_col=0) ref_df, test_df = construct_datasets(ephys_ref, ephys_pred, morph_df) if ref_id_file is not None: ref_ids = np.loadtxt(ref_id_file) ref_df = ref_df.loc[ref_ids, :] if pred_id_file is not None: pred_ids = np.loadtxt(pred_id_file) test_df = test_df.loc[pred_ids, :] labels = ref_label_df.loc[ref_df.index, label_key].values # drop reference values that don't have labels nan_mask = ~pd.Series(labels).isnull().values labels = labels[nan_mask] ref_df = ref_df.loc[nan_mask, :] pred_labels = pred.rf_predict(ref_df, labels, test_df, n_trees=n_trees, class_weight=class_weight) logging.debug("Saving results") pd.DataFrame(pred_labels, index=test_df.index.values).to_csv(output_file)
[docs]def construct_datasets(ephys_ref, ephys_pred, morph_df): """ Build reference and test data sets Parameters ---------- ephys_ref : DataFrame DataFrame with reference electrophysiology data ephys_pred : DataFrame DataFrame with electrophysiology data for label prediction morph_df : DataFrame DataFrame with morphology data for all cells Returns ------- ref_df : DataFrame Combined ephys/morph data set for reference cells test_df : DataFrame Combined ephys/morph data set for cells that will have labels predicted """ ref_df = intersect_ephys_morph(ephys_ref, morph_df) test_df = intersect_ephys_morph(ephys_pred, morph_df) return ref_df, test_df
[docs]def intersect_ephys_morph(ephys_df, morph_df): """ Make combined DataFrame with shared cells from `ephys_df` and `morph_df` Parameters ---------- ephys_df : DataFrame DataFrame with electrophysiology data morph_df : DataFrame DataFrame with morphology data Returns ------- DataFrame Combined ephys/morph data set """ morph_ids = morph_df.index.values # Get ephys data for cells with morphologies ids_with_morph_for_ephys = [s for s in morph_ids if s in ephys_df.index.tolist()] ephys_df_joint = ephys_df.loc[ids_with_morph_for_ephys, :] # Only use morphs that have ephys mask = [s in ephys_df_joint.index.tolist() for s in morph_ids] morph_df_joint = morph_df.loc[mask, :] elmo_data = np.hstack([morph_df_joint.values, ephys_df_joint.values]) return pd.DataFrame(elmo_data, index=ephys_df_joint.index.values)
if __name__ == "__main__": module = ags.ArgSchemaParser(schema_type=RfPredictionParameters) main(**module.args)