Source code for drcme.bin.run_existing_spca_on_new_data

"""
Script to apply an existing set of sPCA loadings to a new data set.

.. autoclass:: DatasetParameters
.. autoclass:: SpcaTransformParameters

"""

import numpy as np
import pandas as pd
import argschema as ags
import joblib
import drcme.load_data as ld
from drcme.spca import orig_mean_and_std_for_zscore, spca_transform_new_data
import logging


[docs]class DatasetParameters(ags.schemas.DefaultSchema): """Parameter schema for input datasets""" fv_h5_file = ags.fields.InputFile( description="HDF5 file with feature vectors") metadata_file = ags.fields.InputFile( description="Metadata file in CSV format", allow_none=True, default=None) dendrite_type = ags.fields.String( default="all", description="Filter for dendrite type using information in metadata (all, spiny, aspiny)", validate=lambda x: x in ["all", "spiny", "aspiny"]) allow_missing_structure = ags.fields.Boolean( required=False, description="Whether or not structure value for cell in metadata can be missing", default=False) allow_missing_dendrite = ags.fields.Boolean( required=False, description="Whether or not dendrite type value for cell in metadata can be missing", default=False) need_ramp_spike = ags.fields.Boolean( required=False, description="Whether or not to exclude cells that did not fire an action potential from the ramp stimulus", default=True) limit_to_cortical_layers = ags.fields.List( ags.fields.String, description="List of cortical layers to limit the data set (using the metadata file)", default=[], cli_as_single_argument=True) id_file = ags.fields.InputFile( description="Text file with specimen IDs to use. Cells with IDs not in the file will be excluded.", required=False, allow_none=True, default=None)
[docs]class SpcaTransformParameters(ags.ArgSchema): """Parameter schema for sPCA using existing transform""" orig_transform_file = ags.fields.InputFile(description="sPCA loadings file") orig_datasets = ags.fields.Nested(DatasetParameters, required=True, many=True, description="schema for loading one or more specific datasets for the analysis") new_datasets = ags.fields.Nested(DatasetParameters, required=True, many=True, description="schema for loading one or more specific datasets for the analysis") params_file = ags.fields.InputFile( description="JSON file with sPCA parameters") output_file = ags.fields.OutputFile(description="CSV with transformed values")
def main(orig_transform_file, orig_datasets, new_datasets, params_file, output_file, **kwargs): """ Main runner function for script. See :class:`SpcaTransformParameters` for argument descriptions. """ spca_zht_params, _ = ld.define_spca_parameters(params_file) spca_results = joblib.load(orig_transform_file) # Load original data sets orig_data_objects = [] orig_specimen_ids_list = [] for ds in orig_datasets: if len(ds["limit_to_cortical_layers"]) == 0: limit_to_cortical_layers = None else: limit_to_cortical_layers = ds["limit_to_cortical_layers"] data_for_spca, specimen_ids = ld.load_h5_data(h5_fv_file=ds["fv_h5_file"], metadata_file=ds["metadata_file"], dendrite_type=ds["dendrite_type"], need_structure=not ds["allow_missing_structure"], need_ramp_spike = ds["need_ramp_spike"], include_dend_type_null=ds["allow_missing_dendrite"], limit_to_cortical_layers=limit_to_cortical_layers, id_file=ds["id_file"], params_file=params_file) orig_data_objects.append(data_for_spca) orig_specimen_ids_list.append(specimen_ids) orig_data_for_spca = [] for i, do in enumerate(orig_data_objects): for k in do: if k not in orig_data_for_spca: orig_data_for_spca[k] = do[k] else: orig_data_for_spca[k] = np.vstack([orig_data_for_spca[k], do[k]]) orig_specimen_ids = np.hstack(orig_specimen_ids_list) logging.info("Original datasets had {:d} cells".format(len(orig_specimen_ids))) orig_mean, orig_std = orig_mean_and_std_for_zscore(spca_results, orig_data_for_spca, spca_zht_params) new_data_objects = [] new_specimen_ids_list = [] for ds in new_datasets: if len(ds["limit_to_cortical_layers"]) == 0: limit_to_cortical_layers = None else: limit_to_cortical_layers = ds["limit_to_cortical_layers"] data_for_spca, specimen_ids = ld.load_h5_data(h5_fv_file=ds["fv_h5_file"], metadata_file=ds["metadata_file"], dendrite_type=ds["dendrite_type"], need_structure=not ds["allow_missing_structure"], need_ramp_spike = ds["need_ramp_spike"], include_dend_type_null=ds["allow_missing_dendrite"], limit_to_cortical_layers=limit_to_cortical_layers, id_file=ds["id_file"], params_file=params_file) new_data_objects.append(data_for_spca) new_specimen_ids_list.append(specimen_ids) data_for_spca = [] for i, do in enumerate(new_data_objects): for k in do: if k not in data_for_spca: data_for_spca[k] = do[k] else: data_for_spca[k] = np.vstack([data_for_spca[k], do[k]]) new_ids = np.hstack(new_specimen_ids_list) logging.info("Applying transform to {:d} new cells".format(len(new_ids))) new_combo = spca_transform_new_data(spca_results, data_for_spca, spca_zht_params, orig_mean, orig_std) new_combo_df = pd.DataFrame(new_combo, index=new_ids) new_combo_df.to_csv(output_file) if __name__ == "__main__": module = ags.ArgSchemaParser(schema_type=SpcaTransformParameters) main(**module.args)