Source code for drcme.bin.run_spca_fit

"""
Script to run sparse principal component analysis on electrophysiology
feature vectors.

The electrophysiology feature vectors used as inputs are in the form
processed by the `IPFX package <https://ipfx.readthedocs.io>`_. An
optional metadata file can be given as an input to filter the cells in
the data set.

The script produces several outputs in the specified ``output_dir``,
named with a specified ``output_code``.

- *sparse_principal_components_[output_code].csv*: sPCA values for each
  cell.
- *spca_components_used_[output_code].json*: List of kept components
  for each data subset.
- *spca_loadings_[output_code].pkl*: sPCA loadings, adjust explained
  variance, and transformed values for each data subset. Uses the
  ``joblib`` library for saving/loading.

.. autoclass:: DatasetParameters
.. autoclass:: AnalysisParameters

"""

import numpy as np
import pandas as pd
import drcme.spca as spca
import drcme.load_data as ld
import argschema as ags
import joblib
import logging
import os
import json


[docs]class DatasetParameters(ags.schemas.DefaultSchema): """Parameter schema for input datasets""" fv_h5_file = ags.fields.InputFile( description="HDF5 file with feature vectors") metadata_file = ags.fields.InputFile( description="Metadata file in CSV format", allow_none=True, default=None) dendrite_type = ags.fields.String( default="all", description="Filter for dendrite type using information in metadata (all, spiny, aspiny)", validate=lambda x: x in ["all", "spiny", "aspiny"]) allow_missing_structure = ags.fields.Boolean( required=False, description="Whether or not structure value for cell in metadata can be missing", default=False) allow_missing_dendrite = ags.fields.Boolean( required=False, description="Whether or not dendrite type value for cell in metadata can be missing", default=False) need_ramp_spike = ags.fields.Boolean( required=False, description="Whether or not to exclude cells that did not fire an action potential from the ramp stimulus", default=True) limit_to_cortical_layers = ags.fields.List( ags.fields.String, description="List of cortical layers to limit the data set (using the metadata file)", default=[], cli_as_single_argument=True) id_file = ags.fields.InputFile( description="Text file with specimen IDs to use. Cells with IDs not in the file will be excluded.", required=False, allow_none=True, default=None)
[docs]class AnalysisParameters(ags.ArgSchema): """Parameter schema for sPCA analysis""" params_file = ags.fields.InputFile( description="JSON file with sPCA parameters") output_dir = ags.fields.OutputDir( description="Directory for output files") output_code = ags.fields.String( description="Code for naming output files") datasets = ags.fields.Nested(DatasetParameters, required=True, many=True, description="Schema for loading one or more specific datasets for the analysis")
def main(params_file, output_dir, output_code, datasets, **kwargs): """ Main runner function for script. See argschema input parameters for argument descriptions. """ # Load data from each dataset data_objects = [] specimen_ids_list = [] for ds in datasets: if len(ds["limit_to_cortical_layers"]) == 0: limit_to_cortical_layers = None else: limit_to_cortical_layers = ds["limit_to_cortical_layers"] data_for_spca, specimen_ids = ld.load_h5_data(h5_fv_file=ds["fv_h5_file"], metadata_file=ds["metadata_file"], dendrite_type=ds["dendrite_type"], need_structure=not ds["allow_missing_structure"], need_ramp_spike = ds["need_ramp_spike"], include_dend_type_null=ds["allow_missing_dendrite"], limit_to_cortical_layers=limit_to_cortical_layers, id_file=ds["id_file"], params_file=params_file) data_objects.append(data_for_spca) specimen_ids_list.append(specimen_ids) data_for_spca = {} for i, do in enumerate(data_objects): for k in do: if k not in data_for_spca: data_for_spca[k] = do[k] else: data_for_spca[k] = np.vstack([data_for_spca[k], do[k]]) specimen_ids = np.hstack(specimen_ids_list) first_key = list(data_for_spca.keys())[0] if len(specimen_ids) != data_for_spca[first_key].shape[0]: logging.error("Mismatch of specimen id dimension ({:d}) and data dimension ({:d})".format(len(specimen_ids), data_for_spca[first_key].shape[0])) logging.info("Proceeding with %d cells", len(specimen_ids)) # Load parameters spca_zht_params, _ = ld.define_spca_parameters(filename=params_file) # Run sPCA subset_for_spca = spca.select_data_subset(data_for_spca, spca_zht_params) spca_results = spca.spca_on_all_data(subset_for_spca, spca_zht_params) combo, component_record = spca.consolidate_spca(spca_results) logging.info("Saving results...") joblib.dump(spca_results, os.path.join(output_dir, "spca_loadings_{:s}.pkl".format(output_code))) combo_df = pd.DataFrame(combo, index=specimen_ids) combo_df.to_csv(os.path.join(output_dir, "sparse_pca_components_{:s}.csv".format(output_code))) with open(os.path.join(output_dir, "spca_components_used_{:s}.json".format(output_code)), "w") as f: json.dump(component_record, f, indent=4) logging.info("Done.") if __name__ == "__main__": module = ags.ArgSchemaParser(schema_type=AnalysisParameters) main(**module.args)