Source code for drcme.bin.run_spca_fit

"""
Script to run sparse principal component analysis on electrophysiology
feature vectors.

The electrophysiology feature vectors used as inputs are in the form
processed by the `IPFX package <https://ipfx.readthedocs.io>`_. An
optional metadata file can be given as an input to filter the cells in
the data set.

The script produces several outputs in the specified ``output_dir``,
named with a specified ``output_code``.

- *sparse_principal_components_[output_code].csv*: sPCA values for each
  cell.
- *spca_components_used_[output_code].json*: List of kept components
  for each data subset.
- *spca_loadings_[output_code].pkl*: sPCA loadings, adjust explained
  variance, and transformed values for each data subset. Uses the
  ``joblib`` library for saving/loading.

.. autoclass:: DatasetParameters
.. autoclass:: AnalysisParameters

"""

import numpy as np
import pandas as pd
import drcme.spca as spca
import drcme.load_data as ld
import argschema as ags
import joblib
import logging
import os
import json


[docs]class DatasetParameters(ags.schemas.DefaultSchema):
    """Parameter schema for input datasets"""
    fv_h5_file = ags.fields.InputFile(
        description="HDF5 file with feature vectors")
    metadata_file = ags.fields.InputFile(
        description="Metadata file in CSV format",
        allow_none=True,
        default=None)
    dendrite_type = ags.fields.String(
        default="all",
        description="Filter for dendrite type using information in metadata (all, spiny, aspiny)",
        validate=lambda x: x in ["all", "spiny", "aspiny"])
    allow_missing_structure = ags.fields.Boolean(
        required=False,
        description="Whether or not structure value for cell in metadata can be missing",
        default=False)
    allow_missing_dendrite = ags.fields.Boolean(
        required=False,
        description="Whether or not dendrite type value for cell in metadata can be missing",
        default=False)
    need_ramp_spike = ags.fields.Boolean(
        required=False,
        description="Whether or not to exclude cells that did not fire an action potential from the ramp stimulus",
        default=True)
    limit_to_cortical_layers = ags.fields.List(
        ags.fields.String,
        description="List of cortical layers to limit the data set (using the metadata file)",
        default=[],
        cli_as_single_argument=True)
    id_file = ags.fields.InputFile(
        description="Text file with specimen IDs to use. Cells with IDs not in the file will be excluded.",
        required=False,
        allow_none=True,
        default=None)


[docs]class AnalysisParameters(ags.ArgSchema):
    """Parameter schema for sPCA analysis"""
    params_file = ags.fields.InputFile(
        description="JSON file with sPCA parameters")
    output_dir = ags.fields.OutputDir(
        description="Directory for output files")
    output_code = ags.fields.String(
        description="Code for naming output files")
    datasets = ags.fields.Nested(DatasetParameters,
                                 required=True,
                                 many=True,
                                 description="Schema for loading one or more specific datasets for the analysis")


def main(params_file, output_dir, output_code, datasets, **kwargs):
    """ Main runner function for script.

    See argschema input parameters for argument descriptions.
    """

    # Load data from each dataset
    data_objects = []
    specimen_ids_list = []
    for ds in datasets:
        if len(ds["limit_to_cortical_layers"]) == 0:
            limit_to_cortical_layers = None
        else:
            limit_to_cortical_layers = ds["limit_to_cortical_layers"]

        data_for_spca, specimen_ids = ld.load_h5_data(h5_fv_file=ds["fv_h5_file"],
                                            metadata_file=ds["metadata_file"],
                                            dendrite_type=ds["dendrite_type"],
                                            need_structure=not ds["allow_missing_structure"],
                                            need_ramp_spike = ds["need_ramp_spike"],
                                            include_dend_type_null=ds["allow_missing_dendrite"],
                                            limit_to_cortical_layers=limit_to_cortical_layers,
                                            id_file=ds["id_file"],
                                            params_file=params_file)
        data_objects.append(data_for_spca)
        specimen_ids_list.append(specimen_ids)

    data_for_spca = {}
    for i, do in enumerate(data_objects):
        for k in do:
            if k not in data_for_spca:
                data_for_spca[k] = do[k]
            else:
                data_for_spca[k] = np.vstack([data_for_spca[k], do[k]])
    specimen_ids = np.hstack(specimen_ids_list)

    first_key = list(data_for_spca.keys())[0]
    if len(specimen_ids) != data_for_spca[first_key].shape[0]:
        logging.error("Mismatch of specimen id dimension ({:d}) and data dimension ({:d})".format(len(specimen_ids), data_for_spca[first_key].shape[0]))

    logging.info("Proceeding with %d cells", len(specimen_ids))

    # Load parameters
    spca_zht_params, _ = ld.define_spca_parameters(filename=params_file)

    # Run sPCA
    subset_for_spca = spca.select_data_subset(data_for_spca, spca_zht_params)
    spca_results = spca.spca_on_all_data(subset_for_spca, spca_zht_params)
    combo, component_record = spca.consolidate_spca(spca_results)

    logging.info("Saving results...")
    joblib.dump(spca_results, os.path.join(output_dir, "spca_loadings_{:s}.pkl".format(output_code)))
    combo_df = pd.DataFrame(combo, index=specimen_ids)
    combo_df.to_csv(os.path.join(output_dir, "sparse_pca_components_{:s}.csv".format(output_code)))
    with open(os.path.join(output_dir, "spca_components_used_{:s}.json".format(output_code)), "w") as f:
        json.dump(component_record, f, indent=4)
    logging.info("Done.")


if __name__ == "__main__":
    module = ags.ArgSchemaParser(schema_type=AnalysisParameters)
    main(**module.args)