"""
The :mod:`drcme.load_data` module contains functions for loading
electrophysiology feature vectors processed by the `IPFX package
<http://ipfx.readthedocs.io>`_, as well as sPCA parameter files.
In particular, it loads in HDF5-format files containing feature vectors
processed by the :mod:`run_feature_vector_extraction script
<ipfx:ipfx.bin.run_feature_vector_extraction>`.
"""
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import logging
import json
import h5py
import os.path
def load_data(project="default", use_noise=False, dendrite_type="all", need_structure=True, include_dend_type_null=False,
limit_to_cortical_layers=None,
params_file="default_params.json",
restriction_file=None,
base_dir=".",
step_num=50):
# Import data
metadata = pd.read_csv(os.path.join(base_dir, "fv_metadata_{:s}.csv".format(project)), index_col=0)
specimen_ids = np.load(os.path.join(base_dir, "fv_ids_{:s}.npy".format(project)))
step_subthresh = np.load(os.path.join(base_dir, "fv_step_subthresh_{:s}.npy".format(project)))
subthresh_norm = np.load(os.path.join(base_dir, "fv_subthresh_norm_{:s}.npy".format(project)))
# ramp_subthresh = np.load(os.path.join(base_dir, "fv_ramp_subthresh_{:s}.npy".format(project)))
first_ap = np.load(os.path.join(base_dir, "fv_first_ap_{:s}.npy".format(project)))
spiking = np.load(os.path.join(base_dir, "fv_spiking_{:s}.npy".format(project)))
isi_shape = np.load(os.path.join(base_dir, "fv_isi_shape_{:s}.npy".format(project)))
if use_noise:
noise = np.load(os.path.join(base_dir, "fv_noise_{:s}.npy".format(project)))
logging.info("Starting with {:d} cells".format(len(specimen_ids)))
# Deal with weird yet-to-be-debugged values
# bad_ids = [501566512, 593610936]
bad_ids = [611520287]
mask = np.array([i not in bad_ids for i in specimen_ids])
orig_n_ids = len(specimen_ids)
n_f = len(first_ap[0])
lens = np.array([len(f) for f in first_ap])
tmp_mask = lens != n_f
if len(specimen_ids) == orig_n_ids:
specimen_ids = specimen_ids[mask]
if len(step_subthresh) == orig_n_ids:
step_subthresh = step_subthresh[mask]
if len(subthresh_norm) == orig_n_ids:
subthresh_norm = subthresh_norm[mask]
# if len(ramp_subthresh) == orig_n_ids:
# ramp_subthresh = ramp_subthresh[mask]
if len(first_ap) == orig_n_ids:
first_ap = first_ap[mask]
if len(spiking) == orig_n_ids:
spiking = spiking[mask]
if len(isi_shape) == orig_n_ids:
isi_shape = isi_shape[mask]
if use_noise and len(noise) == orig_n_ids:
noise = noise[mask]
if np.any(Series(specimen_ids).value_counts() > 1):
logging.info("Handing duplicate specimen ids")
mask = np.array([True] * len(specimen_ids))
mask[np.flatnonzero(np.diff(specimen_ids) == 0)] = False
specimen_ids = specimen_ids[mask]
step_subthresh = step_subthresh[mask]
subthresh_norm = subthresh_norm[mask]
# ramp_subthresh = ramp_subthresh[mask]
first_ap = first_ap[mask]
spiking = spiking[mask]
isi_shape = isi_shape[mask]
if use_noise:
noise = noise[mask]
problem_ramp_ap = np.all(first_ap[:, 300:450] == 0, axis=1)
logging.info("Cells with problem ramp AP: {:d}".format(int(np.sum(problem_ramp_ap))))
specimen_ids = specimen_ids[~problem_ramp_ap]
step_subthresh = step_subthresh[~problem_ramp_ap, :]
subthresh_norm = subthresh_norm[~problem_ramp_ap, :]
# ramp_subthresh = ramp_subthresh[~problem_ramp_ap, :]
first_ap = first_ap[~problem_ramp_ap, :]
spiking = spiking[~problem_ramp_ap, :]
isi_shape = isi_shape[~problem_ramp_ap, :]
if use_noise:
logging.info("Using noise")
noise = noise[~problem_ramp_ap]
has_noise = np.array([arr is not None for arr in noise])
noise = np.array(noise[has_noise].tolist())
specimen_ids = specimen_ids[has_noise]
step_subthresh = step_subthresh[has_noise, :]
subthresh_norm = subthresh_norm[has_noise, :]
# ramp_subthresh = ramp_subthresh[has_noise, :]
first_ap = first_ap[has_noise, :]
spiking = spiking[has_noise, :]
isi_shape = isi_shape[has_noise, :]
problem_noise = (np.all(noise[:, np.arange(0, 150) + 0 * 1410 + 60 + 7 * 150] == 0, axis=1) |
np.all(noise[:, np.arange(0, 150) + 1 * 1410 + 60 + 7 * 150] == 0, axis=1) |
np.all(noise[:, np.arange(0, 150) + 2 * 1410 + 60 + 7 * 150] == 0, axis=1) |
np.all(noise[:, np.arange(0, 150) + 3 * 1410 + 60 + 7 * 150] == 0, axis=1))
specimen_ids = specimen_ids[~problem_noise]
step_subthresh = step_subthresh[~problem_noise, :]
subthresh_norm = subthresh_norm[~problem_noise, :]
# ramp_subthresh = ramp_subthresh[~problem_noise, :]
first_ap = first_ap[~problem_noise, :]
spiking = spiking[~problem_noise, :]
isi_shape = isi_shape[~problem_noise, :]
noise = noise[~problem_noise, :]
# Import metadata
meta_df = metadata.set_index("specimen_id").loc[specimen_ids, :]
meta_df.loc[meta_df["cre_reporter_status"].isnull(), "cre_reporter_status"] = "none"
meta_df = merge_cre_lines(meta_df)
meta_df["cre_w_status"] = "unlabeled"
positive_ind = meta_df["cre_reporter_status"].str.endswith("positive")
if positive_ind.any():
meta_df.loc[positive_ind, "cre_w_status"] = meta_df.loc[positive_ind, "cre_line"]
indeterminate_ind = meta_df["cre_reporter_status"].str.endswith("indeterminate")
indeterminate_ind.fillna(False, inplace=True)
if indeterminate_ind.any():
meta_df.loc[indeterminate_ind, "cre_w_status"] = "indeterminate"
struct_layer = {"1": "1", "2/3": "2/3", "4": "4", "5": "5", "6a": "6", "6b": "6"}
meta_df["layer"] = "unk"
for sl in struct_layer:
meta_df.loc[[s.endswith(sl) if type(s) == str else False for s in meta_df["structure"]], "layer"] = struct_layer[sl]
meta_df["cre_layer"] = meta_df["cre_w_status"] + " " + meta_df["layer"]
meta_df["dendrite_type"] = [s.replace("dendrite type - ", "") if type(s) is str else np.nan for s in meta_df["dendrite_type"]]
logging.info("Cells with Cre status indeterminate: {:d}".format(np.sum(meta_df["cre_w_status"] == "indeterminate")))
logging.info("Cells with dendrite type NA: {:d}".format(np.sum(meta_df["dendrite_type"] == "NA")))
logging.info("Cells with both indeterminate Cre status and dendrite type NA: {:d}".format(np.sum((meta_df["cre_w_status"] == "indeterminate") & (meta_df["dendrite_type"] == "NA"))))
inclusion_mask = filter_by_dendrite_and_structure(meta_df, need_structure,
dendrite_type, include_dend_type_null)
if limit_to_cortical_layers is not None:
inclusion_mask = inclusion_mask & meta_df["cortex_layer"].isin(limit_to_cortical_layers)
logging.info("Cells in restricted cortical layers: {:d}".format(int(np.sum(inclusion_mask))))
specimen_ids = specimen_ids[inclusion_mask]
step_subthresh = step_subthresh[inclusion_mask, :]
subthresh_norm = subthresh_norm[inclusion_mask, :]
# ramp_subthresh = ramp_subthresh[inclusion_mask, :]
first_ap = first_ap[inclusion_mask, :]
spiking = spiking[inclusion_mask, :]
isi_shape = isi_shape[inclusion_mask, :]
if use_noise:
noise = noise[inclusion_mask, :]
meta_df = meta_df.loc[inclusion_mask, :]
if restriction_file is not None:
# Load file of IDs that the cells must be in
restrict_ids = np.loadtxt(restriction_file)
inclusion_mask = np.array([s in restrict_ids for s in specimen_ids])
specimen_ids = specimen_ids[inclusion_mask]
step_subthresh = step_subthresh[inclusion_mask, :]
subthresh_norm = subthresh_norm[inclusion_mask, :]
# ramp_subthresh = ramp_subthresh[inclusion_mask, :]
first_ap = first_ap[inclusion_mask, :]
spiking = spiking[inclusion_mask, :]
isi_shape = isi_shape[inclusion_mask, :]
if use_noise:
noise = noise[inclusion_mask, :]
meta_df = meta_df.loc[inclusion_mask, :]
spca_zht_params, step_num = define_spca_parameters(filename=params_file)
if "spiking_inst_freq" in spca_zht_params and "inst_freq_norm" in spca_zht_params:
indices = spca_zht_params["spiking_inst_freq"][3]
logging.debug("calculating inst_freq_norm with step_num {:d}".format(step_num))
inst_freq_norm = spiking[:, indices]
n_steps = len(indices) // step_num
for i in range(n_steps):
row_max = inst_freq_norm[:, i * step_num:(i + 1) * step_num].max(axis=1)
row_max[row_max == 0] = 1.
inst_freq_norm[:, i * step_num:(i + 1) * step_num] = inst_freq_norm[:, i * step_num:(i + 1) * step_num] / row_max[:, None]
else:
inst_freq_norm = None
return specimen_ids, first_ap, isi_shape, step_subthresh, subthresh_norm, spiking, inst_freq_norm, meta_df
def filter_by_dendrite_and_structure(meta_df, need_structure, dendrite_type, include_dend_type_null):
"""Create mask for cells that pass metadata filters
Parameters
----------
metadata_df: DataFrame
DataFrame of metadata
need_structure: bool (optional, default False)
Requires that structure is present (only used
if metadata file is supplied)
dendrite_type: str (optional, default 'all')
Dendrite type for filtering ('all', 'spiny', 'aspiny') (only used
if metadata file is supplied)
include_dend_type_null: bool (optional, default True)
Also include cells without a dendrite type available regardless of
what `dendrite_type` is specified (only used
if metadata file is supplied)
Results
-------
inclusion_mask: array of shape (len(specimen_ids), )
Boolean mask for filtered cells
"""
# Refine the data set
if need_structure:
logging.info("Requiring structure and dendrite type; excluding dendrite type = NA")
if dendrite_type == "all":
inclusion_mask = np.array((meta_df["cre_w_status"] != "indeterminate") &
(~meta_df["structure"].isnull()) &
(~meta_df["dendrite_type"].isnull()) &
(meta_df["dendrite_type"] != "NA"))
elif dendrite_type == "spiny":
inclusion_mask = np.array((meta_df["cre_w_status"] != "indeterminate") &
(~meta_df["structure"].isnull()) &
(meta_df["dendrite_type"].isin(["spiny"])))
elif dendrite_type == "aspiny":
inclusion_mask = np.array((meta_df["cre_w_status"] != "indeterminate") &
(~meta_df["structure"].isnull()) &
(meta_df["dendrite_type"].isin(["aspiny", "sparsely spiny"])))
else:
raise ValueError("Not allowable value for dendrite type")
logging.info("Cells with dendrite type and structure: {:d}".format(int(np.sum(inclusion_mask))))
elif not include_dend_type_null:
if dendrite_type == "all":
inclusion_mask = np.array((meta_df["cre_w_status"] != "indeterminate") &
(~meta_df["dendrite_type"].isnull()) &
(meta_df["dendrite_type"] != "NA"))
elif dendrite_type == "spiny":
inclusion_mask = np.array((meta_df["cre_w_status"] != "indeterminate") &
(meta_df["dendrite_type"].isin(["spiny"])))
elif dendrite_type == "aspiny":
inclusion_mask = np.array((meta_df["cre_w_status"] != "indeterminate") &
(meta_df["dendrite_type"].isin(["aspiny", "sparsely spiny"])))
else:
raise ValueError("Not allowable value for dendrite type")
logging.info("Requiring dendrite type; excluding dendrite type = NA")
logging.info("Cells with dendrite type: {:d}".format(int(np.sum(inclusion_mask))))
else:
if dendrite_type == "all":
inclusion_mask = np.array((meta_df["cre_w_status"] != "indeterminate") &
(meta_df["dendrite_type"].fillna("") != "NA"))
elif dendrite_type == "spiny":
inclusion_mask = np.array((meta_df["cre_w_status"] != "indeterminate") &
(meta_df["dendrite_type"].isin(["spiny"]) | meta_df["dendrite_type"].isnull()))
elif dendrite_type == "aspiny":
inclusion_mask = np.array((meta_df["cre_w_status"] != "indeterminate") &
(meta_df["dendrite_type"].isin(["aspiny", "sparsely spiny"]) | meta_df["dendrite_type"].isnull()))
else:
raise ValueError("Not allowable value for dendrite type")
logging.info("Excluding dendrite type = NA")
logging.info("Cells with dendrite type specified or missing (does not include NA): {:d}".format(int(np.sum(inclusion_mask))))
return inclusion_mask
def load_organized_data(project, base_dir, params_file, dendrite_type,
use_noise=False, need_structure=False,
include_dend_type_null=True,
limit_to_cortical_layers=None):
logging.info("in load_and_organize_data")
(specimen_ids,
first_ap,
isi_shape,
step_subthresh,
subthresh_norm,
spiking,
inst_freq_norm,
meta_df) = load_data(project=project,
base_dir=base_dir,
params_file=params_file,
use_noise=use_noise,
dendrite_type=dendrite_type,
need_structure=need_structure,
include_dend_type_null=include_dend_type_null,
limit_to_cortical_layers=limit_to_cortical_layers)
data_for_spca = [
{"data": first_ap,
"part_keys": ["first_ap_v", "first_ap_dv"],
},
{"data": isi_shape,
"part_keys": ["isi_shape"],
},
{"data": step_subthresh,
"part_keys": ["step_subthresh"],
},
{"data": subthresh_norm,
"part_keys": ["subthresh_norm"],
},
{"data": spiking,
"part_keys": ["spiking_rate", "spiking_inst_freq", "spiking_updown", "spiking_peak_v",
"spiking_fast_trough_v",
"spiking_threshold_v", "spiking_width"],
},
]
if use_noise:
data_for_spca.append(
{"data": noise,
"part_keys": ["noise_rate", "noise_inst_freq", "noise_updown", "noise_peak_v",
"noise_fast_trough_v",
"noise_threshold_v", "noise_width"],
},
)
if inst_freq_norm is not None:
data_for_spca.append({
"data": inst_freq_norm,
"part_keys": ["inst_freq_norm"],
})
return data_for_spca, specimen_ids
[docs]def load_h5_data(h5_fv_file, params_file, metadata_file=None, dendrite_type="all",
need_structure=False,
need_ramp_spike=True,
include_dend_type_null=True,
limit_to_cortical_layers=None,
id_file=None):
"""Load dictionary for sPCA processing from HDF5 file
The data can also be filtered by several metadata values.
Parameters
----------
h5_fv_file: str
Path to feature vector HDF5 file
params_file: str
Path to sPCA parameters JSON file
metadata_file: str, optional
Path to metadata CSV file
dendrite_type: {'all', 'spiny', 'aspiny'}, optional
Dendrite type for filtering (only used if metadata file is
supplied)
need_structure: bool, optional
Requires that structure is present (only used
if metadata file is supplied)
need_ramp_spike: bool, optional
Requires that the ramp spike is non-zero (aka not missing)
include_dend_type_null: bool, optional
Also include cells without a dendrite type available regardless of
what `dendrite_type` is specified (only used
if metadata file is supplied)
limit_to_cortical_layers: list, optional
List of cortical layers that metadata must match for inclusion (only used
if metadata file is supplied)
id_file: str, optional
Path to text file with IDs to use
Returns
-------
data_for_spca: dict
Dictionary of data sets for sPCA analysis
specimen_ids: array
The specimen IDs for the cells in the data sets
"""
f = h5py.File(h5_fv_file, "r")
spca_zht_params, step_num = define_spca_parameters(filename=params_file)
specimen_ids = f["ids"][...]
logging.info("Starting with {:d} cells".format(len(specimen_ids)))
if need_ramp_spike:
# Ramp waveform expected to be last
# Identify cells with no ramp spike
first_ap_v = f["first_ap_v"][...]
# Expected to have three equal-length AP waveforms
n_bins = first_ap_v.shape[1] // 3
ramp_mask = ~np.all(first_ap_v[:, -n_bins:] == 0, axis=1)
logging.info("{} cells have no ramp AP".format(np.sum(ramp_mask == False)))
else:
logging.info("Including cells without ramp AP")
ramp_mask = np.ones_like(specimen_ids).astype(bool)
if metadata_file is not None:
logging.debug("Using metadata file {}".format(metadata_file))
metadata = pd.read_csv(metadata_file, index_col=0)
mask = mask_for_metadata(specimen_ids, metadata,
dendrite_type, need_structure,
include_dend_type_null, limit_to_cortical_layers)
mask = mask & ramp_mask
else:
mask = ramp_mask
if id_file is not None:
with open(id_file, "r") as id_f:
include_id_list = [int(line.strip("\n")) for line in id_f]
id_mask = np.array([spec_id in include_id_list for spec_id in specimen_ids])
mask = mask & id_mask
data_for_spca = {}
for k in spca_zht_params:
if k not in f.keys():
logging.debug("{} not found in HDF5 file".format(k))
continue
data = f[k][mask, :]
data_for_spca[k] = data
# Calculate additional data set if requested
if ("inst_freq" in spca_zht_params and "inst_freq_norm" in spca_zht_params
and "inst_freq" in f.keys()):
logging.debug("inst_freq_norm will be calculated from inst_freq")
indices = spca_zht_params["inst_freq"][3]
logging.debug("calculating inst_freq_norm with step_num {:d}".format(step_num))
inst_freq_data = f["inst_freq"][...]
if indices is not None:
inst_freq_norm = inst_freq_data[mask, :][:, indices]
else:
inst_freq_norm = inst_freq_data[mask, :]
n_steps = len(indices) // step_num
for i in range(n_steps):
row_max = inst_freq_norm[:, i * step_num:(i + 1) * step_num].max(axis=1)
row_max[row_max == 0] = 1. # handle divide-by-zero issues
inst_freq_norm[:, i * step_num:(i + 1) * step_num] = inst_freq_norm[:, i * step_num:(i + 1) * step_num] / row_max[:, None]
data_for_spca["inst_freq_norm"] = inst_freq_norm
f.close()
specimen_ids = specimen_ids[ramp_mask & mask]
logging.info("Loaded data for {} cells".format(len(specimen_ids)))
return data_for_spca, specimen_ids
def mask_for_metadata(specimen_ids, metadata_df, dendrite_type="all",
need_structure=False, include_dend_type_null=True,
limit_to_cortical_layers=None):
"""Create mask for cells that pass metadata filters
Parameters
----------
specimen_ids: array
Specimen IDs for cells to filter
metadata_df: DataFrame
DataFrame of metadata
dendrite_type: {'all', 'spiny', 'aspiny'}, optional
Dendrite type for filtering (only used if metadata file is
supplied)
need_structure: bool, optional
Requires that structure is present (only used
if metadata file is supplied)
include_dend_type_null: bool, optional
Also include cells without a dendrite type available regardless
of what `dendrite_type` is specified (only used if metadata file
is supplied)
limit_to_cortical_layers: list, optional
List of cortical layers that metadata must match for inclusion
(only used if metadata file is supplied)
Results
-------
array
Boolean mask for filtered cells with size `len(specimen_ids)``
"""
# Limit to specimen_ids
meta_df = metadata_df.set_index("specimen_id").loc[specimen_ids, :]
# Reformat metadata information
meta_df.loc[meta_df["cre_reporter_status"].isnull(), "cre_reporter_status"] = "none"
meta_df = merge_cre_lines(meta_df)
meta_df["cre_w_status"] = "unlabeled"
positive_ind = meta_df["cre_reporter_status"].str.endswith("positive")
if positive_ind.any():
meta_df.loc[positive_ind, "cre_w_status"] = meta_df.loc[positive_ind, "cre_line"]
indeterminate_ind = meta_df["cre_reporter_status"].str.endswith("indeterminate")
indeterminate_ind.fillna(False, inplace=True)
if indeterminate_ind.any():
meta_df.loc[indeterminate_ind, "cre_w_status"] = "indeterminate"
struct_layer = {"1": "1", "2/3": "2/3", "4": "4", "5": "5", "6a": "6", "6b": "6"}
meta_df["layer"] = "unk"
for sl in struct_layer:
meta_df.loc[[s.endswith(sl) if type(s) == str else False for s in meta_df["structure"]], "layer"] = struct_layer[sl]
meta_df["cre_layer"] = meta_df["cre_w_status"] + " " + meta_df["layer"]
meta_df["dendrite_type"] = [s.replace("dendrite type - ", "") if type(s) is str else np.nan for s in meta_df["dendrite_type"]]
logging.info("Cells with Cre status indeterminate: {:d}".format(np.sum(meta_df["cre_w_status"] == "indeterminate")))
logging.info("Cells with dendrite type NA: {:d}".format(np.sum(meta_df["dendrite_type"] == "NA")))
logging.info("Cells with both indeterminate Cre status and dendrite type NA: {:d}".format(np.sum((meta_df["cre_w_status"] == "indeterminate") & (meta_df["dendrite_type"] == "NA"))))
dend_struct_mask = filter_by_dendrite_and_structure(meta_df, need_structure,
dendrite_type, include_dend_type_null)
if limit_to_cortical_layers is not None:
layer_mask = meta_df["cortex_layer"].isin(limit_to_cortical_layers)
logging.info("Cells in restricted cortical layers: {:d}".format(int(np.sum(inclusion_mask))))
else:
layer_mask = np.ones_like(dend_struct_mask, dtype=bool)
return dend_struct_mask & layer_mask
[docs]def define_spca_parameters(filename):
"""Load an sPCA parameters file
The parameters file should be a set of keys with dictionaries as
their values. Each dictionary must contain the following keys:
- `n_components`: Number of components
- `nonzero_component_list`: List of the number of non-zero
loadings for each component
- `use_corr`: Whether to scale the data by its standard
deviation (true/false)
- `range`: If null, all indices will be used. If a pair of
values, the first value represents the first index and
the second represents the index just after the last point (as in
Python's slicing notation). If more than two,
the first of each pair is the start index and the second
is the index just after the end (this allows sections of the full feature
vector to be excluded).
The range is expanded into a list of indices by this function.
Parameters
----------
filename : str
Path to JSON file with sPCA parameters
Returns
-------
dict
Contains key/tuple pairs. The tuple has the format
(`n_components`, `nonzero_component_list`, `use_corr`, `indices`).
"""
with open(filename, "r") as f:
json_data = json.load(f)
spca_zht_params = {}
for k in json_data:
d = json_data[k]
if d["range"] is None:
indices = None
elif len(d["range"]) == 2:
indices = np.arange(d["range"][0], d["range"][1])
else:
range_list = []
for a, b in zip(d["range"][:-1:2], d["range"][1::2]):
range_list.append(np.arange(a, b))
indices = np.hstack(range_list)
spca_zht_params[k] = (
d["n_components"],
d["nonzero_component_list"],
d["use_corr"],
indices,
)
if "inst_freq_norm" in json_data and "step_num" in json_data["inst_freq_norm"]:
step_num = json_data["inst_freq_norm"]["step_num"]
else:
step_num = 50 # default value
return spca_zht_params, step_num
def load_data_with_ids(id_list, project="T301", use_noise=False, dendrite_type="all"):
(specimen_ids, first_ap, isi_shape, step_subthresh, subthresh_norm,
spiking, inst_freq_norm, meta_df) = load_data(project=project,
use_noise=use_noise,
dendrite_type=dendrite_type,
need_structure=False,
include_dend_type_null=True)
inclusion_mask = np.array([s in id_list for s in specimen_ids])
specimen_ids = specimen_ids[inclusion_mask]
step_subthresh = step_subthresh[inclusion_mask, :]
subthresh_norm = subthresh_norm[inclusion_mask, :]
first_ap = first_ap[inclusion_mask, :]
spiking = spiking[inclusion_mask, :]
isi_shape = isi_shape[inclusion_mask, :]
meta_df = meta_df.loc[inclusion_mask, :]
inst_freq_norm = inst_freq_norm[inclusion_mask, :]
return specimen_ids, first_ap, isi_shape, step_subthresh, subthresh_norm, spiking, inst_freq_norm, meta_df
def merge_cre_lines(df):
new_df = df.copy()
lines_to_merge = {
"Ntsr1-Cre": "Ntsr1-Cre_GN220",
"Pvalb-IRES-Cre;Pvalb-IRES-Cre": "Pvalb-IRES-Cre",
"Vip-IRES-Cre;Vip-IRES-Cre": "Vip-IRES-Cre",
"Vipr2-IRES2-Cre;Vipr2-IRES2-Cre": "Vipr2-IRES2-Cre",
"Chat-IRES-Cre-neo;Chat-IRES-Cre-neo": "Chat-IRES-Cre-neo",
"Sst-IRES-FlpO;Nos1-CreERT2": "Nos1-CreERT2;Sst-IRES-FlpO",
}
new_df["cre_line"] = [lines_to_merge[c] if c in lines_to_merge else c
for c in df["cre_line"]]
return new_df