Source code for astrobase.lcproc.catalogs

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# catalogs.py - Waqas Bhatti (wbhatti@astro.princeton.edu) - Feb 2019

'''

This contains functions to generate light curve catalogs from collections of
light curves.

'''

#############
## LOGGING ##
#############

import logging
from astrobase import log_sub, log_fmt, log_date_fmt

DEBUG = False
if DEBUG:
    level = logging.DEBUG
else:
    level = logging.INFO
LOGGER = logging.getLogger(__name__)
logging.basicConfig(
    level=level,
    style=log_sub,
    format=log_fmt,
    datefmt=log_date_fmt,
)

LOGDEBUG = LOGGER.debug
LOGINFO = LOGGER.info
LOGWARNING = LOGGER.warning
LOGERROR = LOGGER.error
LOGEXCEPTION = LOGGER.exception


#############
## IMPORTS ##
#############

import pickle
import os
import os.path
import glob
import shutil
import multiprocessing as mp
from concurrent.futures import ProcessPoolExecutor

import numpy as np
import numpy.random as npr
npr.seed(0xc0ffee)

import scipy.spatial as sps

import astropy.io.fits as pyfits
from astropy.wcs import WCS
from astropy.visualization import ZScaleInterval, LinearStretch

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

try:
    from tqdm import tqdm
    TQDM = True
except Exception:
    TQDM = False
    pass

# to turn a list of keys into a dict address
# from https://stackoverflow.com/a/14692747
from functools import reduce
from operator import getitem


def _dict_get(datadict, keylist):
    return reduce(getitem, keylist, datadict)


############
## CONFIG ##
############

NCPUS = mp.cpu_count()

# these translate filter operators given as strings to Python operators
FILTEROPS = {'eq':'==',
             'gt':'>',
             'ge':'>=',
             'lt':'<',
             'le':'<=',
             'ne':'!='}


###################
## LOCAL IMPORTS ##
###################

from astrobase.plotbase import fits_finder_chart
from astrobase.cpserver.checkplotlist import checkplot_infokey_worker
from astrobase.lcproc import get_lcformat


#####################################################
## FUNCTIONS TO GENERATE OBJECT CATALOGS (LCLISTS) ##
#####################################################

def _lclist_parallel_worker(task):
    '''This is a parallel worker for makelclist.

    Parameters
    ----------

    task : tuple
        This is a tuple containing the following items:

        task[0] = lcf
        task[1] = columns
        task[2] = lcformat
        task[3] = lcformatdir
        task[4] = lcndetkey

    Returns
    -------

    dict or None
        This contains all of the info for the object processed in this LC read
        operation. If this fails, returns None

    '''

    lcf, columns, lcformat, lcformatdir, lcndetkey = task

    # get the bits needed for lcformat handling
    # NOTE: we re-import things in this worker function because sometimes
    # functions can't be pickled correctly for passing them to worker functions
    # in a processing pool
    try:
        formatinfo = get_lcformat(lcformat,
                                  use_lcformat_dir=lcformatdir)
        if formatinfo:
            (dfileglob, readerfunc,
             dtimecols, dmagcols, derrcols,
             magsarefluxes, normfunc) = formatinfo
        else:
            LOGERROR("can't figure out the light curve format")
            return None
    except Exception:
        LOGEXCEPTION("can't figure out the light curve format")
        return None

    # we store the full path of the light curve
    lcobjdict = {'lcfname':os.path.abspath(lcf)}

    try:

        # read the light curve in
        lcdict = readerfunc(lcf)

        # this should handle lists/tuples being returned by readerfunc
        # we assume that the first element is the actual lcdict
        # FIXME: figure out how to not need this assumption
        if ( (isinstance(lcdict, (list, tuple))) and
             (isinstance(lcdict[0], dict)) ):
            lcdict = lcdict[0]

        # insert all of the columns
        for colkey in columns:

            if '.' in colkey:
                getkey = colkey.split('.')
            else:
                getkey = [colkey]

            try:
                thiscolval = _dict_get(lcdict, getkey)
            except Exception:
                LOGWARNING('column %s does not exist for %s' %
                           (colkey, lcf))
                thiscolval = np.nan

            # update the lcobjdict with this value
            lcobjdict[getkey[-1]] = thiscolval

    except Exception:

        LOGEXCEPTION('could not figure out columns for %s' % lcf)

        # insert all of the columns as nans
        for colkey in columns:

            if '.' in colkey:
                getkey = colkey.split('.')
            else:
                getkey = [colkey]

            thiscolval = np.nan

            # update the lclistdict with this value
            lcobjdict[getkey[-1]] = thiscolval

    # now get the actual ndets; this excludes nans and infs
    for dk in lcndetkey:

        try:

            if '.' in dk:
                getdk = dk.split('.')
            else:
                getdk = [dk]

            ndetcol = _dict_get(lcdict, getdk)
            actualndets = ndetcol[np.isfinite(ndetcol)].size
            lcobjdict['%s.ndet' % getdk[-1]] = actualndets

        except Exception:
            lcobjdict['%s.ndet' % getdk[-1]] = np.nan

    return lcobjdict


[docs]def make_lclist(basedir,
                outfile,
                use_list_of_filenames=None,
                lcformat='hat-sql',
                lcformatdir=None,
                fileglob=None,
                recursive=True,
                columns=('objectid',
                         'objectinfo.ra',
                         'objectinfo.decl',
                         'objectinfo.ndet'),
                makecoordindex=('objectinfo.ra','objectinfo.decl'),
                field_fitsfile=None,
                field_wcsfrom=None,
                field_scale=ZScaleInterval(),
                field_stretch=LinearStretch(),
                field_colormap=plt.cm.gray_r,
                field_findersize=None,
                field_pltopts={'marker':'o',
                               'markersize':10.0,
                               'markerfacecolor':'none',
                               'markeredgewidth':2.0,
                               'markeredgecolor':'red'},
                field_grid=False,
                field_gridcolor='k',
                field_zoomcontain=True,
                maxlcs=None,
                nworkers=NCPUS):

    '''This generates a light curve catalog for all light curves in a directory.

    Given a base directory where all the files are, and a light curve format,
    this will find all light curves, pull out the keys in each lcdict requested
    in the `columns` kwarg for each object, and write them to the requested
    output pickle file. These keys should be pointers to scalar values
    (i.e. something like `objectinfo.ra` is OK, but something like 'times' won't
    work because it's a vector).

    Generally, this works with light curve reading functions that produce
    lcdicts as detailed in the docstring for `lcproc.register_lcformat`. Once
    you've registered your light curve reader functions using the
    `lcproc.register_lcformat` function, pass in the `formatkey` associated with
    your light curve format, and this function will be able to read all light
    curves in that format as well as the object information stored in their
    `objectinfo` dict.

    Parameters
    ----------

    basedir : str or list of str
        If this is a str, points to a single directory to search for light
        curves. If this is a list of str, it must be a list of directories to
        search for light curves. All of these will be searched to find light
        curve files matching either your light curve format's default fileglob
        (when you registered your LC format), or a specific fileglob that you
        can pass in using the `fileglob` kwargh here. If the `recursive` kwarg
        is set, the provided directories will be searched recursively.

        If `use_list_of_filenames` is not None, it will override this argument
        and the function will take those light curves as the list of files it
        must process instead of whatever is specified in `basedir`.

    outfile : str
        This is the name of the output file to write. This will be a pickle
        file, so a good convention to use for this name is something like
        'my-lightcurve-catalog.pkl'.

    use_list_of_filenames : list of str or None
        Use this kwarg to override whatever is provided in `basedir` and
        directly pass in a list of light curve files to process. This can speed
        up this function by a lot because no searches on disk will be performed
        to find light curve files matching `basedir` and `fileglob`.

    lcformat : str
        This is the `formatkey` associated with your light curve format, which
        you previously passed in to the `lcproc.register_lcformat`
        function. This will be used to look up how to find and read the light
        curves specified in `basedir` or `use_list_of_filenames`.

    lcformatdir : str or None
        If this is provided, gives the path to a directory when you've stored
        your lcformat description JSONs, other than the usual directories lcproc
        knows to search for them in. Use this along with `lcformat` to specify
        an LC format JSON file that's not currently registered with lcproc.

    fileglob : str or None
        If provided, is a string that is a valid UNIX filename glob. Used to
        override the default fileglob for this LC format when searching for
        light curve files in `basedir`.

    recursive : bool
        If True, the directories specified in `basedir` will be searched
        recursively for all light curve files that match the default fileglob
        for this LC format or a specific one provided in `fileglob`.

    columns : list of str
        This is a list of keys in the lcdict produced by your light curve reader
        function that contain object information, which will be extracted and
        put into the output light curve catalog. It's highly recommended that
        your LC reader function produce a lcdict that contains at least the
        default keys shown here.

        The lcdict keys to extract are specified by using an address scheme:

        - First level dict keys can be specified directly:
          e.g., 'objectid' will extract lcdict['objectid']
        - Keys at other levels can be specified by using a period to indicate
          the level:

          - e.g., 'objectinfo.ra' will extract lcdict['objectinfo']['ra']
          - e.g., 'objectinfo.varinfo.features.stetsonj' will extract
            lcdict['objectinfo']['varinfo']['features']['stetsonj']

    makecoordindex : list of two str or None
        This is used to specify which lcdict keys contain the right ascension
        and declination coordinates for this object. If these are provided, the
        output light curve catalog will have a kdtree built on all object
        coordinates, which enables fast spatial searches and cross-matching to
        external catalogs by `checkplot` and `lcproc` functions.

    field_fitsfile : str or None
        If this is not None, it should be the path to a FITS image containing
        the objects these light curves are for. If this is provided,
        `make_lclist` will use the WCS information in the FITS itself if
        `field_wcsfrom` is None (or from a WCS header file pointed to by
        `field_wcsfrom`) to obtain x and y pixel coordinates for all of the
        objects in the field. A finder chart will also be made using
        `astrobase.plotbase.fits_finder_chart` using the corresponding
        `field_scale`, `_stretch`, `_colormap`, `_findersize`, `_pltopts`,
        `_grid`, and `_gridcolors` kwargs for that function, reproduced here to
        enable customization of the finder chart plot.

    field_wcsfrom : str or None
        If `wcsfrom` is None, the WCS to transform the RA/Dec to pixel x/y will
        be taken from the FITS header of `fitsfile`. If this is not None, it
        must be a FITS or similar file that contains a WCS header in its first
        extension.

    field_scale : astropy.visualization.Interval object
        `scale` sets the normalization for the FITS pixel values. This is an
        astropy.visualization Interval object.
        See http://docs.astropy.org/en/stable/visualization/normalization.html
        for details on `scale` and `stretch` objects.

    field_stretch : astropy.visualization.Stretch object
        `stretch` sets the stretch function for mapping FITS pixel values to
        output pixel values. This is an astropy.visualization Stretch object.
        See http://docs.astropy.org/en/stable/visualization/normalization.html
        for details on `scale` and `stretch` objects.

    field_colormap : matplotlib Colormap object
        `colormap` is a matplotlib color map object to use for the output image.

    field_findersize : None or tuple of two ints
        If `findersize` is None, the output image size will be set by the NAXIS1
        and NAXIS2 keywords in the input `fitsfile` FITS header. Otherwise,
        `findersize` must be a tuple with the intended x and y size of the image
        in inches (all output images will use a DPI = 100).

    field_pltopts : dict
        `field_pltopts` controls how the overlay points will be plotted. This
        a dict with standard matplotlib marker, etc. kwargs as key-val pairs,
        e.g. 'markersize', 'markerfacecolor', etc. The default options make red
        outline circles at the location of each object in the overlay.

    field_grid : bool
        `grid` sets if a grid will be made on the output image.

    field_gridcolor : str
        `gridcolor` sets the color of the grid lines. This is a usual matplotib
        color spec string.

    field_zoomcontain : bool
        `field_zoomcontain` controls if the finder chart will be zoomed to
        just contain the overlayed points. Everything outside the footprint of
        these points will be discarded.

    maxlcs : int or None
        This sets how many light curves to process in the input LC list
        generated by searching for LCs in `basedir` or in the list provided as
        `use_list_of_filenames`.

    nworkers : int
        This sets the number of parallel workers to launch to collect
        information from the light curves.

    Returns
    -------

    str
        Returns the path to the generated light curve catalog pickle file.

    '''

    try:
        formatinfo = get_lcformat(lcformat,
                                  use_lcformat_dir=lcformatdir)
        if formatinfo:
            (dfileglob, readerfunc,
             dtimecols, dmagcols, derrcols,
             magsarefluxes, normfunc) = formatinfo
        else:
            LOGERROR("can't figure out the light curve format")
            return None
    except Exception:
        LOGEXCEPTION("can't figure out the light curve format")
        return None

    if not fileglob:
        fileglob = dfileglob

    # this is to get the actual ndet
    # set to the magnitudes column
    lcndetkey = dmagcols

    if isinstance(use_list_of_filenames, list):

        matching = use_list_of_filenames

    else:

        # handle the case where basedir is a list of directories
        if isinstance(basedir, list):

            matching = []

            for bdir in basedir:

                # now find the files
                LOGINFO('searching for %s light curves in %s ...' % (lcformat,
                                                                     bdir))

                if recursive is False:
                    matching.extend(glob.glob(os.path.join(bdir, fileglob)))

                else:
                    matching.extend(glob.glob(os.path.join(bdir,
                                                           '**',
                                                           fileglob),
                                              recursive=True))

        # otherwise, handle the usual case of one basedir to search in
        else:

            # now find the files
            LOGINFO('searching for %s light curves in %s ...' %
                    (lcformat, basedir))

            if recursive is False:
                matching = glob.glob(os.path.join(basedir, fileglob))

            else:
                matching = glob.glob(os.path.join(basedir,
                                                  '**',
                                                  fileglob),recursive=True)

    #
    # now that we have all the files, process them
    #
    if matching and len(matching) > 0:

        LOGINFO('found %s light curves' % len(matching))

        # cut down matching to maxlcs
        if maxlcs:
            matching = matching[:maxlcs]

        # prepare the output dict
        lclistdict = {
            'basedir':basedir,
            'lcformat':lcformat,
            'fileglob':fileglob,
            'recursive':recursive,
            'columns':columns,
            'makecoordindex':makecoordindex,
            'nfiles':len(matching),
            'objects': {
            }
        }

        # columns that will always be present in the output lclistdict
        derefcols = ['lcfname']
        derefcols.extend(['%s.ndet' % x.split('.')[-1] for x in lcndetkey])

        for dc in derefcols:
            lclistdict['objects'][dc] = []

        # fill in the rest of the lclist columns from the columns kwarg
        for col in columns:

            # dereference the column
            thiscol = col.split('.')
            thiscol = thiscol[-1]
            lclistdict['objects'][thiscol] = []
            derefcols.append(thiscol)

        # start collecting info
        LOGINFO('collecting light curve info...')

        tasks = [(x, columns, lcformat, lcformatdir, lcndetkey)
                 for x in matching]

        with ProcessPoolExecutor(max_workers=nworkers) as executor:
            results = executor.map(_lclist_parallel_worker, tasks)

        results = list(results)

        # update the columns in the overall dict from the results of the
        # parallel map
        for result in results:
            for xcol in derefcols:
                lclistdict['objects'][xcol].append(result[xcol])

        executor.shutdown()

        # done with collecting info
        # turn all of the lists in the lclistdict into arrays
        for col in lclistdict['objects']:
            lclistdict['objects'][col] = np.array(lclistdict['objects'][col])

        # handle duplicate objectids with different light curves
        uniques, counts = np.unique(lclistdict['objects']['objectid'],
                                    return_counts=True)

        duplicated_objectids = uniques[counts > 1]

        if duplicated_objectids.size > 0:

            # redo the objectid array so it has a bit larger dtype so the extra
            # tag can fit into the field
            dt = lclistdict['objects']['objectid'].dtype.str
            dt = '<U%s' % (
                int(dt.replace('<','').replace('U','').replace('S','')) + 3
            )
            lclistdict['objects']['objectid'] = np.array(
                lclistdict['objects']['objectid'],
                dtype=dt
            )

            for objid in duplicated_objectids:

                objid_inds = np.where(
                    lclistdict['objects']['objectid'] == objid
                )

                # mark the duplicates, assume the first instance is the actual
                # one
                for ncounter, nind in enumerate(objid_inds[0][1:]):
                    lclistdict['objects']['objectid'][nind] = '%s-%s' % (
                        lclistdict['objects']['objectid'][nind],
                        ncounter+2
                    )
                    LOGWARNING(
                        'tagging duplicated instance %s of objectid: '
                        '%s as %s-%s, lightcurve: %s' %
                        (ncounter+2, objid, objid, ncounter+2,
                         lclistdict['objects']['lcfname'][nind])
                    )

        # if we're supposed to make a spatial index, do so
        if (makecoordindex and
            isinstance(makecoordindex, (list, tuple)) and
            len(makecoordindex) == 2):

            try:

                # deref the column names
                racol, declcol = makecoordindex
                racol = racol.split('.')[-1]
                declcol = declcol.split('.')[-1]

                # get the ras and decls
                objra, objdecl = (lclistdict['objects'][racol],
                                  lclistdict['objects'][declcol])

                # get the xyz unit vectors from ra,decl
                # since i had to remind myself:
                # https://en.wikipedia.org/wiki/Equatorial_coordinate_system
                cosdecl = np.cos(np.radians(objdecl))
                sindecl = np.sin(np.radians(objdecl))
                cosra = np.cos(np.radians(objra))
                sinra = np.sin(np.radians(objra))
                xyz = np.column_stack((cosra*cosdecl,sinra*cosdecl, sindecl))

                # generate the kdtree
                kdt = sps.cKDTree(xyz,copy_data=True)

                # put the tree into the dict
                lclistdict['kdtree'] = kdt

                LOGINFO('kdtree generated for (ra, decl): (%s, %s)' %
                        (makecoordindex[0], makecoordindex[1]))

            except Exception:
                LOGEXCEPTION('could not make kdtree for (ra, decl): (%s, %s)' %
                             (makecoordindex[0], makecoordindex[1]))
                raise

        # generate the xy pairs if fieldfits is not None
        if field_fitsfile and os.path.exists(field_fitsfile):

            # read in the FITS file
            if field_wcsfrom is None:

                hdulist = pyfits.open(field_fitsfile)
                hdr = hdulist[0].header
                hdulist.close()

                w = WCS(hdr)
                wcsok = True

            elif os.path.exists(field_wcsfrom):

                w = WCS(field_wcsfrom)
                wcsok = True

            else:

                LOGERROR('could not determine WCS info for input FITS: %s' %
                         field_fitsfile)
                wcsok = False

            if wcsok:

                # first, transform the ra/decl to x/y and put these in the
                # lclist output dict
                radecl = np.column_stack((objra, objdecl))
                lclistdict['objects']['framexy'] = w.all_world2pix(
                    radecl,
                    1
                )

                # next, we'll make a PNG plot for the finder
                finder_outfile = os.path.join(
                    os.path.dirname(outfile),
                    os.path.splitext(os.path.basename(outfile))[0] + '.png'
                )

                finder_png = fits_finder_chart(
                    field_fitsfile,
                    finder_outfile,
                    wcsfrom=field_wcsfrom,
                    scale=field_scale,
                    stretch=field_stretch,
                    colormap=field_colormap,
                    findersize=field_findersize,
                    overlay_ra=objra,
                    overlay_decl=objdecl,
                    overlay_pltopts=field_pltopts,
                    overlay_zoomcontain=field_zoomcontain,
                    grid=field_grid,
                    gridcolor=field_gridcolor
                )

                if finder_png is not None:
                    LOGINFO('generated a finder PNG '
                            'with an object position overlay '
                            'for this LC list: %s' % finder_png)

        # write the pickle
        with open(outfile,'wb') as outfd:
            pickle.dump(lclistdict, outfd, protocol=pickle.HIGHEST_PROTOCOL)

        LOGINFO('done. LC info -> %s' % outfile)
        return outfile

    else:

        LOGERROR('no files found in %s matching %s' % (basedir, fileglob))
        return None


[docs]def filter_lclist(lc_catalog,
                  objectidcol='objectid',
                  racol='ra',
                  declcol='decl',
                  xmatchexternal=None,
                  xmatchdistarcsec=3.0,
                  externalcolnums=(0,1,2),
                  externalcolnames=('objectid','ra','decl'),
                  externalcoldtypes='U20,f8,f8',
                  externalcolsep=None,
                  externalcommentchar='#',
                  conesearch=None,
                  conesearchworkers=1,
                  columnfilters=None,
                  field_fitsfile=None,
                  field_wcsfrom=None,
                  field_scale=ZScaleInterval(),
                  field_stretch=LinearStretch(),
                  field_colormap=plt.cm.gray_r,
                  field_findersize=None,
                  field_pltopts={'marker':'o',
                                 'markersize':10.0,
                                 'markerfacecolor':'none',
                                 'markeredgewidth':2.0,
                                 'markeredgecolor':'red'},
                  field_grid=False,
                  field_gridcolor='k',
                  field_zoomcontain=True,
                  copylcsto=None):

    '''This is used to perform cone-search, cross-match, and column-filter
    operations on a light curve catalog generated by `make_lclist`.

    Uses the output of `make_lclist` above. This function returns a list of
    light curves matching various criteria specified by the `xmatchexternal`,
    `conesearch`, and `columnfilters kwargs`. Use this function to generate
    input lists for other lcproc functions,
    e.g. `lcproc.lcvfeatures.parallel_varfeatures`,
    `lcproc.periodfinding.parallel_pf`, and `lcproc.lcbin.parallel_timebin`,
    among others.

    The operations are applied in this order if more than one is specified:
    `xmatchexternal` -> `conesearch` -> `columnfilters`. All results from these
    operations are joined using a logical AND operation.

    Parameters
    ----------

    objectidcol : str
        This is the name of the object ID column in the light curve catalog.

    racol : str
        This is the name of the RA column in the light curve catalog.

    declcol : str
        This is the name of the Dec column in the light curve catalog.

    xmatchexternal : str or None
        If provided, this is the filename of a text file containing objectids,
        ras and decs to match the objects in the light curve catalog to by their
        positions.

    xmatchdistarcsec : float
        This is the distance in arcseconds to use when cross-matching to the
        external catalog in `xmatchexternal`.

    externalcolnums : sequence of int
        This a list of the zero-indexed column numbers of columns to extract
        from the external catalog file.

    externalcolnames : sequence of str
        This is a list of names of columns that will be extracted from the
        external catalog file. This is the same length as
        `externalcolnums`. These must contain the names provided as the
        `objectid`, `ra`, and `decl` column names so this function knows which
        column numbers correspond to those columns and can use them to set up
        the cross-match.

    externalcoldtypes : str
        This is a CSV string containing numpy dtype definitions for all columns
        listed to extract from the external catalog file. The number of dtype
        definitions should be equal to the number of columns to extract.

    externalcolsep : str or None
        The column separator to use when extracting columns from the external
        catalog file. If None, any whitespace between columns is used as the
        separator.

    externalcommentchar : str
        The character indicating that a line in the external catalog file is to
        be ignored.

    conesearch : list of float

        This is used to specify cone-search parameters. It should be a three
        element list:

        [center_ra_deg, center_decl_deg, search_radius_deg]

    conesearchworkers : int
        The number of parallel workers to launch for the cone-search operation.

    columnfilters : list of str

        This is a list of strings indicating any filters to apply on each column
        in the light curve catalog. All column filters are applied in the
        specified sequence and are combined with a logical AND operator. The
        format of each filter string should be:

        '<lc_catalog column>|<operator>|<operand>'

        where:

        - <lc_catalog column> is a column in the lc_catalog pickle file
        - <operator> is one of: 'lt', 'gt', 'le', 'ge', 'eq', 'ne', which
          correspond to the usual operators: <, >, <=, >=, ==, != respectively.
        - <operand> is a float, int, or string.

    field_fitsfile : str or None
        If this is not None, it should be the path to a FITS image containing
        the objects these light curves are for. If this is provided,
        `make_lclist` will use the WCS information in the FITS itself if
        `field_wcsfrom` is None (or from a WCS header file pointed to by
        `field_wcsfrom`) to obtain x and y pixel coordinates for all of the
        objects in the field. A finder chart will also be made using
        `astrobase.plotbase.fits_finder_chart` using the corresponding
        `field_scale`, `_stretch`, `_colormap`, `_findersize`, `_pltopts`,
        `_grid`, and `_gridcolors` kwargs for that function, reproduced here to
        enable customization of the finder chart plot.

    field_wcsfrom : str or None
        If `wcsfrom` is None, the WCS to transform the RA/Dec to pixel x/y will
        be taken from the FITS header of `fitsfile`. If this is not None, it
        must be a FITS or similar file that contains a WCS header in its first
        extension.

    field_scale : astropy.visualization.Interval object
        `scale` sets the normalization for the FITS pixel values. This is an
        astropy.visualization Interval object.
        See http://docs.astropy.org/en/stable/visualization/normalization.html
        for details on `scale` and `stretch` objects.

    field_stretch : astropy.visualization.Stretch object
        `stretch` sets the stretch function for mapping FITS pixel values to
        output pixel values. This is an astropy.visualization Stretch object.
        See http://docs.astropy.org/en/stable/visualization/normalization.html
        for details on `scale` and `stretch` objects.

    field_colormap : matplotlib Colormap object
        `colormap` is a matplotlib color map object to use for the output image.

    field_findersize : None or tuple of two ints
        If `findersize` is None, the output image size will be set by the NAXIS1
        and NAXIS2 keywords in the input `fitsfile` FITS header. Otherwise,
        `findersize` must be a tuple with the intended x and y size of the image
        in inches (all output images will use a DPI = 100).

    field_pltopts : dict
        `field_pltopts` controls how the overlay points will be plotted. This
        a dict with standard matplotlib marker, etc. kwargs as key-val pairs,
        e.g. 'markersize', 'markerfacecolor', etc. The default options make red
        outline circles at the location of each object in the overlay.

    field_grid : bool
        `grid` sets if a grid will be made on the output image.

    field_gridcolor : str
        `gridcolor` sets the color of the grid lines. This is a usual matplotib
        color spec string.

    field_zoomcontain : bool
        `field_zoomcontain` controls if the finder chart will be zoomed to
        just contain the overlayed points. Everything outside the footprint of
        these points will be discarded.

    copylcsto : str
        If this is provided, it is interpreted as a directory target to copy
        all the light curves that match the specified conditions.

    Returns
    -------

    tuple
        Returns a two elem tuple: (matching_object_lcfiles, matching_objectids)
        if conesearch and/or column filters are used. If `xmatchexternal` is
        also used, a three-elem tuple is returned: (matching_object_lcfiles,
        matching_objectids, extcat_matched_objectids).

    '''

    with open(lc_catalog,'rb') as infd:
        lclist = pickle.load(infd)

    # generate numpy arrays of the matching object indexes. we do it this way so
    # we can AND everything at the end, instead of having to look up the objects
    # at these indices and running the columnfilter on them
    xmatch_matching_index = np.full_like(lclist['objects'][objectidcol],
                                         False,
                                         dtype=np.bool)
    conesearch_matching_index = np.full_like(lclist['objects'][objectidcol],
                                             False,
                                             dtype=np.bool)

    # do the xmatch first
    ext_matches = []
    ext_matching_objects = []

    if (xmatchexternal and
        isinstance(xmatchexternal, str) and
        os.path.exists(xmatchexternal)):

        try:

            # read in the external file
            extcat = np.genfromtxt(xmatchexternal,
                                   usecols=externalcolnums,
                                   delimiter=externalcolsep,
                                   names=externalcolnames,
                                   dtype=externalcoldtypes,
                                   comments=externalcommentchar)

            ext_cosdecl = np.cos(np.radians(extcat['decl']))
            ext_sindecl = np.sin(np.radians(extcat['decl']))
            ext_cosra = np.cos(np.radians(extcat['ra']))
            ext_sinra = np.sin(np.radians(extcat['ra']))

            ext_xyz = np.column_stack((ext_cosra*ext_cosdecl,
                                       ext_sinra*ext_cosdecl,
                                       ext_sindecl))
            ext_xyzdist = 2.0 * np.sin(np.radians(xmatchdistarcsec/3600.0)/2.0)

            # get our kdtree
            our_kdt = lclist['kdtree']

            # get the external kdtree
            ext_kdt = sps.cKDTree(ext_xyz)

            # do a query_ball_tree
            extkd_matchinds = ext_kdt.query_ball_tree(our_kdt, ext_xyzdist)

            for extind, mind in enumerate(extkd_matchinds):
                if len(mind) > 0:
                    ext_matches.append(mind[0])

                    # get the whole matching row for the ext objects recarray
                    ext_matching_objects.append(extcat[extind])

            ext_matches = np.array(ext_matches)

            if ext_matches.size > 0:

                # update the xmatch_matching_index
                xmatch_matching_index[ext_matches] = True

                LOGINFO('xmatch: objects matched to %s within %.1f arcsec: %s' %
                        (xmatchexternal, xmatchdistarcsec, ext_matches.size))

            else:

                LOGERROR("xmatch: no objects were cross-matched to external "
                         "catalog spec: %s, can't continue" % xmatchexternal)
                return None, None, None

        except Exception:

            LOGEXCEPTION('could not match to external catalog spec: %s' %
                         repr(xmatchexternal))
            raise

    # do the cone search next
    if (conesearch and
        isinstance(conesearch, (list, tuple)) and
        len(conesearch) == 3):

        try:

            racenter, declcenter, searchradius = conesearch
            cosdecl = np.cos(np.radians(declcenter))
            sindecl = np.sin(np.radians(declcenter))
            cosra = np.cos(np.radians(racenter))
            sinra = np.sin(np.radians(racenter))

            # this is the search distance in xyz unit vectors
            xyzdist = 2.0 * np.sin(np.radians(searchradius)/2.0)

            # get the kdtree
            our_kdt = lclist['kdtree']

            # look up the coordinates
            kdtindices = our_kdt.query_ball_point([cosra*cosdecl,
                                                   sinra*cosdecl,
                                                   sindecl],
                                                  xyzdist,
                                                  n_jobs=conesearchworkers)

            if kdtindices and len(kdtindices) > 0:

                LOGINFO('cone search: objects within %.4f deg '
                        'of (%.3f, %.3f): %s' %
                        (searchradius, racenter, declcenter, len(kdtindices)))

                # update the conesearch_matching_index
                matchingind = kdtindices
                conesearch_matching_index[np.array(matchingind)] = True

            # we fail immediately if we found nothing. this assumes the user
            # cares more about the cone-search than the regular column filters
            else:

                LOGERROR("cone-search: no objects were found within "
                         "%.4f deg of (%.3f, %.3f): %s, can't continue" %
                         (searchradius, racenter, declcenter, len(kdtindices)))
                return None, None

        except Exception:

            LOGEXCEPTION('cone-search: could not run a cone-search, '
                         'is there a kdtree present in %s?' % lc_catalog)
            raise

    # now that we're done with cone-search, do the column filtering
    allfilterinds = []
    if columnfilters and isinstance(columnfilters, list):

        # go through each filter
        for cfilt in columnfilters:

            try:

                fcol, foperator, foperand = cfilt.split('|')
                foperator = FILTEROPS[foperator]

                # generate the evalstring
                filterstr = (
                    "np.isfinite(lclist['objects']['%s']) & "
                    "(lclist['objects']['%s'] %s %s)"
                ) % (fcol, fcol, foperator, foperand)
                filterind = eval(filterstr)

                ngood = lclist['objects'][objectidcol][filterind].size
                LOGINFO('filter: %s -> objects matching: %s ' % (cfilt, ngood))

                allfilterinds.append(filterind)

            except Exception:

                LOGEXCEPTION('filter: could not understand filter spec: %s'
                             % cfilt)
                LOGWARNING('filter: not applying this broken filter')

    # now that we have all the filter indices good to go
    # logical-AND all the things

    # make sure we only do filtering if we were told to do so
    if (xmatchexternal or conesearch or columnfilters):

        filterstack = []
        if xmatchexternal:
            filterstack.append(xmatch_matching_index)
        if conesearch:
            filterstack.append(conesearch_matching_index)
        if columnfilters:
            filterstack.extend(allfilterinds)

        finalfilterind = np.column_stack(filterstack)
        finalfilterind = np.all(finalfilterind, axis=1)

        # get the filtered object light curves and object names
        filteredobjectids = lclist['objects'][objectidcol][finalfilterind]
        filteredlcfnames = lclist['objects']['lcfname'][finalfilterind]

    else:

        filteredobjectids = lclist['objects'][objectidcol]
        filteredlcfnames = lclist['objects']['lcfname']

    # if we're told to make a finder chart with the selected objects
    if field_fitsfile is not None and os.path.exists(field_fitsfile):

        # get the RA and DEC of the matching objects
        matching_ra = lclist['objects'][racol][finalfilterind]
        matching_decl = lclist['objects'][declcol][finalfilterind]

        matching_postfix = []

        if xmatchexternal is not None:
            matching_postfix.append(
                'xmatch_%s' %
                os.path.splitext(os.path.basename(xmatchexternal))[0]
            )
        if conesearch is not None:
            matching_postfix.append('conesearch_RA%.3f_DEC%.3f_RAD%.5f' %
                                    tuple(conesearch))

        if columnfilters is not None:
            for cfi, cf in enumerate(columnfilters):
                if cfi == 0:
                    matching_postfix.append('filter_%s_%s_%s' %
                                            tuple(cf.split('|')))
                else:
                    matching_postfix.append('_and_%s_%s_%s' %
                                            tuple(cf.split('|')))

        if len(matching_postfix) > 0:
            matching_postfix = '-%s' % '_'.join(matching_postfix)
        else:
            matching_postfix = ''

        # next, we'll make a PNG plot for the finder
        finder_outfile = os.path.join(
            os.path.dirname(lc_catalog),
            '%s%s.png' %
            (os.path.splitext(os.path.basename(lc_catalog))[0],
             matching_postfix)
        )

        finder_png = fits_finder_chart(
            field_fitsfile,
            finder_outfile,
            wcsfrom=field_wcsfrom,
            scale=field_scale,
            stretch=field_stretch,
            colormap=field_colormap,
            findersize=field_findersize,
            overlay_ra=matching_ra,
            overlay_decl=matching_decl,
            overlay_pltopts=field_pltopts,
            field_zoomcontain=field_zoomcontain,
            grid=field_grid,
            gridcolor=field_gridcolor
        )

        if finder_png is not None:
            LOGINFO('generated a finder PNG '
                    'with an object position overlay '
                    'for this filtered LC list: %s' % finder_png)

    # if copylcsto is not None, copy LCs over to it
    if copylcsto is not None:

        if not os.path.exists(copylcsto):
            os.mkdir(copylcsto)

        if TQDM:
            lciter = tqdm(filteredlcfnames)
        else:
            lciter = filteredlcfnames

        LOGINFO('copying matching light curves to %s' % copylcsto)

        for lc in lciter:
            shutil.copy(lc, copylcsto)

    LOGINFO('done. objects matching all filters: %s' % filteredobjectids.size)

    if xmatchexternal and len(ext_matching_objects) > 0:
        return filteredlcfnames, filteredobjectids, ext_matching_objects
    else:
        return filteredlcfnames, filteredobjectids


############################################################
## ADDING CHECKPLOT INFO BACK TO THE LIGHT CURVE CATALOGS ##
############################################################

def _cpinfo_key_worker(task):
    '''This wraps `checkplotlist.checkplot_infokey_worker`.

    This is used to get the correct dtype for each element in retrieved results.

    Parameters
    ----------

    task : tuple
        task[0] = cpfile
        task[1] = keyspeclist (infokeys kwarg from `add_cpinfo_to_lclist`)

    Returns
    -------

    dict
        All of the requested keys from the checkplot are returned along with
        their values in a dict.

    '''

    cpfile, keyspeclist = task

    keystoget = [x[0] for x in keyspeclist]
    nonesubs = [x[-2] for x in keyspeclist]
    nansubs = [x[-1] for x in keyspeclist]

    # reform the keystoget into a list of lists
    for i, k in enumerate(keystoget):

        thisk = k.split('.')
        thisk = [(int(x) if x.isdecimal() else x) for x in thisk]

        keystoget[i] = thisk

    # add in the objectid as well to match to the object catalog later
    keystoget.insert(0,['objectid'])
    nonesubs.insert(0, '')
    nansubs.insert(0,'')

    # get all the keys we need
    vals = checkplot_infokey_worker((cpfile, keystoget))

    # if they have some Nones, nans, etc., reform them as expected
    for val, nonesub, nansub, valind in zip(vals, nonesubs,
                                            nansubs, range(len(vals))):

        if val is None:
            outval = nonesub
        elif isinstance(val, float) and not np.isfinite(val):
            outval = nansub
        elif isinstance(val, (list, tuple)):
            outval = ', '.join(val)
        else:
            outval = val

        vals[valind] = outval

    return vals


CPINFO_DEFAULTKEYS = [
    # key, dtype, first level, overwrite=T|append=F, None sub, nan sub
    ('comments',
     np.unicode_, False, True, '', ''),
    ('objectinfo.objecttags',
     np.unicode_, True, True, '', ''),
    ('objectinfo.twomassid',
     np.unicode_, True, True, '', ''),
    ('objectinfo.bmag',
     np.float_, True, True, np.nan, np.nan),
    ('objectinfo.vmag',
     np.float_, True, True, np.nan, np.nan),
    ('objectinfo.rmag',
     np.float_, True, True, np.nan, np.nan),
    ('objectinfo.imag',
     np.float_, True, True, np.nan, np.nan),
    ('objectinfo.jmag',
     np.float_, True, True, np.nan, np.nan),
    ('objectinfo.hmag',
     np.float_, True, True, np.nan, np.nan),
    ('objectinfo.kmag',
     np.float_, True, True, np.nan, np.nan),
    ('objectinfo.sdssu',
     np.float_, True, True, np.nan, np.nan),
    ('objectinfo.sdssg',
     np.float_, True, True, np.nan, np.nan),
    ('objectinfo.sdssr',
     np.float_, True, True, np.nan, np.nan),
    ('objectinfo.sdssi',
     np.float_, True, True, np.nan, np.nan),
    ('objectinfo.sdssz',
     np.float_, True, True, np.nan, np.nan),
    ('objectinfo.dered_bmag',
     np.float_, True, True, np.nan, np.nan),
    ('objectinfo.dered_vmag',
     np.float_, True, True, np.nan, np.nan),
    ('objectinfo.dered_rmag',
     np.float_, True, True, np.nan, np.nan),
    ('objectinfo.dered_imag',
     np.float_, True, True, np.nan, np.nan),
    ('objectinfo.dered_jmag',
     np.float_, True, True, np.nan, np.nan),
    ('objectinfo.dered_hmag',
     np.float_, True, True, np.nan, np.nan),
    ('objectinfo.dered_kmag',
     np.float_, True, True, np.nan, np.nan),
    ('objectinfo.dered_sdssu',
     np.float_, True, True, np.nan, np.nan),
    ('objectinfo.dered_sdssg',
     np.float_, True, True, np.nan, np.nan),
    ('objectinfo.dered_sdssr',
     np.float_, True, True, np.nan, np.nan),
    ('objectinfo.dered_sdssi',
     np.float_, True, True, np.nan, np.nan),
    ('objectinfo.dered_sdssz',
     np.float_, True, True, np.nan, np.nan),
    ('objectinfo.extinction_bmag',
     np.float_, True, True, np.nan, np.nan),
    ('objectinfo.extinction_vmag',
     np.float_, True, True, np.nan, np.nan),
    ('objectinfo.extinction_rmag',
     np.float_, True, True, np.nan, np.nan),
    ('objectinfo.extinction_imag',
     np.float_, True, True, np.nan, np.nan),
    ('objectinfo.extinction_jmag',
     np.float_, True, True, np.nan, np.nan),
    ('objectinfo.extinction_hmag',
     np.float_, True, True, np.nan, np.nan),
    ('objectinfo.extinction_kmag',
     np.float_, True, True, np.nan, np.nan),
    ('objectinfo.extinction_sdssu',
     np.float_, True, True, np.nan, np.nan),
    ('objectinfo.extinction_sdssg',
     np.float_, True, True, np.nan, np.nan),
    ('objectinfo.extinction_sdssr',
     np.float_, True, True, np.nan, np.nan),
    ('objectinfo.extinction_sdssi',
     np.float_, True, True, np.nan, np.nan),
    ('objectinfo.extinction_sdssz',
     np.float_, True, True, np.nan, np.nan),
    ('objectinfo.color_classes',
     np.unicode_, True, True, '', ''),
    ('objectinfo.pmra',
     np.float_, True, True, np.nan, np.nan),
    ('objectinfo.pmdecl',
     np.float_, True, True, np.nan, np.nan),
    ('objectinfo.propermotion',
     np.float_, True, True, np.nan, np.nan),
    ('objectinfo.rpmj',
     np.float_, True, True, np.nan, np.nan),
    ('objectinfo.gl',
     np.float_, True, True, np.nan, np.nan),
    ('objectinfo.gb',
     np.float_, True, True, np.nan, np.nan),
    ('objectinfo.gaia_status',
     np.unicode_, True, True, '', ''),
    ('objectinfo.gaia_ids.0',
     np.unicode_, True, True, '', ''),
    ('objectinfo.gaiamag',
     np.float_, True, True, np.nan, np.nan),
    ('objectinfo.gaia_parallax',
     np.float_, True, True, np.nan, np.nan),
    ('objectinfo.gaia_parallax_err',
     np.float_, True, True, np.nan, np.nan),
    ('objectinfo.gaia_absmag',
     np.float_, True, True, np.nan, np.nan),
    ('objectinfo.simbad_best_mainid',
     np.unicode_, True, True, '', ''),
    ('objectinfo.simbad_best_objtype',
     np.unicode_, True, True, '', ''),
    ('objectinfo.simbad_best_allids',
     np.unicode_, True, True, '', ''),
    ('objectinfo.simbad_best_distarcsec',
     np.float_, True, True, np.nan, np.nan),
    #
    # TIC info
    #
    ('objectinfo.ticid',
     np.unicode_, True, True, '', ''),
    ('objectinfo.tic_version',
     np.unicode_, True, True, '', ''),
    ('objectinfo.tessmag',
     np.float_, True, True, np.nan, np.nan),
    #
    # variability info
    #
    ('varinfo.vartags',
     np.unicode_, False, True, '', ''),
    ('varinfo.varperiod',
     np.float_, False, True, np.nan, np.nan),
    ('varinfo.varepoch',
     np.float_, False, True, np.nan, np.nan),
    ('varinfo.varisperiodic',
     np.int_, False, True, 0, 0),
    ('varinfo.objectisvar',
     np.int_, False, True, 0, 0),
    ('varinfo.features.median',
     np.float_, False, True, np.nan, np.nan),
    ('varinfo.features.mad',
     np.float_, False, True, np.nan, np.nan),
    ('varinfo.features.stdev',
     np.float_, False, True, np.nan, np.nan),
    ('varinfo.features.mag_iqr',
     np.float_, False, True, np.nan, np.nan),
    ('varinfo.features.skew',
     np.float_, False, True, np.nan, np.nan),
    ('varinfo.features.kurtosis',
     np.float_, False, True, np.nan, np.nan),
    ('varinfo.features.stetsonj',
     np.float_, False, True, np.nan, np.nan),
    ('varinfo.features.stetsonk',
     np.float_, False, True, np.nan, np.nan),
    ('varinfo.features.eta_normal',
     np.float_, False, True, np.nan, np.nan),
    ('varinfo.features.linear_fit_slope',
     np.float_, False, True, np.nan, np.nan),
    ('varinfo.features.magnitude_ratio',
     np.float_, False, True, np.nan, np.nan),
    ('varinfo.features.beyond1std',
     np.float_, False, True, np.nan, np.nan)
]


[docs]def add_cpinfo_to_lclist(
        checkplots,  # list or a directory path
        initial_lc_catalog,
        magcol,  # to indicate checkplot magcol
        outfile,
        checkplotglob='checkplot*.pkl*',
        infokeys=CPINFO_DEFAULTKEYS,
        nworkers=NCPUS
):
    '''This adds checkplot info to the initial light curve catalogs generated by
    `make_lclist`.

    This is used to incorporate all the extra info checkplots can have for
    objects back into columns in the light curve catalog produced by
    `make_lclist`. Objects are matched between the checkplots and the light
    curve catalog using their `objectid`. This then allows one to search this
    'augmented' light curve catalog by these extra columns. The 'augmented'
    light curve catalog also forms the basis for search interface provided by
    the LCC-Server.

    The default list of keys that will be extracted from a checkplot and added
    as columns in the initial light curve catalog is listed above in the
    `CPINFO_DEFAULTKEYS` list.

    Parameters
    ----------

    checkplots : str or list
        If this is a str, is interpreted as a directory which will be searched
        for checkplot pickle files using `checkplotglob`. If this is a list, it
        will be interpreted as a list of checkplot pickle files to process.

    initial_lc_catalog : str
        This is the path to the light curve catalog pickle made by
        `make_lclist`.

    magcol : str
        This is used to indicate the light curve magnitude column to extract
        magnitude column specific information. For example, Stetson variability
        indices can be generated using magnitude measurements in separate
        photometric apertures, which appear in separate `magcols` in the
        checkplot. To associate each such feature of the object with its
        specific `magcol`, pass that `magcol` in here. This `magcol` will then
        be added as a prefix to the resulting column in the 'augmented' LC
        catalog, e.g. Stetson J will appear as `magcol1_stetsonj` and
        `magcol2_stetsonj` for two separate magcols.

    outfile : str
        This is the file name of the output 'augmented' light curve catalog
        pickle file that will be written.

    infokeys : list of tuples

        This is a list of keys to extract from the checkplot and some info on
        how this extraction is to be done. Each key entry is a six-element
        tuple of the following form:

        - key name in the checkplot
        - numpy dtype of the value of this key
        - False if key is associated with a magcol or True otherwise
        - False if subsequent updates to the same column name will append to
          existing key values in the output augmented light curve catalog or
          True if these will overwrite the existing key value
        - character to use to substitute a None value of the key in the
          checkplot in the output light curve catalog column
        - character to use to substitute a nan value of the key in the
          checkplot in the output light curve catalog column

        See the `CPFINFO_DEFAULTKEYS` list above for examples.

    nworkers : int
        The number of parallel workers to launch to extract checkplot
        information.

    Returns
    -------

    str
        Returns the path to the generated 'augmented' light curve catalog pickle
        file.

    '''

    # get the checkplots from the directory if one is provided
    if not isinstance(checkplots, list) and os.path.exists(checkplots):
        checkplots = sorted(glob.glob(os.path.join(checkplots, checkplotglob)))

    tasklist = [(cpf, infokeys) for cpf in checkplots]

    with ProcessPoolExecutor(max_workers=nworkers) as executor:
        resultfutures = executor.map(_cpinfo_key_worker, tasklist)

    results = list(resultfutures)
    executor.shutdown()

    # now that we have all the checkplot info, we need to match to the
    # objectlist in the lclist

    # open the lclist
    with open(initial_lc_catalog,'rb') as infd:
        lc_catalog = pickle.load(infd)

    # convert the lc_catalog['columns'] item to a list if it's not
    # this is so we can append columns to it later
    lc_catalog['columns'] = list(lc_catalog['columns'])

    catalog_objectids = np.array(lc_catalog['objects']['objectid'])
    checkplot_objectids = np.array([x[0] for x in results])

    # add the extra key arrays in the lclist dict
    extrainfokeys = []
    actualkeys = []

    # set up the extrainfokeys list
    for keyspec in infokeys:

        key, dtype, firstlevel, overwrite_append, nonesub, nansub = keyspec

        if firstlevel:
            eik = key
        else:
            eik = '%s.%s' % (magcol, key)

        extrainfokeys.append(eik)

        # now handle the output dicts and column list
        eactual = eik.split('.')

        # this handles dereferenced list indices
        if not eactual[-1].isdigit():

            if not firstlevel:
                eactual = '.'.join([eactual[0], eactual[-1]])
            else:
                eactual = eactual[-1]

        else:
            elastkey = eactual[-2]

            # for list columns, this converts stuff like errs -> err,
            # and parallaxes -> parallax
            if elastkey.endswith('es'):
                elastkey = elastkey[:-2]
            elif elastkey.endswith('s'):
                elastkey = elastkey[:-1]

            if not firstlevel:
                eactual = '.'.join([eactual[0], elastkey])
            else:
                eactual = elastkey

        actualkeys.append(eactual)

        # add a new column only if required
        if eactual not in lc_catalog['columns']:
            lc_catalog['columns'].append(eactual)

        # we'll overwrite earlier existing columns in any case
        lc_catalog['objects'][eactual] = []

    # now go through each objectid in the catalog and add the extra keys to
    # their respective arrays
    for catobj in tqdm(catalog_objectids):

        cp_objind = np.where(checkplot_objectids == catobj)

        if len(cp_objind[0]) > 0:

            # get the info line for this checkplot
            thiscpinfo = results[cp_objind[0][0]]

            # the first element is the objectid which we remove
            thiscpinfo = thiscpinfo[1:]

            # update the object catalog entries for this object
            for ekind, ek in enumerate(actualkeys):

                # add the actual thing to the output list
                lc_catalog['objects'][ek].append(
                    thiscpinfo[ekind]
                )

        else:

            # update the object catalog entries for this object
            for ekind, ek in enumerate(actualkeys):

                thiskeyspec = infokeys[ekind]
                nonesub = thiskeyspec[-2]

                lc_catalog['objects'][ek].append(
                    nonesub
                )

    # now we should have all the new keys in the object catalog
    # turn them into arrays
    for ek in actualkeys:

        lc_catalog['objects'][ek] = np.array(
            lc_catalog['objects'][ek]
        )

    # add the magcol to the lc_catalog
    if 'magcols' in lc_catalog:
        if magcol not in lc_catalog['magcols']:
            lc_catalog['magcols'].append(magcol)
    else:
        lc_catalog['magcols'] = [magcol]

    # write back the new object catalog
    with open(outfile, 'wb') as outfd:
        pickle.dump(lc_catalog, outfd, protocol=pickle.HIGHEST_PROTOCOL)

    return outfile