Source code for astrobase.lcproc.catalogs

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# catalogs.py - Waqas Bhatti (wbhatti@astro.princeton.edu) - Feb 2019

'''

This contains functions to generate light curve catalogs from collections of
light curves.

'''

#############
## LOGGING ##
#############

import logging
from astrobase import log_sub, log_fmt, log_date_fmt

DEBUG = False
if DEBUG:
    level = logging.DEBUG
else:
    level = logging.INFO
LOGGER = logging.getLogger(__name__)
logging.basicConfig(
    level=level,
    style=log_sub,
    format=log_fmt,
    datefmt=log_date_fmt,
)

LOGDEBUG = LOGGER.debug
LOGINFO = LOGGER.info
LOGWARNING = LOGGER.warning
LOGERROR = LOGGER.error
LOGEXCEPTION = LOGGER.exception


#############
## IMPORTS ##
#############

import pickle
import os
import os.path
import glob
import shutil
import multiprocessing as mp
from concurrent.futures import ProcessPoolExecutor

import numpy as np
import numpy.random as npr
npr.seed(0xc0ffee)

import scipy.spatial as sps

import astropy.io.fits as pyfits
from astropy.wcs import WCS
from astropy.visualization import ZScaleInterval, LinearStretch

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

try:
    from tqdm import tqdm
    TQDM = True
except Exception:
    TQDM = False
    pass

# to turn a list of keys into a dict address
# from https://stackoverflow.com/a/14692747
from functools import reduce
from operator import getitem


def _dict_get(datadict, keylist):
    return reduce(getitem, keylist, datadict)


############
## CONFIG ##
############

NCPUS = mp.cpu_count()

# these translate filter operators given as strings to Python operators
FILTEROPS = {'eq':'==',
             'gt':'>',
             'ge':'>=',
             'lt':'<',
             'le':'<=',
             'ne':'!='}


###################
## LOCAL IMPORTS ##
###################

from astrobase.plotbase import fits_finder_chart
from astrobase.cpserver.checkplotlist import checkplot_infokey_worker
from astrobase.lcproc import get_lcformat


#####################################################
## FUNCTIONS TO GENERATE OBJECT CATALOGS (LCLISTS) ##
#####################################################

def _lclist_parallel_worker(task):
    '''This is a parallel worker for makelclist.

    Parameters
    ----------

    task : tuple
        This is a tuple containing the following items:

        task[0] = lcf
        task[1] = columns
        task[2] = lcformat
        task[3] = lcformatdir
        task[4] = lcndetkey

    Returns
    -------

    dict or None
        This contains all of the info for the object processed in this LC read
        operation. If this fails, returns None

    '''

    lcf, columns, lcformat, lcformatdir, lcndetkey = task

    # get the bits needed for lcformat handling
    # NOTE: we re-import things in this worker function because sometimes
    # functions can't be pickled correctly for passing them to worker functions
    # in a processing pool
    try:
        formatinfo = get_lcformat(lcformat,
                                  use_lcformat_dir=lcformatdir)
        if formatinfo:
            (dfileglob, readerfunc,
             dtimecols, dmagcols, derrcols,
             magsarefluxes, normfunc) = formatinfo
        else:
            LOGERROR("can't figure out the light curve format")
            return None
    except Exception:
        LOGEXCEPTION("can't figure out the light curve format")
        return None

    # we store the full path of the light curve
    lcobjdict = {'lcfname':os.path.abspath(lcf)}

    try:

        # read the light curve in
        lcdict = readerfunc(lcf)

        # this should handle lists/tuples being returned by readerfunc
        # we assume that the first element is the actual lcdict
        # FIXME: figure out how to not need this assumption
        if ( (isinstance(lcdict, (list, tuple))) and
             (isinstance(lcdict[0], dict)) ):
            lcdict = lcdict[0]

        # insert all of the columns
        for colkey in columns:

            if '.' in colkey:
                getkey = colkey.split('.')
            else:
                getkey = [colkey]

            try:
                thiscolval = _dict_get(lcdict, getkey)
            except Exception:
                LOGWARNING('column %s does not exist for %s' %
                           (colkey, lcf))
                thiscolval = np.nan

            # update the lcobjdict with this value
            lcobjdict[getkey[-1]] = thiscolval

    except Exception:

        LOGEXCEPTION('could not figure out columns for %s' % lcf)

        # insert all of the columns as nans
        for colkey in columns:

            if '.' in colkey:
                getkey = colkey.split('.')
            else:
                getkey = [colkey]

            thiscolval = np.nan

            # update the lclistdict with this value
            lcobjdict[getkey[-1]] = thiscolval

    # now get the actual ndets; this excludes nans and infs
    for dk in lcndetkey:

        try:

            if '.' in dk:
                getdk = dk.split('.')
            else:
                getdk = [dk]

            ndetcol = _dict_get(lcdict, getdk)
            actualndets = ndetcol[np.isfinite(ndetcol)].size
            lcobjdict['%s.ndet' % getdk[-1]] = actualndets

        except Exception:
            lcobjdict['%s.ndet' % getdk[-1]] = np.nan

    return lcobjdict


[docs]def make_lclist(basedir, outfile, use_list_of_filenames=None, lcformat='hat-sql', lcformatdir=None, fileglob=None, recursive=True, columns=('objectid', 'objectinfo.ra', 'objectinfo.decl', 'objectinfo.ndet'), makecoordindex=('objectinfo.ra','objectinfo.decl'), field_fitsfile=None, field_wcsfrom=None, field_scale=ZScaleInterval(), field_stretch=LinearStretch(), field_colormap=plt.cm.gray_r, field_findersize=None, field_pltopts={'marker':'o', 'markersize':10.0, 'markerfacecolor':'none', 'markeredgewidth':2.0, 'markeredgecolor':'red'}, field_grid=False, field_gridcolor='k', field_zoomcontain=True, maxlcs=None, nworkers=NCPUS): '''This generates a light curve catalog for all light curves in a directory. Given a base directory where all the files are, and a light curve format, this will find all light curves, pull out the keys in each lcdict requested in the `columns` kwarg for each object, and write them to the requested output pickle file. These keys should be pointers to scalar values (i.e. something like `objectinfo.ra` is OK, but something like 'times' won't work because it's a vector). Generally, this works with light curve reading functions that produce lcdicts as detailed in the docstring for `lcproc.register_lcformat`. Once you've registered your light curve reader functions using the `lcproc.register_lcformat` function, pass in the `formatkey` associated with your light curve format, and this function will be able to read all light curves in that format as well as the object information stored in their `objectinfo` dict. Parameters ---------- basedir : str or list of str If this is a str, points to a single directory to search for light curves. If this is a list of str, it must be a list of directories to search for light curves. All of these will be searched to find light curve files matching either your light curve format's default fileglob (when you registered your LC format), or a specific fileglob that you can pass in using the `fileglob` kwargh here. If the `recursive` kwarg is set, the provided directories will be searched recursively. If `use_list_of_filenames` is not None, it will override this argument and the function will take those light curves as the list of files it must process instead of whatever is specified in `basedir`. outfile : str This is the name of the output file to write. This will be a pickle file, so a good convention to use for this name is something like 'my-lightcurve-catalog.pkl'. use_list_of_filenames : list of str or None Use this kwarg to override whatever is provided in `basedir` and directly pass in a list of light curve files to process. This can speed up this function by a lot because no searches on disk will be performed to find light curve files matching `basedir` and `fileglob`. lcformat : str This is the `formatkey` associated with your light curve format, which you previously passed in to the `lcproc.register_lcformat` function. This will be used to look up how to find and read the light curves specified in `basedir` or `use_list_of_filenames`. lcformatdir : str or None If this is provided, gives the path to a directory when you've stored your lcformat description JSONs, other than the usual directories lcproc knows to search for them in. Use this along with `lcformat` to specify an LC format JSON file that's not currently registered with lcproc. fileglob : str or None If provided, is a string that is a valid UNIX filename glob. Used to override the default fileglob for this LC format when searching for light curve files in `basedir`. recursive : bool If True, the directories specified in `basedir` will be searched recursively for all light curve files that match the default fileglob for this LC format or a specific one provided in `fileglob`. columns : list of str This is a list of keys in the lcdict produced by your light curve reader function that contain object information, which will be extracted and put into the output light curve catalog. It's highly recommended that your LC reader function produce a lcdict that contains at least the default keys shown here. The lcdict keys to extract are specified by using an address scheme: - First level dict keys can be specified directly: e.g., 'objectid' will extract lcdict['objectid'] - Keys at other levels can be specified by using a period to indicate the level: - e.g., 'objectinfo.ra' will extract lcdict['objectinfo']['ra'] - e.g., 'objectinfo.varinfo.features.stetsonj' will extract lcdict['objectinfo']['varinfo']['features']['stetsonj'] makecoordindex : list of two str or None This is used to specify which lcdict keys contain the right ascension and declination coordinates for this object. If these are provided, the output light curve catalog will have a kdtree built on all object coordinates, which enables fast spatial searches and cross-matching to external catalogs by `checkplot` and `lcproc` functions. field_fitsfile : str or None If this is not None, it should be the path to a FITS image containing the objects these light curves are for. If this is provided, `make_lclist` will use the WCS information in the FITS itself if `field_wcsfrom` is None (or from a WCS header file pointed to by `field_wcsfrom`) to obtain x and y pixel coordinates for all of the objects in the field. A finder chart will also be made using `astrobase.plotbase.fits_finder_chart` using the corresponding `field_scale`, `_stretch`, `_colormap`, `_findersize`, `_pltopts`, `_grid`, and `_gridcolors` kwargs for that function, reproduced here to enable customization of the finder chart plot. field_wcsfrom : str or None If `wcsfrom` is None, the WCS to transform the RA/Dec to pixel x/y will be taken from the FITS header of `fitsfile`. If this is not None, it must be a FITS or similar file that contains a WCS header in its first extension. field_scale : astropy.visualization.Interval object `scale` sets the normalization for the FITS pixel values. This is an astropy.visualization Interval object. See http://docs.astropy.org/en/stable/visualization/normalization.html for details on `scale` and `stretch` objects. field_stretch : astropy.visualization.Stretch object `stretch` sets the stretch function for mapping FITS pixel values to output pixel values. This is an astropy.visualization Stretch object. See http://docs.astropy.org/en/stable/visualization/normalization.html for details on `scale` and `stretch` objects. field_colormap : matplotlib Colormap object `colormap` is a matplotlib color map object to use for the output image. field_findersize : None or tuple of two ints If `findersize` is None, the output image size will be set by the NAXIS1 and NAXIS2 keywords in the input `fitsfile` FITS header. Otherwise, `findersize` must be a tuple with the intended x and y size of the image in inches (all output images will use a DPI = 100). field_pltopts : dict `field_pltopts` controls how the overlay points will be plotted. This a dict with standard matplotlib marker, etc. kwargs as key-val pairs, e.g. 'markersize', 'markerfacecolor', etc. The default options make red outline circles at the location of each object in the overlay. field_grid : bool `grid` sets if a grid will be made on the output image. field_gridcolor : str `gridcolor` sets the color of the grid lines. This is a usual matplotib color spec string. field_zoomcontain : bool `field_zoomcontain` controls if the finder chart will be zoomed to just contain the overlayed points. Everything outside the footprint of these points will be discarded. maxlcs : int or None This sets how many light curves to process in the input LC list generated by searching for LCs in `basedir` or in the list provided as `use_list_of_filenames`. nworkers : int This sets the number of parallel workers to launch to collect information from the light curves. Returns ------- str Returns the path to the generated light curve catalog pickle file. ''' try: formatinfo = get_lcformat(lcformat, use_lcformat_dir=lcformatdir) if formatinfo: (dfileglob, readerfunc, dtimecols, dmagcols, derrcols, magsarefluxes, normfunc) = formatinfo else: LOGERROR("can't figure out the light curve format") return None except Exception: LOGEXCEPTION("can't figure out the light curve format") return None if not fileglob: fileglob = dfileglob # this is to get the actual ndet # set to the magnitudes column lcndetkey = dmagcols if isinstance(use_list_of_filenames, list): matching = use_list_of_filenames else: # handle the case where basedir is a list of directories if isinstance(basedir, list): matching = [] for bdir in basedir: # now find the files LOGINFO('searching for %s light curves in %s ...' % (lcformat, bdir)) if recursive is False: matching.extend(glob.glob(os.path.join(bdir, fileglob))) else: matching.extend(glob.glob(os.path.join(bdir, '**', fileglob), recursive=True)) # otherwise, handle the usual case of one basedir to search in else: # now find the files LOGINFO('searching for %s light curves in %s ...' % (lcformat, basedir)) if recursive is False: matching = glob.glob(os.path.join(basedir, fileglob)) else: matching = glob.glob(os.path.join(basedir, '**', fileglob),recursive=True) # # now that we have all the files, process them # if matching and len(matching) > 0: LOGINFO('found %s light curves' % len(matching)) # cut down matching to maxlcs if maxlcs: matching = matching[:maxlcs] # prepare the output dict lclistdict = { 'basedir':basedir, 'lcformat':lcformat, 'fileglob':fileglob, 'recursive':recursive, 'columns':columns, 'makecoordindex':makecoordindex, 'nfiles':len(matching), 'objects': { } } # columns that will always be present in the output lclistdict derefcols = ['lcfname'] derefcols.extend(['%s.ndet' % x.split('.')[-1] for x in lcndetkey]) for dc in derefcols: lclistdict['objects'][dc] = [] # fill in the rest of the lclist columns from the columns kwarg for col in columns: # dereference the column thiscol = col.split('.') thiscol = thiscol[-1] lclistdict['objects'][thiscol] = [] derefcols.append(thiscol) # start collecting info LOGINFO('collecting light curve info...') tasks = [(x, columns, lcformat, lcformatdir, lcndetkey) for x in matching] with ProcessPoolExecutor(max_workers=nworkers) as executor: results = executor.map(_lclist_parallel_worker, tasks) results = list(results) # update the columns in the overall dict from the results of the # parallel map for result in results: for xcol in derefcols: lclistdict['objects'][xcol].append(result[xcol]) executor.shutdown() # done with collecting info # turn all of the lists in the lclistdict into arrays for col in lclistdict['objects']: lclistdict['objects'][col] = np.array(lclistdict['objects'][col]) # handle duplicate objectids with different light curves uniques, counts = np.unique(lclistdict['objects']['objectid'], return_counts=True) duplicated_objectids = uniques[counts > 1] if duplicated_objectids.size > 0: # redo the objectid array so it has a bit larger dtype so the extra # tag can fit into the field dt = lclistdict['objects']['objectid'].dtype.str dt = '<U%s' % ( int(dt.replace('<','').replace('U','').replace('S','')) + 3 ) lclistdict['objects']['objectid'] = np.array( lclistdict['objects']['objectid'], dtype=dt ) for objid in duplicated_objectids: objid_inds = np.where( lclistdict['objects']['objectid'] == objid ) # mark the duplicates, assume the first instance is the actual # one for ncounter, nind in enumerate(objid_inds[0][1:]): lclistdict['objects']['objectid'][nind] = '%s-%s' % ( lclistdict['objects']['objectid'][nind], ncounter+2 ) LOGWARNING( 'tagging duplicated instance %s of objectid: ' '%s as %s-%s, lightcurve: %s' % (ncounter+2, objid, objid, ncounter+2, lclistdict['objects']['lcfname'][nind]) ) # if we're supposed to make a spatial index, do so if (makecoordindex and isinstance(makecoordindex, (list, tuple)) and len(makecoordindex) == 2): try: # deref the column names racol, declcol = makecoordindex racol = racol.split('.')[-1] declcol = declcol.split('.')[-1] # get the ras and decls objra, objdecl = (lclistdict['objects'][racol], lclistdict['objects'][declcol]) # get the xyz unit vectors from ra,decl # since i had to remind myself: # https://en.wikipedia.org/wiki/Equatorial_coordinate_system cosdecl = np.cos(np.radians(objdecl)) sindecl = np.sin(np.radians(objdecl)) cosra = np.cos(np.radians(objra)) sinra = np.sin(np.radians(objra)) xyz = np.column_stack((cosra*cosdecl,sinra*cosdecl, sindecl)) # generate the kdtree kdt = sps.cKDTree(xyz,copy_data=True) # put the tree into the dict lclistdict['kdtree'] = kdt LOGINFO('kdtree generated for (ra, decl): (%s, %s)' % (makecoordindex[0], makecoordindex[1])) except Exception: LOGEXCEPTION('could not make kdtree for (ra, decl): (%s, %s)' % (makecoordindex[0], makecoordindex[1])) raise # generate the xy pairs if fieldfits is not None if field_fitsfile and os.path.exists(field_fitsfile): # read in the FITS file if field_wcsfrom is None: hdulist = pyfits.open(field_fitsfile) hdr = hdulist[0].header hdulist.close() w = WCS(hdr) wcsok = True elif os.path.exists(field_wcsfrom): w = WCS(field_wcsfrom) wcsok = True else: LOGERROR('could not determine WCS info for input FITS: %s' % field_fitsfile) wcsok = False if wcsok: # first, transform the ra/decl to x/y and put these in the # lclist output dict radecl = np.column_stack((objra, objdecl)) lclistdict['objects']['framexy'] = w.all_world2pix( radecl, 1 ) # next, we'll make a PNG plot for the finder finder_outfile = os.path.join( os.path.dirname(outfile), os.path.splitext(os.path.basename(outfile))[0] + '.png' ) finder_png = fits_finder_chart( field_fitsfile, finder_outfile, wcsfrom=field_wcsfrom, scale=field_scale, stretch=field_stretch, colormap=field_colormap, findersize=field_findersize, overlay_ra=objra, overlay_decl=objdecl, overlay_pltopts=field_pltopts, overlay_zoomcontain=field_zoomcontain, grid=field_grid, gridcolor=field_gridcolor ) if finder_png is not None: LOGINFO('generated a finder PNG ' 'with an object position overlay ' 'for this LC list: %s' % finder_png) # write the pickle with open(outfile,'wb') as outfd: pickle.dump(lclistdict, outfd, protocol=pickle.HIGHEST_PROTOCOL) LOGINFO('done. LC info -> %s' % outfile) return outfile else: LOGERROR('no files found in %s matching %s' % (basedir, fileglob)) return None
[docs]def filter_lclist(lc_catalog, objectidcol='objectid', racol='ra', declcol='decl', xmatchexternal=None, xmatchdistarcsec=3.0, externalcolnums=(0,1,2), externalcolnames=('objectid','ra','decl'), externalcoldtypes='U20,f8,f8', externalcolsep=None, externalcommentchar='#', conesearch=None, conesearchworkers=1, columnfilters=None, field_fitsfile=None, field_wcsfrom=None, field_scale=ZScaleInterval(), field_stretch=LinearStretch(), field_colormap=plt.cm.gray_r, field_findersize=None, field_pltopts={'marker':'o', 'markersize':10.0, 'markerfacecolor':'none', 'markeredgewidth':2.0, 'markeredgecolor':'red'}, field_grid=False, field_gridcolor='k', field_zoomcontain=True, copylcsto=None): '''This is used to perform cone-search, cross-match, and column-filter operations on a light curve catalog generated by `make_lclist`. Uses the output of `make_lclist` above. This function returns a list of light curves matching various criteria specified by the `xmatchexternal`, `conesearch`, and `columnfilters kwargs`. Use this function to generate input lists for other lcproc functions, e.g. `lcproc.lcvfeatures.parallel_varfeatures`, `lcproc.periodfinding.parallel_pf`, and `lcproc.lcbin.parallel_timebin`, among others. The operations are applied in this order if more than one is specified: `xmatchexternal` -> `conesearch` -> `columnfilters`. All results from these operations are joined using a logical AND operation. Parameters ---------- objectidcol : str This is the name of the object ID column in the light curve catalog. racol : str This is the name of the RA column in the light curve catalog. declcol : str This is the name of the Dec column in the light curve catalog. xmatchexternal : str or None If provided, this is the filename of a text file containing objectids, ras and decs to match the objects in the light curve catalog to by their positions. xmatchdistarcsec : float This is the distance in arcseconds to use when cross-matching to the external catalog in `xmatchexternal`. externalcolnums : sequence of int This a list of the zero-indexed column numbers of columns to extract from the external catalog file. externalcolnames : sequence of str This is a list of names of columns that will be extracted from the external catalog file. This is the same length as `externalcolnums`. These must contain the names provided as the `objectid`, `ra`, and `decl` column names so this function knows which column numbers correspond to those columns and can use them to set up the cross-match. externalcoldtypes : str This is a CSV string containing numpy dtype definitions for all columns listed to extract from the external catalog file. The number of dtype definitions should be equal to the number of columns to extract. externalcolsep : str or None The column separator to use when extracting columns from the external catalog file. If None, any whitespace between columns is used as the separator. externalcommentchar : str The character indicating that a line in the external catalog file is to be ignored. conesearch : list of float This is used to specify cone-search parameters. It should be a three element list: [center_ra_deg, center_decl_deg, search_radius_deg] conesearchworkers : int The number of parallel workers to launch for the cone-search operation. columnfilters : list of str This is a list of strings indicating any filters to apply on each column in the light curve catalog. All column filters are applied in the specified sequence and are combined with a logical AND operator. The format of each filter string should be: '<lc_catalog column>|<operator>|<operand>' where: - <lc_catalog column> is a column in the lc_catalog pickle file - <operator> is one of: 'lt', 'gt', 'le', 'ge', 'eq', 'ne', which correspond to the usual operators: <, >, <=, >=, ==, != respectively. - <operand> is a float, int, or string. field_fitsfile : str or None If this is not None, it should be the path to a FITS image containing the objects these light curves are for. If this is provided, `make_lclist` will use the WCS information in the FITS itself if `field_wcsfrom` is None (or from a WCS header file pointed to by `field_wcsfrom`) to obtain x and y pixel coordinates for all of the objects in the field. A finder chart will also be made using `astrobase.plotbase.fits_finder_chart` using the corresponding `field_scale`, `_stretch`, `_colormap`, `_findersize`, `_pltopts`, `_grid`, and `_gridcolors` kwargs for that function, reproduced here to enable customization of the finder chart plot. field_wcsfrom : str or None If `wcsfrom` is None, the WCS to transform the RA/Dec to pixel x/y will be taken from the FITS header of `fitsfile`. If this is not None, it must be a FITS or similar file that contains a WCS header in its first extension. field_scale : astropy.visualization.Interval object `scale` sets the normalization for the FITS pixel values. This is an astropy.visualization Interval object. See http://docs.astropy.org/en/stable/visualization/normalization.html for details on `scale` and `stretch` objects. field_stretch : astropy.visualization.Stretch object `stretch` sets the stretch function for mapping FITS pixel values to output pixel values. This is an astropy.visualization Stretch object. See http://docs.astropy.org/en/stable/visualization/normalization.html for details on `scale` and `stretch` objects. field_colormap : matplotlib Colormap object `colormap` is a matplotlib color map object to use for the output image. field_findersize : None or tuple of two ints If `findersize` is None, the output image size will be set by the NAXIS1 and NAXIS2 keywords in the input `fitsfile` FITS header. Otherwise, `findersize` must be a tuple with the intended x and y size of the image in inches (all output images will use a DPI = 100). field_pltopts : dict `field_pltopts` controls how the overlay points will be plotted. This a dict with standard matplotlib marker, etc. kwargs as key-val pairs, e.g. 'markersize', 'markerfacecolor', etc. The default options make red outline circles at the location of each object in the overlay. field_grid : bool `grid` sets if a grid will be made on the output image. field_gridcolor : str `gridcolor` sets the color of the grid lines. This is a usual matplotib color spec string. field_zoomcontain : bool `field_zoomcontain` controls if the finder chart will be zoomed to just contain the overlayed points. Everything outside the footprint of these points will be discarded. copylcsto : str If this is provided, it is interpreted as a directory target to copy all the light curves that match the specified conditions. Returns ------- tuple Returns a two elem tuple: (matching_object_lcfiles, matching_objectids) if conesearch and/or column filters are used. If `xmatchexternal` is also used, a three-elem tuple is returned: (matching_object_lcfiles, matching_objectids, extcat_matched_objectids). ''' with open(lc_catalog,'rb') as infd: lclist = pickle.load(infd) # generate numpy arrays of the matching object indexes. we do it this way so # we can AND everything at the end, instead of having to look up the objects # at these indices and running the columnfilter on them xmatch_matching_index = np.full_like(lclist['objects'][objectidcol], False, dtype=np.bool) conesearch_matching_index = np.full_like(lclist['objects'][objectidcol], False, dtype=np.bool) # do the xmatch first ext_matches = [] ext_matching_objects = [] if (xmatchexternal and isinstance(xmatchexternal, str) and os.path.exists(xmatchexternal)): try: # read in the external file extcat = np.genfromtxt(xmatchexternal, usecols=externalcolnums, delimiter=externalcolsep, names=externalcolnames, dtype=externalcoldtypes, comments=externalcommentchar) ext_cosdecl = np.cos(np.radians(extcat['decl'])) ext_sindecl = np.sin(np.radians(extcat['decl'])) ext_cosra = np.cos(np.radians(extcat['ra'])) ext_sinra = np.sin(np.radians(extcat['ra'])) ext_xyz = np.column_stack((ext_cosra*ext_cosdecl, ext_sinra*ext_cosdecl, ext_sindecl)) ext_xyzdist = 2.0 * np.sin(np.radians(xmatchdistarcsec/3600.0)/2.0) # get our kdtree our_kdt = lclist['kdtree'] # get the external kdtree ext_kdt = sps.cKDTree(ext_xyz) # do a query_ball_tree extkd_matchinds = ext_kdt.query_ball_tree(our_kdt, ext_xyzdist) for extind, mind in enumerate(extkd_matchinds): if len(mind) > 0: ext_matches.append(mind[0]) # get the whole matching row for the ext objects recarray ext_matching_objects.append(extcat[extind]) ext_matches = np.array(ext_matches) if ext_matches.size > 0: # update the xmatch_matching_index xmatch_matching_index[ext_matches] = True LOGINFO('xmatch: objects matched to %s within %.1f arcsec: %s' % (xmatchexternal, xmatchdistarcsec, ext_matches.size)) else: LOGERROR("xmatch: no objects were cross-matched to external " "catalog spec: %s, can't continue" % xmatchexternal) return None, None, None except Exception: LOGEXCEPTION('could not match to external catalog spec: %s' % repr(xmatchexternal)) raise # do the cone search next if (conesearch and isinstance(conesearch, (list, tuple)) and len(conesearch) == 3): try: racenter, declcenter, searchradius = conesearch cosdecl = np.cos(np.radians(declcenter)) sindecl = np.sin(np.radians(declcenter)) cosra = np.cos(np.radians(racenter)) sinra = np.sin(np.radians(racenter)) # this is the search distance in xyz unit vectors xyzdist = 2.0 * np.sin(np.radians(searchradius)/2.0) # get the kdtree our_kdt = lclist['kdtree'] # look up the coordinates kdtindices = our_kdt.query_ball_point([cosra*cosdecl, sinra*cosdecl, sindecl], xyzdist, n_jobs=conesearchworkers) if kdtindices and len(kdtindices) > 0: LOGINFO('cone search: objects within %.4f deg ' 'of (%.3f, %.3f): %s' % (searchradius, racenter, declcenter, len(kdtindices))) # update the conesearch_matching_index matchingind = kdtindices conesearch_matching_index[np.array(matchingind)] = True # we fail immediately if we found nothing. this assumes the user # cares more about the cone-search than the regular column filters else: LOGERROR("cone-search: no objects were found within " "%.4f deg of (%.3f, %.3f): %s, can't continue" % (searchradius, racenter, declcenter, len(kdtindices))) return None, None except Exception: LOGEXCEPTION('cone-search: could not run a cone-search, ' 'is there a kdtree present in %s?' % lc_catalog) raise # now that we're done with cone-search, do the column filtering allfilterinds = [] if columnfilters and isinstance(columnfilters, list): # go through each filter for cfilt in columnfilters: try: fcol, foperator, foperand = cfilt.split('|') foperator = FILTEROPS[foperator] # generate the evalstring filterstr = ( "np.isfinite(lclist['objects']['%s']) & " "(lclist['objects']['%s'] %s %s)" ) % (fcol, fcol, foperator, foperand) filterind = eval(filterstr) ngood = lclist['objects'][objectidcol][filterind].size LOGINFO('filter: %s -> objects matching: %s ' % (cfilt, ngood)) allfilterinds.append(filterind) except Exception: LOGEXCEPTION('filter: could not understand filter spec: %s' % cfilt) LOGWARNING('filter: not applying this broken filter') # now that we have all the filter indices good to go # logical-AND all the things # make sure we only do filtering if we were told to do so if (xmatchexternal or conesearch or columnfilters): filterstack = [] if xmatchexternal: filterstack.append(xmatch_matching_index) if conesearch: filterstack.append(conesearch_matching_index) if columnfilters: filterstack.extend(allfilterinds) finalfilterind = np.column_stack(filterstack) finalfilterind = np.all(finalfilterind, axis=1) # get the filtered object light curves and object names filteredobjectids = lclist['objects'][objectidcol][finalfilterind] filteredlcfnames = lclist['objects']['lcfname'][finalfilterind] else: filteredobjectids = lclist['objects'][objectidcol] filteredlcfnames = lclist['objects']['lcfname'] # if we're told to make a finder chart with the selected objects if field_fitsfile is not None and os.path.exists(field_fitsfile): # get the RA and DEC of the matching objects matching_ra = lclist['objects'][racol][finalfilterind] matching_decl = lclist['objects'][declcol][finalfilterind] matching_postfix = [] if xmatchexternal is not None: matching_postfix.append( 'xmatch_%s' % os.path.splitext(os.path.basename(xmatchexternal))[0] ) if conesearch is not None: matching_postfix.append('conesearch_RA%.3f_DEC%.3f_RAD%.5f' % tuple(conesearch)) if columnfilters is not None: for cfi, cf in enumerate(columnfilters): if cfi == 0: matching_postfix.append('filter_%s_%s_%s' % tuple(cf.split('|'))) else: matching_postfix.append('_and_%s_%s_%s' % tuple(cf.split('|'))) if len(matching_postfix) > 0: matching_postfix = '-%s' % '_'.join(matching_postfix) else: matching_postfix = '' # next, we'll make a PNG plot for the finder finder_outfile = os.path.join( os.path.dirname(lc_catalog), '%s%s.png' % (os.path.splitext(os.path.basename(lc_catalog))[0], matching_postfix) ) finder_png = fits_finder_chart( field_fitsfile, finder_outfile, wcsfrom=field_wcsfrom, scale=field_scale, stretch=field_stretch, colormap=field_colormap, findersize=field_findersize, overlay_ra=matching_ra, overlay_decl=matching_decl, overlay_pltopts=field_pltopts, field_zoomcontain=field_zoomcontain, grid=field_grid, gridcolor=field_gridcolor ) if finder_png is not None: LOGINFO('generated a finder PNG ' 'with an object position overlay ' 'for this filtered LC list: %s' % finder_png) # if copylcsto is not None, copy LCs over to it if copylcsto is not None: if not os.path.exists(copylcsto): os.mkdir(copylcsto) if TQDM: lciter = tqdm(filteredlcfnames) else: lciter = filteredlcfnames LOGINFO('copying matching light curves to %s' % copylcsto) for lc in lciter: shutil.copy(lc, copylcsto) LOGINFO('done. objects matching all filters: %s' % filteredobjectids.size) if xmatchexternal and len(ext_matching_objects) > 0: return filteredlcfnames, filteredobjectids, ext_matching_objects else: return filteredlcfnames, filteredobjectids
############################################################ ## ADDING CHECKPLOT INFO BACK TO THE LIGHT CURVE CATALOGS ## ############################################################ def _cpinfo_key_worker(task): '''This wraps `checkplotlist.checkplot_infokey_worker`. This is used to get the correct dtype for each element in retrieved results. Parameters ---------- task : tuple task[0] = cpfile task[1] = keyspeclist (infokeys kwarg from `add_cpinfo_to_lclist`) Returns ------- dict All of the requested keys from the checkplot are returned along with their values in a dict. ''' cpfile, keyspeclist = task keystoget = [x[0] for x in keyspeclist] nonesubs = [x[-2] for x in keyspeclist] nansubs = [x[-1] for x in keyspeclist] # reform the keystoget into a list of lists for i, k in enumerate(keystoget): thisk = k.split('.') thisk = [(int(x) if x.isdecimal() else x) for x in thisk] keystoget[i] = thisk # add in the objectid as well to match to the object catalog later keystoget.insert(0,['objectid']) nonesubs.insert(0, '') nansubs.insert(0,'') # get all the keys we need vals = checkplot_infokey_worker((cpfile, keystoget)) # if they have some Nones, nans, etc., reform them as expected for val, nonesub, nansub, valind in zip(vals, nonesubs, nansubs, range(len(vals))): if val is None: outval = nonesub elif isinstance(val, float) and not np.isfinite(val): outval = nansub elif isinstance(val, (list, tuple)): outval = ', '.join(val) else: outval = val vals[valind] = outval return vals CPINFO_DEFAULTKEYS = [ # key, dtype, first level, overwrite=T|append=F, None sub, nan sub ('comments', np.unicode_, False, True, '', ''), ('objectinfo.objecttags', np.unicode_, True, True, '', ''), ('objectinfo.twomassid', np.unicode_, True, True, '', ''), ('objectinfo.bmag', np.float_, True, True, np.nan, np.nan), ('objectinfo.vmag', np.float_, True, True, np.nan, np.nan), ('objectinfo.rmag', np.float_, True, True, np.nan, np.nan), ('objectinfo.imag', np.float_, True, True, np.nan, np.nan), ('objectinfo.jmag', np.float_, True, True, np.nan, np.nan), ('objectinfo.hmag', np.float_, True, True, np.nan, np.nan), ('objectinfo.kmag', np.float_, True, True, np.nan, np.nan), ('objectinfo.sdssu', np.float_, True, True, np.nan, np.nan), ('objectinfo.sdssg', np.float_, True, True, np.nan, np.nan), ('objectinfo.sdssr', np.float_, True, True, np.nan, np.nan), ('objectinfo.sdssi', np.float_, True, True, np.nan, np.nan), ('objectinfo.sdssz', np.float_, True, True, np.nan, np.nan), ('objectinfo.dered_bmag', np.float_, True, True, np.nan, np.nan), ('objectinfo.dered_vmag', np.float_, True, True, np.nan, np.nan), ('objectinfo.dered_rmag', np.float_, True, True, np.nan, np.nan), ('objectinfo.dered_imag', np.float_, True, True, np.nan, np.nan), ('objectinfo.dered_jmag', np.float_, True, True, np.nan, np.nan), ('objectinfo.dered_hmag', np.float_, True, True, np.nan, np.nan), ('objectinfo.dered_kmag', np.float_, True, True, np.nan, np.nan), ('objectinfo.dered_sdssu', np.float_, True, True, np.nan, np.nan), ('objectinfo.dered_sdssg', np.float_, True, True, np.nan, np.nan), ('objectinfo.dered_sdssr', np.float_, True, True, np.nan, np.nan), ('objectinfo.dered_sdssi', np.float_, True, True, np.nan, np.nan), ('objectinfo.dered_sdssz', np.float_, True, True, np.nan, np.nan), ('objectinfo.extinction_bmag', np.float_, True, True, np.nan, np.nan), ('objectinfo.extinction_vmag', np.float_, True, True, np.nan, np.nan), ('objectinfo.extinction_rmag', np.float_, True, True, np.nan, np.nan), ('objectinfo.extinction_imag', np.float_, True, True, np.nan, np.nan), ('objectinfo.extinction_jmag', np.float_, True, True, np.nan, np.nan), ('objectinfo.extinction_hmag', np.float_, True, True, np.nan, np.nan), ('objectinfo.extinction_kmag', np.float_, True, True, np.nan, np.nan), ('objectinfo.extinction_sdssu', np.float_, True, True, np.nan, np.nan), ('objectinfo.extinction_sdssg', np.float_, True, True, np.nan, np.nan), ('objectinfo.extinction_sdssr', np.float_, True, True, np.nan, np.nan), ('objectinfo.extinction_sdssi', np.float_, True, True, np.nan, np.nan), ('objectinfo.extinction_sdssz', np.float_, True, True, np.nan, np.nan), ('objectinfo.color_classes', np.unicode_, True, True, '', ''), ('objectinfo.pmra', np.float_, True, True, np.nan, np.nan), ('objectinfo.pmdecl', np.float_, True, True, np.nan, np.nan), ('objectinfo.propermotion', np.float_, True, True, np.nan, np.nan), ('objectinfo.rpmj', np.float_, True, True, np.nan, np.nan), ('objectinfo.gl', np.float_, True, True, np.nan, np.nan), ('objectinfo.gb', np.float_, True, True, np.nan, np.nan), ('objectinfo.gaia_status', np.unicode_, True, True, '', ''), ('objectinfo.gaia_ids.0', np.unicode_, True, True, '', ''), ('objectinfo.gaiamag', np.float_, True, True, np.nan, np.nan), ('objectinfo.gaia_parallax', np.float_, True, True, np.nan, np.nan), ('objectinfo.gaia_parallax_err', np.float_, True, True, np.nan, np.nan), ('objectinfo.gaia_absmag', np.float_, True, True, np.nan, np.nan), ('objectinfo.simbad_best_mainid', np.unicode_, True, True, '', ''), ('objectinfo.simbad_best_objtype', np.unicode_, True, True, '', ''), ('objectinfo.simbad_best_allids', np.unicode_, True, True, '', ''), ('objectinfo.simbad_best_distarcsec', np.float_, True, True, np.nan, np.nan), # # TIC info # ('objectinfo.ticid', np.unicode_, True, True, '', ''), ('objectinfo.tic_version', np.unicode_, True, True, '', ''), ('objectinfo.tessmag', np.float_, True, True, np.nan, np.nan), # # variability info # ('varinfo.vartags', np.unicode_, False, True, '', ''), ('varinfo.varperiod', np.float_, False, True, np.nan, np.nan), ('varinfo.varepoch', np.float_, False, True, np.nan, np.nan), ('varinfo.varisperiodic', np.int_, False, True, 0, 0), ('varinfo.objectisvar', np.int_, False, True, 0, 0), ('varinfo.features.median', np.float_, False, True, np.nan, np.nan), ('varinfo.features.mad', np.float_, False, True, np.nan, np.nan), ('varinfo.features.stdev', np.float_, False, True, np.nan, np.nan), ('varinfo.features.mag_iqr', np.float_, False, True, np.nan, np.nan), ('varinfo.features.skew', np.float_, False, True, np.nan, np.nan), ('varinfo.features.kurtosis', np.float_, False, True, np.nan, np.nan), ('varinfo.features.stetsonj', np.float_, False, True, np.nan, np.nan), ('varinfo.features.stetsonk', np.float_, False, True, np.nan, np.nan), ('varinfo.features.eta_normal', np.float_, False, True, np.nan, np.nan), ('varinfo.features.linear_fit_slope', np.float_, False, True, np.nan, np.nan), ('varinfo.features.magnitude_ratio', np.float_, False, True, np.nan, np.nan), ('varinfo.features.beyond1std', np.float_, False, True, np.nan, np.nan) ]
[docs]def add_cpinfo_to_lclist( checkplots, # list or a directory path initial_lc_catalog, magcol, # to indicate checkplot magcol outfile, checkplotglob='checkplot*.pkl*', infokeys=CPINFO_DEFAULTKEYS, nworkers=NCPUS ): '''This adds checkplot info to the initial light curve catalogs generated by `make_lclist`. This is used to incorporate all the extra info checkplots can have for objects back into columns in the light curve catalog produced by `make_lclist`. Objects are matched between the checkplots and the light curve catalog using their `objectid`. This then allows one to search this 'augmented' light curve catalog by these extra columns. The 'augmented' light curve catalog also forms the basis for search interface provided by the LCC-Server. The default list of keys that will be extracted from a checkplot and added as columns in the initial light curve catalog is listed above in the `CPINFO_DEFAULTKEYS` list. Parameters ---------- checkplots : str or list If this is a str, is interpreted as a directory which will be searched for checkplot pickle files using `checkplotglob`. If this is a list, it will be interpreted as a list of checkplot pickle files to process. initial_lc_catalog : str This is the path to the light curve catalog pickle made by `make_lclist`. magcol : str This is used to indicate the light curve magnitude column to extract magnitude column specific information. For example, Stetson variability indices can be generated using magnitude measurements in separate photometric apertures, which appear in separate `magcols` in the checkplot. To associate each such feature of the object with its specific `magcol`, pass that `magcol` in here. This `magcol` will then be added as a prefix to the resulting column in the 'augmented' LC catalog, e.g. Stetson J will appear as `magcol1_stetsonj` and `magcol2_stetsonj` for two separate magcols. outfile : str This is the file name of the output 'augmented' light curve catalog pickle file that will be written. infokeys : list of tuples This is a list of keys to extract from the checkplot and some info on how this extraction is to be done. Each key entry is a six-element tuple of the following form: - key name in the checkplot - numpy dtype of the value of this key - False if key is associated with a magcol or True otherwise - False if subsequent updates to the same column name will append to existing key values in the output augmented light curve catalog or True if these will overwrite the existing key value - character to use to substitute a None value of the key in the checkplot in the output light curve catalog column - character to use to substitute a nan value of the key in the checkplot in the output light curve catalog column See the `CPFINFO_DEFAULTKEYS` list above for examples. nworkers : int The number of parallel workers to launch to extract checkplot information. Returns ------- str Returns the path to the generated 'augmented' light curve catalog pickle file. ''' # get the checkplots from the directory if one is provided if not isinstance(checkplots, list) and os.path.exists(checkplots): checkplots = sorted(glob.glob(os.path.join(checkplots, checkplotglob))) tasklist = [(cpf, infokeys) for cpf in checkplots] with ProcessPoolExecutor(max_workers=nworkers) as executor: resultfutures = executor.map(_cpinfo_key_worker, tasklist) results = list(resultfutures) executor.shutdown() # now that we have all the checkplot info, we need to match to the # objectlist in the lclist # open the lclist with open(initial_lc_catalog,'rb') as infd: lc_catalog = pickle.load(infd) # convert the lc_catalog['columns'] item to a list if it's not # this is so we can append columns to it later lc_catalog['columns'] = list(lc_catalog['columns']) catalog_objectids = np.array(lc_catalog['objects']['objectid']) checkplot_objectids = np.array([x[0] for x in results]) # add the extra key arrays in the lclist dict extrainfokeys = [] actualkeys = [] # set up the extrainfokeys list for keyspec in infokeys: key, dtype, firstlevel, overwrite_append, nonesub, nansub = keyspec if firstlevel: eik = key else: eik = '%s.%s' % (magcol, key) extrainfokeys.append(eik) # now handle the output dicts and column list eactual = eik.split('.') # this handles dereferenced list indices if not eactual[-1].isdigit(): if not firstlevel: eactual = '.'.join([eactual[0], eactual[-1]]) else: eactual = eactual[-1] else: elastkey = eactual[-2] # for list columns, this converts stuff like errs -> err, # and parallaxes -> parallax if elastkey.endswith('es'): elastkey = elastkey[:-2] elif elastkey.endswith('s'): elastkey = elastkey[:-1] if not firstlevel: eactual = '.'.join([eactual[0], elastkey]) else: eactual = elastkey actualkeys.append(eactual) # add a new column only if required if eactual not in lc_catalog['columns']: lc_catalog['columns'].append(eactual) # we'll overwrite earlier existing columns in any case lc_catalog['objects'][eactual] = [] # now go through each objectid in the catalog and add the extra keys to # their respective arrays for catobj in tqdm(catalog_objectids): cp_objind = np.where(checkplot_objectids == catobj) if len(cp_objind[0]) > 0: # get the info line for this checkplot thiscpinfo = results[cp_objind[0][0]] # the first element is the objectid which we remove thiscpinfo = thiscpinfo[1:] # update the object catalog entries for this object for ekind, ek in enumerate(actualkeys): # add the actual thing to the output list lc_catalog['objects'][ek].append( thiscpinfo[ekind] ) else: # update the object catalog entries for this object for ekind, ek in enumerate(actualkeys): thiskeyspec = infokeys[ekind] nonesub = thiskeyspec[-2] lc_catalog['objects'][ek].append( nonesub ) # now we should have all the new keys in the object catalog # turn them into arrays for ek in actualkeys: lc_catalog['objects'][ek] = np.array( lc_catalog['objects'][ek] ) # add the magcol to the lc_catalog if 'magcols' in lc_catalog: if magcol not in lc_catalog['magcols']: lc_catalog['magcols'].append(magcol) else: lc_catalog['magcols'] = [magcol] # write back the new object catalog with open(outfile, 'wb') as outfd: pickle.dump(lc_catalog, outfd, protocol=pickle.HIGHEST_PROTOCOL) return outfile