Source code for astrobase.cpserver.checkplotlist

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# checkplotlist.py - Waqas Bhatti (wbhatti@astro.princeton.edu) - Dec 2016
# License: MIT. See LICENSE for full text.

'''This makes a checkplot file list for use with the `checkplot-viewer.html` or
the `checkplotserver.py` webapps. Checkplots are quick-views of object info,
finder charts, light curves, phased light curves, and periodograms used to
examine their stellar variability.

These are produced by several functions in the `astrobase.checkplot` module:

- :py:func:`astrobase.checkplot.pkl.checkplot_pickle`: makes a checkplot pickle
  file for any number of independent period-finding methods. Use
  `checkplotserver.py` to view these pickle files.

- :py:func:`astrobase.checkplot.png.checkplot_png`: makes a checkplot PNG for a
  single period-finding method. Use `checkplot-viewer.html` to view these image
  files.

- :py:func:`astrobase.checkplot.png.twolsp_checkplot_png`: does the same for
  two independent period-finding methods. Use `checkplot-viewer.html` to view
  these image files.

'''

PROGDESC = '''\
This makes a checkplot file list for use with the checkplot-viewer.html (for
checkplot PNGs) or the checkplotserver.py (for checkplot pickles) webapps.
'''

PROGEPILOG = '''\
SEARCHING FOR CHECKPLOT PNGS OR PICKLES
---------------------------------------
If you have checkplots that don't have 'checkplot' somewhere in their file name,
use the optional checkplot file glob argument to checkplotlist to provide
this:

--search '<filename glob for prefix>'

Make sure to use the quotes around this argument, otherwise the shell will
expand it.

SORTING CHECKPLOT PICKLES
-------------------------
If you want to sort checkplot pickle files in the output list in some special
way other than the usual filename sort order, this requires an argument on the
commandline of the form:

--sortby '<sortkey>|<asc or desc>'

(use the | character to separate sortkey and order)

Here, sortkey is some key in the checkplot pickle. This can be a simple key:
e.g. objectid or it can be a composite key: e.g. varinfo.features.stetsonj.
sortorder is either 'asc' or desc' for ascending/descending sort. The sortkey
must exist in all checkplot pickles.

FILTERING CHECKPLOT PICKLES
---------------------------
You can filter the checkplot pickle files in the output list by using the
--filterby argument. Note that filtering takes place after any requested
sorting.  Provide a filterkey, filteroperator, and filteroperand in the form:

--filterby '<filterkey>|<filteroperator>@<filteroperand>'

(use the | character to separate the filter column-key and filter specification,
 use the @ character in the filter spec to separate filter operator and operand)

Here, filterkey is some key in the checkplot pickle, specified as the sortkey
discussed above. filteroperator is one of the following 2-character strings:

'gt' -> greater than, 'lt' -> less than, 'ge' -> greater than or equal to,
'le' -> less than or equal to, 'eq' -> equal to, 'ne' -> not equal to

filteroperand is the appropriate integer, float, or string for the filterkey and
operator.

EXAMPLES OF CHECKPLOT PICKLE SORTING AND FILTERING
--------------------------------------------------
Sort checkplots by their 2MASS J magnitudes in ascending order:

  $ checkplotlist pkl project/awesome-objects --sortby 'objectinfo.jmag|asc'

Sort checkplots by the power of the best peak in their PDM periodograms:

  $ checkplotlist pkl project/awesome-objects --sortby 'pdm.nbestlspvals.0|asc'

Get only those checkplots with Stetson J > 0.2:

  $ checkplotlist pkl project/awesome-objects       \\
      --filterby 'varinfo.features.stetsonj|gt@0.2'

Get only those checkplots for objects that have object r mag < 12.0 and sort
these by power of the best peak in their Lomb-Scargle periodogram:

  $ checkplot pkl project/awesome-objects   \\
      --filterby 'objectinfo.sdssr|lt@12.0' \\
      --sortby 'gls.nbestlspvals.0|desc'

Get only those checkplots for objects that have best-period transit depths
between 1 mmag and 10 mmag and sort these by the SNR of the best peak in the BLS
spectrum in descending order:

  $ checkplot pkl project/awesome-objects       \\
      --sortby 'bls.snr.0|desc'                 \\
      --filterby 'bls.transitdepth.0|lt@-0.001' \\
      --filterby 'bls.transitdepth.0|gt@-0.01'
'''

import os
import os.path
import sys
import glob
import json
import argparse


# suppress warnings
import warnings
warnings.filterwarnings('ignore')

# to turn a list of keys into a dict address
# from https://stackoverflow.com/a/14692747
# used to walk a checkplotdict for a specific key in the structure
from functools import reduce
from operator import getitem

import numpy as np
import multiprocessing as mp
CPU_COUNT = mp.cpu_count()

from astrobase.checkplot.pkl_io import _read_checkplot_picklefile


######################
## HELPER FUNCTIONS ##
######################

def _dict_get(datadict, keylist):
    '''This gets a requested dict key by walking the dict.

    Parameters
    ----------

    datadict : dict
        The dict to get the specified key from.

    keylist : list of str
        This is a list of keys to use to walk the dict and get to the key that
        is provided as the last element in `keylist`. For example::

            keylist = ['key1','key2','key3']

        will walk `datadict` recursively to get to `datadict[key1][key2][key3]`.

    Returns
    -------

    object
        The dict value of the specified key address.

    '''
    return reduce(getitem, keylist, datadict)


[docs]def checkplot_infokey_worker(task): '''This gets the required keys from the requested file. Parameters ---------- task : tuple Task is a two element tuple:: - task[0] is the dict to work on - task[1] is a list of lists of str indicating all the key address to extract items from the dict for Returns ------- list This is a list of all of the items at the requested key addresses. ''' cpf, keys = task cpd = _read_checkplot_picklefile(cpf) resultkeys = [] for k in keys: try: resultkeys.append(_dict_get(cpd, k)) except Exception: resultkeys.append(np.nan) return resultkeys
############ ## CONFIG ## ############ FILTEROPS = { 'eq':'==', 'gt':'>', 'ge':'>=', 'lt':'<', 'le':'<=', 'ne':'!=', 'cb':'closed-interval-between', 'ob':'open-interval-between', } ########## ## MAIN ## ##########
[docs]def main(): '''This is the main function of this script. The current script args are shown below :: Usage: checkplotlist [-h] [--search SEARCH] [--sortby SORTBY] [--filterby FILTERBY] [--splitout SPLITOUT] [--outprefix OUTPREFIX] [--maxkeyworkers MAXKEYWORKERS] {pkl,png} cpdir This makes a checkplot file list for use with the checkplot-viewer.html (for checkplot PNGs) or the checkplotserver.py (for checkplot pickles) webapps. positional arguments: {pkl,png} type of checkplot to search for: pkl -> checkplot pickles, png -> checkplot PNGs cpdir directory containing the checkplots to process optional arguments: -h, --help show this help message and exit --search SEARCH file glob prefix to use when searching for checkplots, default: '*checkplot*', (the extension is added automatically - .png or .pkl) --sortby SORTBY the sort key and order to use when sorting --filterby FILTERBY the filter key and condition to use when filtering. you can specify this multiple times to filter by several keys at once. all filters are joined with a logical AND operation in the order they're given. --splitout SPLITOUT if there are more than SPLITOUT objects in the target directory (default: 5000), checkplotlist will split the output JSON into multiple files. this helps keep the checkplotserver webapp responsive. --outprefix OUTPREFIX a prefix string to use for the output JSON file(s). use this to separate out different sort orders or filter conditions, for example. if this isn't provided, but --sortby or --filterby are, will use those to figure out the output files' prefixes --maxkeyworkers MAXKEYWORKERS the number of parallel workers that will be launched to retrieve checkplot key values used for sorting and filtering (default: 2) ''' #################### ## PARSE THE ARGS ## #################### aparser = argparse.ArgumentParser( epilog=PROGEPILOG, description=PROGDESC, formatter_class=argparse.RawDescriptionHelpFormatter, ) aparser.add_argument( 'cptype', action='store', choices=['pkl','png'], type=str, help=("type of checkplot to search for: pkl -> checkplot pickles, " "png -> checkplot PNGs") ) aparser.add_argument( 'cpdir', action='store', type=str, help=("directory containing the checkplots to process") ) # TODO: here, make the --search kwarg an array (i.e. allow multiple search # statements). the use of this will be to make checkplotserver able to load # more than one checkplot per object (i.e. different mag types -- epd # vs. tfa -- or different bands -- r vs. i -- at the SAME time). # TODO: we'll fix checkplotserver and its js so there's a vertical tab # column between the left period/epoch/tags panel and main # periodogram/phased-LCs panel on the right. the user will be able to flip # between tabs to look at the object in all loaded alternative checkplots. # TODO: need to also think about to sort/filter; for now let's make it so # the sorting works on a chosen checkplot search list, if we give --search # 'checkplot*iep1' and --search 'checkplot*itf1', specify --sortpkls and # --filterpkls kwargs, which match the given globs for the --search # kwargs. e.g. we'd specify --sortpkls 'checkplot*iep1' to sort everything # by the specified --sortby values in those pickles. # TODO: we'll have to change the output JSON so it's primarily by objectid # instead of checkplot filenames. each objectid will have its own list of # checkplots to use for the frontend. aparser.add_argument( '--search', action='store', default='*checkplot*', type=str, help=("file glob prefix to use when searching for checkplots, " "default: '%(default)s', " "(the extension is added automatically - .png or .pkl)") ) aparser.add_argument( '--sortby', action='store', type=str, help=("the sort key and order to use when sorting") ) aparser.add_argument( '--filterby', action='append', type=str, help=("the filter key and condition to use when filtering. " "you can specify this multiple times to filter by " "several keys at once. all filters are joined with a " "logical AND operation in the order they're given.") ) aparser.add_argument( '--splitout', action='store', type=int, default=5000, help=("if there are more than SPLITOUT objects in " "the target directory (default: %(default)s), " "checkplotlist will split the output JSON into multiple files. " "this helps keep the checkplotserver webapp responsive.") ) aparser.add_argument( '--outprefix', action='store', type=str, help=("a prefix string to use for the output JSON file(s). " "use this to separate out different sort orders " "or filter conditions, for example. " "if this isn't provided, but --sortby or --filterby are, " "will use those to figure out the output files' prefixes") ) aparser.add_argument( '--maxkeyworkers', action='store', type=int, default=int(CPU_COUNT/4.0), help=("the number of parallel workers that will be launched " "to retrieve checkplot key values used for " "sorting and filtering (default: %(default)s)") ) args = aparser.parse_args() checkplotbasedir = args.cpdir fileglob = args.search splitout = args.splitout outprefix = args.outprefix if args.outprefix else None # see if there's a sorting order if args.sortby: sortkey, sortorder = args.sortby.split('|') if outprefix is None: outprefix = args.sortby else: sortkey, sortorder = 'objectid', 'asc' # see if there's a filter condition if args.filterby: filterkeys, filterconditions = [], [] # load all the filters for filt in args.filterby: f = filt.split('|') filterkeys.append(f[0]) filterconditions.append(f[1]) # generate the output file's prefix if outprefix is None: outprefix = '-'.join(args.filterby) else: outprefix = '%s-%s' % ('-'.join(args.filterby), outprefix) else: filterkeys, filterconditions = None, None if args.cptype == 'pkl': checkplotext = 'pkl' elif args.cptype == 'png': checkplotext = 'png' else: print("unknown format for checkplots: %s! can't continue!" % args.cptype) sys.exit(1) ####################### ## NOW START WORKING ## ####################### currdir = os.getcwd() checkplotglob = os.path.join(checkplotbasedir, '%s.%s' % (fileglob, checkplotext)) print('searching for checkplots: %s' % checkplotglob) searchresults = glob.glob(checkplotglob) if searchresults: print('found %s checkplot files in dir: %s' % (len(searchresults), checkplotbasedir)) # see if we should sort the searchresults in some special order # this requires an arg on the commandline of the form: # '<sortkey>-<asc|desc>' # where sortkey is some key in the checkplot pickle: # this can be a simple key: e.g. objectid # or it can be a composite key: e.g. varinfo.varfeatures.stetsonj # and sortorder is either 'asc' or desc' for ascending/descending sort # we only support a single condition conditions are of the form: # '<filterkey>-<condition>@<operand>' where <condition> is one of: 'ge', # 'gt', 'le', 'lt', 'eq' and <operand> is a string, float, or int to use # when applying <condition> # first, take care of sort keys sortdone = False # second, take care of any filters filterok = False filterstatements = [] # make sure we only run these operations on checkplot pickles if ((args.cptype == 'pkl') and ((sortkey and sortorder) or (filterkeys and filterconditions))): keystoget = [] # handle sorting if (sortkey and sortorder): print('sorting checkplot pickles by %s in order: %s' % (sortkey, sortorder)) # dereference the sort key sortkeys = sortkey.split('.') # if there are any integers in the sortkeys strings, interpret # these to mean actual integer indexes of lists or integer keys # for dicts this allows us to move into arrays easily by # indexing them sortkeys = [(int(x) if x.isdecimal() else x) for x in sortkeys] keystoget.append(sortkeys) # handle filtering if (filterkeys and filterconditions): print('filtering checkplot pickles by %s using: %s' % (filterkeys, filterconditions)) # add all the filtkeys to the list of keys to get for fdk in filterkeys: # dereference the filter dict key fdictkeys = fdk.split('.') fdictkeys = [(int(x) if x.isdecimal() else x) for x in fdictkeys] keystoget.append(fdictkeys) print('retrieving checkplot info using %s workers...' % args.maxkeyworkers) # launch the key retrieval pool = mp.Pool(args.maxkeyworkers) tasks = [(x, keystoget) for x in searchresults] keytargets = pool.map(checkplot_infokey_worker, tasks) pool.close() pool.join() # now that we have keys, we need to use them # keys will be returned in the order we put them into keystoget # if keystoget is more than 1 element, then it's either sorting # followed by filtering (multiple)... if (len(keystoget) > 1 and (sortkey and sortorder) and (filterkeys and filterconditions)): # the first elem is sort key targets sorttargets = [x[0] for x in keytargets] # all of the rest are filter targets filtertargets = [x[1:] for x in keytargets] # otherwise, it's just multiple filters elif (len(keystoget) > 1 and (not (sortkey and sortorder)) and (filterkeys and filterconditions)): sorttargets = None filtertargets = keytargets # if there's only one element in keytoget, then it's either just a # sort target... elif (len(keystoget) == 1 and (sortkey and sortorder) and (not(filterkeys and filterconditions))): sorttargets = keytargets filtertargets = None # or it's just a filter target elif (len(keystoget) == 1 and (filterkeys and filterconditions) and (not(sortkey and sortorder))): sorttargets = None filtertargets = keytargets # turn the search results into an np.array before we do # sorting/filtering searchresults = np.array(searchresults) if sorttargets: sorttargets = np.ravel(np.array(sorttargets)) sortind = np.argsort(sorttargets) if sortorder == 'desc': sortind = sortind[::-1] # sort the search results in the requested order searchresults = searchresults[sortind] sortdone = True if filtertargets: # don't forget to also sort the filtertargets in the same order # as sorttargets so we can get the correct objects to filter. # now figure out the filter conditions: <condition>@<operand> # where <condition> is one of: 'ge', 'gt', 'le', 'lt', 'eq' and # <operand> is a string, float, or int to use when applying # <condition> finalfilterind = [] for ind, fcond in enumerate(filterconditions): thisftarget = np.array([x[ind] for x in filtertargets]) if (sortdone): thisftarget = thisftarget[sortind] try: foperator, foperand = fcond.split('@') foperator = FILTEROPS[foperator] # we'll do a straight eval of the filter # yes: this is unsafe filterstr = ( 'np.isfinite(thisftarget) & (thisftarget %s %s)' % (foperator, foperand) ) filterind = eval(filterstr) # add this filter to the finalfilterind finalfilterind.append(filterind) # update the filterstatements filterstatements.append('%s %s %s' % (filterkeys[ind], foperator, foperand)) except Exception as e: print('ERR! could not understand filter spec: %s' '\nexception was: %s' % (args.filterby[ind], e)) print('WRN! not applying broken filter') # # DONE with evaluating each filter, get final results below # # column stack the overall filter ind finalfilterind = np.column_stack(finalfilterind) # do a logical AND across the rows finalfilterind = np.all(finalfilterind, axis=1) # these are the final results after ANDing all the filters filterresults = searchresults[finalfilterind] # make sure we got some results if filterresults.size > 0: print('filters applied: %s -> objects found: %s ' % (repr(args.filterby), filterresults.size)) searchresults = filterresults filterok = True # otherwise, applying all of the filters killed everything else: print('WRN! filtering failed! %s -> ZERO objects found!' % (repr(args.filterby), )) print('WRN! not applying any filters') # all done with sorting and filtering # turn the searchresults back into a list searchresults = searchresults.tolist() # if there's no special sort order defined, use the usual sort order # at the end after filtering if not(sortkey and sortorder): print('WRN! no special sort key and order/' 'filter key and condition specified, ' 'sorting checkplot pickles ' 'using usual alphanumeric sort...') searchresults = sorted(searchresults) sortkey = 'filename' sortorder = 'asc' nchunks = int(len(searchresults)/splitout) + 1 searchchunks = [searchresults[x*splitout:x*splitout+splitout] for x in range(nchunks)] if nchunks > 1: print('WRN! more than %s checkplots in final list, ' 'splitting into %s chunks' % (splitout, nchunks)) # if the filter failed, zero out filterkey if (filterkeys and filterconditions) and not filterok: filterstatements = [] # generate the output for chunkind, chunk in enumerate(searchchunks): # figure out if we need to split the JSON file outjson = os.path.abspath( os.path.join( currdir, '%scheckplot-filelist%s.json' % ( ('%s-' % outprefix if outprefix is not None else ''), ('-%02i' % chunkind if len(searchchunks) > 1 else ''), ) ) ) outjson = outjson.replace('|','_') outjson = outjson.replace('@','_') # ask if the checkplot list JSON should be updated if os.path.exists(outjson): answer = input( 'There is an existing ' 'checkplot list file in this ' 'directory:\n %s\nDo you want to ' 'overwrite it completely? (default: no) [y/n] ' % outjson ) # if it's OK to overwrite, then do so if answer and answer == 'y': with open(outjson,'w') as outfd: print('WRN! completely overwriting ' 'existing checkplot list %s' % outjson) outdict = { 'checkplots':chunk, 'nfiles':len(chunk), 'sortkey':sortkey, 'sortorder':sortorder, 'filterstatements':filterstatements } json.dump(outdict,outfd) # if it's not OK to overwrite, then else: # read in the outjson, and add stuff to it for objects that # don't have an entry print('only updating existing checkplot list ' 'file with any new checkplot pickles') with open(outjson,'r') as infd: indict = json.load(infd) # update the checkplot list, sortorder, and sortkey only indict['checkplots'] = chunk indict['nfiles'] = len(chunk) indict['sortkey'] = sortkey indict['sortorder'] = sortorder indict['filterstatements'] = filterstatements # write the updated to back to the file with open(outjson,'w') as outfd: json.dump(indict, outfd) # if this is a new output file else: with open(outjson,'w') as outfd: outdict = {'checkplots':chunk, 'nfiles':len(chunk), 'sortkey':sortkey, 'sortorder':sortorder, 'filterstatements':filterstatements} json.dump(outdict,outfd) if os.path.exists(outjson): print('checkplot file list written to %s' % outjson) else: print('ERR! writing the checkplot file list failed!') else: print('ERR! no checkplots found in %s' % checkplotbasedir)
if __name__ == '__main__': main()