Source code for astrobase.cpserver.checkplotlist

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# checkplotlist.py - Waqas Bhatti (wbhatti@astro.princeton.edu) - Dec 2016
# License: MIT. See LICENSE for full text.

'''This makes a checkplot file list for use with the `checkplot-viewer.html` or
the `checkplotserver.py` webapps. Checkplots are quick-views of object info,
finder charts, light curves, phased light curves, and periodograms used to
examine their stellar variability.

These are produced by several functions in the `astrobase.checkplot` module:

- :py:func:`astrobase.checkplot.pkl.checkplot_pickle`: makes a checkplot pickle
  file for any number of independent period-finding methods. Use
  `checkplotserver.py` to view these pickle files.

- :py:func:`astrobase.checkplot.png.checkplot_png`: makes a checkplot PNG for a
  single period-finding method. Use `checkplot-viewer.html` to view these image
  files.

- :py:func:`astrobase.checkplot.png.twolsp_checkplot_png`: does the same for
  two independent period-finding methods. Use `checkplot-viewer.html` to view
  these image files.

'''

PROGDESC = '''\
This makes a checkplot file list for use with the checkplot-viewer.html (for
checkplot PNGs) or the checkplotserver.py (for checkplot pickles) webapps.
'''

PROGEPILOG = '''\
SEARCHING FOR CHECKPLOT PNGS OR PICKLES
---------------------------------------
If you have checkplots that don't have 'checkplot' somewhere in their file name,
use the optional checkplot file glob argument to checkplotlist to provide
this:

--search '<filename glob for prefix>'

Make sure to use the quotes around this argument, otherwise the shell will
expand it.

SORTING CHECKPLOT PICKLES
-------------------------
If you want to sort checkplot pickle files in the output list in some special
way other than the usual filename sort order, this requires an argument on the
commandline of the form:

--sortby '<sortkey>|<asc or desc>'

(use the | character to separate sortkey and order)

Here, sortkey is some key in the checkplot pickle. This can be a simple key:
e.g. objectid or it can be a composite key: e.g. varinfo.features.stetsonj.
sortorder is either 'asc' or desc' for ascending/descending sort. The sortkey
must exist in all checkplot pickles.

FILTERING CHECKPLOT PICKLES
---------------------------
You can filter the checkplot pickle files in the output list by using the
--filterby argument. Note that filtering takes place after any requested
sorting.  Provide a filterkey, filteroperator, and filteroperand in the form:

--filterby '<filterkey>|<filteroperator>@<filteroperand>'

(use the | character to separate the filter column-key and filter specification,
 use the @ character in the filter spec to separate filter operator and operand)

Here, filterkey is some key in the checkplot pickle, specified as the sortkey
discussed above. filteroperator is one of the following 2-character strings:

'gt' -> greater than, 'lt' -> less than, 'ge' -> greater than or equal to,
'le' -> less than or equal to, 'eq' -> equal to, 'ne' -> not equal to

filteroperand is the appropriate integer, float, or string for the filterkey and
operator.

EXAMPLES OF CHECKPLOT PICKLE SORTING AND FILTERING
--------------------------------------------------
Sort checkplots by their 2MASS J magnitudes in ascending order:

  $ checkplotlist pkl project/awesome-objects --sortby 'objectinfo.jmag|asc'

Sort checkplots by the power of the best peak in their PDM periodograms:

  $ checkplotlist pkl project/awesome-objects --sortby 'pdm.nbestlspvals.0|asc'

Get only those checkplots with Stetson J > 0.2:

  $ checkplotlist pkl project/awesome-objects       \\
      --filterby 'varinfo.features.stetsonj|gt@0.2'

Get only those checkplots for objects that have object r mag < 12.0 and sort
these by power of the best peak in their Lomb-Scargle periodogram:

  $ checkplot pkl project/awesome-objects   \\
      --filterby 'objectinfo.sdssr|lt@12.0' \\
      --sortby 'gls.nbestlspvals.0|desc'

Get only those checkplots for objects that have best-period transit depths
between 1 mmag and 10 mmag and sort these by the SNR of the best peak in the BLS
spectrum in descending order:

  $ checkplot pkl project/awesome-objects       \\
      --sortby 'bls.snr.0|desc'                 \\
      --filterby 'bls.transitdepth.0|lt@-0.001' \\
      --filterby 'bls.transitdepth.0|gt@-0.01'
'''

import os
import os.path
import sys
import glob
import json
import argparse


# suppress warnings
import warnings
warnings.filterwarnings('ignore')

# to turn a list of keys into a dict address
# from https://stackoverflow.com/a/14692747
# used to walk a checkplotdict for a specific key in the structure
from functools import reduce
from operator import getitem

import numpy as np
import multiprocessing as mp
CPU_COUNT = mp.cpu_count()

from astrobase.checkplot.pkl_io import _read_checkplot_picklefile


######################
## HELPER FUNCTIONS ##
######################

def _dict_get(datadict, keylist):
    '''This gets a requested dict key by walking the dict.

    Parameters
    ----------

    datadict : dict
        The dict to get the specified key from.

    keylist : list of str
        This is a list of keys to use to walk the dict and get to the key that
        is provided as the last element in `keylist`. For example::

            keylist = ['key1','key2','key3']

        will walk `datadict` recursively to get to `datadict[key1][key2][key3]`.

    Returns
    -------

    object
        The dict value of the specified key address.

    '''
    return reduce(getitem, keylist, datadict)


[docs]def checkplot_infokey_worker(task):
    '''This gets the required keys from the requested file.

    Parameters
    ----------

    task : tuple
        Task is a two element tuple::

        - task[0] is the dict to work on

        - task[1] is a list of lists of str indicating all the key address to
          extract items from the dict for

    Returns
    -------

    list
        This is a list of all of the items at the requested key addresses.

    '''
    cpf, keys = task

    cpd = _read_checkplot_picklefile(cpf)

    resultkeys = []

    for k in keys:

        try:
            resultkeys.append(_dict_get(cpd, k))
        except Exception:
            resultkeys.append(np.nan)

    return resultkeys


############
## CONFIG ##
############

FILTEROPS = {
    'eq':'==',
    'gt':'>',
    'ge':'>=',
    'lt':'<',
    'le':'<=',
    'ne':'!=',
    'cb':'closed-interval-between',
    'ob':'open-interval-between',
}


##########
## MAIN ##
##########

[docs]def main():
    '''This is the main function of this script.

    The current script args are shown below ::

        Usage: checkplotlist [-h] [--search SEARCH] [--sortby SORTBY]
                             [--filterby FILTERBY] [--splitout SPLITOUT]
                             [--outprefix OUTPREFIX] [--maxkeyworkers MAXKEYWORKERS]
                             {pkl,png} cpdir

        This makes a checkplot file list for use with the checkplot-viewer.html
        (for checkplot PNGs) or the checkplotserver.py (for checkplot pickles)
        webapps.

        positional arguments:
          {pkl,png}             type of checkplot to search for: pkl -> checkplot
                                pickles, png -> checkplot PNGs
          cpdir                 directory containing the checkplots to process

        optional arguments:
          -h, --help            show this help message and exit
          --search SEARCH       file glob prefix to use when searching for checkplots,
                                default: '*checkplot*', (the extension is added
                                automatically - .png or .pkl)
          --sortby SORTBY       the sort key and order to use when sorting
          --filterby FILTERBY   the filter key and condition to use when filtering.
                                you can specify this multiple times to filter by
                                several keys at once. all filters are joined with a
                                logical AND operation in the order they're given.
          --splitout SPLITOUT   if there are more than SPLITOUT objects in the target
                                directory (default: 5000), checkplotlist will split
                                the output JSON into multiple files. this helps keep
                                the checkplotserver webapp responsive.
          --outprefix OUTPREFIX
                                a prefix string to use for the output JSON file(s).
                                use this to separate out different sort orders or
                                filter conditions, for example. if this isn't
                                provided, but --sortby or --filterby are, will use
                                those to figure out the output files' prefixes
          --maxkeyworkers MAXKEYWORKERS
                                the number of parallel workers that will be launched
                                to retrieve checkplot key values used for sorting and
                                filtering (default: 2)

    '''

    ####################
    ## PARSE THE ARGS ##
    ####################

    aparser = argparse.ArgumentParser(
        epilog=PROGEPILOG,
        description=PROGDESC,
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )
    aparser.add_argument(
        'cptype',
        action='store',
        choices=['pkl','png'],
        type=str,
        help=("type of checkplot to search for: pkl -> checkplot pickles, "
              "png -> checkplot PNGs")
    )
    aparser.add_argument(
        'cpdir',
        action='store',
        type=str,
        help=("directory containing the checkplots to process")
    )

    # TODO: here, make the --search kwarg an array (i.e. allow multiple search
    # statements). the use of this will be to make checkplotserver able to load
    # more than one checkplot per object (i.e. different mag types -- epd
    # vs. tfa -- or different bands -- r vs. i -- at the SAME time).

    # TODO: we'll fix checkplotserver and its js so there's a vertical tab
    # column between the left period/epoch/tags panel and main
    # periodogram/phased-LCs panel on the right. the user will be able to flip
    # between tabs to look at the object in all loaded alternative checkplots.

    # TODO: need to also think about to sort/filter; for now let's make it so
    # the sorting works on a chosen checkplot search list, if we give --search
    # 'checkplot*iep1' and --search 'checkplot*itf1', specify --sortpkls and
    # --filterpkls kwargs, which match the given globs for the --search
    # kwargs. e.g. we'd specify --sortpkls 'checkplot*iep1' to sort everything
    # by the specified --sortby values in those pickles.

    # TODO: we'll have to change the output JSON so it's primarily by objectid
    # instead of checkplot filenames. each objectid will have its own list of
    # checkplots to use for the frontend.
    aparser.add_argument(
        '--search',
        action='store',
        default='*checkplot*',
        type=str,
        help=("file glob prefix to use when searching for checkplots, "
              "default: '%(default)s', "
              "(the extension is added automatically - .png or .pkl)")
    )

    aparser.add_argument(
        '--sortby',
        action='store',
        type=str,
        help=("the sort key and order to use when sorting")
    )
    aparser.add_argument(
        '--filterby',
        action='append',
        type=str,
        help=("the filter key and condition to use when filtering. "
              "you can specify this multiple times to filter by "
              "several keys at once. all filters are joined with a "
              "logical AND operation in the order they're given.")
    )
    aparser.add_argument(
        '--splitout',
        action='store',
        type=int,
        default=5000,
        help=("if there are more than SPLITOUT objects in "
              "the target directory (default: %(default)s), "
              "checkplotlist will split the output JSON into multiple files. "
              "this helps keep the checkplotserver webapp responsive.")
    )
    aparser.add_argument(
        '--outprefix',
        action='store',
        type=str,
        help=("a prefix string to use for the output JSON file(s). "
              "use this to separate out different sort orders "
              "or filter conditions, for example. "
              "if this isn't provided, but --sortby or --filterby are, "
              "will use those to figure out the output files' prefixes")
    )
    aparser.add_argument(
        '--maxkeyworkers',
        action='store',
        type=int,
        default=int(CPU_COUNT/4.0),
        help=("the number of parallel workers that will be launched "
              "to retrieve checkplot key values used for "
              "sorting and filtering (default: %(default)s)")
    )

    args = aparser.parse_args()

    checkplotbasedir = args.cpdir
    fileglob = args.search
    splitout = args.splitout
    outprefix = args.outprefix if args.outprefix else None

    # see if there's a sorting order
    if args.sortby:
        sortkey, sortorder = args.sortby.split('|')
        if outprefix is None:
            outprefix = args.sortby
    else:
        sortkey, sortorder = 'objectid', 'asc'

    # see if there's a filter condition
    if args.filterby:

        filterkeys, filterconditions = [], []

        # load all the filters
        for filt in args.filterby:

            f = filt.split('|')
            filterkeys.append(f[0])
            filterconditions.append(f[1])

        # generate the output file's prefix
        if outprefix is None:
            outprefix = '-'.join(args.filterby)
        else:
            outprefix = '%s-%s' % ('-'.join(args.filterby), outprefix)
    else:
        filterkeys, filterconditions = None, None

    if args.cptype == 'pkl':
        checkplotext = 'pkl'
    elif args.cptype == 'png':
        checkplotext = 'png'
    else:
        print("unknown format for checkplots: %s! can't continue!"
              % args.cptype)
        sys.exit(1)

    #######################
    ## NOW START WORKING ##
    #######################

    currdir = os.getcwd()

    checkplotglob = os.path.join(checkplotbasedir,
                                 '%s.%s' % (fileglob, checkplotext))

    print('searching for checkplots: %s' % checkplotglob)

    searchresults = glob.glob(checkplotglob)

    if searchresults:

        print('found %s checkplot files in dir: %s' %
              (len(searchresults), checkplotbasedir))

        # see if we should sort the searchresults in some special order
        # this requires an arg on the commandline of the form:
        # '<sortkey>-<asc|desc>'
        # where sortkey is some key in the checkplot pickle:
        #   this can be a simple key: e.g. objectid
        #   or it can be a composite key: e.g. varinfo.varfeatures.stetsonj
        # and sortorder is either 'asc' or desc' for ascending/descending sort

        # we only support a single condition conditions are of the form:
        # '<filterkey>-<condition>@<operand>' where <condition> is one of: 'ge',
        # 'gt', 'le', 'lt', 'eq' and <operand> is a string, float, or int to use
        # when applying <condition>

        # first, take care of sort keys
        sortdone = False

        # second, take care of any filters
        filterok = False
        filterstatements = []

        # make sure we only run these operations on checkplot pickles
        if ((args.cptype == 'pkl') and
            ((sortkey and sortorder) or (filterkeys and filterconditions))):

            keystoget = []

            # handle sorting
            if (sortkey and sortorder):

                print('sorting checkplot pickles by %s in order: %s' %
                      (sortkey, sortorder))

                # dereference the sort key
                sortkeys = sortkey.split('.')

                # if there are any integers in the sortkeys strings, interpret
                # these to mean actual integer indexes of lists or integer keys
                # for dicts this allows us to move into arrays easily by
                # indexing them
                sortkeys = [(int(x) if x.isdecimal() else x)
                            for x in sortkeys]

                keystoget.append(sortkeys)

            # handle filtering
            if (filterkeys and filterconditions):

                print('filtering checkplot pickles by %s using: %s' %
                      (filterkeys, filterconditions))

                # add all the filtkeys to the list of keys to get
                for fdk in filterkeys:

                    # dereference the filter dict key
                    fdictkeys = fdk.split('.')
                    fdictkeys = [(int(x) if x.isdecimal() else x)
                                 for x in fdictkeys]

                    keystoget.append(fdictkeys)

            print('retrieving checkplot info using %s workers...'
                  % args.maxkeyworkers)
            # launch the key retrieval
            pool = mp.Pool(args.maxkeyworkers)
            tasks = [(x, keystoget) for x in searchresults]
            keytargets = pool.map(checkplot_infokey_worker, tasks)

            pool.close()
            pool.join()

            # now that we have keys, we need to use them
            # keys will be returned in the order we put them into keystoget

            # if keystoget is more than 1 element, then it's either sorting
            # followed by filtering (multiple)...
            if (len(keystoget) > 1 and
                (sortkey and sortorder) and
                (filterkeys and filterconditions)):

                # the first elem is sort key targets
                sorttargets = [x[0] for x in keytargets]

                # all of the rest are filter targets
                filtertargets = [x[1:] for x in keytargets]

            # otherwise, it's just multiple filters
            elif (len(keystoget) > 1 and
                  (not (sortkey and sortorder)) and
                  (filterkeys and filterconditions)):

                sorttargets = None
                filtertargets = keytargets

            # if there's only one element in keytoget, then it's either just a
            # sort target...
            elif (len(keystoget) == 1 and
                  (sortkey and sortorder) and
                  (not(filterkeys and filterconditions))):
                sorttargets = keytargets
                filtertargets = None

            # or it's just a filter target
            elif (len(keystoget) == 1 and
                  (filterkeys and filterconditions) and
                  (not(sortkey and sortorder))):
                sorttargets = None
                filtertargets = keytargets

            # turn the search results into an np.array before we do
            # sorting/filtering
            searchresults = np.array(searchresults)

            if sorttargets:

                sorttargets = np.ravel(np.array(sorttargets))

                sortind = np.argsort(sorttargets)
                if sortorder == 'desc':
                    sortind = sortind[::-1]

                # sort the search results in the requested order
                searchresults = searchresults[sortind]
                sortdone = True

            if filtertargets:

                # don't forget to also sort the filtertargets in the same order
                # as sorttargets so we can get the correct objects to filter.

                # now figure out the filter conditions: <condition>@<operand>
                # where <condition> is one of: 'ge', 'gt', 'le', 'lt', 'eq' and
                # <operand> is a string, float, or int to use when applying
                # <condition>

                finalfilterind = []

                for ind, fcond in enumerate(filterconditions):

                    thisftarget = np.array([x[ind] for x in filtertargets])

                    if (sortdone):
                        thisftarget = thisftarget[sortind]

                    try:

                        foperator, foperand = fcond.split('@')
                        foperator = FILTEROPS[foperator]

                        # we'll do a straight eval of the filter
                        # yes: this is unsafe
                        filterstr = (
                            'np.isfinite(thisftarget) & (thisftarget %s %s)' %
                            (foperator, foperand)
                        )
                        filterind = eval(filterstr)

                        # add this filter to the finalfilterind
                        finalfilterind.append(filterind)

                        # update the filterstatements
                        filterstatements.append('%s %s %s' % (filterkeys[ind],
                                                              foperator,
                                                              foperand))

                    except Exception as e:

                        print('ERR! could not understand filter spec: %s'
                              '\nexception was: %s' %
                              (args.filterby[ind], e))
                        print('WRN! not applying broken filter')

                #
                # DONE with evaluating each filter, get final results below
                #
                # column stack the overall filter ind
                finalfilterind = np.column_stack(finalfilterind)

                # do a logical AND across the rows
                finalfilterind = np.all(finalfilterind, axis=1)

                # these are the final results after ANDing all the filters
                filterresults = searchresults[finalfilterind]

                # make sure we got some results
                if filterresults.size > 0:

                    print('filters applied: %s -> objects found: %s ' %
                          (repr(args.filterby), filterresults.size))
                    searchresults = filterresults
                    filterok = True

                # otherwise, applying all of the filters killed everything
                else:
                    print('WRN! filtering failed! %s -> ZERO objects found!' %
                          (repr(args.filterby), ))
                    print('WRN! not applying any filters')

            # all done with sorting and filtering
            # turn the searchresults back into a list
            searchresults = searchresults.tolist()

            # if there's no special sort order defined, use the usual sort order
            # at the end after filtering
            if not(sortkey and sortorder):

                print('WRN! no special sort key and order/'
                      'filter key and condition specified, '
                      'sorting checkplot pickles '
                      'using usual alphanumeric sort...')

                searchresults = sorted(searchresults)
                sortkey = 'filename'
                sortorder = 'asc'

        nchunks = int(len(searchresults)/splitout) + 1

        searchchunks = [searchresults[x*splitout:x*splitout+splitout] for x
                        in range(nchunks)]

        if nchunks > 1:
            print('WRN! more than %s checkplots in final list, '
                  'splitting into %s chunks' % (splitout, nchunks))

        # if the filter failed, zero out filterkey
        if (filterkeys and filterconditions) and not filterok:
            filterstatements = []

        # generate the output
        for chunkind, chunk in enumerate(searchchunks):

            # figure out if we need to split the JSON file
            outjson = os.path.abspath(
                os.path.join(
                    currdir,
                    '%scheckplot-filelist%s.json' % (
                        ('%s-' % outprefix if outprefix is not None else ''),
                        ('-%02i' % chunkind if len(searchchunks) > 1 else ''),
                    )
                )
            )

            outjson = outjson.replace('|','_')
            outjson = outjson.replace('@','_')

            # ask if the checkplot list JSON should be updated
            if os.path.exists(outjson):

                answer = input(
                    'There is an existing '
                    'checkplot list file in this '
                    'directory:\n    %s\nDo you want to '
                    'overwrite it completely? (default: no) [y/n] ' %
                    outjson
                )

                # if it's OK to overwrite, then do so
                if answer and answer == 'y':

                    with open(outjson,'w') as outfd:
                        print('WRN! completely overwriting '
                              'existing checkplot list %s' % outjson)
                        outdict = {
                            'checkplots':chunk,
                            'nfiles':len(chunk),
                            'sortkey':sortkey,
                            'sortorder':sortorder,
                            'filterstatements':filterstatements
                        }
                        json.dump(outdict,outfd)

                # if it's not OK to overwrite, then
                else:

                    # read in the outjson, and add stuff to it for objects that
                    # don't have an entry
                    print('only updating existing checkplot list '
                          'file with any new checkplot pickles')

                    with open(outjson,'r') as infd:
                        indict = json.load(infd)

                    # update the checkplot list, sortorder, and sortkey only
                    indict['checkplots'] = chunk
                    indict['nfiles'] = len(chunk)
                    indict['sortkey'] = sortkey
                    indict['sortorder'] = sortorder
                    indict['filterstatements'] = filterstatements

                    # write the updated to back to the file
                    with open(outjson,'w') as outfd:
                        json.dump(indict, outfd)

            # if this is a new output file
            else:

                with open(outjson,'w') as outfd:
                    outdict = {'checkplots':chunk,
                               'nfiles':len(chunk),
                               'sortkey':sortkey,
                               'sortorder':sortorder,
                               'filterstatements':filterstatements}
                    json.dump(outdict,outfd)

            if os.path.exists(outjson):
                print('checkplot file list written to %s' % outjson)
            else:
                print('ERR! writing the checkplot file list failed!')

    else:

        print('ERR! no checkplots found in %s' % checkplotbasedir)


if __name__ == '__main__':
    main()