Source code for astrobase.lcproc

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# lcformat.py - Waqas Bhatti (wbhatti@astro.princeton.edu) - Feb 2019

'''This package contains functions that help drive large batch-processing jobs
for light curves.

This top level module contains functions to import custom light curve
formats. Once you have your own LC format registered with `lcproc`, all of the
submodules in this package can be used to process these LCs:

- :py:mod:`astrobase.lcproc.awsrun`: contains driver functions that run
  batch-processing of light curve period-finding and checkplot making using
  resources from Amazon AWS: EC2 for processing, S3 for storage, and SQS for
  queuing work.

- :py:mod:`astrobase.lcproc.catalogs`: contains functions that generate catalogs
  from collections of light curves, make KD-Trees for fast spatial matching, and
  augment these catalogs from the rich object information contained in checkplot
  pickles.

- :py:mod:`astrobase.lcproc.checkplotgen`: contains functions that drive
  batch-jobs to make checkplot pickles for a large collection of light curves
  (and optional period-finding results).

- :py:mod:`astrobase.lcproc.checkplotproc`: contains functions that add extra
  information to checkplot pickles, including color-magnitude diagrams, updating
  neighbor light curves, and cross-matches to external catalogs.

- :py:mod:`astrobase.lcproc.epd`: contains functions that drive batch-jobs for
  External Parameter Decorrelation on collections of light curves.

- :py:mod:`astrobase.lcproc.lcbin`: contains functions that drive batch-jobs
  for time-binning collections of light curves to a specified cadence.

- :py:mod:`astrobase.lcproc.lcpfeatures`: contains functions that drive
  batch-jobs to calculate features of phased light curves, if period-finding
  results for these are available. These periodic light curve features can be
  used later to do variable star classification.

- :py:mod:`astrobase.lcproc.lcsfeatures`: contains functions that drive
  batch-jobs to calculate color, coordinate, and neighbor proximity features for
  a collection of light curves. These can be used later to do variable star
  classification.

- :py:mod:`astrobase.lcproc.lcvfeatures`: contains functions that drive
  batch-jobs to calculate non-periodic features of unphased light curves
  (e.g. time-series moments and variability indices). These can be used later to
  do variable star classification.

- :py:mod:`astrobase.lcproc.periodsearch`: contains functions that drive
  batch-jobs to run period-finding using any of the methods in
  :py:mod:`astrobase.periodbase` on collections of light curves. These produce
  period-finder result pickles that can be used transparently by the functions
  in :py:mod:`astrobase.lcproc.checkplotgen` and
  :py:mod:`astrobase.lcproc.checkplotproc` to generate and update checkplot
  pickles.

- :py:mod:`astrobase.lcproc.tfa`: contains functions that drive the application
  of the Trend Filtering Algorithm (TFA) to large collections of light curves.

- :py:mod:`astrobase.lcproc.varthreshold`: contains functions that help decide
  where to place thresholds on several variability indices for a collection of
  light curves to maximize recovery of actual variable stars.

'''

#############
## LOGGING ##
#############

import logging
from astrobase import log_sub, log_fmt, log_date_fmt

DEBUG = False
if DEBUG:
    level = logging.DEBUG
else:
    level = logging.INFO
LOGGER = logging.getLogger(__name__)
logging.basicConfig(
    level=level,
    style=log_sub,
    format=log_fmt,
    datefmt=log_date_fmt,
)

LOGDEBUG = LOGGER.debug
LOGINFO = LOGGER.info
LOGWARNING = LOGGER.warning
LOGERROR = LOGGER.error
LOGEXCEPTION = LOGGER.exception


#############
## IMPORTS ##
#############

try:
    import cPickle as pickle
except Exception:
    import pickle
import gzip
import os.path
import os
import importlib
import sys
import json


# to turn a list of keys into a dict address
# from https://stackoverflow.com/a/14692747
from functools import reduce, partial
from operator import getitem


def _dict_get(datadict, keylist):
    return reduce(getitem, keylist, datadict)


#################################
## PICKLE LC READING FUNCTIONS ##
#################################

def _read_pklc(lcfile):
    '''
    This just reads a light curve pickle file.

    Parameters
    ----------

    lcfile : str
        The file name of the pickle to open.

    Returns
    -------

    dict
        This returns an lcdict.

    '''

    if lcfile.endswith('.gz'):

        try:
            with gzip.open(lcfile,'rb') as infd:
                lcdict = pickle.load(infd)
        except UnicodeDecodeError:
            with gzip.open(lcfile,'rb') as infd:
                lcdict = pickle.load(infd, encoding='latin1')

    else:

        try:
            with open(lcfile,'rb') as infd:
                lcdict = pickle.load(infd)
        except UnicodeDecodeError:
            with open(lcfile,'rb') as infd:
                lcdict = pickle.load(infd, encoding='latin1')

    return lcdict


#################################
## LIGHT CURVE FORMAT HANDLING ##
#################################

def _check_extmodule(module, formatkey):
    '''This imports the module specified.

    Used to dynamically import Python modules that are needed to support LC
    formats not natively supported by astrobase.

    Parameters
    ----------

    module : str
        This is either:

        - a Python module import path, e.g. 'astrobase.lcproc.catalogs' or
        - a path to a Python file, e.g. '/astrobase/hatsurveys/hatlc.py'

        that contains the Python module that contains functions used to open
        (and optionally normalize) a custom LC format that's not natively
        supported by astrobase.

    formatkey : str
        A str used as the unique ID of this LC format for all lcproc functions
        and can be used to look it up later and import the correct functions
        needed to support it for lcproc operations. For example, we use
        'kep-fits' as a the specifier for Kepler FITS light curves, which can be
        read by the `astrobase.astrokep.read_kepler_fitslc` function as
        specified by the `<astrobase install path>/data/lcformats/kep-fits.json`
        LC format specification JSON.

    Returns
    -------

    Python module
        This returns a Python module if it's able to successfully import it.

    '''

    try:

        if os.path.exists(module):

            sys.path.append(os.path.dirname(module))
            importedok = importlib.import_module(
                os.path.basename(module.replace('.py',''))
            )

        else:
            importedok = importlib.import_module(module)

    except Exception:

        LOGEXCEPTION('could not import the module: %s for LC format: %s. '
                     'check the file path or fully qualified module name?'
                     % (module, formatkey))
        importedok = False

    return importedok


[docs]def register_lcformat(formatkey, fileglob, timecols, magcols, errcols, readerfunc_module, readerfunc, readerfunc_kwargs=None, normfunc_module=None, normfunc=None, normfunc_kwargs=None, magsarefluxes=False, overwrite_existing=False, lcformat_dir='~/.astrobase/lcformat-jsons'): '''This adds a new LC format to the astrobase LC format registry. Allows handling of custom format light curves for astrobase lcproc drivers. Once the format is successfully registered, light curves should work transparently with all of the functions in this module, by simply calling them with the `formatkey` in the `lcformat` keyword argument. LC format specifications are generated as JSON files. astrobase comes with several of these in `<astrobase install path>/data/lcformats`. LC formats you add by using this function will have their specifiers written to the `~/.astrobase/lcformat-jsons` directory in your home directory. Parameters ---------- formatkey : str A str used as the unique ID of this LC format for all lcproc functions and can be used to look it up later and import the correct functions needed to support it for lcproc operations. For example, we use 'kep-fits' as a the specifier for Kepler FITS light curves, which can be read by the `astrobase.astrokep.read_kepler_fitslc` function as specified by the `<astrobase install path>/data/lcformats/kep-fits.json` LC format specification JSON produced by `register_lcformat`. fileglob : str The default UNIX fileglob to use to search for light curve files in this LC format. This is a string like '*-whatever-???-*.*??-.lc'. timecols,magcols,errcols : list of str These are all lists of strings indicating which keys in the lcdict produced by your `lcreader_func` that will be extracted and used by lcproc functions for processing. The lists must all have the same dimensions, e.g. if timecols = ['timecol1','timecol2'], then magcols must be something like ['magcol1','magcol2'] and errcols must be something like ['errcol1', 'errcol2']. This allows you to process multiple apertures or multiple types of measurements in one go. Each element in these lists can be a simple key, e.g. 'time' (which would correspond to lcdict['time']), or a composite key, e.g. 'aperture1.times.rjd' (which would correspond to lcdict['aperture1']['times']['rjd']). See the examples in the lcformat specification JSON files in `<astrobase install path>/data/lcformats`. readerfunc_module : str This is either: - a Python module import path, e.g. 'astrobase.lcproc.catalogs' or - a path to a Python file, e.g. '/astrobase/hatsurveys/hatlc.py' that contains the Python module that contains functions used to open (and optionally normalize) a custom LC format that's not natively supported by astrobase. readerfunc : str This is the function name in `readerfunc_module` to use to read light curves in the custom format. This MUST always return a dictionary (the 'lcdict') with the following signature (the keys listed below are required, but others are allowed):: {'objectid': this object's identifier as a string, 'objectinfo':{'ra': this object's right ascension in decimal deg, 'decl': this object's declination in decimal deg, 'ndet': the number of observations in this LC, 'objectid': the object ID again for legacy reasons}, ...other time columns, mag columns go in as their own keys} normfunc_kwargs : dict or None This is a dictionary containing any kwargs to pass through to the light curve norm function. normfunc_module : str or None This is either: - a Python module import path, e.g. 'astrobase.lcproc.catalogs' or - a path to a Python file, e.g. '/astrobase/hatsurveys/hatlc.py' - None, in which case we'll use default normalization that contains the Python module that contains functions used to normalize a custom LC format that's not natively supported by astrobase. normfunc : str or None This is the function name in `normfunc_module` to use to normalize light curves in the custom format. If None, the default normalization method used by lcproc is to find gaps in the time-series, normalize measurements grouped by these gaps to zero, then normalize the entire magnitude time series to global time series median using the `astrobase.lcmath.normalize_magseries` function. If this is provided, the normalization function should take and return an lcdict of the same form as that produced by `readerfunc` above. For an example of a specific normalization function, see `normalize_lcdict_by_inst` in the `astrobase.hatsurveys.hatlc` module. normfunc_kwargs : dict or None This is a dictionary containing any kwargs to pass through to the light curve normalization function. magsarefluxes : bool If this is True, then all lcproc functions will treat the measurement columns in the lcdict produced by your `readerfunc` as flux instead of mags, so things like default normalization and sigma-clipping will be done correctly. If this is False, magnitudes will be treated as magnitudes. overwrite_existing : bool If this is True, this function will overwrite any existing LC format specification JSON with the same name as that provided in the `formatkey` arg. This can be used to update LC format specifications while keeping the `formatkey` the same. lcformat_dir : str This specifies the directory where the the LC format specification JSON produced by this function will be written. By default, this goes to the `.astrobase/lcformat-jsons` directory in your home directory. Returns ------- str Returns the file path to the generated LC format specification JSON file. ''' LOGINFO('adding %s to LC format registry...' % formatkey) # search for the lcformat_dir and create it if it doesn't exist lcformat_dpath = os.path.abspath( os.path.expanduser(lcformat_dir) ) if not os.path.exists(lcformat_dpath): os.makedirs(lcformat_dpath) lcformat_jsonpath = os.path.join(lcformat_dpath,'%s.json' % formatkey) if os.path.exists(lcformat_jsonpath) and not overwrite_existing: LOGERROR('There is an existing lcformat JSON: %s ' 'for this formatkey: %s and ' 'overwrite_existing = False, skipping...' % (lcformat_jsonpath, formatkey)) return None # see if we can import the reader module readermodule = _check_extmodule(readerfunc_module, formatkey) if not readermodule: LOGERROR("could not import the required " "module: %s to read %s light curves" % (readerfunc_module, formatkey)) return None # then, get the function we need to read the light curve try: getattr(readermodule, readerfunc) readerfunc_in = readerfunc except AttributeError: LOGEXCEPTION('Could not get the specified reader ' 'function: %s for lcformat: %s ' 'from module: %s' % (formatkey, readerfunc_module, readerfunc)) raise # see if we can import the normalization module if normfunc_module: normmodule = _check_extmodule(normfunc_module, formatkey) if not normmodule: LOGERROR("could not import the required " "module: %s to normalize %s light curves" % (normfunc_module, formatkey)) return None else: normmodule = None # finally, get the function we need to normalize the light curve if normfunc_module and normfunc: try: getattr(normmodule, normfunc) normfunc_in = normfunc except AttributeError: LOGEXCEPTION('Could not get the specified norm ' 'function: %s for lcformat: %s ' 'from module: %s' % (normfunc, formatkey, normfunc_module)) raise else: normfunc_in = None # if we made it to here, then everything's good. generate the JSON # structure formatdict = {'fileglob':fileglob, 'timecols':timecols, 'magcols':magcols, 'errcols':errcols, 'magsarefluxes':magsarefluxes, 'lcreader_module':readerfunc_module, 'lcreader_func':readerfunc_in, 'lcreader_kwargs':readerfunc_kwargs, 'lcnorm_module':normfunc_module, 'lcnorm_func':normfunc_in, 'lcnorm_kwargs':normfunc_kwargs} # write this to the lcformat directory with open(lcformat_jsonpath,'w') as outfd: json.dump(formatdict, outfd, indent=4) return lcformat_jsonpath
[docs]def get_lcformat(formatkey, use_lcformat_dir=None): '''This loads an LC format description from a previously-saved JSON file. Parameters ---------- formatkey : str The key used to refer to the LC format. This is part of the JSON file's name, e.g. the format key 'hat-csv' maps to the format JSON file: '<astrobase install path>/data/lcformats/hat-csv.json'. use_lcformat_dir : str or None If provided, must be the path to a directory that contains the corresponding lcformat JSON file for `formatkey`. If this is None, this function will look for lcformat JSON files corresponding to the given `formatkey`: - first, in the directory specified in this kwarg, - if not found there, in the home directory: ~/.astrobase/lcformat-jsons - if not found there, in: <astrobase install path>/data/lcformats Returns ------- tuple A tuple of the following form is returned:: (fileglob : the file glob of the associated LC files, readerfunc_in : the imported Python function for reading LCs, timecols : list of time col keys to get from the lcdict, magcols : list of mag col keys to get from the lcdict , errcols : list of err col keys to get from the lcdict, magsarefluxes : True if the measurements are fluxes not mags, normfunc_in : the imported Python function for normalizing LCs) All `astrobase.lcproc` functions can then use this tuple to dynamically import your LC reader and normalization functions to work with your LC format transparently. ''' if isinstance(use_lcformat_dir, str): # look for the lcformat JSON lcformat_jsonpath = os.path.join( use_lcformat_dir, '%s.json' % formatkey ) if not os.path.exists(lcformat_jsonpath): lcformat_jsonpath = os.path.join( os.path.expanduser('~/.astrobase/lcformat-jsons'), '%s.json' % formatkey ) if not os.path.exists(lcformat_jsonpath): install_path = os.path.dirname(__file__) install_path = os.path.abspath( os.path.join(install_path, '..', 'data','lcformats') ) lcformat_jsonpath = os.path.join( install_path, '%s.json' % formatkey ) if not os.path.exists(lcformat_jsonpath): LOGERROR('could not find an lcformat JSON ' 'for formatkey: %s in any of: ' 'use_lcformat_dir, home directory, ' 'astrobase installed data directory' % formatkey) return None else: lcformat_jsonpath = os.path.join( os.path.expanduser('~/.astrobase/lcformat-jsons'), '%s.json' % formatkey ) if not os.path.exists(lcformat_jsonpath): install_path = os.path.dirname(__file__) install_path = os.path.abspath( os.path.join(install_path, '..', 'data','lcformats') ) lcformat_jsonpath = os.path.join( install_path, '%s.json' % formatkey ) if not os.path.exists(lcformat_jsonpath): LOGERROR('could not find an lcformat JSON ' 'for formatkey: %s in any of: ' 'use_lcformat_dir, home directory, ' 'astrobase installed data directory' % formatkey) return None # load the found lcformat JSON with open(lcformat_jsonpath) as infd: lcformatdict = json.load(infd) readerfunc_module = lcformatdict['lcreader_module'] readerfunc = lcformatdict['lcreader_func'] readerfunc_kwargs = lcformatdict['lcreader_kwargs'] normfunc_module = lcformatdict['lcnorm_module'] normfunc = lcformatdict['lcnorm_func'] normfunc_kwargs = lcformatdict['lcnorm_kwargs'] fileglob = lcformatdict['fileglob'] timecols = lcformatdict['timecols'] magcols = lcformatdict['magcols'] errcols = lcformatdict['errcols'] magsarefluxes = lcformatdict['magsarefluxes'] # import all the required bits # see if we can import the reader module readermodule = _check_extmodule(readerfunc_module, formatkey) if not readermodule: LOGERROR("could not import the required " "module: %s to read %s light curves" % (readerfunc_module, formatkey)) return None # then, get the function we need to read the light curve try: readerfunc_in = getattr(readermodule, readerfunc) except AttributeError: LOGEXCEPTION('Could not get the specified reader ' 'function: %s for lcformat: %s ' 'from module: %s' % (formatkey, readerfunc_module, readerfunc)) raise # see if we can import the normalization module if normfunc_module: normmodule = _check_extmodule(normfunc_module, formatkey) if not normmodule: LOGERROR("could not import the required " "module: %s to normalize %s light curves" % (normfunc_module, formatkey)) return None else: normmodule = None # finally, get the function we need to normalize the light curve if normfunc_module and normfunc: try: normfunc_in = getattr(normmodule, normfunc) except AttributeError: LOGEXCEPTION('Could not get the specified norm ' 'function: %s for lcformat: %s ' 'from module: %s' % (formatkey, normfunc_module, normfunc)) raise else: normfunc_in = None # add in any optional kwargs that need to be there for readerfunc if isinstance(readerfunc_kwargs, dict): readerfunc_in = partial(readerfunc_in, **readerfunc_kwargs) # add in any optional kwargs that need to be there for normfunc if normfunc_in is not None: if isinstance(normfunc_kwargs, dict): normfunc_in = partial(normfunc_in, **normfunc_kwargs) # assemble the return tuple # this can be used directly by other lcproc functions returntuple = ( fileglob, readerfunc_in, timecols, magcols, errcols, magsarefluxes, normfunc_in, ) return returntuple