Source code for astrobase.astrokep

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# astrokep.py - Waqas Bhatti (wbhatti@astro.princeton.edu) - 05/2016

'''
Contains various useful tools for analyzing Kepler light curves.

'''

#############
## LOGGING ##
#############

import logging
from astrobase import log_sub, log_fmt, log_date_fmt

DEBUG = False
if DEBUG:
    level = logging.DEBUG
else:
    level = logging.INFO
LOGGER = logging.getLogger(__name__)
logging.basicConfig(
    level=level,
    style=log_sub,
    format=log_fmt,
    datefmt=log_date_fmt,
)

LOGDEBUG = LOGGER.debug
LOGINFO = LOGGER.info
LOGWARNING = LOGGER.warning
LOGERROR = LOGGER.error
LOGEXCEPTION = LOGGER.exception


#############
## IMPORTS ##
#############

import glob
import os.path
import pickle
import gzip
import os

import numpy as np

from numpy import (
    nan as npnan, sum as npsum, abs as npabs,
    isfinite as npisfinite,
    median as npmedian,
    array as nparray,
    max as npmax, min as npmin,
    log10 as nplog10, pi as MPI, floor as npfloor,
    argsort as npargsort, cos as npcos, sin as npsin,
    zeros_like as npzeros_like, full_like as npfull_like,
    ones as npones,
    column_stack as npcolumn_stack, in1d as npin1d, append as npappend,
    unique as npunique, concatenate as npconcatenate
)

from numpy.polynomial.legendre import Legendre
from scipy.optimize import leastsq
from scipy.ndimage import median_filter
from astropy.io import fits as pyfits
from sklearn.ensemble import RandomForestRegressor
SKLEARN = True

from .lcmath import sigclip_magseries, find_lc_timegroups


###########################################
## UTILITY FUNCTIONS FOR FLUXES AND MAGS ##
###########################################

[docs]def keplerflux_to_keplermag(keplerflux, f12=1.74e5):
    '''This converts the Kepler flux in electrons/sec to Kepler magnitude.

    The kepler mag/flux relation is::

        fkep = (10.0**(-0.4*(kepmag - 12.0)))*f12
        f12 = 1.74e5 # electrons/sec

    Parameters
    ----------

    keplerflux : float or array-like
        The flux value(s) to convert to magnitudes.

    f12 : float
        The flux value in the Kepler band corresponding to Kepler mag = 12.0.

    Returns
    -------

    np.array
        Magnitudes in the Kepler band corresponding to the input `keplerflux`
        flux value(s).

    '''

    kepmag = 12.0 - 2.5*nplog10(keplerflux/f12)
    return kepmag


[docs]def keplermag_to_keplerflux(keplermag, f12=1.74e5):
    '''This converts the Kepler mag back to Kepler flux.

    Parameters
    ----------

    keplermag : float or array-like
        The Kepler magnitude value(s) to convert to fluxes.

    f12 : float
        The flux value in the Kepler band corresponding to Kepler mag = 12.0.

    Returns
    -------

    np.array
        Fluxes in the Kepler band corresponding to the input `keplermag`
        magnitude value(s).

    '''

    kepflux = (10.0**(-0.4*(keplermag - 12.0)))*f12
    return kepflux


[docs]def keplermag_to_sdssr(keplermag, kic_sdssg, kic_sdssr):
    '''Converts magnitude measurements in Kepler band to SDSS r band.

    Parameters
    ----------

    keplermag : float or array-like
        The Kepler magnitude value(s) to convert to fluxes.

    kic_sdssg,kic_sdssr : float or array-like
        The SDSS g and r magnitudes of the object(s) from the Kepler Input
        Catalog. The .llc.fits MAST light curve file for a Kepler object
        contains these values in the FITS extension 0 header.

    Returns
    -------

    float or array-like
        SDSS r band magnitude(s) converted from the Kepler band magnitude.

    '''
    kic_sdssgr = kic_sdssg - kic_sdssr

    if kic_sdssgr < 0.8:
        kepsdssr = (keplermag - 0.2*kic_sdssg)/0.8
    else:
        kepsdssr = (keplermag - 0.1*kic_sdssg)/0.9
    return kepsdssr


[docs]def flux_ppm_to_magnitudes(ppm):
    '''This converts Kepler's flux parts-per-million to magnitudes.

    Mostly useful for turning PPMs reported by Kepler or TESS into millimag
    values to compare with ground-based surveys.

    Parameters
    ----------

    ppm : float or array-like
        Kepler flux measurement errors or RMS values in parts-per-million.

    Returns
    -------

    float or array-like
        Measurement errors or RMS values expressed in magnitudes.

    '''

    return -2.5*nplog10(1.0 - ppm/1.0e6)


######################################################
## FUNCTIONS FOR READING KEPLER AND K2 LIGHT CURVES ##
######################################################

# this is the list of keys to pull out of the light curve FITS table
LCDATAKEYS = ['TIME','TIMECORR','CADENCENO',
              'SAP_QUALITY',
              'PSF_CENTR1','PSF_CENTR1_ERR','PSF_CENTR2','PSF_CENTR2_ERR',
              'MOM_CENTR1','MOM_CENTR1_ERR','MOM_CENTR2','MOM_CENTR2_ERR']

LCSAPKEYS = ['SAP_FLUX','SAP_FLUX_ERR','SAP_BKG','SAP_BKG_ERR']
LCPDCKEYS = ['PDCSAP_FLUX','PDCSAP_FLUX_ERR']

# this is the list of keys to pull out of the light curve header
LCHEADERKEYS = ['TIMESYS','BJDREFI','BJDREFF',
                'OBJECT','KEPLERID',
                'RA_OBJ','DEC_OBJ','EQUINOX',
                'EXPOSURE',
                'CDPP3_0','CDPP6_0','CDPP12_0',
                'PDCVAR','PDCMETHD','CROWDSAP','FLFRCSAP']

# this is the list of keys to pull out of the top header of the FITS
LCTOPKEYS = ['CHANNEL','SKYGROUP','MODULE','OUTPUT',
             'QUARTER','SEASON','CAMPAIGN',
             'DATA_REL','OBSMODE',
             'PMRA','PMDEC','PMTOTAL','PARALLAX',
             'GLON','GLAT',
             'GMAG','RMAG','IMAG','ZMAG','D51MAG',
             'JMAG','HMAG','KMAG','KEPMAG',
             'GRCOLOR','JKCOLOR','GKCOLOR',
             'TEFF','LOGG','FEH',
             'EBMINUSV','AV','RADIUS','TMINDEX']

# this is the list of keys to pull out of the aperture part of the light curve
# we also pull out the whole pixel mask, which looks something like:
# array([[0, 1, 1, 1, 1, 1, 1, 0],
#        [1, 1, 1, 3, 3, 1, 1, 1],
#        [1, 1, 3, 3, 3, 3, 1, 1],
#        [1, 1, 3, 3, 3, 3, 3, 1],
#        [1, 1, 3, 3, 3, 3, 3, 1],
#        [1, 1, 1, 1, 3, 3, 1, 1],
#        [0, 1, 1, 1, 1, 1, 1, 0]], dtype=int32)
# where the value 3 means the actual pixels used to sum the flux for this
# particular object (the optimal aperture). 1 means the pixel was collected by
# the telescope, so its flux is available
# we use CDELT1 and CDELT2 below to get the pixel scale in arcsec/px
# it should be about 3.96 arcsec/pixel in most cases
LCAPERTUREKEYS = ['NPIXSAP','NPIXMISS','CDELT1','CDELT2']


[docs]def read_kepler_fitslc(
        lcfits,
        headerkeys=LCHEADERKEYS,
        datakeys=LCDATAKEYS,
        sapkeys=LCSAPKEYS,
        pdckeys=LCPDCKEYS,
        topkeys=LCTOPKEYS,
        apkeys=LCAPERTUREKEYS,
        appendto=None,
        normalize=False,
):
    '''This extracts the light curve from a single Kepler or K2 LC FITS file.

    This works on the light curves available at MAST:

    - `kplr{kepid}-{somedatething}_llc.fits` files from the Kepler mission
    - `ktwo{epicid}-c{campaign}_llc.fits` files from the K2 mission

    Parameters
    ----------

    lcfits : str
        The filename of a MAST Kepler/K2 light curve FITS file.

    headerkeys : list
        A list of FITS header keys that will be extracted from the FITS light
        curve file. These describe the observations. The default value for this
        is given in `LCHEADERKEYS` above.

    datakeys : list
        A list of FITS column names that correspond to the auxiliary
        measurements in the light curve. The default is `LCDATAKEYS` above.

    sapkeys : list
        A list of FITS column names that correspond to the SAP flux
        measurements in the light curve. The default is `LCSAPKEYS` above.

    pdckeys : list
        A list of FITS column names that correspond to the PDC flux
        measurements in the light curve. The default is `LCPDCKEYS` above.

    topkeys : list
        A list of FITS header keys that describe the object in the light
        curve. The default is `LCTOPKEYS` above.

    apkeys : list
        A list of FITS header keys that describe the flux measurement apertures
        used by the Kepler/K2 pipeline. The default is `LCAPERTUREKEYS` above.

    appendto : lcdict or None
        If appendto is an `lcdict`, will append measurements of this `lcdict` to
        that `lcdict`. This is used for consolidating light curves for the same
        object across different files (quarters). The appending does not care
        about the time order. To consolidate light curves in time order, use
        `consolidate_kepler_fitslc` below.

    normalize : bool
        If True, then each component light curve's SAP_FLUX and PDCSAP_FLUX
        measurements will be normalized to 1.0 by dividing out the median flux
        for the component light curve.

    Returns
    -------

    lcdict
        Returns an `lcdict` (this is useable by most astrobase functions for LC
        processing).

    '''

    # read the fits file
    hdulist = pyfits.open(lcfits)
    lchdr, lcdata = hdulist[1].header, hdulist[1].data
    lctophdr, lcaperturehdr, lcaperturedata = (hdulist[0].header,
                                               hdulist[2].header,
                                               hdulist[2].data)
    hdulist.close()

    hdrinfo = {}

    # now get the values we want from the header
    for key in headerkeys:
        if key in lchdr and lchdr[key] is not None:
            hdrinfo[key.lower()] = lchdr[key]
        else:
            hdrinfo[key.lower()] = None

    # get the number of detections
    ndet = lchdr['NAXIS2']

    # get the info from the topheader
    for key in topkeys:
        if key in lctophdr and lctophdr[key] is not None:
            hdrinfo[key.lower()] = lctophdr[key]
        else:
            hdrinfo[key.lower()] = None

    # get the info from the lcaperturehdr
    for key in lcaperturehdr:
        if key in lcaperturehdr and lcaperturehdr[key] is not None:
            hdrinfo[key.lower()] = lcaperturehdr[key]
        else:
            hdrinfo[key.lower()] = None

    # if we're appending to another lcdict
    if appendto and isinstance(appendto, dict):

        lcdict = appendto

        lcdict['quarter'].append(hdrinfo['quarter'])
        lcdict['season'].append(hdrinfo['season'])
        lcdict['datarelease'].append(hdrinfo['data_rel'])
        lcdict['obsmode'].append(hdrinfo['obsmode'])
        lcdict['campaign'].append(hdrinfo['campaign'])
        # we don't update the objectid

        # update lcinfo
        lcdict['lcinfo']['timesys'].append(hdrinfo['timesys'])
        lcdict['lcinfo']['bjdoffset'].append(
            hdrinfo['bjdrefi'] + hdrinfo['bjdreff']
        )
        lcdict['lcinfo']['exptime'].append(hdrinfo['exposure'])
        lcdict['lcinfo']['lcaperture'].append(lcaperturedata)
        lcdict['lcinfo']['aperpixused'].append(hdrinfo['npixsap'])
        lcdict['lcinfo']['aperpixunused'].append(hdrinfo['npixmiss'])
        lcdict['lcinfo']['pixarcsec'].append(
            (npabs(hdrinfo['cdelt1']) +
             npabs(hdrinfo['cdelt2']))*3600.0/2.0
        )
        lcdict['lcinfo']['channel'].append(hdrinfo['channel'])
        lcdict['lcinfo']['skygroup'].append(hdrinfo['skygroup'])
        lcdict['lcinfo']['module'].append(hdrinfo['module'])
        lcdict['lcinfo']['output'].append(hdrinfo['output'])
        lcdict['lcinfo']['ndet'].append(ndet)

        # the objectinfo is not updated for the same object when appending to a
        # light curve. FIXME: maybe it should be?

        # update the varinfo for this light curve
        lcdict['varinfo']['cdpp3_0'].append(hdrinfo['cdpp3_0'])
        lcdict['varinfo']['cdpp6_0'].append(hdrinfo['cdpp6_0'])
        lcdict['varinfo']['cdpp12_0'].append(hdrinfo['cdpp12_0'])
        lcdict['varinfo']['pdcvar'].append(hdrinfo['pdcvar'])
        lcdict['varinfo']['pdcmethod'].append(hdrinfo['pdcmethd'])
        lcdict['varinfo']['aper_target_total_ratio'].append(hdrinfo['crowdsap'])
        lcdict['varinfo']['aper_target_frac'].append(hdrinfo['flfrcsap'])

        # update the light curve columns now
        for key in datakeys:
            if key.lower() in lcdict:
                lcdict[key.lower()] = (
                    npconcatenate((lcdict[key.lower()], lcdata[key]))
                )

        for key in sapkeys:

            if key.lower() in lcdict['sap']:

                sapflux_median = np.nanmedian(lcdata['SAP_FLUX'])

                # normalize the current flux measurements if needed
                if normalize and key == 'SAP_FLUX':
                    thislcdata = lcdata[key] / sapflux_median
                elif normalize and key == 'SAP_FLUX_ERR':
                    thislcdata = lcdata[key] / sapflux_median
                elif normalize and key == 'SAP_BKG':
                    thislcdata = lcdata[key] / sapflux_median
                elif normalize and key == 'SAP_BKG_ERR':
                    thislcdata = lcdata[key] / sapflux_median
                else:
                    thislcdata = lcdata[key]

                lcdict['sap'][key.lower()] = (
                    np.concatenate((lcdict['sap'][key.lower()], thislcdata))
                )

        for key in pdckeys:

            if key.lower() in lcdict['pdc']:

                pdcsap_flux_median = np.nanmedian(lcdata['PDCSAP_FLUX'])

                # normalize the current flux measurements if needed
                if normalize and key == 'PDCSAP_FLUX':
                    thislcdata = lcdata[key] / pdcsap_flux_median
                elif normalize and key == 'PDCSAP_FLUX_ERR':
                    thislcdata = lcdata[key] / pdcsap_flux_median
                else:
                    thislcdata = lcdata[key]

                lcdict['pdc'][key.lower()] = (
                    np.concatenate((lcdict['pdc'][key.lower()], thislcdata))
                )

        # append some of the light curve information into existing numpy arrays
        # so we can sort on them later
        lcdict['lc_channel'] = npconcatenate(
            (lcdict['lc_channel'],
             npfull_like(lcdata['TIME'],
                         hdrinfo['channel']))
        )
        lcdict['lc_skygroup'] = npconcatenate(
            (lcdict['lc_skygroup'],
             npfull_like(lcdata['TIME'],
                         hdrinfo['skygroup']))
        )
        lcdict['lc_module'] = npconcatenate(
            (lcdict['lc_module'],
             npfull_like(lcdata['TIME'],
                         hdrinfo['module']))
        )
        lcdict['lc_output'] = npconcatenate(
            (lcdict['lc_output'],
             npfull_like(lcdata['TIME'],
                         hdrinfo['output']))
        )
        lcdict['lc_quarter'] = npconcatenate(
            (lcdict['lc_quarter'],
             npfull_like(lcdata['TIME'],
                         hdrinfo['quarter']))
        )
        lcdict['lc_season'] = npconcatenate(
            (lcdict['lc_season'],
             npfull_like(lcdata['TIME'],
                         hdrinfo['season']))
        )
        lcdict['lc_campaign'] = npconcatenate(
            (lcdict['lc_campaign'],
             npfull_like(lcdata['TIME'],
                         hdrinfo['campaign']))
        )

    # otherwise, this is a new lcdict
    else:

        # form the lcdict
        # the metadata is one-elem arrays because we might add on to them later
        lcdict = {
            'quarter':[hdrinfo['quarter']],
            'season':[hdrinfo['season']],
            'datarelease':[hdrinfo['data_rel']],
            'campaign':[hdrinfo['campaign']],  # this is None for KepPrime
            'obsmode':[hdrinfo['obsmode']],
            'objectid':hdrinfo['object'],
            'lcinfo':{
                'timesys':[hdrinfo['timesys']],
                'bjdoffset':[hdrinfo['bjdrefi'] + hdrinfo['bjdreff']],
                'exptime':[hdrinfo['exposure']],
                'lcaperture':[lcaperturedata],
                'aperpixused':[hdrinfo['npixsap']],
                'aperpixunused':[hdrinfo['npixmiss']],
                'pixarcsec':[(npabs(hdrinfo['cdelt1']) +
                             npabs(hdrinfo['cdelt2']))*3600.0/2.0],
                'channel':[hdrinfo['channel']],
                'skygroup':[hdrinfo['skygroup']],
                'module':[hdrinfo['module']],
                'output':[hdrinfo['output']],
                'ndet':[ndet],
            },
            'objectinfo':{
                'objectid':hdrinfo['object'],  # repeated here for checkplot use
                'keplerid':hdrinfo['keplerid'],
                'ra':hdrinfo['ra_obj'],
                'decl':hdrinfo['dec_obj'],
                'pmra':hdrinfo['pmra'],
                'pmdecl':hdrinfo['pmdec'],
                'pmtotal':hdrinfo['pmtotal'],
                'sdssg':hdrinfo['gmag'],
                'sdssr':hdrinfo['rmag'],
                'sdssi':hdrinfo['imag'],
                'sdssz':hdrinfo['zmag'],
                'kepmag':hdrinfo['kepmag'],
                'teff':hdrinfo['teff'],
                'logg':hdrinfo['logg'],
                'feh':hdrinfo['feh'],
                'ebminusv':hdrinfo['ebminusv'],
                'extinction':hdrinfo['av'],
                'starradius':hdrinfo['radius'],
                'twomassuid':hdrinfo['tmindex'],
            },
            'varinfo':{
                'cdpp3_0':[hdrinfo['cdpp3_0']],
                'cdpp6_0':[hdrinfo['cdpp6_0']],
                'cdpp12_0':[hdrinfo['cdpp12_0']],
                'pdcvar':[hdrinfo['pdcvar']],
                'pdcmethod':[hdrinfo['pdcmethd']],
                'aper_target_total_ratio':[hdrinfo['crowdsap']],
                'aper_target_frac':[hdrinfo['flfrcsap']],
            },
            'sap':{},
            'pdc':{},
        }

        # get the LC columns
        for key in datakeys:
            lcdict[key.lower()] = lcdata[key]
        for key in sapkeys:
            lcdict['sap'][key.lower()] = lcdata[key]
        for key in pdckeys:
            lcdict['pdc'][key.lower()] = lcdata[key]

        # turn some of the light curve information into numpy arrays so we can
        # sort on them later
        lcdict['lc_channel'] = npfull_like(lcdict['time'],
                                           lcdict['lcinfo']['channel'][0])
        lcdict['lc_skygroup'] = npfull_like(lcdict['time'],
                                            lcdict['lcinfo']['skygroup'][0])
        lcdict['lc_module'] = npfull_like(lcdict['time'],
                                          lcdict['lcinfo']['module'][0])
        lcdict['lc_output'] = npfull_like(lcdict['time'],
                                          lcdict['lcinfo']['output'][0])
        lcdict['lc_quarter'] = npfull_like(lcdict['time'],
                                           lcdict['quarter'][0])
        lcdict['lc_season'] = npfull_like(lcdict['time'],
                                          lcdict['season'][0])
        lcdict['lc_campaign'] = npfull_like(lcdict['time'],
                                            lcdict['campaign'][0])

        # normalize the SAP and PDCSAP fluxes if needed
        if normalize:

            sapflux_median = np.nanmedian(lcdict['sap']['sap_flux'])
            pdcsap_flux_median = np.nanmedian(lcdict['pdc']['pdcsap_flux'])

            lcdict['sap']['sap_flux'] = (
                lcdict['sap']['sap_flux'] /
                sapflux_median
            )
            lcdict['sap']['sap_flux_err'] = (
                lcdict['sap']['sap_flux_err'] /
                sapflux_median
            )

            lcdict['sap']['sap_bkg'] = (
                lcdict['sap']['sap_bkg'] /
                sapflux_median
            )
            lcdict['sap']['sap_bkg_err'] = (
                lcdict['sap']['sap_bkg_err'] /
                sapflux_median
            )

            lcdict['pdc']['pdcsap_flux'] = (
                lcdict['pdc']['pdcsap_flux'] /
                pdcsap_flux_median
            )
            lcdict['pdc']['pdcsap_flux_err'] = (
                lcdict['pdc']['pdcsap_flux_err'] /
                pdcsap_flux_median
            )

    ## END OF LIGHT CURVE CONSTRUCTION ##

    # update the lcdict columns with the actual columns
    lcdict['columns'] = (
        [x.lower() for x in datakeys] +
        ['sap.%s' % x.lower() for x in sapkeys] +
        ['pdc.%s' % x.lower() for x in pdckeys] +
        ['lc_channel','lc_skygroup','lc_module',
         'lc_output','lc_quarter','lc_season']
    )

    # return the lcdict at the end
    return lcdict


[docs]def consolidate_kepler_fitslc(keplerid,
                              lcfitsdir,
                              normalize=True,
                              headerkeys=LCHEADERKEYS,
                              datakeys=LCDATAKEYS,
                              sapkeys=LCSAPKEYS,
                              pdckeys=LCPDCKEYS,
                              topkeys=LCTOPKEYS,
                              apkeys=LCAPERTUREKEYS):
    '''This gets all Kepler/K2 light curves for the given `keplerid`
    in `lcfitsdir`.

    Searches recursively in `lcfitsdir` for all of the files belonging to the
    specified `keplerid`. Sorts the light curves by time. Returns an
    `lcdict`. This is meant to be used to consolidate light curves for a single
    object across Kepler quarters.

    NOTE: `keplerid` is an integer (without the leading zeros). This is usually
    the KIC ID.

    NOTE: if light curve time arrays contain `nans`, these and their associated
    measurements will be sorted to the end of the final combined arrays.

    Parameters
    ----------

    keplerid : int
        The Kepler ID of the object to consolidate LCs for, as an integer
        without any leading zeros. This is usually the KIC or EPIC ID.

    lcfitsdir : str
        The directory to look in for LCs of the specified object.

    normalize : bool
        If True, then each component light curve's SAP_FLUX and PDCSAP_FLUX
        measurements will be normalized to 1.0 by dividing out the median flux
        for the component light curve.

    headerkeys : list
        A list of FITS header keys that will be extracted from the FITS light
        curve file. These describe the observations. The default value for this
        is given in `LCHEADERKEYS` above.

    datakeys : list
        A list of FITS column names that correspond to the auxiliary
        measurements in the light curve. The default is `LCDATAKEYS` above.

    sapkeys : list
        A list of FITS column names that correspond to the SAP flux
        measurements in the light curve. The default is `LCSAPKEYS` above.

    pdckeys : list
        A list of FITS column names that correspond to the PDC flux
        measurements in the light curve. The default is `LCPDCKEYS` above.

    topkeys : list
        A list of FITS header keys that describe the object in the light
        curve. The default is `LCTOPKEYS` above.

    apkeys : list
        A list of FITS header keys that describe the flux measurement apertures
        used by the Kepler/K2 pipeline. The default is `LCAPERTUREKEYS` above.

    Returns
    -------

    lcdict
        Returns an `lcdict` (this is useable by most astrobase functions for LC
        processing).

    '''

    LOGINFO('looking for Kepler light curve FITS in %s for %s...' % (lcfitsdir,
                                                                     keplerid))

    matching = glob.glob(os.path.join(lcfitsdir,
                                      '**',
                                      'kplr%09i-*_llc.fits' % keplerid),
                         recursive=True)
    LOGINFO('found %s files: %s' % (len(matching), repr(matching)))

    # now that we've found everything, read them all in
    if len(matching) > 0:

        LOGINFO('consolidating...')

        # the first file
        consolidated = read_kepler_fitslc(matching[0],
                                          headerkeys=headerkeys,
                                          datakeys=datakeys,
                                          sapkeys=sapkeys,
                                          pdckeys=pdckeys,
                                          topkeys=topkeys,
                                          apkeys=apkeys,
                                          normalize=normalize)

        # get the rest of the files
        for lcf in matching:
            consolidated = read_kepler_fitslc(lcf,
                                              appendto=consolidated,
                                              headerkeys=headerkeys,
                                              datakeys=datakeys,
                                              sapkeys=sapkeys,
                                              pdckeys=pdckeys,
                                              topkeys=topkeys,
                                              apkeys=apkeys,
                                              normalize=normalize)

        # get the sort indices
        # we use time for the columns and quarters for the headers
        LOGINFO('sorting by time...')

        # NOTE: nans in time will be sorted to the end of the array
        finiteind = npisfinite(consolidated['time'])
        if npsum(finiteind) < consolidated['time'].size:
            LOGWARNING('some time values are nan! '
                       'measurements at these times will be '
                       'sorted to the end of the column arrays.')

        # get the sort index
        column_sort_ind = npargsort(consolidated['time'])

        # sort the columns by time
        for col in consolidated['columns']:
            if '.' in col:
                key, subkey = col.split('.')
                consolidated[key][subkey] = (
                    consolidated[key][subkey][column_sort_ind]
                )
            else:
                consolidated[col] = consolidated[col][column_sort_ind]

        # now sort the headers by quarters
        header_sort_ind = npargsort(consolidated['quarter']).tolist()

        # this is a bit convoluted, but whatever: list -> array -> list

        for key in ('quarter', 'season', 'datarelease', 'obsmode'):
            consolidated[key] = (
                nparray(consolidated[key])[header_sort_ind].tolist()
            )

        for key in ('timesys','bjdoffset','exptime','lcaperture',
                    'aperpixused','aperpixunused','pixarcsec',
                    'channel','skygroup','module','output','ndet'):
            consolidated['lcinfo'][key] = (
                nparray(consolidated['lcinfo'][key])[header_sort_ind].tolist()
            )

        for key in ('cdpp3_0','cdpp6_0','cdpp12_0','pdcvar','pdcmethod',
                    'aper_target_total_ratio','aper_target_frac'):
            consolidated['varinfo'][key] = (
                nparray(consolidated['varinfo'][key])[header_sort_ind].tolist()
            )

        # finally, return the consolidated lcdict
        return consolidated

    # if we didn't find anything, complain
    else:

        LOGERROR('could not find any light curves '
                 'for %s in %s or its subdirectories' % (keplerid,
                                                         lcfitsdir))
        return None


########################
## READING K2 SFF LCs ##
########################

SFFTOPKEYS = LCTOPKEYS
SFFHEADERKEYS = LCHEADERKEYS + ['MASKTYPE','MASKINDE','NPIXSAP']
SFFDATAKEYS = ['T','FRAW','FCOR','ARCLENGTH','MOVING','CADENCENO']


[docs]def read_k2sff_lightcurve(lcfits):
    '''This reads a K2 SFF (Vandenberg+ 2014) light curve into an `lcdict`.

    Use this with the light curves from the K2 SFF project at MAST.

    Parameters
    ----------

    lcfits : str
        The filename of the FITS light curve file downloaded from MAST.

    Returns
    -------

    lcdict
        Returns an `lcdict` (this is useable by most astrobase functions for LC
        processing).

    '''

    # read the fits file
    hdulist = pyfits.open(lcfits)
    lchdr, lcdata = hdulist[1].header, hdulist[1].data
    lctophdr = hdulist[0].header

    hdulist.close()

    hdrinfo = {}

    # get the number of detections
    ndet = lchdr['NAXIS2']

    # get the info from the topheader
    for key in SFFTOPKEYS:
        if key in lctophdr and lctophdr[key] is not None:
            hdrinfo[key.lower()] = lctophdr[key]
        else:
            hdrinfo[key.lower()] = None

    # now get the values we want from the header
    for key in SFFHEADERKEYS:
        if key in lchdr and lchdr[key] is not None:
            hdrinfo[key.lower()] = lchdr[key]
        else:
            hdrinfo[key.lower()] = None

    # form the lcdict
    # the metadata is one-elem arrays because we might add on to them later
    lcdict = {
        'quarter':[hdrinfo['quarter']],
        'season':[hdrinfo['season']],
        'datarelease':[hdrinfo['data_rel']],
        'obsmode':[hdrinfo['obsmode']],
        'objectid':hdrinfo['object'],
        'campaign':[hdrinfo['campaign']],
        'lcinfo':{
            'timesys':[hdrinfo['timesys']],
            'bjdoffset':[hdrinfo['bjdrefi'] + hdrinfo['bjdreff']],
            'exptime':[hdrinfo['exposure']],
            'lcapermaskidx':[hdrinfo['maskinde']],
            'lcapermasktype':[hdrinfo['masktype']],
            'aperpixused':[hdrinfo['npixsap']],
            'aperpixunused':[None],
            'pixarcsec':[None],
            'channel':[hdrinfo['channel']],
            'skygroup':[hdrinfo['skygroup']],
            'module':[hdrinfo['module']],
            'output':[hdrinfo['output']],
            'ndet':[ndet],
        },
        'objectinfo':{
            'keplerid':hdrinfo['keplerid'],
            'ra':hdrinfo['ra_obj'],
            'decl':hdrinfo['dec_obj'],
            'pmra':hdrinfo['pmra'],
            'pmdecl':hdrinfo['pmdec'],
            'pmtotal':hdrinfo['pmtotal'],
            'sdssg':hdrinfo['gmag'],
            'sdssr':hdrinfo['rmag'],
            'sdssi':hdrinfo['imag'],
            'sdssz':hdrinfo['zmag'],
            'kepmag':hdrinfo['kepmag'],
            'teff':hdrinfo['teff'],
            'logg':hdrinfo['logg'],
            'feh':hdrinfo['feh'],
            'ebminusv':hdrinfo['ebminusv'],
            'extinction':hdrinfo['av'],
            'starradius':hdrinfo['radius'],
            'twomassuid':hdrinfo['tmindex'],
        },
        'varinfo':{
            'cdpp3_0':[hdrinfo['cdpp3_0']],
            'cdpp6_0':[hdrinfo['cdpp6_0']],
            'cdpp12_0':[hdrinfo['cdpp12_0']],
            'pdcvar':[hdrinfo['pdcvar']],
            'pdcmethod':[hdrinfo['pdcmethd']],
            'aptgttotrat':[hdrinfo['crowdsap']],
            'aptgtfrac':[hdrinfo['flfrcsap']],
        },
    }

    # get the LC columns
    for key in SFFDATAKEYS:
        lcdict[key.lower()] = lcdata[key]

    # add some of the light curve information to the data arrays so we can sort
    # on them later
    lcdict['channel'] = npfull_like(lcdict['t'],
                                    lcdict['lcinfo']['channel'][0])
    lcdict['skygroup'] = npfull_like(lcdict['t'],
                                     lcdict['lcinfo']['skygroup'][0])
    lcdict['module'] = npfull_like(lcdict['t'],
                                   lcdict['lcinfo']['module'][0])
    lcdict['output'] = npfull_like(lcdict['t'],
                                   lcdict['lcinfo']['output'][0])
    lcdict['quarter'] = npfull_like(lcdict['t'],
                                    lcdict['quarter'][0])
    lcdict['season'] = npfull_like(lcdict['t'],
                                   lcdict['season'][0])
    lcdict['campaign'] = npfull_like(lcdict['t'],
                                     lcdict['campaign'][0])

    # update the lcdict columns with the actual columns
    lcdict['columns'] = (
        [x.lower() for x in SFFDATAKEYS] +
        ['channel','skygroup','module','output','quarter','season','campaign']
    )

    # return the lcdict at the end
    return lcdict


##################
## INPUT/OUTPUT ##
##################

[docs]def kepler_lcdict_to_pkl(lcdict, outfile=None):
    '''This writes the `lcdict` to a Python pickle.

    Parameters
    ----------

    lcdict : lcdict
        This is the input `lcdict` to write to a pickle.

    outfile : str or None
        If this is None, the object's Kepler ID/EPIC ID will determined from the
        `lcdict` and used to form the filename of the output pickle file. If
        this is a `str`, the provided filename will be used.

    Returns
    -------

    str
        The absolute path to the written pickle file.

    '''

    if not outfile:
        outfile = '%s-keplc.pkl' % lcdict['objectid'].replace(' ','-')

    # we're using pickle.HIGHEST_PROTOCOL here, this will make Py3 pickles
    # unreadable for Python 2.7
    with open(outfile,'wb') as outfd:
        pickle.dump(lcdict, outfd, protocol=pickle.HIGHEST_PROTOCOL)

    return os.path.abspath(outfile)


[docs]def read_kepler_pklc(picklefile):
    '''This turns the pickled lightcurve file back into an `lcdict`.

    Parameters
    ----------

    picklefile : str
        The path to a previously written Kepler LC picklefile generated by
        `kepler_lcdict_to_pkl` above.

    Returns
    -------

    lcdict
        Returns an `lcdict` (this is useable by most astrobase functions for LC
        processing).

    '''

    if picklefile.endswith('.gz'):
        infd = gzip.open(picklefile, 'rb')
    else:
        infd = open(picklefile, 'rb')

    try:
        with infd:
            lcdict = pickle.load(infd)

    except UnicodeDecodeError:

        with open(picklefile,'rb') as infd:
            lcdict = pickle.load(infd, encoding='latin1')

        LOGWARNING('pickle %s was probably from Python 2 '
                   'and failed to load without using "latin1" encoding. '
                   'This is probably a numpy issue: '
                   'http://stackoverflow.com/q/11305790' % picklefile)

    return lcdict


##########################
## KEPLER LC PROCESSING ##
##########################

[docs]def stitch_kepler_lcdict(lcdict):
    '''This stitches Kepler light curves together across quarters.

    FIXME: implement this.

    Parameters
    ----------

    lcdict : lcdict
        An `lcdict` produced by `consolidate_kepler_fitslc`. The flux
        measurements between quarters will be stitched together.

    Returns
    -------

    lcdict
        Returns an `lcdict` (this is useable by most astrobase functions for LC
        processing). The flux measurements will have been shifted to form a
        seamless light curve across quarters suitable for long-term variability
        investigation.

    '''


[docs]def filter_kepler_lcdict(lcdict,
                         filterflags=True,
                         nanfilter='sap,pdc',
                         timestoignore=None):
    '''This filters the Kepler `lcdict`, removing nans and bad
    observations.

    By default, this function removes points in the Kepler LC that have ANY
    quality flags set.

    Parameters
    ----------

    lcdict : lcdict
        An `lcdict` produced by `consolidate_kepler_fitslc` or
        `read_kepler_fitslc`.

    filterflags : bool
        If True, will remove any measurements that have non-zero quality flags
        present. This usually indicates an issue with the instrument or
        spacecraft.

    nanfilter : {'sap','pdc','sap,pdc'}
        Indicates the flux measurement type(s) to apply the filtering to.

    timestoignore : list of tuples or None
        This is of the form::

            [(time1_start, time1_end), (time2_start, time2_end), ...]

        and indicates the start and end times to mask out of the final
        lcdict. Use this to remove anything that wasn't caught by the quality
        flags.

    Returns
    -------

    lcdict
        Returns an `lcdict` (this is useable by most astrobase functions for LC
        processing). The `lcdict` is filtered IN PLACE!

    '''

    cols = lcdict['columns']

    # filter all bad LC points as noted by quality flags
    if filterflags:

        nbefore = lcdict['time'].size
        filterind = lcdict['sap_quality'] == 0

        for col in cols:
            if '.' in col:
                key, subkey = col.split('.')
                lcdict[key][subkey] = lcdict[key][subkey][filterind]
            else:
                lcdict[col] = lcdict[col][filterind]

        nafter = lcdict['time'].size
        LOGINFO('applied quality flag filter, ndet before = %s, ndet after = %s'
                % (nbefore, nafter))

    if nanfilter and nanfilter == 'sap,pdc':
        notnanind = (
            npisfinite(lcdict['sap']['sap_flux']) &
            npisfinite(lcdict['pdc']['pdcsap_flux']) &
            npisfinite(lcdict['time'])
        )
    elif nanfilter and nanfilter == 'sap':
        notnanind = (
            npisfinite(lcdict['sap']['sap_flux']) &
            npisfinite(lcdict['time'])
        )
    elif nanfilter and nanfilter == 'pdc':
        notnanind = (
            npisfinite(lcdict['pdc']['pdcsap_flux']) &
            npisfinite(lcdict['time'])
        )

    # remove nans from all columns
    if nanfilter:

        nbefore = lcdict['time'].size
        for col in cols:
            if '.' in col:
                key, subkey = col.split('.')
                lcdict[key][subkey] = lcdict[key][subkey][notnanind]
            else:
                lcdict[col] = lcdict[col][notnanind]

        nafter = lcdict['time'].size

        LOGINFO('removed nans, ndet before = %s, ndet after = %s'
                % (nbefore, nafter))

    # exclude all times in timestoignore
    if (timestoignore and
        isinstance(timestoignore, list) and
        len(timestoignore) > 0):

        exclind = npfull_like(lcdict['time'], True, dtype=np.bool_)
        nbefore = exclind.size

        # get all the masks
        for ignoretime in timestoignore:
            time0, time1 = ignoretime[0], ignoretime[1]
            thismask = ~((lcdict['time'] >= time0) & (lcdict['time'] <= time1))
            exclind = exclind & thismask

        # apply the masks
        for col in cols:
            if '.' in col:
                key, subkey = col.split('.')
                lcdict[key][subkey] = lcdict[key][subkey][exclind]
            else:
                lcdict[col] = lcdict[col][exclind]

        nafter = lcdict['time'].size
        LOGINFO('removed timestoignore, ndet before = %s, ndet after = %s'
                % (nbefore, nafter))

    return lcdict


###################
## KEPLER LC EPD ##
###################

def _epd_function(coeffs, fluxes, xcc, ycc, bgv, bge):
    '''This is the EPD function to fit.

    Parameters
    ----------

    coeffs : array-like of floats
        Contains the EPD coefficients that will be used to generate the EPD fit
        function.

    fluxes : array-like
        The flux measurement array being used.

    xcc,ycc : array-like
        Arrays of the x and y coordinates associated with each measurement in
        `fluxes`.

    bgv,bge : array-like
        Arrays of the flux background value and the flux background error
        associated with each measurement in `fluxes`.

    Returns
    -------

    np.array
        Contains the fit function evaluated at each flux measurement value.

    '''

    epdf = (
        coeffs[0] +
        coeffs[1]*npsin(2*MPI*xcc) + coeffs[2]*npcos(2*MPI*xcc) +
        coeffs[3]*npsin(2*MPI*ycc) + coeffs[4]*npcos(2*MPI*ycc) +
        coeffs[5]*npsin(4*MPI*xcc) + coeffs[6]*npcos(4*MPI*xcc) +
        coeffs[7]*npsin(4*MPI*ycc) + coeffs[8]*npcos(4*MPI*ycc) +
        coeffs[9]*bgv +
        coeffs[10]*bge
    )

    return epdf


def _epd_residual(coeffs, fluxes, xcc, ycc, bgv, bge):
    '''This is the residual function to minimize using scipy.optimize.leastsq.

    Parameters
    ----------

    coeffs : array-like of floats
        Contains the EPD coefficients that will be used to generate the EPD fit
        function.

    fluxes : array-like
        The flux measurement array being used.

    xcc,ycc : array-like
        Arrays of the x and y coordinates associated with each measurement in
        `fluxes`.

    bgv,bge : array-like
        Arrays of the flux background value and the flux background error
        associated with each measurement in `fluxes`.

    Returns
    -------

    np.array
        Contains the fit function residual evaluated at each flux measurement
        value.

    '''

    f = _epd_function(coeffs, fluxes, xcc, ycc, bgv, bge)
    residual = fluxes - f
    return residual


[docs]def epd_kepler_lightcurve(lcdict,
                          xccol='mom_centr1',
                          yccol='mom_centr2',
                          timestoignore=None,
                          filterflags=True,
                          writetodict=True,
                          epdsmooth=5):
    '''This runs EPD on the Kepler light curve.

    Following Huang et al. 2015, we fit the following EPD function to a smoothed
    light curve, and then subtract it to obtain EPD corrected magnitudes::

        f = c0 +
            c1*sin(2*pi*x) + c2*cos(2*pi*x) + c3*sin(2*pi*y) + c4*cos(2*pi*y) +
            c5*sin(4*pi*x) + c6*cos(4*pi*x) + c7*sin(4*pi*y) + c8*cos(4*pi*y) +
            c9*bgv + c10*bge

    By default, this function removes points in the Kepler LC that have ANY
    quality flags set.

    Parameters
    ----------

    lcdict : lcdict
        An `lcdict` produced by `consolidate_kepler_fitslc` or
        `read_kepler_fitslc`.

    xcol,ycol : str
        Indicates the x and y coordinate column names to use from the Kepler LC
        in the EPD fit.

    timestoignore : list of tuples
        This is of the form::

            [(time1_start, time1_end), (time2_start, time2_end), ...]

        and indicates the start and end times to mask out of the final
        lcdict. Use this to remove anything that wasn't caught by the quality
        flags.

    filterflags : bool
        If True, will remove any measurements that have non-zero quality flags
        present. This usually indicates an issue with the instrument or
        spacecraft.

    writetodict : bool
        If writetodict is True, adds the following columns to the lcdict::

            epd_time = time array
            epd_sapflux = uncorrected flux before EPD
            epd_epdsapflux = corrected flux after EPD
            epd_epdsapcorr = EPD flux corrections
            epd_bkg = background array
            epd_bkg_err = background errors array
            epd_xcc = xcoord array
            epd_ycc = ycoord array
            epd_quality = quality flag array

        and updates the 'columns' list in the lcdict as well.

    epdsmooth : int
        Sets the number of light curve points to smooth over when generating the
        EPD fit function.

    Returns
    -------

    tuple
        Returns a tuple of the form: (times, epdfluxes, fitcoeffs, epdfit)

    '''

    times, fluxes, background, background_err = (lcdict['time'],
                                                 lcdict['sap']['sap_flux'],
                                                 lcdict['sap']['sap_bkg'],
                                                 lcdict['sap']['sap_bkg_err'])
    xcc = lcdict[xccol]
    ycc = lcdict[yccol]
    flags = lcdict['sap_quality']

    # filter all bad LC points as noted by quality flags
    if filterflags:

        nbefore = times.size

        filterind = flags == 0

        times = times[filterind]
        fluxes = fluxes[filterind]
        background = background[filterind]
        background_err = background_err[filterind]
        xcc = xcc[filterind]
        ycc = ycc[filterind]
        flags = flags[filterind]

        nafter = times.size
        LOGINFO('applied quality flag filter, ndet before = %s, ndet after = %s'
                % (nbefore, nafter))

    # remove nans
    find = (npisfinite(xcc) & npisfinite(ycc) &
            npisfinite(times) & npisfinite(fluxes) &
            npisfinite(background) & npisfinite(background_err))

    nbefore = times.size

    times = times[find]
    fluxes = fluxes[find]
    background = background[find]
    background_err = background_err[find]
    xcc = xcc[find]
    ycc = ycc[find]
    flags = flags[find]

    nafter = times.size
    LOGINFO('removed nans, ndet before = %s, ndet after = %s'
            % (nbefore, nafter))

    # exclude all times in timestoignore
    if (timestoignore and
        isinstance(timestoignore, list) and
        len(timestoignore) > 0):

        exclind = npfull_like(times,True)
        nbefore = times.size

        # apply all the masks
        for ignoretime in timestoignore:
            time0, time1 = ignoretime[0], ignoretime[1]
            thismask = (times > time0) & (times < time1)
            exclind = exclind & thismask

        # quantities after masks have been applied
        times = times[exclind]
        fluxes = fluxes[exclind]
        background = background[exclind]
        background_err = background_err[exclind]
        xcc = xcc[exclind]
        ycc = ycc[exclind]
        flags = flags[exclind]

        nafter = times.size
        LOGINFO('removed timestoignore, ndet before = %s, ndet after = %s'
                % (nbefore, nafter))

    # now that we're all done, we can do EPD
    # first, smooth the light curve
    smoothedfluxes = median_filter(fluxes, size=epdsmooth)

    # initial fit coeffs
    initcoeffs = npones(11)

    # fit the the smoothed mags and find better coeffs
    leastsqfit = leastsq(_epd_residual,
                         initcoeffs,
                         args=(smoothedfluxes,
                               xcc, ycc,
                               background, background_err))

    # if the fit succeeds, then get the EPD fluxes
    if leastsqfit[-1] in (1,2,3,4):

        fitcoeffs = leastsqfit[0]
        epdfit = _epd_function(fitcoeffs,
                               fluxes,
                               xcc,
                               ycc,
                               background,
                               background_err)
        epdfluxes = npmedian(fluxes) + fluxes - epdfit

        # write these to the dictionary if requested
        if writetodict:

            lcdict['epd'] = {}

            lcdict['epd']['time'] = times
            lcdict['epd']['sapflux'] = fluxes
            lcdict['epd']['epdsapflux'] = epdfluxes
            lcdict['epd']['epdsapcorr'] = epdfit
            lcdict['epd']['bkg'] = background
            lcdict['epd']['bkg_err'] = background_err
            lcdict['epd']['xcc'] = xcc
            lcdict['epd']['ycc'] = ycc
            lcdict['epd']['quality'] = flags

            for newcol in ['epd.time','epd.sapflux',
                           'epd.epdsapflux','epd.epdsapcorr',
                           'epd.bkg','epd.bkg.err',
                           'epd.xcc','epd.ycc',
                           'epd.quality']:

                if newcol not in lcdict['columns']:
                    lcdict['columns'].append(newcol)

        return times, epdfluxes, fitcoeffs, epdfit

    else:

        LOGERROR('could not fit EPD function to light curve')
        return None, None, None, None


[docs]def rfepd_kepler_lightcurve(
        lcdict,
        xccol='mom_centr1',
        yccol='mom_centr2',
        timestoignore=None,
        filterflags=True,
        writetodict=True,
        epdsmooth=23,
        decorr='xcc,ycc',
        nrftrees=200
):
    '''This uses a `RandomForestRegressor` to fit and decorrelate Kepler light
    curves.

    Fits the X and Y positions, the background, and background error.

    By default, this function removes points in the Kepler LC that have ANY
    quality flags set.

    Parameters
    ----------

    lcdict : lcdict
        An `lcdict` produced by `consolidate_kepler_fitslc` or
        `read_kepler_fitslc`.

    xcol,ycol : str
        Indicates the x and y coordinate column names to use from the Kepler LC
        in the EPD fit.

    timestoignore : list of tuples
        This is of the form::

            [(time1_start, time1_end), (time2_start, time2_end), ...]

        and indicates the start and end times to mask out of the final
        lcdict. Use this to remove anything that wasn't caught by the quality
        flags.

    filterflags : bool
        If True, will remove any measurements that have non-zero quality flags
        present. This usually indicates an issue with the instrument or
        spacecraft.

    writetodict : bool
        If writetodict is True, adds the following columns to the lcdict::

            rfepd_time = time array
            rfepd_sapflux = uncorrected flux before EPD
            rfepd_epdsapflux = corrected flux after EPD
            rfepd_epdsapcorr = EPD flux corrections
            rfepd_bkg = background array
            rfepd_bkg_err = background errors array
            rfepd_xcc = xcoord array
            rfepd_ycc = ycoord array
            rfepd_quality = quality flag array

        and updates the 'columns' list in the lcdict as well.

    epdsmooth : int
        Sets the number of light curve points to smooth over when generating the
        EPD fit function.

    decorr : {'xcc,ycc','bgv,bge','xcc,ycc,bgv,bge'}
        Indicates whether to use the x,y coords alone; background value and
        error alone; or x,y coords and background value, error in combination as
        the features to training the `RandomForestRegressor` on and perform the
        fit.

    nrftrees : int
        The number of trees to use in the `RandomForestRegressor`.

    Returns
    -------

    tuple
        Returns a tuple of the form: (times, corrected_fluxes, flux_corrections)

    '''

    times, fluxes, background, background_err = (
        lcdict['time'],
        lcdict['sap']['sap_flux'],
        lcdict['sap']['sap_bkg'],
        lcdict['sap']['sap_bkg_err']
    )
    xcc = lcdict[xccol]
    ycc = lcdict[yccol]
    flags = lcdict['sap_quality']

    # filter all bad LC points as noted by quality flags
    if filterflags:

        nbefore = times.size

        filterind = flags == 0

        times = times[filterind]
        fluxes = fluxes[filterind]
        background = background[filterind]
        background_err = background_err[filterind]
        xcc = xcc[filterind]
        ycc = ycc[filterind]
        flags = flags[filterind]

        nafter = times.size
        LOGINFO('applied quality flag filter, ndet before = %s, '
                'ndet after = %s'
                % (nbefore, nafter))

    # remove nans
    find = (npisfinite(xcc) & npisfinite(ycc) &
            npisfinite(times) & npisfinite(fluxes) &
            npisfinite(background) & npisfinite(background_err))

    nbefore = times.size

    times = times[find]
    fluxes = fluxes[find]
    background = background[find]
    background_err = background_err[find]
    xcc = xcc[find]
    ycc = ycc[find]
    flags = flags[find]

    nafter = times.size
    LOGINFO('removed nans, ndet before = %s, ndet after = %s'
            % (nbefore, nafter))

    # exclude all times in timestoignore
    if (timestoignore and
        isinstance(timestoignore, list) and
        len(timestoignore) > 0):

        exclind = npfull_like(times,True)

        nbefore = times.size

        # apply all the masks
        for ignoretime in timestoignore:
            time0, time1 = ignoretime[0], ignoretime[1]
            thismask = (times > time0) & (times < time1)
            exclind = exclind & thismask

        # quantities after masks have been applied
        times = times[exclind]
        fluxes = fluxes[exclind]
        background = background[exclind]
        background_err = background_err[exclind]
        xcc = xcc[exclind]
        ycc = ycc[exclind]
        flags = flags[exclind]

        nafter = times.size
        LOGINFO('removed timestoignore, ndet before = %s, ndet after = %s'
                % (nbefore, nafter))

    # now that we're all done, we can do EPD

    # set up the regressor
    RFR = RandomForestRegressor(n_estimators=nrftrees)

    if decorr == 'xcc,ycc,bgv,bge':
        # collect the features and target variable
        features = npcolumn_stack((xcc,ycc,background,background_err))
    elif decorr == 'xcc,ycc':
        # collect the features and target variable
        features = npcolumn_stack((xcc,ycc))
    elif decorr == 'bgv,bge':
        # collect the features and target variable
        features = npcolumn_stack((background,background_err))
    else:
        LOGERROR("couldn't understand decorr, not decorrelating...")
        return None

    # smooth the light curve
    if epdsmooth:
        smoothedfluxes = median_filter(fluxes, size=epdsmooth)
    else:
        smoothedfluxes = fluxes

    # fit, then generate the predicted values, then get corrected values
    RFR.fit(features, smoothedfluxes)
    flux_corrections = RFR.predict(features)
    corrected_fluxes = npmedian(fluxes) + fluxes - flux_corrections

    # remove the random forest to save RAM
    del RFR

    # write these to the dictionary if requested
    if writetodict:

        lcdict['rfepd'] = {}
        lcdict['rfepd']['time'] = times
        lcdict['rfepd']['sapflux'] = fluxes
        lcdict['rfepd']['epdsapflux'] = corrected_fluxes
        lcdict['rfepd']['epdsapcorr'] = flux_corrections
        lcdict['rfepd']['bkg'] = background
        lcdict['rfepd']['bkg_err'] = background_err
        lcdict['rfepd']['xcc'] = xcc
        lcdict['rfepd']['ycc'] = ycc
        lcdict['rfepd']['quality'] = flags

        for newcol in ['rfepd.time','rfepd.sapflux',
                       'rfepd.epdsapflux','rfepd.epdsapcorr',
                       'rfepd.bkg','rfepd.bkg.err',
                       'rfepd.xcc','rfepd.ycc',
                       'rfepd.quality']:

            if newcol not in lcdict['columns']:
                lcdict['columns'].append(newcol)

    return times, corrected_fluxes, flux_corrections


#######################
## CENTROID ANALYSIS ##
#######################

[docs]def detrend_centroid(lcd, detrend='legendre', sigclip=None, mingap=0.5):
    '''Detrends the x and y coordinate centroids for a Kepler light curve.

    Given an `lcdict` for a single quarter of Kepler data, returned by
    `read_kepler_fitslc`, this function returns this same dictionary,
    appending detrended centroid_x and centroid_y values.

    Here "detrended" means "finite, SAP quality flag set to 0, sigma clipped,
    timegroups selected based on `mingap` day gaps, then fit vs time by a
    legendre polynomial of lowish degree".

    Parameters
    ----------

    lcd : lcdict
        An `lcdict` generated by the `read_kepler_fitslc` function.

    detrend : {'legendre'}
        Method by which to detrend the LC. 'legendre' is the only thing
        implemented at the moment.

    sigclip : None or float or int or sequence of floats/ints
        Determines the type and amount of sigma-clipping done on the light curve
        to remove outliers. If None, no sigma-clipping is performed. If a two
        element sequence of floats/ints, the first element corresponds to the
        fainter sigma-clip limit, and the second element corresponds to the
        brighter sigma-clip limit.

    mingap : float
        Number of days by which to define "timegroups" (for individual fitting
        each of timegroup, and to eliminate "burn-in" of Kepler spacecraft. For
        long cadence data, 0.5 days is typical.

    Returns
    -------

    tuple
        This is of the form `(lcd, errflag)`, where:

        `lcd` : an `lcdict` with the new key `lcd['centroids']`, containing the
        detrended times, (centroid_x, centroid_y) values, and their errors.

        `errflag` : boolean error flag, could be raised at various points.

    '''

    qnum = npunique(lcd['quarter'])
    assert qnum.size == 1, 'light curve dict should be for a unique quarter'
    assert detrend == 'legendre', 'only legendre is supported for detrend kwarg'
    qnum = int(qnum)

    # Get finite, QUALITY_FLAG != 0 times, centroids, and their errors.
    # Fraquelli & Thompson (2012), or perhaps also newer papers, give the list
    # of exclusions for quality flags.
    nbefore = lcd['time'].size

    # "ctd" for centroid.
    times = lcd['time'][lcd['sap_quality'] == 0]
    # Kepler Archive Manual KDMC-10008-006, pg 18. MOM_CENTR1 is the *column*
    # value for the flux-weighted centroid, MOM_CENTR2 is the row value.
    ctd_x = lcd['mom_centr2'][lcd['sap_quality'] == 0]
    ctd_y = lcd['mom_centr1'][lcd['sap_quality'] == 0]
    ctd_x_err = lcd['mom_centr2_err'][lcd['sap_quality'] == 0]
    ctd_y_err = lcd['mom_centr1_err'][lcd['sap_quality'] == 0]

    find = npisfinite(times) & npisfinite(ctd_x) & npisfinite(ctd_y)
    find &= (npisfinite(ctd_x_err)) & (npisfinite(ctd_y_err))

    f_times, f_ctd_x, f_ctd_y = times[find], ctd_x[find], ctd_y[find]
    f_ctd_x_err, f_ctd_y_err = ctd_x_err[find], ctd_y_err[find]

    # Sigma clip whopping outliers. It'd be better to have a general purpose
    # function for this, but sigclip_magseries works.
    stimes_x, s_ctd_x, s_ctd_x_err = sigclip_magseries(
        f_times,
        f_ctd_x,
        f_ctd_x_err,
        magsarefluxes=True,
        sigclip=30.0
    )
    stimes_y, s_ctd_y, s_ctd_y_err = sigclip_magseries(
        f_times,
        f_ctd_y,
        f_ctd_y_err,
        magsarefluxes=True,
        sigclip=30.0
    )

    # Get times and centroids where everything is finite and sigma clipped.
    mask_x = npin1d(stimes_x, stimes_y)
    s_times, s_ctd_x, s_ctd_x_err = (
        stimes_x[mask_x],
        s_ctd_x[mask_x],
        s_ctd_x_err[mask_x]
    )
    mask_y = npin1d(stimes_y, stimes_x)
    tmp, s_ctd_y, s_ctd_y_err = (
        stimes_y[mask_y],
        s_ctd_y[mask_y],
        s_ctd_y_err[mask_y]
    )
    try:
        np.testing.assert_array_equal(s_times, tmp)
        assert len(s_ctd_y) == len(s_times)
        assert len(s_ctd_y_err) == len(s_times)
        assert len(s_ctd_x) == len(s_times)
        assert len(s_ctd_x_err) == len(s_times)
    except AssertionError:
        return lcd, True

    nqflag = s_times.size

    # Drop intra-quarter and interquarter gaps in the timeseries. These are the
    # same limits set by Armstrong et al (2014): split each quarter's
    # timegroups by whether points are within 0.5 day limits. Then drop points
    # within 0.5 days of any boundary.  Finally, since the interquarter burn-in
    # time is more like 1 day, drop a further 0.5 days from the edges of each
    # quarter.  A nicer way to implement this would be with numpy masks, but
    # this approach just constructs the full arrays for any given quarter.

    ngroups, groups = find_lc_timegroups(s_times, mingap=mingap)
    tmp_times, tmp_ctd_x, tmp_ctd_y = [], [], []
    tmp_ctd_x_err, tmp_ctd_y_err = [], []

    for group in groups:
        tg_times = s_times[group]
        tg_ctd_x = s_ctd_x[group]
        tg_ctd_y = s_ctd_y[group]
        tg_ctd_x_err = s_ctd_x_err[group]
        tg_ctd_y_err = s_ctd_y_err[group]
        try:
            sel = ((tg_times > npmin(tg_times)+mingap) &
                   (tg_times < npmax(tg_times)-mingap))
        except ValueError:
            # If tgtimes is empty, continue to next timegroup.
            continue

        tmp_times.append(tg_times[sel])
        tmp_ctd_x.append(tg_ctd_x[sel])
        tmp_ctd_y.append(tg_ctd_y[sel])
        tmp_ctd_x_err.append(tg_ctd_x_err[sel])
        tmp_ctd_y_err.append(tg_ctd_y_err[sel])

    s_times,s_ctd_x,s_ctd_y,s_ctd_x_err,s_ctd_y_err = (
        nparray([]),nparray([]),nparray([]),nparray([]),nparray([])
    )

    # N.b.: works fine with empty arrays.
    for ix, _ in enumerate(tmp_times):
        s_times = npappend(s_times, tmp_times[ix])
        s_ctd_x = npappend(s_ctd_x, tmp_ctd_x[ix])
        s_ctd_y = npappend(s_ctd_y, tmp_ctd_y[ix])
        s_ctd_x_err = npappend(s_ctd_x_err, tmp_ctd_x_err[ix])
        s_ctd_y_err = npappend(s_ctd_y_err, tmp_ctd_y_err[ix])

    # Extra inter-quarter burn-in of 0.5 days.
    try:

        s_ctd_x = s_ctd_x[(s_times > (npmin(s_times)+mingap)) &
                          (s_times < (npmax(s_times)-mingap))]

    except Exception:
        # Case: s_times is wonky, all across this quarter. (Implemented because
        # of a rare bug with a singleton s_times array).
        LOGERROR('DETREND FAILED, qnum {:d}'.format(qnum))
        return npnan, True

    s_ctd_y = s_ctd_y[(s_times > (npmin(s_times)+mingap)) &
                      (s_times < (npmax(s_times)-mingap))]
    s_ctd_x_err = s_ctd_x_err[(s_times > (npmin(s_times)+mingap)) &
                              (s_times < (npmax(s_times)-mingap))]
    s_ctd_y_err = s_ctd_y_err[(s_times > (npmin(s_times)+mingap)) &
                              (s_times < (npmax(s_times)-mingap))]
    # Careful to do this last...
    s_times = s_times[(s_times > (npmin(s_times)+mingap)) &
                      (s_times < (npmax(s_times)-mingap))]

    nafter = s_times.size

    LOGINFO(
        'CLIPPING (SAP), qnum: {:d}'.format(qnum) +
        '\nndet before qflag & sigclip: {:d} ({:.3g}),'.format(
            nbefore, 1.
        ) +
        '\nndet after qflag & finite & sigclip: {:d} ({:.3g})'.format(
            nqflag, nqflag/float(nbefore)
        ) +
        '\nndet after dropping pts near gaps: {:d} ({:.3g})'.format(
            nafter, nafter/float(nbefore)
        )
    )

    # DETREND: fit a "low" order legendre series (see
    # "legendredeg_vs_npts_per_timegroup_ctd.pdf"), and save it to the output
    # dictionary. Save the fit (residuals to be computed after).
    ctd_dtr = {}

    if detrend == 'legendre':
        mingap = 0.5  # days
        ngroups, groups = find_lc_timegroups(s_times, mingap=mingap)
        tmpctdxlegfit, tmpctdylegfit, legdegs = [], [], []
        for group in groups:
            tg_times = s_times[group]
            tg_ctd_x = s_ctd_x[group]
            tg_ctd_x_err = s_ctd_x_err[group]
            tg_ctd_y = s_ctd_y[group]
            tg_ctd_y_err = s_ctd_y_err[group]

            legdeg = _get_legendre_deg_ctd(len(tg_times))
            tg_ctd_x_fit, _, _ = _legendre_dtr(tg_times,tg_ctd_x,tg_ctd_x_err,
                                               legendredeg=legdeg)
            tg_ctd_y_fit, _, _ = _legendre_dtr(tg_times,tg_ctd_y,tg_ctd_y_err,
                                               legendredeg=legdeg)

            tmpctdxlegfit.append(tg_ctd_x_fit)
            tmpctdylegfit.append(tg_ctd_y_fit)
            legdegs.append(legdeg)

        fit_ctd_x, fit_ctd_y = nparray([]), nparray([])
        for ix, _ in enumerate(tmpctdxlegfit):
            fit_ctd_x = npappend(fit_ctd_x, tmpctdxlegfit[ix])
            fit_ctd_y = npappend(fit_ctd_y, tmpctdylegfit[ix])

    ctd_dtr = {'times':s_times,
               'ctd_x':s_ctd_x,
               'ctd_x_err':s_ctd_x_err,
               'fit_ctd_x':fit_ctd_x,
               'ctd_y':s_ctd_y,
               'ctd_y_err':s_ctd_y_err,
               'fit_ctd_y':fit_ctd_y}

    lcd['ctd_dtr'] = ctd_dtr

    return lcd, False


[docs]def get_centroid_offsets(lcd, t_ing_egr, oot_buffer_time=0.1, sample_factor=3):
    '''After running `detrend_centroid`, this gets positions of centroids during
    transits, and outside of transits.

    These positions can then be used in a false positive analysis.

    This routine requires knowing the ingress and egress times for every
    transit of interest within the quarter this routine is being called for.
    There is currently no astrobase routine that automates this for periodic
    transits (it must be done in a calling routine).

    To get out of transit centroids, this routine takes points outside of the
    "buffer" set by `oot_buffer_time`, sampling 3x as many points on either
    side of the transit as are in the transit (or however many are specified by
    `sample_factor`).

    Parameters
    ----------

    lcd : lcdict
        An `lcdict` generated by the `read_kepler_fitslc` function. We assume
        that the `detrend_centroid` function has been run on this `lcdict`.

    t_ing_egr : list of tuples
        This is of the form::

            [(ingress time of i^th transit, egress time of i^th transit)]

        for i the transit number index in this quarter (starts at zero at the
        beginning of every quarter). Assumes units of BJD.

    oot_buffer_time : float
        Number of days away from ingress and egress times to begin sampling "out
        of transit" centroid points. The number of out of transit points to take
        per transit is 3x the number of points in transit.

    sample_factor : float
        The size of out of transit window from which to sample.

    Returns
    -------

    dict
        This is a dictionary keyed by transit number (i.e., the same index as
        `t_ing_egr`), where each key contains the following value::

            {'ctd_x_in_tra':ctd_x_in_tra,
             'ctd_y_in_tra':ctd_y_in_tra,
             'ctd_x_oot':ctd_x_oot,
             'ctd_y_oot':ctd_y_oot,
             'npts_in_tra':len(ctd_x_in_tra),
             'npts_oot':len(ctd_x_oot),
             'in_tra_times':in_tra_times,
             'oot_times':oot_times}

    '''

    # NOTE:
    # Bryson+ (2013) gives a more complicated and more correct approach to this
    # problem, computing offsets relative to positions defined on the SKY. This
    # requires using a Kepler focal plane geometry model. I don't have that
    # model, or know how to get it. So I use a simpler approach.

    qnum = int(np.unique(lcd['quarter']))
    LOGINFO('Getting centroid offsets (qnum: {:d})...'.format(qnum))
    # Kepler pixel scale, cf.
    # https://keplerscience.arc.nasa.gov/the-kepler-space-telescope.html
    arcsec_per_px = 3.98

    # Get the residuals (units: pixel offset).
    times = lcd['ctd_dtr']['times']
    ctd_resid_x = lcd['ctd_dtr']['ctd_x'] - lcd['ctd_dtr']['fit_ctd_x']
    ctd_resid_y = lcd['ctd_dtr']['ctd_y'] - lcd['ctd_dtr']['fit_ctd_y']

    # Return results in "centroid dictionary" (has keys of transit number).
    cd = {}
    for ix,(t_ing,t_egr) in enumerate(t_ing_egr):

        # We have in-transit times as input.
        in_tra_times = times[(times > t_ing) & (times < t_egr)]

        # Compute out of transit times on either side of the in-transit times.
        transit_dur = t_egr - t_ing
        oot_window_len = sample_factor * transit_dur

        oot_before = times[
            (times < (t_ing-oot_buffer_time)) &
            (times > (t_ing-oot_buffer_time-oot_window_len))
        ]
        oot_after = times[
            (times > (t_egr+oot_buffer_time)) &
            (times < (t_egr+oot_buffer_time+oot_window_len))
        ]

        oot_times = npconcatenate([oot_before, oot_after])

        mask_tra = npin1d(times, in_tra_times)
        mask_oot = npin1d(times, oot_times)

        # Convert to units of arcseconds.
        ctd_x_in_tra = ctd_resid_x[mask_tra]*arcsec_per_px
        ctd_y_in_tra = ctd_resid_y[mask_tra]*arcsec_per_px
        ctd_x_oot = ctd_resid_x[mask_oot]*arcsec_per_px
        ctd_y_oot = ctd_resid_y[mask_oot]*arcsec_per_px

        cd[ix] = {'ctd_x_in_tra':ctd_x_in_tra,
                  'ctd_y_in_tra':ctd_y_in_tra,
                  'ctd_x_oot':ctd_x_oot,
                  'ctd_y_oot':ctd_y_oot,
                  'npts_in_tra':len(ctd_x_in_tra),
                  'npts_oot':len(ctd_x_oot),
                  'in_tra_times':in_tra_times,
                  'oot_times':oot_times}

    LOGINFO('Got centroid offsets (qnum: {:d}).'.format(qnum))

    return cd


############################################
# UTILITY FUNCTION FOR CENTROID DETRENDING #
############################################

def _get_legendre_deg_ctd(npts):
    '''This is a helper function for centroid detrending.

    '''

    from scipy.interpolate import interp1d

    degs = nparray([4,5,6,10,15])
    pts = nparray([1e2,3e2,5e2,1e3,3e3])
    fn = interp1d(pts, degs, kind='linear',
                  bounds_error=False,
                  fill_value=(min(degs), max(degs)))
    legendredeg = int(npfloor(fn(npts)))

    return legendredeg


#######################################
# UTILITY FUNCTION FOR ANY DETRENDING #
#######################################

def _legendre_dtr(x, y, y_err, legendredeg=10):
    '''This calculates the residual and chi-sq values for a Legendre
    function fit.

    Parameters
    ----------

    x : np.array
        Array of the independent variable.

    y : np.array
        Array of the dependent variable.

    y_err : np.array
        Array of errors associated with each `y` value. Used to calculate fit
        weights.

    legendredeg : int
        The degree of the Legendre function to use when fitting.

    Returns
    -------

    tuple
        The tuple returned is of the form: (fit_y, fitchisq, fitredchisq)

    '''
    try:
        p = Legendre.fit(x, y, legendredeg)
        fit_y = p(x)
    except Exception:
        fit_y = npzeros_like(y)

    fitchisq = npsum(
        ((fit_y - y)*(fit_y - y)) / (y_err*y_err)
    )

    nparams = legendredeg + 1
    fitredchisq = fitchisq/(len(y) - nparams - 1)

    LOGINFO(
        'legendre detrend applied. chisq = %.5f, reduced chisq = %.5f' %
        (fitchisq, fitredchisq)
    )

    return fit_y, fitchisq, fitredchisq