#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# lcpfeatures.py - Waqas Bhatti (wbhatti@astro.princeton.edu) - Feb 2019
'''
This contains functions to generate periodic light curve features for later
variable star classification.
'''
#############
## LOGGING ##
#############
import logging
from astrobase import log_sub, log_fmt, log_date_fmt
DEBUG = False
if DEBUG:
level = logging.DEBUG
else:
level = logging.INFO
LOGGER = logging.getLogger(__name__)
logging.basicConfig(
level=level,
style=log_sub,
format=log_fmt,
datefmt=log_date_fmt,
)
LOGDEBUG = LOGGER.debug
LOGINFO = LOGGER.info
LOGWARNING = LOGGER.warning
LOGERROR = LOGGER.error
LOGEXCEPTION = LOGGER.exception
#############
## IMPORTS ##
#############
import pickle
import os
import os.path
import glob
import gzip
import multiprocessing as mp
from concurrent.futures import ProcessPoolExecutor
from tornado.escape import squeeze
# to turn a list of keys into a dict address
# from https://stackoverflow.com/a/14692747
from functools import reduce
from operator import getitem
def _dict_get(datadict, keylist):
return reduce(getitem, keylist, datadict)
import numpy as np
try:
from tqdm import tqdm
TQDM = True
except Exception:
TQDM = False
pass
############
## CONFIG ##
############
NCPUS = mp.cpu_count()
###################
## LOCAL IMPORTS ##
###################
from astrobase.lcmath import normalize_magseries
from astrobase.varclass import periodicfeatures
from astrobase.lcproc import get_lcformat
from astrobase.lcproc.periodsearch import PFMETHODS
#######################
## PERIODIC FEATURES ##
#######################
[docs]def get_periodicfeatures(
pfpickle,
lcbasedir,
outdir,
fourierorder=5,
# these are depth, duration, ingress duration
transitparams=(-0.01,0.1,0.1),
# these are depth, duration, depth ratio, secphase
ebparams=(-0.2,0.3,0.7,0.5),
pdiff_threshold=1.0e-4,
sidereal_threshold=1.0e-4,
sampling_peak_multiplier=5.0,
sampling_startp=None,
sampling_endp=None,
starfeatures=None,
timecols=None,
magcols=None,
errcols=None,
lcformat='hat-sql',
lcformatdir=None,
sigclip=10.0,
verbose=True,
raiseonfail=False
):
'''This gets all periodic features for the object.
Parameters
----------
pfpickle : str
The period-finding result pickle containing period-finder results to use
for the calculation of LC fit, periodogram, and phased LC features.
lcbasedir : str
The base directory where the light curve for the current object is
located.
outdir : str
The output directory where the results will be written.
fourierorder : int
The Fourier order to use to generate sinusoidal function and fit that to
the phased light curve.
transitparams : list of floats
The transit depth, duration, and ingress duration to use to generate a
trapezoid planet transit model fit to the phased light curve. The period
used is the one provided in `period`, while the epoch is automatically
obtained from a spline fit to the phased light curve.
ebparams : list of floats
The primary eclipse depth, eclipse duration, the primary-secondary depth
ratio, and the phase of the secondary eclipse to use to generate an
eclipsing binary model fit to the phased light curve. The period used is
the one provided in `period`, while the epoch is automatically obtained
from a spline fit to the phased light curve.
pdiff_threshold : float
This is the max difference between periods to consider them the same.
sidereal_threshold : float
This is the max difference between any of the 'best' periods and the
sidereal day periods to consider them the same.
sampling_peak_multiplier : float
This is the minimum multiplicative factor of a 'best' period's
normalized periodogram peak over the sampling periodogram peak at the
same period required to accept the 'best' period as possibly real.
sampling_startp, sampling_endp : float
If the `pgramlist` doesn't have a time-sampling Lomb-Scargle
periodogram, it will be obtained automatically. Use these kwargs to
control the minimum and maximum period interval to be searched when
generating this periodogram.
starfeatures : str or None
If not None, this should be the filename of the
`starfeatures-<objectid>.pkl` created by
:py:func:`astrobase.lcproc.lcsfeatures.get_starfeatures` for this
object. This is used to get the neighbor's light curve and phase it with
this object's period to see if this object is blended.
timecols : list of str or None
The timecol keys to use from the lcdict in calculating the features.
magcols : list of str or None
The magcol keys to use from the lcdict in calculating the features.
errcols : list of str or None
The errcol keys to use from the lcdict in calculating the features.
lcformat : str
This is the `formatkey` associated with your light curve format, which
you previously passed in to the `lcproc.register_lcformat`
function. This will be used to look up how to find and read the light
curves specified in `basedir` or `use_list_of_filenames`.
lcformatdir : str or None
If this is provided, gives the path to a directory when you've stored
your lcformat description JSONs, other than the usual directories lcproc
knows to search for them in. Use this along with `lcformat` to specify
an LC format JSON file that's not currently registered with lcproc.
sigclip : float or int or sequence of two floats/ints or None
If a single float or int, a symmetric sigma-clip will be performed using
the number provided as the sigma-multiplier to cut out from the input
time-series.
If a list of two ints/floats is provided, the function will perform an
'asymmetric' sigma-clip. The first element in this list is the sigma
value to use for fainter flux/mag values; the second element in this
list is the sigma value to use for brighter flux/mag values. For
example, `sigclip=[10., 3.]`, will sigclip out greater than 10-sigma
dimmings and greater than 3-sigma brightenings. Here the meaning of
"dimming" and "brightening" is set by *physics* (not the magnitude
system), which is why the `magsarefluxes` kwarg must be correctly set.
If `sigclip` is None, no sigma-clipping will be performed, and the
time-series (with non-finite elems removed) will be passed through to
the output.
verbose : bool
If True, will indicate progress while working.
raiseonfail : bool
If True, will raise an Exception if something goes wrong.
Returns
-------
str
Returns a filename for the output pickle containing all of the periodic
features for the input object's LC.
'''
try:
formatinfo = get_lcformat(lcformat,
use_lcformat_dir=lcformatdir)
if formatinfo:
(fileglob, readerfunc,
dtimecols, dmagcols, derrcols,
magsarefluxes, normfunc) = formatinfo
else:
LOGERROR("can't figure out the light curve format")
return None
except Exception:
LOGEXCEPTION("can't figure out the light curve format")
return None
# open the pfpickle
if pfpickle.endswith('.gz'):
infd = gzip.open(pfpickle)
else:
infd = open(pfpickle)
pf = pickle.load(infd)
infd.close()
lcfile = os.path.join(lcbasedir, pf['lcfbasename'])
objectid = pf['objectid']
if 'kwargs' in pf:
kwargs = pf['kwargs']
else:
kwargs = None
# override the default timecols, magcols, and errcols
# using the ones provided to the periodfinder
# if those don't exist, use the defaults from the lcformat def
if kwargs and 'timecols' in kwargs and timecols is None:
timecols = kwargs['timecols']
elif not kwargs and not timecols:
timecols = dtimecols
if kwargs and 'magcols' in kwargs and magcols is None:
magcols = kwargs['magcols']
elif not kwargs and not magcols:
magcols = dmagcols
if kwargs and 'errcols' in kwargs and errcols is None:
errcols = kwargs['errcols']
elif not kwargs and not errcols:
errcols = derrcols
# check if the light curve file exists
if not os.path.exists(lcfile):
LOGERROR("can't find LC %s for object %s" % (lcfile, objectid))
return None
# check if we have neighbors we can get the LCs for
if starfeatures is not None and os.path.exists(starfeatures):
with open(starfeatures,'rb') as infd:
starfeat = pickle.load(infd)
if starfeat['closestnbrlcfname'].size > 0:
nbr_full_lcf = starfeat['closestnbrlcfname'][0]
# check for this LC in the lcbasedir
if os.path.exists(os.path.join(lcbasedir,
os.path.basename(nbr_full_lcf))):
nbrlcf = os.path.join(lcbasedir,
os.path.basename(nbr_full_lcf))
# if it's not there, check for this file at the full LC location
elif os.path.exists(nbr_full_lcf):
nbrlcf = nbr_full_lcf
# otherwise, we can't find it, so complain
else:
LOGWARNING("can't find neighbor light curve file: %s in "
"its original directory: %s, or in this object's "
"lcbasedir: %s, skipping neighbor processing..." %
(os.path.basename(nbr_full_lcf),
os.path.dirname(nbr_full_lcf),
lcbasedir))
nbrlcf = None
else:
nbrlcf = None
else:
nbrlcf = None
# now, start processing for periodic feature extraction
try:
# get the object LC into a dict
lcdict = readerfunc(lcfile)
# this should handle lists/tuples being returned by readerfunc
# we assume that the first element is the actual lcdict
# FIXME: figure out how to not need this assumption
if ( (isinstance(lcdict, (list, tuple))) and
(isinstance(lcdict[0], dict)) ):
lcdict = lcdict[0]
# get the nbr object LC into a dict if there is one
if nbrlcf is not None:
nbrlcdict = readerfunc(nbrlcf)
# this should handle lists/tuples being returned by readerfunc
# we assume that the first element is the actual lcdict
# FIXME: figure out how to not need this assumption
if ( (isinstance(nbrlcdict, (list, tuple))) and
(isinstance(nbrlcdict[0], dict)) ):
nbrlcdict = nbrlcdict[0]
# this will be the output file
outfile = os.path.join(outdir, 'periodicfeatures-%s.pkl' %
squeeze(objectid).replace(' ','-'))
# normalize using the special function if specified
if normfunc is not None:
lcdict = normfunc(lcdict)
if nbrlcf:
nbrlcdict = normfunc(nbrlcdict)
resultdict = {}
for tcol, mcol, ecol in zip(timecols, magcols, errcols):
# dereference the columns and get them from the lcdict
if '.' in tcol:
tcolget = tcol.split('.')
else:
tcolget = [tcol]
times = _dict_get(lcdict, tcolget)
if nbrlcf:
nbrtimes = _dict_get(nbrlcdict, tcolget)
else:
nbrtimes = None
if '.' in mcol:
mcolget = mcol.split('.')
else:
mcolget = [mcol]
mags = _dict_get(lcdict, mcolget)
if nbrlcf:
nbrmags = _dict_get(nbrlcdict, mcolget)
else:
nbrmags = None
if '.' in ecol:
ecolget = ecol.split('.')
else:
ecolget = [ecol]
errs = _dict_get(lcdict, ecolget)
if nbrlcf:
nbrerrs = _dict_get(nbrlcdict, ecolget)
else:
nbrerrs = None
#
# filter out nans, etc. from the object and any neighbor LC
#
# get the finite values
finind = np.isfinite(times) & np.isfinite(mags) & np.isfinite(errs)
ftimes, fmags, ferrs = times[finind], mags[finind], errs[finind]
if nbrlcf:
nfinind = (np.isfinite(nbrtimes) &
np.isfinite(nbrmags) &
np.isfinite(nbrerrs))
nbrftimes, nbrfmags, nbrferrs = (nbrtimes[nfinind],
nbrmags[nfinind],
nbrerrs[nfinind])
# get nonzero errors
nzind = np.nonzero(ferrs)
ftimes, fmags, ferrs = ftimes[nzind], fmags[nzind], ferrs[nzind]
if nbrlcf:
nnzind = np.nonzero(nbrferrs)
nbrftimes, nbrfmags, nbrferrs = (nbrftimes[nnzind],
nbrfmags[nnzind],
nbrferrs[nnzind])
# normalize here if not using special normalization
if normfunc is None:
ntimes, nmags = normalize_magseries(
ftimes, fmags,
magsarefluxes=magsarefluxes
)
times, mags, errs = ntimes, nmags, ferrs
if nbrlcf:
nbrntimes, nbrnmags = normalize_magseries(
nbrftimes, nbrfmags,
magsarefluxes=magsarefluxes
)
nbrtimes, nbrmags, nbrerrs = nbrntimes, nbrnmags, nbrferrs
else:
nbrtimes, nbrmags, nbrerrs = None, None, None
else:
times, mags, errs = ftimes, fmags, ferrs
if times.size > 999:
#
# now we have times, mags, errs (and nbrtimes, nbrmags, nbrerrs)
#
available_pfmethods = []
available_pgrams = []
available_bestperiods = []
for k in pf[mcol].keys():
if k in PFMETHODS:
available_pgrams.append(pf[mcol][k])
if k != 'win':
available_pfmethods.append(
pf[mcol][k]['method']
)
available_bestperiods.append(
pf[mcol][k]['bestperiod']
)
#
# process periodic features for this magcol
#
featkey = 'periodicfeatures-%s' % mcol
resultdict[featkey] = {}
# first, handle the periodogram features
pgramfeat = periodicfeatures.periodogram_features(
available_pgrams, times, mags, errs,
sigclip=sigclip,
pdiff_threshold=pdiff_threshold,
sidereal_threshold=sidereal_threshold,
sampling_peak_multiplier=sampling_peak_multiplier,
sampling_startp=sampling_startp,
sampling_endp=sampling_endp,
verbose=verbose
)
resultdict[featkey].update(pgramfeat)
resultdict[featkey]['pfmethods'] = available_pfmethods
# then for each bestperiod, get phasedlc and lcfit features
for _ind, pfm, bp in zip(range(len(available_bestperiods)),
available_pfmethods,
available_bestperiods):
resultdict[featkey][pfm] = periodicfeatures.lcfit_features(
times, mags, errs, bp,
fourierorder=fourierorder,
transitparams=transitparams,
ebparams=ebparams,
sigclip=sigclip,
magsarefluxes=magsarefluxes,
verbose=verbose
)
phasedlcfeat = periodicfeatures.phasedlc_features(
times, mags, errs, bp,
nbrtimes=nbrtimes,
nbrmags=nbrmags,
nbrerrs=nbrerrs
)
resultdict[featkey][pfm].update(phasedlcfeat)
else:
LOGERROR('not enough finite measurements in magcol: %s, for '
'pfpickle: %s, skipping this magcol'
% (mcol, pfpickle))
featkey = 'periodicfeatures-%s' % mcol
resultdict[featkey] = None
#
# end of per magcol processing
#
# write resultdict to pickle
outfile = os.path.join(outdir, 'periodicfeatures-%s.pkl' %
squeeze(objectid).replace(' ','-'))
with open(outfile,'wb') as outfd:
pickle.dump(resultdict, outfd, pickle.HIGHEST_PROTOCOL)
return outfile
except Exception:
LOGEXCEPTION('failed to run for pf: %s, lcfile: %s' %
(pfpickle, lcfile))
if raiseonfail:
raise
else:
return None
def _periodicfeatures_worker(task):
'''
This is a parallel worker for the drivers below.
'''
pfpickle, lcbasedir, outdir, starfeatures, kwargs = task
try:
return get_periodicfeatures(pfpickle,
lcbasedir,
outdir,
starfeatures=starfeatures,
**kwargs)
except Exception:
LOGEXCEPTION('failed to get periodicfeatures for %s' % pfpickle)
[docs]def serial_periodicfeatures(pfpkl_list,
lcbasedir,
outdir,
starfeaturesdir=None,
fourierorder=5,
# these are depth, duration, ingress duration
transitparams=(-0.01,0.1,0.1),
# these are depth, duration, depth ratio, secphase
ebparams=(-0.2,0.3,0.7,0.5),
pdiff_threshold=1.0e-4,
sidereal_threshold=1.0e-4,
sampling_peak_multiplier=5.0,
sampling_startp=None,
sampling_endp=None,
starfeatures=None,
timecols=None,
magcols=None,
errcols=None,
lcformat='hat-sql',
lcformatdir=None,
sigclip=10.0,
verbose=False,
maxobjects=None):
'''This drives the periodicfeatures collection for a list of periodfinding
pickles.
Parameters
----------
pfpkl_list : list of str
The list of period-finding pickles to use.
lcbasedir : str
The base directory where the associated light curves are located.
outdir : str
The directory where the results will be written.
starfeaturesdir : str or None
The directory containing the `starfeatures-<objectid>.pkl` files for
each object to use calculate neighbor proximity light curve features.
fourierorder : int
The Fourier order to use to generate sinusoidal function and fit that to
the phased light curve.
transitparams : list of floats
The transit depth, duration, and ingress duration to use to generate a
trapezoid planet transit model fit to the phased light curve. The period
used is the one provided in `period`, while the epoch is automatically
obtained from a spline fit to the phased light curve.
ebparams : list of floats
The primary eclipse depth, eclipse duration, the primary-secondary depth
ratio, and the phase of the secondary eclipse to use to generate an
eclipsing binary model fit to the phased light curve. The period used is
the one provided in `period`, while the epoch is automatically obtained
from a spline fit to the phased light curve.
pdiff_threshold : float
This is the max difference between periods to consider them the same.
sidereal_threshold : float
This is the max difference between any of the 'best' periods and the
sidereal day periods to consider them the same.
sampling_peak_multiplier : float
This is the minimum multiplicative factor of a 'best' period's
normalized periodogram peak over the sampling periodogram peak at the
same period required to accept the 'best' period as possibly real.
sampling_startp, sampling_endp : float
If the `pgramlist` doesn't have a time-sampling Lomb-Scargle
periodogram, it will be obtained automatically. Use these kwargs to
control the minimum and maximum period interval to be searched when
generating this periodogram.
timecols : list of str or None
The timecol keys to use from the lcdict in calculating the features.
magcols : list of str or None
The magcol keys to use from the lcdict in calculating the features.
errcols : list of str or None
The errcol keys to use from the lcdict in calculating the features.
lcformat : str
This is the `formatkey` associated with your light curve format, which
you previously passed in to the `lcproc.register_lcformat`
function. This will be used to look up how to find and read the light
curves specified in `basedir` or `use_list_of_filenames`.
lcformatdir : str or None
If this is provided, gives the path to a directory when you've stored
your lcformat description JSONs, other than the usual directories lcproc
knows to search for them in. Use this along with `lcformat` to specify
an LC format JSON file that's not currently registered with lcproc.
sigclip : float or int or sequence of two floats/ints or None
If a single float or int, a symmetric sigma-clip will be performed using
the number provided as the sigma-multiplier to cut out from the input
time-series.
If a list of two ints/floats is provided, the function will perform an
'asymmetric' sigma-clip. The first element in this list is the sigma
value to use for fainter flux/mag values; the second element in this
list is the sigma value to use for brighter flux/mag values. For
example, `sigclip=[10., 3.]`, will sigclip out greater than 10-sigma
dimmings and greater than 3-sigma brightenings. Here the meaning of
"dimming" and "brightening" is set by *physics* (not the magnitude
system), which is why the `magsarefluxes` kwarg must be correctly set.
If `sigclip` is None, no sigma-clipping will be performed, and the
time-series (with non-finite elems removed) will be passed through to
the output.
verbose : bool
If True, will indicate progress while working.
maxobjects : int
The total number of objects to process from `pfpkl_list`.
Returns
-------
Nothing.
'''
try:
formatinfo = get_lcformat(lcformat,
use_lcformat_dir=lcformatdir)
if formatinfo:
(fileglob, readerfunc,
dtimecols, dmagcols, derrcols,
magsarefluxes, normfunc) = formatinfo
else:
LOGERROR("can't figure out the light curve format")
return None
except Exception:
LOGEXCEPTION("can't figure out the light curve format")
return None
# make sure to make the output directory if it doesn't exist
if not os.path.exists(outdir):
os.makedirs(outdir)
if maxobjects:
pfpkl_list = pfpkl_list[:maxobjects]
LOGINFO('%s periodfinding pickles to process' % len(pfpkl_list))
# if the starfeaturedir is provided, try to find a starfeatures pickle for
# each periodfinding pickle in pfpkl_list
if starfeaturesdir and os.path.exists(starfeaturesdir):
starfeatures_list = []
LOGINFO('collecting starfeatures pickles...')
for pfpkl in pfpkl_list:
sfpkl1 = os.path.basename(pfpkl).replace('periodfinding',
'starfeatures')
sfpkl2 = sfpkl1.replace('.gz','')
sfpath1 = os.path.join(starfeaturesdir, sfpkl1)
sfpath2 = os.path.join(starfeaturesdir, sfpkl2)
if os.path.exists(sfpath1):
starfeatures_list.append(sfpkl1)
elif os.path.exists(sfpath2):
starfeatures_list.append(sfpkl2)
else:
starfeatures_list.append(None)
else:
starfeatures_list = [None for x in pfpkl_list]
# generate the task list
kwargs = {'fourierorder':fourierorder,
'transitparams':transitparams,
'ebparams':ebparams,
'pdiff_threshold':pdiff_threshold,
'sidereal_threshold':sidereal_threshold,
'sampling_peak_multiplier':sampling_peak_multiplier,
'sampling_startp':sampling_startp,
'sampling_endp':sampling_endp,
'timecols':timecols,
'magcols':magcols,
'errcols':errcols,
'lcformat':lcformat,
'lcformatdir':lcformatdir,
'sigclip':sigclip,
'verbose':verbose}
tasks = [(x, lcbasedir, outdir, y, kwargs) for (x,y) in
zip(pfpkl_list, starfeatures_list)]
LOGINFO('processing periodfinding pickles...')
for task in tqdm(tasks):
_periodicfeatures_worker(task)
[docs]def parallel_periodicfeatures(pfpkl_list,
lcbasedir,
outdir,
starfeaturesdir=None,
fourierorder=5,
# these are depth, duration, ingress duration
transitparams=(-0.01,0.1,0.1),
# these are depth, duration, depth ratio, secphase
ebparams=(-0.2,0.3,0.7,0.5),
pdiff_threshold=1.0e-4,
sidereal_threshold=1.0e-4,
sampling_peak_multiplier=5.0,
sampling_startp=None,
sampling_endp=None,
timecols=None,
magcols=None,
errcols=None,
lcformat='hat-sql',
lcformatdir=None,
sigclip=10.0,
verbose=False,
maxobjects=None,
nworkers=NCPUS):
'''This runs periodic feature generation in parallel for all periodfinding
pickles in the input list.
Parameters
----------
pfpkl_list : list of str
The list of period-finding pickles to use.
lcbasedir : str
The base directory where the associated light curves are located.
outdir : str
The directory where the results will be written.
starfeaturesdir : str or None
The directory containing the `starfeatures-<objectid>.pkl` files for
each object to use calculate neighbor proximity light curve features.
fourierorder : int
The Fourier order to use to generate sinusoidal function and fit that to
the phased light curve.
transitparams : list of floats
The transit depth, duration, and ingress duration to use to generate a
trapezoid planet transit model fit to the phased light curve. The period
used is the one provided in `period`, while the epoch is automatically
obtained from a spline fit to the phased light curve.
ebparams : list of floats
The primary eclipse depth, eclipse duration, the primary-secondary depth
ratio, and the phase of the secondary eclipse to use to generate an
eclipsing binary model fit to the phased light curve. The period used is
the one provided in `period`, while the epoch is automatically obtained
from a spline fit to the phased light curve.
pdiff_threshold : float
This is the max difference between periods to consider them the same.
sidereal_threshold : float
This is the max difference between any of the 'best' periods and the
sidereal day periods to consider them the same.
sampling_peak_multiplier : float
This is the minimum multiplicative factor of a 'best' period's
normalized periodogram peak over the sampling periodogram peak at the
same period required to accept the 'best' period as possibly real.
sampling_startp, sampling_endp : float
If the `pgramlist` doesn't have a time-sampling Lomb-Scargle
periodogram, it will be obtained automatically. Use these kwargs to
control the minimum and maximum period interval to be searched when
generating this periodogram.
timecols : list of str or None
The timecol keys to use from the lcdict in calculating the features.
magcols : list of str or None
The magcol keys to use from the lcdict in calculating the features.
errcols : list of str or None
The errcol keys to use from the lcdict in calculating the features.
lcformat : str
This is the `formatkey` associated with your light curve format, which
you previously passed in to the `lcproc.register_lcformat`
function. This will be used to look up how to find and read the light
curves specified in `basedir` or `use_list_of_filenames`.
lcformatdir : str or None
If this is provided, gives the path to a directory when you've stored
your lcformat description JSONs, other than the usual directories lcproc
knows to search for them in. Use this along with `lcformat` to specify
an LC format JSON file that's not currently registered with lcproc.
sigclip : float or int or sequence of two floats/ints or None
If a single float or int, a symmetric sigma-clip will be performed using
the number provided as the sigma-multiplier to cut out from the input
time-series.
If a list of two ints/floats is provided, the function will perform an
'asymmetric' sigma-clip. The first element in this list is the sigma
value to use for fainter flux/mag values; the second element in this
list is the sigma value to use for brighter flux/mag values. For
example, `sigclip=[10., 3.]`, will sigclip out greater than 10-sigma
dimmings and greater than 3-sigma brightenings. Here the meaning of
"dimming" and "brightening" is set by *physics* (not the magnitude
system), which is why the `magsarefluxes` kwarg must be correctly set.
If `sigclip` is None, no sigma-clipping will be performed, and the
time-series (with non-finite elems removed) will be passed through to
the output.
verbose : bool
If True, will indicate progress while working.
maxobjects : int
The total number of objects to process from `pfpkl_list`.
nworkers : int
The number of parallel workers to launch to process the input.
Returns
-------
dict
A dict containing key: val pairs of the input period-finder result and
the output periodic feature result pickles for each input pickle is
returned.
'''
# make sure to make the output directory if it doesn't exist
if not os.path.exists(outdir):
os.makedirs(outdir)
if maxobjects:
pfpkl_list = pfpkl_list[:maxobjects]
LOGINFO('%s periodfinding pickles to process' % len(pfpkl_list))
# if the starfeaturedir is provided, try to find a starfeatures pickle for
# each periodfinding pickle in pfpkl_list
if starfeaturesdir and os.path.exists(starfeaturesdir):
starfeatures_list = []
LOGINFO('collecting starfeatures pickles...')
for pfpkl in pfpkl_list:
sfpkl1 = os.path.basename(pfpkl).replace('periodfinding',
'starfeatures')
sfpkl2 = sfpkl1.replace('.gz','')
sfpath1 = os.path.join(starfeaturesdir, sfpkl1)
sfpath2 = os.path.join(starfeaturesdir, sfpkl2)
if os.path.exists(sfpath1):
starfeatures_list.append(sfpkl1)
elif os.path.exists(sfpath2):
starfeatures_list.append(sfpkl2)
else:
starfeatures_list.append(None)
else:
starfeatures_list = [None for x in pfpkl_list]
# generate the task list
kwargs = {'fourierorder':fourierorder,
'transitparams':transitparams,
'ebparams':ebparams,
'pdiff_threshold':pdiff_threshold,
'sidereal_threshold':sidereal_threshold,
'sampling_peak_multiplier':sampling_peak_multiplier,
'sampling_startp':sampling_startp,
'sampling_endp':sampling_endp,
'timecols':timecols,
'magcols':magcols,
'errcols':errcols,
'lcformat':lcformat,
'lcformatdir':lcformat,
'sigclip':sigclip,
'verbose':verbose}
tasks = [(x, lcbasedir, outdir, y, kwargs) for (x,y) in
zip(pfpkl_list, starfeatures_list)]
LOGINFO('processing periodfinding pickles...')
with ProcessPoolExecutor(max_workers=nworkers) as executor:
resultfutures = executor.map(_periodicfeatures_worker, tasks)
results = list(resultfutures)
resdict = {os.path.basename(x):y for (x,y) in zip(pfpkl_list, results)}
return resdict
[docs]def parallel_periodicfeatures_lcdir(
pfpkl_dir,
lcbasedir,
outdir,
pfpkl_glob='periodfinding-*.pkl*',
starfeaturesdir=None,
fourierorder=5,
# these are depth, duration, ingress duration
transitparams=(-0.01,0.1,0.1),
# these are depth, duration, depth ratio, secphase
ebparams=(-0.2,0.3,0.7,0.5),
pdiff_threshold=1.0e-4,
sidereal_threshold=1.0e-4,
sampling_peak_multiplier=5.0,
sampling_startp=None,
sampling_endp=None,
timecols=None,
magcols=None,
errcols=None,
lcformat='hat-sql',
lcformatdir=None,
sigclip=10.0,
verbose=False,
maxobjects=None,
nworkers=NCPUS,
recursive=True,
):
'''This runs parallel periodicfeature extraction for a directory of
periodfinding result pickles.
Parameters
----------
pfpkl_dir : str
The directory containing the pickles to process.
lcbasedir : str
The directory where all of the associated light curve files are located.
outdir : str
The directory where all the output will be written.
pfpkl_glob : str
The UNIX file glob to use to search for period-finder result pickles in
`pfpkl_dir`.
starfeaturesdir : str or None
The directory containing the `starfeatures-<objectid>.pkl` files for
each object to use calculate neighbor proximity light curve features.
fourierorder : int
The Fourier order to use to generate sinusoidal function and fit that to
the phased light curve.
transitparams : list of floats
The transit depth, duration, and ingress duration to use to generate a
trapezoid planet transit model fit to the phased light curve. The period
used is the one provided in `period`, while the epoch is automatically
obtained from a spline fit to the phased light curve.
ebparams : list of floats
The primary eclipse depth, eclipse duration, the primary-secondary depth
ratio, and the phase of the secondary eclipse to use to generate an
eclipsing binary model fit to the phased light curve. The period used is
the one provided in `period`, while the epoch is automatically obtained
from a spline fit to the phased light curve.
pdiff_threshold : float
This is the max difference between periods to consider them the same.
sidereal_threshold : float
This is the max difference between any of the 'best' periods and the
sidereal day periods to consider them the same.
sampling_peak_multiplier : float
This is the minimum multiplicative factor of a 'best' period's
normalized periodogram peak over the sampling periodogram peak at the
same period required to accept the 'best' period as possibly real.
sampling_startp, sampling_endp : float
If the `pgramlist` doesn't have a time-sampling Lomb-Scargle
periodogram, it will be obtained automatically. Use these kwargs to
control the minimum and maximum period interval to be searched when
generating this periodogram.
timecols : list of str or None
The timecol keys to use from the lcdict in calculating the features.
magcols : list of str or None
The magcol keys to use from the lcdict in calculating the features.
errcols : list of str or None
The errcol keys to use from the lcdict in calculating the features.
lcformat : str
This is the `formatkey` associated with your light curve format, which
you previously passed in to the `lcproc.register_lcformat`
function. This will be used to look up how to find and read the light
curves specified in `basedir` or `use_list_of_filenames`.
lcformatdir : str or None
If this is provided, gives the path to a directory when you've stored
your lcformat description JSONs, other than the usual directories lcproc
knows to search for them in. Use this along with `lcformat` to specify
an LC format JSON file that's not currently registered with lcproc.
sigclip : float or int or sequence of two floats/ints or None
If a single float or int, a symmetric sigma-clip will be performed using
the number provided as the sigma-multiplier to cut out from the input
time-series.
If a list of two ints/floats is provided, the function will perform an
'asymmetric' sigma-clip. The first element in this list is the sigma
value to use for fainter flux/mag values; the second element in this
list is the sigma value to use for brighter flux/mag values. For
example, `sigclip=[10., 3.]`, will sigclip out greater than 10-sigma
dimmings and greater than 3-sigma brightenings. Here the meaning of
"dimming" and "brightening" is set by *physics* (not the magnitude
system), which is why the `magsarefluxes` kwarg must be correctly set.
If `sigclip` is None, no sigma-clipping will be performed, and the
time-series (with non-finite elems removed) will be passed through to
the output.
verbose : bool
If True, will indicate progress while working.
maxobjects : int
The total number of objects to process from `pfpkl_list`.
nworkers : int
The number of parallel workers to launch to process the input.
Returns
-------
dict
A dict containing key: val pairs of the input period-finder result and
the output periodic feature result pickles for each input pickle is
returned.
'''
try:
formatinfo = get_lcformat(lcformat,
use_lcformat_dir=lcformatdir)
if formatinfo:
(dfileglob, readerfunc,
dtimecols, dmagcols, derrcols,
magsarefluxes, normfunc) = formatinfo
else:
LOGERROR("can't figure out the light curve format")
return None
except Exception:
LOGEXCEPTION("can't figure out the light curve format")
return None
fileglob = pfpkl_glob
# now find the files
LOGINFO('searching for periodfinding pickles in %s ...' % pfpkl_dir)
if recursive is False:
matching = glob.glob(os.path.join(pfpkl_dir, fileglob))
else:
matching = glob.glob(os.path.join(pfpkl_dir,
'**',
fileglob),
recursive=True)
# now that we have all the files, process them
if matching and len(matching) > 0:
LOGINFO('found %s periodfinding pickles, getting periodicfeatures...' %
len(matching))
return parallel_periodicfeatures(
matching,
lcbasedir,
outdir,
starfeaturesdir=starfeaturesdir,
fourierorder=fourierorder,
transitparams=transitparams,
ebparams=ebparams,
pdiff_threshold=pdiff_threshold,
sidereal_threshold=sidereal_threshold,
sampling_peak_multiplier=sampling_peak_multiplier,
sampling_startp=sampling_startp,
sampling_endp=sampling_endp,
timecols=timecols,
magcols=magcols,
errcols=errcols,
lcformat=lcformat,
lcformatdir=lcformatdir,
sigclip=sigclip,
verbose=verbose,
maxobjects=maxobjects,
nworkers=nworkers,
)
else:
LOGERROR('no periodfinding pickles found in %s' % (pfpkl_dir))
return None