Source code for astrobase.fakelcs.generation

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# generation - Waqas Bhatti (wbhatti@astro.princeton.edu) - Oct 2017
# License: MIT. See the LICENSE file for more details.

'''This generates light curves of variable stars using the astrobase.lcmodels
package, adds noise and observation sampling to them based on given parameters
(or example light curves). See fakelcrecovery.py for functions that run a full
recovery simulation.

NOTE 1: the parameters for these light curves are currently all chosen from
uniform distributions, which obviously doesn't reflect reality (some of the
current parameter upper and lower limits are realistic, however). Some of these
will be updated with real-life distributions as soon as I find them, especially
for periods and amplitudes (along with references).

NOTE 2: To generate custom distributions, one can subclass
scipy.stats.rv_continuous and override the _pdf and _cdf methods (or just the
_rvs method directly to get the distributed variables if distribution's location
and scale don't really matter). This is described here:

https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.rv_continuous.html

and doesn't seem to be restricted to distributions described by analytic
functions only. It's probably possible to get a histogram of some complex
distribution in parameter space and use a kernel-density estimator to get the
PDF and CDF, e.g.:

http://scikit-learn.org/stable/modules/density.html#kernel-density.

NOTE 3: any distribution parameter below having to do with magnitudes/flux is by
default in MAGNITUDES. So depth, amplitude, etc. distributions and their limits
will have to be adjusted appropriately for fluxes. IT IS NOT SUFFICIENT to just
set magsarefluxes = True.

FIXME: in the future, we'll do all the amplitude, etc. distributions in
differential fluxes canonically, and then take logs where appropriate if
magsarefluxes = False.

FIXME: we should add RA/DEC values that are taken from GAIA if we provide a
radec box for the simulation to take place in. in this way, we can parameterize
blending and take it into account in the recovery as well.

FIXME: check if object coordinates end up so that two or more objects lie within
a chosen blend radius. if this happens, we should check if the blender(s) are
variable, and add in some fraction of their phased light curve to the
blendee. if the blender(s) are not variable, add in a constant fraction of the
brightness to the blendee's light curve. the blending fraction is multiplied
into the light curve of the blender(s) and the resulting flux added to the
blendee's light curve.

- given the FWHM of the instrument, figure out the overlap

- we need to calculate the pixel area for blendee and the sum of pixel areas
  covered by the blenders. this will require input kwargs for pixel size of the
  detector and FWHM of the star (this might need to be calculated based on the
  brightness of the star)

FIXME: add input from TRILEGAL produced .dat files for color and mag
information. This will let us generate pulsating variables with their actual
colors.

'''

#############
## LOGGING ##
#############

import logging
from astrobase import log_sub, log_fmt, log_date_fmt

DEBUG = False
if DEBUG:
    level = logging.DEBUG
else:
    level = logging.INFO
LOGGER = logging.getLogger(__name__)
logging.basicConfig(
    level=level,
    style=log_sub,
    format=log_fmt,
    datefmt=log_date_fmt,
)

LOGDEBUG = LOGGER.debug
LOGINFO = LOGGER.info
LOGWARNING = LOGGER.warning
LOGERROR = LOGGER.error
LOGEXCEPTION = LOGGER.exception


#############
## IMPORTS ##
#############

import os
import os.path
import pickle
import shutil

from hashlib import md5, sha512

# to turn a list of keys into a dict address
# from https://stackoverflow.com/a/14692747
from functools import reduce
from operator import getitem


def _dict_get(datadict, keylist):
    return reduce(getitem, keylist, datadict)


import numpy as np
import numpy.random as npr

# seed the numpy random generator
# we'll use RANDSEED for scipy.stats distribution functions as well
RANDSEED = 0xdecaff
npr.seed(RANDSEED)

import scipy.stats as sps
import scipy.interpolate as spi


###################
## LOCAL IMPORTS ##
###################

# light curve models
from ..lcmodels import transits, eclipses, flares, sinusoidal

# magnitude conversion
from ..magnitudes import jhk_to_sdssr

# get the lcformat functions
from ..lcproc import get_lcformat, _read_pklc


#############################################
## FUNCTIONS TO GENERATE FAKE LIGHT CURVES ##
#############################################

[docs]def generate_transit_lightcurve(
        times,
        mags=None,
        errs=None,
        paramdists={'transitperiod':sps.uniform(loc=0.1,scale=49.9),
                    'transitdepth':sps.uniform(loc=1.0e-4,scale=2.0e-2),
                    'transitduration':sps.uniform(loc=0.01,scale=0.29)},
        magsarefluxes=False,
):
    '''This generates fake planet transit light curves.

    Parameters
    ----------

    times : np.array
        This is an array of time values that will be used as the time base.

    mags,errs : np.array
        These arrays will have the model added to them. If either is
        None, `np.full_like(times, 0.0)` will used as a substitute and the model
        light curve will be centered around 0.0.

    paramdists : dict
        This is a dict containing parameter distributions to use for the
        model params, containing the following keys ::

            {'transitperiod', 'transitdepth', 'transitduration'}

        The values of these keys should all be 'frozen' scipy.stats distribution
        objects, e.g.:

        https://docs.scipy.org/doc/scipy/reference/stats.html#continuous-distributions
        The variability epoch will be automatically chosen from a uniform
        distribution between `times.min()` and `times.max()`.

        The ingress duration will be automatically chosen from a uniform
        distribution ranging from 0.05 to 0.5 of the transitduration.

        The transitdepth will be flipped automatically as appropriate if
        `magsarefluxes=True`.

    magsarefluxes : bool
        If the generated time series is meant to be a flux time-series, set this
        to True to get the correct sign of variability amplitude.

    Returns
    -------

    dict
        A dict of the form below is returned::

            {'vartype': 'planet',
             'params': {'transitperiod': generated value of period,
                        'transitepoch': generated value of epoch,
                        'transitdepth': generated value of transit depth,
                        'transitduration': generated value of transit duration,
                        'ingressduration': generated value of transit ingress
                                           duration},
             'times': the model times,
             'mags': the model mags,
             'errs': the model errs,
             'varperiod': the generated period of variability == 'transitperiod'
             'varamplitude': the generated amplitude of
                             variability == 'transitdepth'}

    '''

    if mags is None:
        mags = np.full_like(times, 0.0)

    if errs is None:
        errs = np.full_like(times, 0.0)

    # choose the epoch
    epoch = npr.random()*(times.max() - times.min()) + times.min()

    # choose the period, depth, duration
    period = paramdists['transitperiod'].rvs(size=1)
    depth = paramdists['transitdepth'].rvs(size=1)
    duration = paramdists['transitduration'].rvs(size=1)

    # figure out the ingress duration
    ingduration = npr.random()*(0.5*duration - 0.05*duration) + 0.05*duration

    # fix the transit depth if it needs to be flipped
    if magsarefluxes and depth < 0.0:
        depth = -depth
    elif not magsarefluxes and depth > 0.0:
        depth = -depth

    # generate the model
    modelmags, phase, ptimes, pmags, perrs = (
        transits.trapezoid_transit_func([period, epoch, depth,
                                         duration, ingduration],
                                        times,
                                        mags,
                                        errs)
    )

    # resort in original time order
    timeind = np.argsort(ptimes)
    mtimes = ptimes[timeind]
    mmags = modelmags[timeind]
    merrs = perrs[timeind]

    # return a dict with everything
    modeldict = {
        'vartype':'planet',
        'params':{x:np.asscalar(y) for x,y in zip(['transitperiod',
                                                   'transitepoch',
                                                   'transitdepth',
                                                   'transitduration',
                                                   'ingressduration'],
                                                  [period,
                                                   epoch,
                                                   depth,
                                                   duration,
                                                   ingduration])},
        'times':mtimes,
        'mags':mmags,
        'errs':merrs,
        # these are standard keys that help with later characterization of
        # variability as a function period, variability amplitude, object mag,
        # ndet, etc.
        'varperiod':period,
        'varamplitude':depth
    }

    return modeldict


[docs]def generate_eb_lightcurve(
        times,
        mags=None,
        errs=None,
        paramdists={'period':sps.uniform(loc=0.2,scale=99.8),
                    'pdepth':sps.uniform(loc=1.0e-4,scale=0.7),
                    'pduration':sps.uniform(loc=0.01,scale=0.44),
                    'depthratio':sps.uniform(loc=0.01,scale=0.99),
                    'secphase':sps.norm(loc=0.5,scale=0.1)},
        magsarefluxes=False,
):
    '''This generates fake EB light curves.

    Parameters
    ----------

    times : np.array
        This is an array of time values that will be used as the time base.

    mags,errs : np.array
        These arrays will have the model added to them. If either is
        None, `np.full_like(times, 0.0)` will used as a substitute and the model
        light curve will be centered around 0.0.

    paramdists : dict
        This is a dict containing parameter distributions to use for the
        model params, containing the following keys ::

            {'period', 'pdepth', 'pduration', 'depthratio', 'secphase'}

        The values of these keys should all be 'frozen' scipy.stats distribution
        objects, e.g.:

        https://docs.scipy.org/doc/scipy/reference/stats.html#continuous-distributions
        The variability epoch will be automatically chosen from a uniform
        distribution between `times.min()` and `times.max()`.

        The `pdepth` will be flipped automatically as appropriate if
        `magsarefluxes=True`.

    magsarefluxes : bool
        If the generated time series is meant to be a flux time-series, set this
        to True to get the correct sign of variability amplitude.

    Returns
    -------

    dict
        A dict of the form below is returned::

            {'vartype': 'EB',
             'params': {'period': generated value of period,
                        'epoch': generated value of epoch,
                        'pdepth': generated value of priary eclipse depth,
                        'pduration': generated value of prim eclipse duration,
                        'depthratio': generated value of prim/sec eclipse
                                      depth ratio},
             'times': the model times,
             'mags': the model mags,
             'errs': the model errs,
             'varperiod': the generated period of variability == 'period'
             'varamplitude': the generated amplitude of
                             variability == 'pdepth'}

    '''

    if mags is None:
        mags = np.full_like(times, 0.0)

    if errs is None:
        errs = np.full_like(times, 0.0)

    # choose the epoch
    epoch = npr.random()*(times.max() - times.min()) + times.min()

    # choose the period, pdepth, duration, depthratio
    period = paramdists['period'].rvs(size=1)
    pdepth = paramdists['pdepth'].rvs(size=1)
    pduration = paramdists['pduration'].rvs(size=1)
    depthratio = paramdists['depthratio'].rvs(size=1)
    secphase = paramdists['secphase'].rvs(size=1)

    # fix the transit depth if it needs to be flipped
    if magsarefluxes and pdepth < 0.0:
        pdepth = -pdepth
    elif not magsarefluxes and pdepth > 0.0:
        pdepth = -pdepth

    # generate the model
    modelmags, phase, ptimes, pmags, perrs = (
        eclipses.invgauss_eclipses_func([period, epoch, pdepth,
                                         pduration, depthratio, secphase],
                                        times,
                                        mags,
                                        errs)
    )

    # resort in original time order
    timeind = np.argsort(ptimes)
    mtimes = ptimes[timeind]
    mmags = modelmags[timeind]
    merrs = perrs[timeind]

    # return a dict with everything
    modeldict = {
        'vartype':'EB',
        'params':{x:np.asscalar(y) for x,y in zip(['period',
                                                   'epoch',
                                                   'pdepth',
                                                   'pduration',
                                                   'depthratio'],
                                                  [period,
                                                   epoch,
                                                   pdepth,
                                                   pduration,
                                                   depthratio])},
        'times':mtimes,
        'mags':mmags,
        'errs':merrs,
        'varperiod':period,
        'varamplitude':pdepth,
    }

    return modeldict


[docs]def generate_flare_lightcurve(
        times,
        mags=None,
        errs=None,
        paramdists={
            # flare peak amplitude from 0.01 mag to 1.0 mag above median.  this
            # is tuned for redder bands, flares are much stronger in bluer
            # bands, so tune appropriately for your situation.
            'amplitude':sps.uniform(loc=0.01,scale=0.99),
            # up to 5 flares per LC and at least 1
            'nflares':[1,5],
            # 10 minutes to 1 hour for rise stdev
            'risestdev':sps.uniform(loc=0.007, scale=0.04),
            # 1 hour to 4 hours for decay time constant
            'decayconst':sps.uniform(loc=0.04, scale=0.163)
        },
        magsarefluxes=False,
):
    '''This generates fake flare light curves.

    Parameters
    ----------

    times : np.array
        This is an array of time values that will be used as the time base.

    mags,errs : np.array
        These arrays will have the model added to them. If either is
        None, `np.full_like(times, 0.0)` will used as a substitute and the model
        light curve will be centered around 0.0.

    paramdists : dict
        This is a dict containing parameter distributions to use for the
        model params, containing the following keys ::

            {'amplitude', 'nflares', 'risestdev', 'decayconst'}

        The values of these keys should all be 'frozen' scipy.stats distribution
        objects, e.g.:

        https://docs.scipy.org/doc/scipy/reference/stats.html#continuous-distributions
        The `flare_peak_time` for each flare will be generated automatically
        between `times.min()` and `times.max()` using a uniform distribution.

        The `amplitude` will be flipped automatically as appropriate if
        `magsarefluxes=True`.

    magsarefluxes : bool
        If the generated time series is meant to be a flux time-series, set this
        to True to get the correct sign of variability amplitude.

    Returns
    -------

    dict
        A dict of the form below is returned::

            {'vartype': 'flare',
             'params': {'amplitude': generated value of flare amplitudes,
                        'nflares': generated value of number of flares,
                        'risestdev': generated value of stdev of rise time,
                        'decayconst': generated value of decay constant,
                        'peaktime': generated value of flare peak time},
             'times': the model times,
             'mags': the model mags,
             'errs': the model errs,
             'varamplitude': the generated amplitude of
                             variability == 'amplitude'}

    '''

    if mags is None:
        mags = np.full_like(times, 0.0)

    if errs is None:
        errs = np.full_like(times, 0.0)

    nflares = npr.randint(paramdists['nflares'][0],
                          high=paramdists['nflares'][1])

    # generate random flare peak times based on the number of flares
    flarepeaktimes = (
        npr.random(
            size=nflares
        )*(times.max() - times.min()) + times.min()
    )

    # now add the flares to the time-series
    params = {'nflares':nflares}

    for flareind, peaktime in zip(range(nflares), flarepeaktimes):

        # choose the amplitude, rise stdev and decay time constant
        amp = paramdists['amplitude'].rvs(size=1)
        risestdev = paramdists['risestdev'].rvs(size=1)
        decayconst = paramdists['decayconst'].rvs(size=1)

        # fix the transit depth if it needs to be flipped
        if magsarefluxes and amp < 0.0:
            amp = -amp
        elif not magsarefluxes and amp > 0.0:
            amp = -amp

        # add this flare to the light curve
        modelmags, ptimes, pmags, perrs = (
            flares.flare_model(
                [amp, peaktime, risestdev, decayconst],
                times,
                mags,
                errs
            )
        )

        # update the mags
        mags = modelmags

        # add the flare params to the modeldict
        params[flareind] = {'peaktime':peaktime,
                            'amplitude':amp,
                            'risestdev':risestdev,
                            'decayconst':decayconst}

    #
    # done with all flares
    #

    # return a dict with everything
    modeldict = {
        'vartype':'flare',
        'params':params,
        'times':times,
        'mags':mags,
        'errs':errs,
        'varperiod':None,
        # FIXME: this is complicated because we can have multiple flares
        # figure out a good way to handle this upstream
        'varamplitude':[params[x]['amplitude']
                        for x in range(params['nflares'])],
    }

    return modeldict


[docs]def generate_sinusoidal_lightcurve(
        times,
        mags=None,
        errs=None,
        paramdists={
            'period':sps.uniform(loc=0.04,scale=500.0),
            'fourierorder':[2,10],
            'amplitude':sps.uniform(loc=0.1,scale=0.9),
            'phioffset':0.0,
        },
        magsarefluxes=False
):
    '''This generates fake sinusoidal light curves.

    This can be used for a variety of sinusoidal variables, e.g. RRab, RRc,
    Cepheids, Miras, etc. The functions that generate these model LCs below
    implement the following table::

        ## FOURIER PARAMS FOR SINUSOIDAL VARIABLES
        #
        # type        fourier           period [days]
        #             order    dist     limits         dist

        # RRab        8 to 10  uniform  0.45--0.80     uniform
        # RRc         3 to 6   uniform  0.10--0.40     uniform
        # HADS        7 to 9   uniform  0.04--0.10     uniform
        # rotator     2 to 5   uniform  0.80--120.0    uniform
        # LPV         2 to 5   uniform  250--500.0     uniform

    FIXME: for better model LCs, figure out how scipy.signal.butter works and
    low-pass filter using scipy.signal.filtfilt.

    Parameters
    ----------

    times : np.array
        This is an array of time values that will be used as the time base.

    mags,errs : np.array
        These arrays will have the model added to them. If either is
        None, `np.full_like(times, 0.0)` will used as a substitute and the model
        light curve will be centered around 0.0.

    paramdists : dict
        This is a dict containing parameter distributions to use for the
        model params, containing the following keys ::

            {'period', 'fourierorder', 'amplitude', 'phioffset'}

        The values of these keys should all be 'frozen' scipy.stats distribution
        objects, e.g.:

        https://docs.scipy.org/doc/scipy/reference/stats.html#continuous-distributions
        The variability epoch will be automatically chosen from a uniform
        distribution between `times.min()` and `times.max()`.

        The `amplitude` will be flipped automatically as appropriate if
        `magsarefluxes=True`.

    magsarefluxes : bool
        If the generated time series is meant to be a flux time-series, set this
        to True to get the correct sign of variability amplitude.

    Returns
    -------

    dict
        A dict of the form below is returned::

            {'vartype': 'sinusoidal',
             'params': {'period': generated value of period,
                        'epoch': generated value of epoch,
                        'amplitude': generated value of amplitude,
                        'fourierorder': generated value of fourier order,
                        'fourieramps': generated values of fourier amplitudes,
                        'fourierphases': generated values of fourier phases},
             'times': the model times,
             'mags': the model mags,
             'errs': the model errs,
             'varperiod': the generated period of variability == 'period'
             'varamplitude': the generated amplitude of
                             variability == 'amplitude'}

    '''

    if mags is None:
        mags = np.full_like(times, 0.0)

    if errs is None:
        errs = np.full_like(times, 0.0)

    # choose the epoch
    epoch = npr.random()*(times.max() - times.min()) + times.min()

    # choose the period, fourierorder, and amplitude
    period = paramdists['period'].rvs(size=1)
    fourierorder = npr.randint(paramdists['fourierorder'][0],
                               high=paramdists['fourierorder'][1])
    amplitude = paramdists['amplitude'].rvs(size=1)

    # fix the amplitude if it needs to be flipped
    if magsarefluxes and amplitude < 0.0:
        amplitude = -amplitude
    elif not magsarefluxes and amplitude > 0.0:
        amplitude = -amplitude

    # generate the amplitudes and phases of the Fourier components
    ampcomps = [abs(amplitude/2.0)/float(x)
                for x in range(1,fourierorder+1)]
    phacomps = [paramdists['phioffset']*float(x)
                for x in range(1,fourierorder+1)]

    # now that we have our amp and pha components, generate the light curve
    modelmags, phase, ptimes, pmags, perrs = sinusoidal.sine_series_sum(
        [period, epoch, ampcomps, phacomps],
        times,
        mags,
        errs
    )

    # resort in original time order
    timeind = np.argsort(ptimes)
    mtimes = ptimes[timeind]
    mmags = modelmags[timeind]
    merrs = perrs[timeind]
    mphase = phase[timeind]

    # return a dict with everything
    modeldict = {
        'vartype':'sinusoidal',
        'params':{x:y for x,y in zip(['period',
                                      'epoch',
                                      'amplitude',
                                      'fourierorder',
                                      'fourieramps',
                                      'fourierphases'],
                                     [period,
                                      epoch,
                                      amplitude,
                                      fourierorder,
                                      ampcomps,
                                      phacomps])},
        'times':mtimes,
        'mags':mmags,
        'errs':merrs,
        'phase':mphase,
        # these are standard keys that help with later characterization of
        # variability as a function period, variability amplitude, object mag,
        # ndet, etc.
        'varperiod':period,
        'varamplitude':amplitude
    }

    return modeldict


[docs]def generate_rrab_lightcurve(
        times,
        mags=None,
        errs=None,
        paramdists={
            'period':sps.uniform(loc=0.45,scale=0.35),
            'fourierorder':[8,11],
            'amplitude':sps.uniform(loc=0.4,scale=0.5),
            'phioffset':np.pi,
        },
        magsarefluxes=False
):
    '''This generates fake RRab light curves.

    Parameters
    ----------

    times : np.array
        This is an array of time values that will be used as the time base.

    mags,errs : np.array
        These arrays will have the model added to them. If either is
        None, `np.full_like(times, 0.0)` will used as a substitute and the model
        light curve will be centered around 0.0.

    paramdists : dict
        This is a dict containing parameter distributions to use for the
        model params, containing the following keys ::

            {'period', 'fourierorder', 'amplitude'}

        The values of these keys should all be 'frozen' scipy.stats distribution
        objects, e.g.:

        https://docs.scipy.org/doc/scipy/reference/stats.html#continuous-distributions
        The variability epoch will be automatically chosen from a uniform
        distribution between `times.min()` and `times.max()`.

        The `amplitude` will be flipped automatically as appropriate if
        `magsarefluxes=True`.

    magsarefluxes : bool
        If the generated time series is meant to be a flux time-series, set this
        to True to get the correct sign of variability amplitude.

    Returns
    -------

    dict
        A dict of the form below is returned::

            {'vartype': 'RRab',
             'params': {'period': generated value of period,
                        'epoch': generated value of epoch,
                        'amplitude': generated value of amplitude,
                        'fourierorder': generated value of fourier order,
                        'fourieramps': generated values of fourier amplitudes,
                        'fourierphases': generated values of fourier phases},
             'times': the model times,
             'mags': the model mags,
             'errs': the model errs,
             'varperiod': the generated period of variability == 'period'
             'varamplitude': the generated amplitude of
                             variability == 'amplitude'}

    '''

    modeldict = generate_sinusoidal_lightcurve(times,
                                               mags=mags,
                                               errs=errs,
                                               paramdists=paramdists,
                                               magsarefluxes=magsarefluxes)
    modeldict['vartype'] = 'RRab'
    return modeldict


[docs]def generate_rrc_lightcurve(
        times,
        mags=None,
        errs=None,
        paramdists={
            'period':sps.uniform(loc=0.10,scale=0.30),
            'fourierorder':[2,3],
            'amplitude':sps.uniform(loc=0.1,scale=0.3),
            'phioffset':1.5*np.pi,
        },
        magsarefluxes=False
):
    '''This generates fake RRc light curves.

    Parameters
    ----------

    times : np.array
        This is an array of time values that will be used as the time base.

    mags,errs : np.array
        These arrays will have the model added to them. If either is
        None, `np.full_like(times, 0.0)` will used as a substitute and the model
        light curve will be centered around 0.0.

    paramdists : dict
        This is a dict containing parameter distributions to use for the
        model params, containing the following keys ::

            {'period', 'fourierorder', 'amplitude'}

        The values of these keys should all be 'frozen' scipy.stats distribution
        objects, e.g.:

        https://docs.scipy.org/doc/scipy/reference/stats.html#continuous-distributions
        The variability epoch will be automatically chosen from a uniform
        distribution between `times.min()` and `times.max()`.

        The `amplitude` will be flipped automatically as appropriate if
        `magsarefluxes=True`.

    magsarefluxes : bool
        If the generated time series is meant to be a flux time-series, set this
        to True to get the correct sign of variability amplitude.

    Returns
    -------

    dict
        A dict of the form below is returned::

            {'vartype': 'RRc',
             'params': {'period': generated value of period,
                        'epoch': generated value of epoch,
                        'amplitude': generated value of amplitude,
                        'fourierorder': generated value of fourier order,
                        'fourieramps': generated values of fourier amplitudes,
                        'fourierphases': generated values of fourier phases},
             'times': the model times,
             'mags': the model mags,
             'errs': the model errs,
             'varperiod': the generated period of variability == 'period'
             'varamplitude': the generated amplitude of
                             variability == 'amplitude'}

    '''

    modeldict = generate_sinusoidal_lightcurve(times,
                                               mags=mags,
                                               errs=errs,
                                               paramdists=paramdists,
                                               magsarefluxes=magsarefluxes)
    modeldict['vartype'] = 'RRc'
    return modeldict


[docs]def generate_hads_lightcurve(
        times,
        mags=None,
        errs=None,
        paramdists={
            'period':sps.uniform(loc=0.04,scale=0.06),
            'fourierorder':[5,10],
            'amplitude':sps.uniform(loc=0.1,scale=0.6),
            'phioffset':np.pi,
        },
        magsarefluxes=False
):
    '''This generates fake HADS light curves.

    Parameters
    ----------

    times : np.array
        This is an array of time values that will be used as the time base.

    mags,errs : np.array
        These arrays will have the model added to them. If either is
        None, `np.full_like(times, 0.0)` will used as a substitute and the model
        light curve will be centered around 0.0.

    paramdists : dict
        This is a dict containing parameter distributions to use for the
        model params, containing the following keys ::

            {'period', 'fourierorder', 'amplitude'}

        The values of these keys should all be 'frozen' scipy.stats distribution
        objects, e.g.:

        https://docs.scipy.org/doc/scipy/reference/stats.html#continuous-distributions
        The variability epoch will be automatically chosen from a uniform
        distribution between `times.min()` and `times.max()`.

        The `amplitude` will be flipped automatically as appropriate if
        `magsarefluxes=True`.

    magsarefluxes : bool
        If the generated time series is meant to be a flux time-series, set this
        to True to get the correct sign of variability amplitude.

    Returns
    -------

    dict
        A dict of the form below is returned::

            {'vartype': 'HADS',
             'params': {'period': generated value of period,
                        'epoch': generated value of epoch,
                        'amplitude': generated value of amplitude,
                        'fourierorder': generated value of fourier order,
                        'fourieramps': generated values of fourier amplitudes,
                        'fourierphases': generated values of fourier phases},
             'times': the model times,
             'mags': the model mags,
             'errs': the model errs,
             'varperiod': the generated period of variability == 'period'
             'varamplitude': the generated amplitude of
                             variability == 'amplitude'}

    '''

    modeldict = generate_sinusoidal_lightcurve(times,
                                               mags=mags,
                                               errs=errs,
                                               paramdists=paramdists,
                                               magsarefluxes=magsarefluxes)
    modeldict['vartype'] = 'HADS'
    return modeldict


[docs]def generate_rotator_lightcurve(
        times,
        mags=None,
        errs=None,
        paramdists={
            'period':sps.uniform(loc=0.80,scale=119.20),
            'fourierorder':[2,3],
            'amplitude':sps.uniform(loc=0.01,scale=0.7),
            'phioffset':1.5*np.pi,
        },
        magsarefluxes=False
):
    '''This generates fake rotator light curves.

    Parameters
    ----------

    times : np.array
        This is an array of time values that will be used as the time base.

    mags,errs : np.array
        These arrays will have the model added to them. If either is
        None, `np.full_like(times, 0.0)` will used as a substitute and the model
        light curve will be centered around 0.0.

    paramdists : dict
        This is a dict containing parameter distributions to use for the
        model params, containing the following keys ::

            {'period', 'fourierorder', 'amplitude'}

        The values of these keys should all be 'frozen' scipy.stats distribution
        objects, e.g.:

        https://docs.scipy.org/doc/scipy/reference/stats.html#continuous-distributions
        The variability epoch will be automatically chosen from a uniform
        distribution between `times.min()` and `times.max()`.

        The `amplitude` will be flipped automatically as appropriate if
        `magsarefluxes=True`.

    magsarefluxes : bool
        If the generated time series is meant to be a flux time-series, set this
        to True to get the correct sign of variability amplitude.

    Returns
    -------

    dict
        A dict of the form below is returned::

            {'vartype': 'rotator',
             'params': {'period': generated value of period,
                        'epoch': generated value of epoch,
                        'amplitude': generated value of amplitude,
                        'fourierorder': generated value of fourier order,
                        'fourieramps': generated values of fourier amplitudes,
                        'fourierphases': generated values of fourier phases},
             'times': the model times,
             'mags': the model mags,
             'errs': the model errs,
             'varperiod': the generated period of variability == 'period'
             'varamplitude': the generated amplitude of
                             variability == 'amplitude'}

    '''

    modeldict = generate_sinusoidal_lightcurve(times,
                                               mags=mags,
                                               errs=errs,
                                               paramdists=paramdists,
                                               magsarefluxes=magsarefluxes)
    modeldict['vartype'] = 'rotator'
    return modeldict


[docs]def generate_lpv_lightcurve(
        times,
        mags=None,
        errs=None,
        paramdists={
            'period':sps.uniform(loc=250.0,scale=250.0),
            'fourierorder':[2,3],
            'amplitude':sps.uniform(loc=0.1,scale=0.8),
            'phioffset':1.5*np.pi,
        },
        magsarefluxes=False
):
    '''This generates fake long-period-variable (LPV) light curves.

    Parameters
    ----------

    times : np.array
        This is an array of time values that will be used as the time base.

    mags,errs : np.array
        These arrays will have the model added to them. If either is
        None, `np.full_like(times, 0.0)` will used as a substitute and the model
        light curve will be centered around 0.0.

    paramdists : dict
        This is a dict containing parameter distributions to use for the
        model params, containing the following keys ::

            {'period', 'fourierorder', 'amplitude'}

        The values of these keys should all be 'frozen' scipy.stats distribution
        objects, e.g.:

        https://docs.scipy.org/doc/scipy/reference/stats.html#continuous-distributions
        The variability epoch will be automatically chosen from a uniform
        distribution between `times.min()` and `times.max()`.

        The `amplitude` will be flipped automatically as appropriate if
        `magsarefluxes=True`.

    magsarefluxes : bool
        If the generated time series is meant to be a flux time-series, set this
        to True to get the correct sign of variability amplitude.

    Returns
    -------

    dict
        A dict of the form below is returned::

            {'vartype': 'LPV',
             'params': {'period': generated value of period,
                        'epoch': generated value of epoch,
                        'amplitude': generated value of amplitude,
                        'fourierorder': generated value of fourier order,
                        'fourieramps': generated values of fourier amplitudes,
                        'fourierphases': generated values of fourier phases},
             'times': the model times,
             'mags': the model mags,
             'errs': the model errs,
             'varperiod': the generated period of variability == 'period'
             'varamplitude': the generated amplitude of
                             variability == 'amplitude'}

    '''

    modeldict = generate_sinusoidal_lightcurve(times,
                                               mags=mags,
                                               errs=errs,
                                               paramdists=paramdists,
                                               magsarefluxes=magsarefluxes)
    modeldict['vartype'] = 'LPV'
    return modeldict


[docs]def generate_cepheid_lightcurve(
        times,
        mags=None,
        errs=None,
        paramdists={
            'period':sps.uniform(loc=1.5,scale=108.5),
            'fourierorder':[8,11],
            'amplitude':sps.uniform(loc=0.1,scale=0.9),
            'phioffset':np.pi,
        },
        magsarefluxes=False
):
    '''This generates fake Cepheid light curves.

    Parameters
    ----------

    times : np.array
        This is an array of time values that will be used as the time base.

    mags,errs : np.array
        These arrays will have the model added to them. If either is
        None, `np.full_like(times, 0.0)` will used as a substitute and the model
        light curve will be centered around 0.0.

    paramdists : dict
        This is a dict containing parameter distributions to use for the
        model params, containing the following keys ::

            {'period', 'fourierorder', 'amplitude'}

        The values of these keys should all be 'frozen' scipy.stats distribution
        objects, e.g.:

        https://docs.scipy.org/doc/scipy/reference/stats.html#continuous-distributions
        The variability epoch will be automatically chosen from a uniform
        distribution between `times.min()` and `times.max()`.

        The `amplitude` will be flipped automatically as appropriate if
        `magsarefluxes=True`.

    magsarefluxes : bool
        If the generated time series is meant to be a flux time-series, set this
        to True to get the correct sign of variability amplitude.

    Returns
    -------

    dict
        A dict of the form below is returned::

            {'vartype': 'cepheid',
             'params': {'period': generated value of period,
                        'epoch': generated value of epoch,
                        'amplitude': generated value of amplitude,
                        'fourierorder': generated value of fourier order,
                        'fourieramps': generated values of fourier amplitudes,
                        'fourierphases': generated values of fourier phases},
             'times': the model times,
             'mags': the model mags,
             'errs': the model errs,
             'varperiod': the generated period of variability == 'period'
             'varamplitude': the generated amplitude of
                             variability == 'amplitude'}

    '''

    modeldict = generate_sinusoidal_lightcurve(times,
                                               mags=mags,
                                               errs=errs,
                                               paramdists=paramdists,
                                               magsarefluxes=magsarefluxes)
    modeldict['vartype'] = 'cepheid'
    return modeldict


# this maps functions to generate light curves to their vartype codes as put
# into the make_fakelc_collection function.
VARTYPE_LCGEN_MAP = {
    'EB': generate_eb_lightcurve,
    'RRab': generate_rrab_lightcurve,
    'RRc': generate_rrc_lightcurve,
    'rotator': generate_rotator_lightcurve,
    'flare': generate_flare_lightcurve,
    'HADS': generate_hads_lightcurve,
    'planet': generate_transit_lightcurve,
    'LPV': generate_lpv_lightcurve,
    'cepheid':generate_cepheid_lightcurve,
}


###############################################
## FUNCTIONS TO COLLECT LIGHT CURVES FOR SIM ##
###############################################

[docs]def make_fakelc(lcfile,
                outdir,
                magrms=None,
                randomizemags=True,
                randomizecoords=False,
                lcformat='hat-sql',
                lcformatdir=None,
                timecols=None,
                magcols=None,
                errcols=None):
    '''This preprocesses an input real LC and sets it up to be a fake LC.

    Parameters
    ----------

    lcfile : str
        This is an input light curve file that will be used to copy over the
        time-base. This will be used to generate the time-base for fake light
        curves to provide a realistic simulation of the observing window
        function.

    outdir : str
        The output directory where the the fake light curve will be written.

    magrms : dict
        This is a dict containing the SDSS r mag-RMS (SDSS rmag-MAD preferably)
        relation based on all light curves that the input lcfile is from. This
        will be used to generate the median mag and noise corresponding to the
        magnitude chosen for this fake LC.

    randomizemags : bool
        If this is True, then a random mag between the first and last magbin in
        magrms will be chosen as the median mag for this light curve. This
        choice will be weighted by the mag bin probability obtained from the
        magrms kwarg. Otherwise, the median mag will be taken from the input
        lcfile's lcdict['objectinfo']['sdssr'] key or a transformed SDSS r mag
        generated from the input lcfile's lcdict['objectinfo']['jmag'],
        ['hmag'], and ['kmag'] keys. The magrms relation for each magcol will be
        used to generate Gaussian noise at the correct level for the magbin this
        light curve's median mag falls into.

    randomizecoords : bool
        If this is True, will randomize the RA, DEC of the output fake object
        and not copy over the RA/DEC from the real input object.

    lcformat : str
        This is the `formatkey` associated with your input real light curve
        format, which you previously passed in to the `lcproc.register_lcformat`
        function. This will be used to look up how to find and read the light
        curve specified in `lcfile`.

    lcformatdir : str or None
        If this is provided, gives the path to a directory when you've stored
        your lcformat description JSONs, other than the usual directories lcproc
        knows to search for them in. Use this along with `lcformat` to specify
        an LC format JSON file that's not currently registered with lcproc.

    timecols : list of str or None
        The timecol keys to use from the input lcdict in generating the fake
        light curve. Fake LCs will be generated for each each
        timecol/magcol/errcol combination in the input light curve.

    magcols : list of str or None
        The magcol keys to use from the input lcdict in generating the fake
        light curve. Fake LCs will be generated for each each
        timecol/magcol/errcol combination in the input light curve.

    errcols : list of str or None
        The errcol keys to use from the input lcdict in generating the fake
        light curve. Fake LCs will be generated for each each
        timecol/magcol/errcol combination in the input light curve.

    Returns
    -------

    tuple
        A tuple of the following form is returned::

            (fakelc_fpath,
             fakelc_lcdict['columns'],
             fakelc_lcdict['objectinfo'],
             fakelc_lcdict['moments'])

    '''

    try:
        formatinfo = get_lcformat(lcformat,
                                  use_lcformat_dir=lcformatdir)
        if formatinfo:
            (fileglob, readerfunc,
             dtimecols, dmagcols, derrcols,
             magsarefluxes, normfunc) = formatinfo
        else:
            LOGERROR("can't figure out the light curve format")
            return None
    except Exception:
        LOGEXCEPTION("can't figure out the light curve format")
        return None

    # override the default timecols, magcols, and errcols
    # using the ones provided to the function
    if timecols is None:
        timecols = dtimecols
    if magcols is None:
        magcols = dmagcols
    if errcols is None:
        errcols = derrcols

    # read in the light curve
    lcdict = readerfunc(lcfile)
    if isinstance(lcdict, tuple) and isinstance(lcdict[0],dict):
        lcdict = lcdict[0]

    # set up the fakelcdict with a randomly assigned objectid
    fakeobjectid = sha512(npr.bytes(12)).hexdigest()[-8:]
    fakelcdict = {
        'objectid':fakeobjectid,
        'objectinfo':{'objectid':fakeobjectid},
        'columns':[],
        'moments':{},
        'origformat':lcformat,
    }

    # now, get the actual mag of this object and other info and use that to
    # populate the corresponding entries of the fakelcdict objectinfo
    if ('objectinfo' in lcdict and
        isinstance(lcdict['objectinfo'], dict)):

        objectinfo = lcdict['objectinfo']

        # get the RA
        if (not randomizecoords and 'ra' in objectinfo and
            objectinfo['ra'] is not None and
            np.isfinite(objectinfo['ra'])):

            fakelcdict['objectinfo']['ra'] = objectinfo['ra']

        else:

            # if there's no RA available, we'll assign a random one between 0
            # and 360.0
            LOGWARNING('%s: assigning a random right ascension' % lcfile)
            fakelcdict['objectinfo']['ra'] = npr.random()*360.0

        # get the DEC
        if (not randomizecoords and 'decl' in objectinfo and
            objectinfo['decl'] is not None and
            np.isfinite(objectinfo['decl'])):

            fakelcdict['objectinfo']['decl'] = objectinfo['decl']

        else:

            # if there's no DECL available, we'll assign a random one between
            # -90.0 and +90.0
            LOGWARNING(' %s: assigning a random declination' % lcfile)
            fakelcdict['objectinfo']['decl'] = npr.random()*180.0 - 90.0

        # get the SDSS r mag for this object
        # this will be used for getting the eventual mag-RMS relation later
        if ((not randomizemags) and 'sdssr' in objectinfo and
            objectinfo['sdssr'] is not None and
            np.isfinite(objectinfo['sdssr'])):

            fakelcdict['objectinfo']['sdssr'] = objectinfo['sdssr']

        # if the SDSS r is unavailable, but we have J, H, K: use those to get
        # the SDSS r by using transformations
        elif ((not randomizemags) and ('jmag' in objectinfo and
                                       objectinfo['jmag'] is not None and
                                       np.isfinite(objectinfo['jmag'])) and
              ('hmag' in objectinfo and
               objectinfo['hmag'] is not None and
               np.isfinite(objectinfo['hmag'])) and
              ('kmag' in objectinfo and
               objectinfo['kmag'] is not None and
               np.isfinite(objectinfo['kmag']))):

            LOGWARNING('used JHK mags to generate an SDSS r mag for %s' %
                       lcfile)
            fakelcdict['objectinfo']['sdssr'] = jhk_to_sdssr(
                objectinfo['jmag'],
                objectinfo['hmag'],
                objectinfo['kmag']
            )

        # if there are no mags available or we're specically told to randomize
        # them, generate a random mag between 8 and 16.0
        elif randomizemags and magrms:

            LOGWARNING(' %s: assigning a random mag weighted by mag '
                       'bin probabilities' % lcfile)

            magbins = magrms[magcols[0]]['binned_sdssr_median']
            binprobs = magrms[magcols[0]]['magbin_probabilities']

            # this is the center of the magbin chosen
            magbincenter = npr.choice(magbins,size=1,p=binprobs)

            # in this magbin, choose between center and -+ 0.25 mag
            chosenmag = (
                npr.random()*((magbincenter+0.25) - (magbincenter-0.25)) +
                (magbincenter-0.25)
            )

            fakelcdict['objectinfo']['sdssr'] = np.asscalar(chosenmag)

        # if there are no mags available at all, generate a random mag
        # between 8 and 16.0
        else:

            LOGWARNING(' %s: assigning a random mag from '
                       'uniform distribution between 8.0 and 16.0' % lcfile)

            fakelcdict['objectinfo']['sdssr'] = npr.random()*8.0 + 8.0

    # if there's no info available, generate fake info
    else:

        LOGWARNING('no object information found in %s, '
                   'generating random ra, decl, sdssr' %
                   lcfile)
        fakelcdict['objectinfo']['ra'] = npr.random()*360.0
        fakelcdict['objectinfo']['decl'] = npr.random()*180.0 - 90.0
        fakelcdict['objectinfo']['sdssr'] = npr.random()*8.0 + 8.0

    #
    # NOW FILL IN THE TIMES, MAGS, ERRS
    #

    # get the time columns
    for tcind, tcol in enumerate(timecols):

        if '.' in tcol:
            tcolget = tcol.split('.')
        else:
            tcolget = [tcol]

        if tcol not in fakelcdict:
            fakelcdict[tcol] = _dict_get(lcdict, tcolget)
            fakelcdict['columns'].append(tcol)

            # update the ndet with the first time column's size. it's possible
            # that different time columns have different lengths, but that would
            # be weird and we won't deal with it for now
            if tcind == 0:
                fakelcdict['objectinfo']['ndet'] = fakelcdict[tcol].size

    # get the mag columns
    for mcol in magcols:

        if '.' in mcol:
            mcolget = mcol.split('.')
        else:
            mcolget = [mcol]

        # put the mcol in only once
        if mcol not in fakelcdict:

            measuredmags = _dict_get(lcdict, mcolget)
            measuredmags = measuredmags[np.isfinite(measuredmags)]

            # if we're randomizing, get the mags from the interpolated mag-RMS
            # relation
            if (randomizemags and magrms and mcol in magrms and
                'interpolated_magmad' in magrms[mcol] and
                magrms[mcol]['interpolated_magmad'] is not None):

                interpfunc = magrms[mcol]['interpolated_magmad']
                lcmad = interpfunc(fakelcdict['objectinfo']['sdssr'])

                fakelcdict['moments'][mcol] = {
                    'median': fakelcdict['objectinfo']['sdssr'],
                    'mad': lcmad
                }

            # if we're not randomizing, get the median and MAD from the light
            # curve itself
            else:

                # we require at least 10 finite measurements
                if measuredmags.size > 9:

                    measuredmedian = np.median(measuredmags)
                    measuredmad = np.median(
                        np.abs(measuredmags - measuredmedian)
                    )
                    fakelcdict['moments'][mcol] = {'median':measuredmedian,
                                                   'mad':measuredmad}

                # if there aren't enough measurements in this LC, try to get the
                # median and RMS from the interpolated mag-RMS relation first
                else:

                    if (magrms and mcol in magrms and
                        'interpolated_magmad' in magrms[mcol] and
                        magrms[mcol]['interpolated_magmad'] is not None):

                        LOGWARNING(
                            'input LC %s does not have enough '
                            'finite measurements, '
                            'generating mag moments from '
                            'fakelc sdssr and the mag-RMS relation' % lcfile
                        )

                        interpfunc = magrms[mcol]['interpolated_magmad']
                        lcmad = interpfunc(fakelcdict['objectinfo']['sdssr'])

                        fakelcdict['moments'][mcol] = {
                            'median': fakelcdict['objectinfo']['sdssr'],
                            'mad': lcmad
                        }

                    # if we don't have the mag-RMS relation either, then we
                    # can't do anything for this light curve, generate a random
                    # MAD between 5e-4 and 0.1
                    else:

                        LOGWARNING(
                            'input LC %s does not have enough '
                            'finite measurements and '
                            'no mag-RMS relation provided '
                            'assigning a random MAD between 5.0e-4 and 0.1'
                            % lcfile
                        )

                        fakelcdict['moments'][mcol] = {
                            'median':fakelcdict['objectinfo']['sdssr'],
                            'mad':npr.random()*(0.1 - 5.0e-4) + 5.0e-4
                        }

            # the magnitude column is set to all zeros initially. this will be
            # filled in by the add_fakelc_variability function below
            fakelcdict[mcol] = np.full_like(_dict_get(lcdict, mcolget), 0.0)
            fakelcdict['columns'].append(mcol)

    # get the err columns
    for mcol, ecol in zip(magcols, errcols):

        if '.' in ecol:
            ecolget = ecol.split('.')
        else:
            ecolget = [ecol]

        if ecol not in fakelcdict:

            measurederrs = _dict_get(lcdict, ecolget)
            measurederrs = measurederrs[np.isfinite(measurederrs)]

            # if we're randomizing, get the errs from the interpolated mag-RMS
            # relation
            if (randomizemags and magrms and mcol in magrms and
                'interpolated_magmad' in magrms[mcol] and
                magrms[mcol]['interpolated_magmad'] is not None):

                interpfunc = magrms[mcol]['interpolated_magmad']
                lcmad = interpfunc(fakelcdict['objectinfo']['sdssr'])

                # the median of the errs = lcmad
                # the mad of the errs is 0.1 x lcmad
                fakelcdict['moments'][ecol] = {
                    'median': lcmad,
                    'mad': 0.1*lcmad
                }

            else:

                # we require at least 10 finite measurements
                # we'll calculate the median and MAD of the errs to use later on
                if measurederrs.size > 9:
                    measuredmedian = np.median(measurederrs)
                    measuredmad = np.median(
                        np.abs(measurederrs - measuredmedian)
                    )
                    fakelcdict['moments'][ecol] = {'median':measuredmedian,
                                                   'mad':measuredmad}
                else:

                    if (magrms and mcol in magrms and
                        'interpolated_magmad' in magrms[mcol] and
                        magrms[mcol]['interpolated_magmad'] is not None):

                        LOGWARNING(
                            'input LC %s does not have enough '
                            'finite measurements, '
                            'generating err moments from '
                            'the mag-RMS relation' % lcfile
                        )

                        interpfunc = magrms[mcol]['interpolated_magmad']
                        lcmad = interpfunc(fakelcdict['objectinfo']['sdssr'])

                        fakelcdict['moments'][ecol] = {
                            'median': lcmad,
                            'mad': 0.1*lcmad
                        }

                    # if we don't have the mag-RMS relation either, then we
                    # can't do anything for this light curve, generate a random
                    # MAD between 5e-4 and 0.1
                    else:

                        LOGWARNING(
                            'input LC %s does not have '
                            'enough finite measurements and '
                            'no mag-RMS relation provided, '
                            'generating errs randomly' % lcfile
                        )
                        fakelcdict['moments'][ecol] = {
                            'median':npr.random()*(0.01 - 5.0e-4) + 5.0e-4,
                            'mad':npr.random()*(0.01 - 5.0e-4) + 5.0e-4
                        }

            # the errors column is set to all zeros initially. this will be
            # filled in by the add_fakelc_variability function below.
            fakelcdict[ecol] = np.full_like(_dict_get(lcdict, ecolget), 0.0)
            fakelcdict['columns'].append(ecol)

    # add the timecols, magcols, errcols to the lcdict
    fakelcdict['timecols'] = timecols
    fakelcdict['magcols'] = magcols
    fakelcdict['errcols'] = errcols

    # generate an output file name
    fakelcfname = '%s-fakelc.pkl' % fakelcdict['objectid']
    fakelcfpath = os.path.abspath(os.path.join(outdir, fakelcfname))

    # write this out to the output directory
    with open(fakelcfpath,'wb') as outfd:
        pickle.dump(fakelcdict, outfd, protocol=pickle.HIGHEST_PROTOCOL)

    # return the fakelc path, its columns, info, and moments so we can put them
    # into a collection DB later on
    LOGINFO('real LC %s -> fake LC %s OK' % (lcfile, fakelcfpath))

    return (fakelcfpath, fakelcdict['columns'],
            fakelcdict['objectinfo'], fakelcdict['moments'])


##########################
## COLLECTION FUNCTIONS ##
##########################

[docs]def collection_worker(task):
    '''
    This wraps `process_fakelc` for `make_fakelc_collection` below.

    Parameters
    ----------

    task : tuple
        This is of the form::

            task[0] = lcfile
            task[1] = outdir
            task[2] = magrms
            task[3] = dict with keys: {'lcformat', 'timecols', 'magcols',
                                       'errcols', 'randomizeinfo'}

    Returns
    -------

    tuple
        This returns a tuple of the form::

            (fakelc_fpath,
             fakelc_lcdict['columns'],
             fakelc_lcdict['objectinfo'],
             fakelc_lcdict['moments'])
    '''

    lcfile, outdir, kwargs = task

    try:

        fakelcresults = make_fakelc(
            lcfile,
            outdir,
            **kwargs
        )

        return fakelcresults

    except Exception:

        LOGEXCEPTION('could not process %s into a fakelc' % lcfile)
        return None


[docs]def make_fakelc_collection(lclist,
                           simbasedir,
                           magrmsfrom,
                           magrms_interpolate='quadratic',
                           magrms_fillvalue='extrapolate',
                           maxlcs=25000,
                           maxvars=2000,
                           randomizemags=True,
                           randomizecoords=False,
                           vartypes=('EB','RRab','RRc','cepheid',
                                     'rotator','flare','HADS',
                                     'planet','LPV'),
                           lcformat='hat-sql',
                           lcformatdir=None,
                           timecols=None,
                           magcols=None,
                           errcols=None):

    '''This prepares light curves for the recovery sim.

    Collects light curves from `lclist` using a uniform sampling among
    them. Copies them to the `simbasedir`, zeroes out their mags and errs but
    keeps their time bases, also keeps their RMS and median mags for later
    use. Calculates the mag-rms relation for the entire collection and writes
    that to the `simbasedir` as well.

    The purpose of this function is to copy over the time base and mag-rms
    relation of an existing light curve collection to use it as the basis for a
    variability recovery simulation.

    This returns a pickle written to the `simbasedir` that contains all the
    information for the chosen ensemble of fake light curves and writes all
    generated light curves to the `simbasedir/lightcurves` directory. Run the
    `add_variability_to_fakelc_collection` function after this function to add
    variability of the specified type to these generated light curves.

    Parameters
    ----------

    lclist : list of str
        This is a list of existing project light curves. This can be generated
        from :py:func:`astrobase.lcproc.catalogs.make_lclist` or similar.

    simbasedir : str
        This is the directory to where the fake light curves and their
        information will be copied to.

    magrmsfrom : str or dict
        This is used to generate magnitudes and RMSes for the objects in the
        output collection of fake light curves. This arg is either a string
        pointing to an existing pickle file that must contain a dict or a dict
        variable that MUST have the following key-vals at a minimum::

            {'<magcol1_name>': {
                  'binned_sdssr_median': array of median mags for each magbin
                  'binned_lcmad_median': array of LC MAD values per magbin
             },
             '<magcol2_name>': {
                  'binned_sdssr_median': array of median mags for each magbin
                  'binned_lcmad_median': array of LC MAD values per magbin
             },
             .
             .
             ...}

        where `magcol1_name`, etc. are the same as the `magcols` listed in the
        magcols kwarg (or the default magcols for the specified
        lcformat). Examples of the magrmsfrom dict (or pickle) required can be
        generated by the
        :py:func:`astrobase.lcproc.varthreshold.variability_threshold` function.

    magrms_interpolate,magrms_fillvalue : str
        These are arguments that will be passed directly to the
        scipy.interpolate.interp1d function to generate interpolating functions
        for the mag-RMS relation. See:

        https://docs.scipy.org/doc/scipy/reference/generated/scipy.interpolate.interp1d.html

        for details.

    maxlcs : int
        This is the total number of light curves to choose from `lclist` and
        generate as fake LCs.

    maxvars : int
        This is the total number of fake light curves that will be marked as
        variable.

    vartypes : list of str
        This is a list of variable types to put into the collection. The
        vartypes for each fake variable star will be chosen uniformly from this
        list.

    lcformat : str
        This is the `formatkey` associated with your input real light curves'
        format, which you previously passed in to the `lcproc.register_lcformat`
        function. This will be used to look up how to find and read the light
        curves specified in `lclist`.

    lcformatdir : str or None
        If this is provided, gives the path to a directory when you've stored
        your lcformat description JSONs, other than the usual directories lcproc
        knows to search for them in. Use this along with `lcformat` to specify
        an LC format JSON file that's not currently registered with lcproc.

    timecols : list of str or None
        The timecol keys to use from the input lcdict in generating the fake
        light curve. Fake LCs will be generated for each each
        timecol/magcol/errcol combination in the input light curves.

    magcols : list of str or None
        The magcol keys to use from the input lcdict in generating the fake
        light curve. Fake LCs will be generated for each each
        timecol/magcol/errcol combination in the input light curves.

    errcols : list of str or None
        The errcol keys to use from the input lcdict in generating the fake
        light curve. Fake LCs will be generated for each each
        timecol/magcol/errcol combination in the input light curves.

    Returns
    -------

    str
        Returns the string file name of a pickle containing all of the
        information for the fake LC collection that has been generated.

    '''

    try:
        formatinfo = get_lcformat(lcformat,
                                  use_lcformat_dir=lcformatdir)
        if formatinfo:
            (fileglob, readerfunc,
             dtimecols, dmagcols, derrcols,
             magsarefluxes, normfunc) = formatinfo
        else:
            LOGERROR("can't figure out the light curve format")
            return None
    except Exception:
        LOGEXCEPTION("can't figure out the light curve format")
        return None

    # override the default timecols, magcols, and errcols
    # using the ones provided to the function
    if timecols is None:
        timecols = dtimecols
    if magcols is None:
        magcols = dmagcols
    if errcols is None:
        errcols = derrcols

    if not isinstance(lclist, np.ndarray):
        lclist = np.array(lclist)

    chosenlcs = npr.choice(lclist, maxlcs, replace=False)

    fakelcdir = os.path.join(simbasedir, 'lightcurves')
    if not os.path.exists(fakelcdir):
        os.makedirs(fakelcdir)

    # get the magrms relation needed from the pickle or input dict
    if isinstance(magrmsfrom, str) and os.path.exists(magrmsfrom):
        with open(magrmsfrom,'rb') as infd:
            xmagrms = pickle.load(infd)
    elif isinstance(magrmsfrom, dict):
        xmagrms = magrmsfrom

    magrms = {}

    # get the required items from the magrms dict. interpolate the mag-rms
    # relation for the magcol so the make_fake_lc function can use it directly.
    for magcol in magcols:

        if (magcol in xmagrms and
            'binned_sdssr_median' in xmagrms[magcol] and
            'binned_lcmad_median' in xmagrms[magcol]):

            magrms[magcol] = {
                'binned_sdssr_median':np.array(
                    xmagrms[magcol]['binned_sdssr_median']
                ),
                'binned_lcmad_median':np.array(
                    xmagrms[magcol]['binned_lcmad_median']
                ),
            }

            # interpolate the mag-MAD relation
            interpolated_magmad = spi.interp1d(
                xmagrms[magcol]['binned_sdssr_median'],
                xmagrms[magcol]['binned_lcmad_median'],
                kind=magrms_interpolate,
                fill_value=magrms_fillvalue,
            )

            # save the magrms
            magrms[magcol]['interpolated_magmad'] = interpolated_magmad

            # generate the probability distribution in magbins. this is needed
            # to correctly sample the objects in this population
            bincounts = np.array(xmagrms[magcol]['binned_count'])
            binprobs = bincounts/np.sum(bincounts)

            # save the bin probabilities as well
            magrms[magcol]['magbin_probabilities'] = binprobs

        else:

            LOGWARNING('input magrms dict does not have '
                       'required info for magcol: %s' % magcol)

            magrms[magcol] = {
                'binned_sdssr_median':None,
                'binned_lcmad_median':None,
                'interpolated_magmad':None,
                'magbin_probabilities':None,
            }

    tasks = [(x, fakelcdir, {'lcformat':lcformat,
                             'timecols':timecols,
                             'magcols':magcols,
                             'errcols':errcols,
                             'magrms':magrms,
                             'randomizemags':randomizemags,
                             'randomizecoords':randomizecoords})
             for x in chosenlcs]

    # we can't parallelize because it messes up the random number generation,
    # causing all the IDs to clash. FIXME: figure out a way around this
    # (probably initial a seed in each worker process?)
    fakeresults = [collection_worker(task) for task in tasks]

    fakedb = {'simbasedir':simbasedir,
              'lcformat':lcformat,
              'timecols':timecols,
              'magcols':magcols,
              'errcols':errcols,
              'magsarefluxes':magsarefluxes}

    fobjects, fpaths = [], []
    fras, fdecls, fndets = [], [], []

    fmags, fmagmads = [], []
    ferrmeds, ferrmads = [], []

    # these are the indices for the variable objects chosen randomly
    isvariableind = npr.randint(0,high=len(fakeresults), size=maxvars)
    isvariable = np.full(len(fakeresults), False, dtype=np.bool)
    isvariable[isvariableind] = True
    fakedb['isvariable'] = isvariable

    LOGINFO('added %s variable stars' % maxvars)

    # these are the variable types for each variable object
    vartypeind = npr.randint(0,high=len(vartypes), size=maxvars)
    vartypearr = np.array([vartypes[x] for x in vartypeind])
    fakedb['vartype'] = vartypearr

    for vt in sorted(vartypes):
        LOGINFO('%s: %s stars' % (vt, vartypearr[vartypearr == vt].size))

    # now go through the collection and get the mag/rms and err/rms for each
    # star. these will be used later to add noise to light curves
    LOGINFO('collecting info...')

    for fr in fakeresults:

        if fr is not None:

            fpath, fcols, finfo, fmoments = fr

            fobjects.append(finfo['objectid'])
            fpaths.append(fpath)

            fras.append(finfo['ra'])
            fdecls.append(finfo['decl'])
            fndets.append(finfo['ndet'])

            fmags.append(finfo['sdssr'])
            # this is per magcol
            fmagmads.append([fmoments[x]['mad'] for x in magcols])

            # these are per errcol
            ferrmeds.append([fmoments[x]['median'] for x in errcols])
            ferrmads.append([fmoments[x]['mad'] for x in errcols])

    # convert to nparrays
    fobjects = np.array(fobjects)
    fpaths = np.array(fpaths)

    fras = np.array(fras)
    fdecls = np.array(fdecls)
    fndets = np.array(fndets)

    fmags = np.array(fmags)
    fmagmads = np.array(fmagmads)
    ferrmeds = np.array(ferrmeds)
    ferrmads = np.array(ferrmads)

    # put these in the fakedb
    fakedb['objectid'] = fobjects
    fakedb['lcfpath'] = fpaths

    fakedb['ra'] = fras
    fakedb['decl'] = fdecls
    fakedb['ndet'] = fndets

    fakedb['sdssr'] = fmags
    fakedb['mad'] = fmagmads
    fakedb['errmedian'] = ferrmeds
    fakedb['errmad'] = ferrmads

    # get the mag-RMS curve for this light curve collection for each magcol
    fakedb['magrms'] = magrms

    # finally, write the collection DB to a pickle in simbasedir
    dboutfname = os.path.join(simbasedir,'fakelcs-info.pkl')
    with open(dboutfname, 'wb') as outfd:
        pickle.dump(fakedb, outfd)

    LOGINFO('wrote %s fake LCs to: %s' % (len(fakeresults), simbasedir))
    LOGINFO('fake LC info written to: %s' % dboutfname)

    return dboutfname


########################################################
## FUNCTIONS TO ADD VARIABLE LCS TO FAKELC COLLECTION ##
########################################################

[docs]def add_fakelc_variability(fakelcfile,
                           vartype,
                           override_paramdists=None,
                           magsarefluxes=False,
                           overwrite=False):
    '''This adds variability of the specified type to the fake LC.

    The procedure is (for each `magcol`):

    - read the fakelcfile, get the stored moments and vartype info

    - add the periodic variability specified in vartype and varparamdists. if
      `vartype == None`, then do nothing in this step. If `override_vartype` is
      not None, override stored vartype with specified vartype. If
      `override_varparamdists` provided, override with specified
      `varparamdists`. NOTE: the varparamdists must make sense for the vartype,
      otherwise, weird stuff will happen.

    - add the median mag level stored in `fakelcfile` to the time series

    - add Gaussian noise to the light curve as specified in `fakelcfile`

    - add a varinfo key and dict to the lcdict with `varperiod`, `varepoch`,
      `varparams`

    - write back to fake LC pickle

    - return the `varinfo` dict to the caller

    Parameters
    ----------

    fakelcfile : str
        The name of the fake LC file to process.

    vartype : str
        The type of variability to add to this fake LC file.

    override_paramdists : dict
        A parameter distribution dict as in the `generate_XX_lightcurve`
        functions above. If provided, will override the distribution stored in
        the input fake LC file itself.

    magsarefluxes : bool
        Sets if the variability amplitude is in fluxes and not magnitudes.

    overwite : bool
        This overwrites the input fake LC file with a new variable LC even if
        it's been processed before.

    Returns
    -------

    dict
        A dict of the following form is returned::

            {'objectid':lcdict['objectid'],
             'lcfname':fakelcfile,
             'actual_vartype':vartype,
             'actual_varparams':lcdict['actual_varparams']}

    '''

    # read in the fakelcfile
    lcdict = _read_pklc(fakelcfile)

    # make sure to bail out if this light curve already has fake variability
    # added
    if ('actual_vartype' in lcdict and
        'actual_varparams' in lcdict and
        not overwrite):
        LOGERROR('%s has existing variability type: %s '
                 'and params: %s and overwrite = False, '
                 'skipping this file...' %
                 (fakelcfile, lcdict['actual_vartype'],
                  repr(lcdict['actual_varparams'])))
        return None

    # get the times, mags, errs from this LC
    timecols, magcols, errcols = (lcdict['timecols'],
                                  lcdict['magcols'],
                                  lcdict['errcols'])

    # get the correct function to apply variability
    if vartype in VARTYPE_LCGEN_MAP:
        vargenfunc = VARTYPE_LCGEN_MAP[vartype]
    elif vartype is None:
        vargenfunc = None
    else:
        LOGERROR('unknown variability type: %s, choose from: %s' %
                 (vartype, repr(list(VARTYPE_LCGEN_MAP.keys()))))
        return None

    # 1. generate the variability, including the overrides if provided we do
    # this outside the loop below to get the period, etc. distributions once
    # only per object. NOTE: in doing so, we're assuming that the difference
    # between magcols is just additive and the timebases for each magcol are the
    # same; this is not strictly correct
    if vargenfunc is not None:
        if (override_paramdists is not None and
            isinstance(override_paramdists,dict)):

            variablelc = vargenfunc(lcdict[timecols[0]],
                                    paramdists=override_paramdists,
                                    magsarefluxes=magsarefluxes)

        else:

            variablelc = vargenfunc(lcdict[timecols[0]],
                                    magsarefluxes=magsarefluxes)

    # for nonvariables, don't execute vargenfunc, but return a similar dict
    # so we can add the required noise to it
    else:
        variablelc = {'vartype':None,
                      'params':None,
                      'times':lcdict[timecols[0]],
                      'mags':np.full_like(lcdict[timecols[0]], 0.0),
                      'errs':np.full_like(lcdict[timecols[0]], 0.0)}

    # now iterate over the time, mag, err columns
    for mcol, ecol in zip(magcols, errcols):

        # 2. get the moments for this magcol
        mag_median = lcdict['moments'][mcol]['median']
        mag_mad = lcdict['moments'][mcol]['mad']

        # add up to 5 mmag of extra RMS for systematics and red-noise
        mag_rms = mag_mad*1.483

        err_median = lcdict['moments'][ecol]['median']
        err_mad = lcdict['moments'][ecol]['mad']
        err_rms = err_mad*1.483

        # 3. add the median level + gaussian noise
        magnoise = npr.normal(size=variablelc['mags'].size)*mag_rms
        errnoise = npr.normal(size=variablelc['errs'].size)*err_rms

        finalmags = mag_median + (variablelc['mags'] + magnoise)
        finalerrs = err_median + (variablelc['errs'] + errnoise)

        # 4. update these tcol, mcol, ecol values in the lcdict
        lcdict[mcol] = finalmags
        lcdict[ecol] = finalerrs

    #
    # all done with updating mags and errs
    #

    # 5. update the light curve with the variability info
    lcdict['actual_vartype'] = variablelc['vartype']
    lcdict['actual_varparams'] = variablelc['params']

    # these standard keys are set to help out later with characterizing recovery
    # rates by magnitude, period, amplitude, ndet, etc.
    if vartype is not None:
        lcdict['actual_varperiod'] = variablelc['varperiod']
        lcdict['actual_varamplitude'] = variablelc['varamplitude']
    else:
        lcdict['actual_varperiod'] = np.nan
        lcdict['actual_varamplitude'] = np.nan

    # 6. write back, making sure to do it safely
    tempoutf = '%s.%s' % (fakelcfile, md5(npr.bytes(4)).hexdigest()[-8:])
    with open(tempoutf, 'wb') as outfd:
        pickle.dump(lcdict, outfd, pickle.HIGHEST_PROTOCOL)

    if os.path.exists(tempoutf):
        shutil.copy(tempoutf, fakelcfile)
        os.remove(tempoutf)
    else:
        LOGEXCEPTION('could not write output light curve file to dir: %s' %
                     os.path.dirname(tempoutf))
        # fail here
        raise

    LOGINFO('object: %s, vartype: %s -> %s OK' % (
        lcdict['objectid'],
        vartype,
        fakelcfile)
    )

    return {'objectid':lcdict['objectid'],
            'lcfname':fakelcfile,
            'actual_vartype':vartype,
            'actual_varparams':lcdict['actual_varparams']}


[docs]def add_variability_to_fakelc_collection(simbasedir,
                                         override_paramdists=None,
                                         overwrite_existingvar=False):
    '''This adds variability and noise to all fake LCs in `simbasedir`.

    If an object is marked as variable in the `fakelcs-info`.pkl file in
    `simbasedir`, a variable signal will be added to its light curve based on
    its selected type, default period and amplitude distribution, the
    appropriate params, etc. the epochs for each variable object will be chosen
    uniformly from its time-range (and may not necessarily fall on a actual
    observed time). Nonvariable objects will only have noise added as determined
    by their params, but no variable signal will be added.

    Parameters
    ----------

    simbasedir : str
        The directory containing the fake LCs to process.

    override_paramdists : dict
        This can be used to override the stored variable parameters in each fake
        LC. It should be a dict of the following form::

            {'<vartype1>': {'<param1>: a scipy.stats distribution function or
                                       the np.random.randint function,
                            .
                            .
                            .
                            '<paramN>: a scipy.stats distribution function
                                       or the np.random.randint function}

        for any vartype in VARTYPE_LCGEN_MAP. These are used to override the
        default parameter distributions for each variable type.

    overwrite_existingvar : bool
        If this is True, then will overwrite any existing variability in the
        input fake LCs in `simbasedir`.

    Returns
    -------

    dict
        This returns a dict containing the fake LC filenames as keys and
        variability info for each as values.

    '''

    # open the fakelcs-info.pkl
    infof = os.path.join(simbasedir,'fakelcs-info.pkl')
    with open(infof, 'rb') as infd:
        lcinfo = pickle.load(infd)

    lclist = lcinfo['lcfpath']
    varflag = lcinfo['isvariable']
    vartypes = lcinfo['vartype']

    vartind = 0

    varinfo = {}

    # go through all the LCs and add the required type of variability
    for lc, varf, _lcind in zip(lclist, varflag, range(len(lclist))):

        # if this object is variable, add variability
        if varf:

            thisvartype = vartypes[vartind]

            if (override_paramdists and
                isinstance(override_paramdists, dict) and
                thisvartype in override_paramdists and
                isinstance(override_paramdists[thisvartype], dict)):

                thisoverride_paramdists = override_paramdists[thisvartype]
            else:
                thisoverride_paramdists = None

            varlc = add_fakelc_variability(
                lc, thisvartype,
                override_paramdists=thisoverride_paramdists,
                overwrite=overwrite_existingvar
            )
            varinfo[varlc['objectid']] = {'params': varlc['actual_varparams'],
                                          'vartype': varlc['actual_vartype']}

            # update vartind
            vartind = vartind + 1

        else:

            varlc = add_fakelc_variability(
                lc, None,
                overwrite=overwrite_existingvar
            )
            varinfo[varlc['objectid']] = {'params': varlc['actual_varparams'],
                                          'vartype': varlc['actual_vartype']}
    #
    # done with all objects
    #

    # write the varinfo back to the dict and fakelcs-info.pkl
    lcinfo['varinfo'] = varinfo

    tempoutf = '%s.%s' % (infof, md5(npr.bytes(4)).hexdigest()[-8:])
    with open(tempoutf, 'wb') as outfd:
        pickle.dump(lcinfo, outfd, pickle.HIGHEST_PROTOCOL)

    if os.path.exists(tempoutf):
        shutil.copy(tempoutf, infof)
        os.remove(tempoutf)
    else:
        LOGEXCEPTION('could not write output light curve file to dir: %s' %
                     os.path.dirname(tempoutf))
        # fail here
        raise

    return lcinfo