Source code for astrobase.checkplot.pkl_xmatch

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# pkl_xmatch.py - Waqas Bhatti (wbhatti@astro.princeton.edu) - Feb 2019
# License: MIT.

'''
This contains utility functions that support the checkplot.pkl xmatch
functionality.

'''

#############
## LOGGING ##
#############

import logging
from astrobase import log_sub, log_fmt, log_date_fmt

DEBUG = False
if DEBUG:
    level = logging.DEBUG
else:
    level = logging.INFO
LOGGER = logging.getLogger(__name__)
logging.basicConfig(
    level=level,
    style=log_sub,
    format=log_fmt,
    datefmt=log_date_fmt,
)

LOGDEBUG = LOGGER.debug
LOGINFO = LOGGER.info
LOGWARNING = LOGGER.warning
LOGERROR = LOGGER.error
LOGEXCEPTION = LOGGER.exception


#############
## IMPORTS ##
#############

import os
import os.path
import gzip
import sys
import json
import pickle

import numpy as np

# import sps.cKDTree for external catalog xmatches
from scipy.spatial import cKDTree


###################
## LOCAL IMPORTS ##
###################

from .pkl_utils import _xyzdist_to_distarcsec
from .pkl_io import _write_checkplot_picklefile


#########################################
## XMATCHING AGAINST EXTERNAL CATALOGS ##
#########################################

def _parse_xmatch_catalog_header(xc, xk):
    '''
    This parses the header for a catalog file and returns it as a file object.

    Parameters
    ----------

    xc : str
        The file name of an xmatch catalog prepared previously.

    xk : list of str
        This is a list of column names to extract from the xmatch catalog.

    Returns
    -------

    tuple
        The tuple returned is of the form::

            (infd: the file object associated with the opened xmatch catalog,
             catdefdict: a dict describing the catalog column definitions,
             catcolinds: column number indices of the catalog,
             catcoldtypes: the numpy dtypes of the catalog columns,
             catcolnames: the names of each catalog column,
             catcolunits: the units associated with each catalog column)

    '''

    catdef = []

    # read in this catalog and transparently handle gzipped files
    if xc.endswith('.gz'):
        infd = gzip.open(xc,'rb')
    else:
        infd = open(xc,'rb')

    # read in the defs
    for line in infd:
        if line.decode().startswith('#'):
            catdef.append(
                line.decode().replace('#','').strip().rstrip('\n')
            )
        if not line.decode().startswith('#'):
            break

    if not len(catdef) > 0:
        LOGERROR("catalog definition not parseable "
                 "for catalog: %s, skipping..." % xc)
        return None

    catdef = ' '.join(catdef)
    catdefdict = json.loads(catdef)

    catdefkeys = [x['key'] for x in catdefdict['columns']]
    catdefdtypes = [x['dtype'] for x in catdefdict['columns']]
    catdefnames = [x['name'] for x in catdefdict['columns']]
    catdefunits = [x['unit'] for x in catdefdict['columns']]

    # get the correct column indices and dtypes for the requested columns
    # from the catdefdict

    catcolinds = []
    catcoldtypes = []
    catcolnames = []
    catcolunits = []

    for xkcol in xk:

        if xkcol in catdefkeys:

            xkcolind = catdefkeys.index(xkcol)

            catcolinds.append(xkcolind)
            catcoldtypes.append(catdefdtypes[xkcolind])
            catcolnames.append(catdefnames[xkcolind])
            catcolunits.append(catdefunits[xkcolind])

    return (infd, catdefdict,
            catcolinds, catcoldtypes, catcolnames, catcolunits)


[docs]def load_xmatch_external_catalogs(xmatchto, xmatchkeys, outfile=None):
    '''This loads the external xmatch catalogs into a dict for use in an xmatch.

    Parameters
    ----------

    xmatchto : list of str
        This is a list of paths to all the catalog text files that will be
        loaded.

        The text files must be 'CSVs' that use the '|' character as the
        separator betwen columns. These files should all begin with a header in
        JSON format on lines starting with the '#' character. this header will
        define the catalog and contains the name of the catalog and the column
        definitions. Column definitions must have the column name and the numpy
        dtype of the columns (in the same format as that expected for the
        numpy.genfromtxt function). Any line that does not begin with '#' is
        assumed to be part of the columns in the catalog. An example is shown
        below::

            # {"name":"NSVS catalog of variable stars",
            #  "columns":[
            #   {"key":"objectid", "dtype":"U20", "name":"Object ID", "unit": null},
            #   {"key":"ra", "dtype":"f8", "name":"RA", "unit":"deg"},
            #   {"key":"decl","dtype":"f8", "name": "Declination", "unit":"deg"},
            #   {"key":"sdssr","dtype":"f8","name":"SDSS r", "unit":"mag"},
            #   {"key":"vartype","dtype":"U20","name":"Variable type", "unit":null}
            #  ],
            #  "colra":"ra",
            #  "coldec":"decl",
            #  "description":"Contains variable stars from the NSVS catalog"}
            objectid1 | 45.0  | -20.0 | 12.0 | detached EB
            objectid2 | 145.0 | 23.0  | 10.0 | RRab
            objectid3 | 12.0  | 11.0  | 14.0 | Cepheid
            .
            .
            .

    xmatchkeys : list of lists
        This is the list of lists of column names (as str) to get out of each
        `xmatchto` catalog. This should be the same length as `xmatchto` and
        each element here will apply to the respective file in `xmatchto`.

    outfile : str or None
        If this is not None, set this to the name of the pickle to write the
        collected xmatch catalogs to. this pickle can then be loaded
        transparently by the :py:func:`astrobase.checkplot.pkl.checkplot_dict`,
        :py:func:`astrobase.checkplot.pkl.checkplot_pickle` functions to provide
        xmatch info to the
        :py:func:`astrobase.checkplot.pkl_xmatch.xmatch_external_catalogs`
        function below.

        If this is None, will return the loaded xmatch catalogs directly. This
        will be a huge dict, so make sure you have enough RAM.

    Returns
    -------

    str or dict
        Based on the `outfile` kwarg, will either return the path to a collected
        xmatch pickle file or the collected xmatch dict.

    '''

    outdict = {}

    for xc, xk in zip(xmatchto, xmatchkeys):

        parsed_catdef = _parse_xmatch_catalog_header(xc, xk)

        if not parsed_catdef:
            continue

        (infd, catdefdict,
         catcolinds, catcoldtypes,
         catcolnames, catcolunits) = parsed_catdef

        # get the specified columns out of the catalog
        catarr = np.genfromtxt(infd,
                               usecols=catcolinds,
                               names=xk,
                               dtype=','.join(catcoldtypes),
                               comments='#',
                               delimiter='|',
                               autostrip=True)
        infd.close()

        catshortname = os.path.splitext(os.path.basename(xc))[0]
        catshortname = catshortname.replace('.csv','')

        #
        # make a kdtree for this catalog
        #

        # get the ra and decl columns
        objra, objdecl = (catarr[catdefdict['colra']],
                          catarr[catdefdict['coldec']])

        # get the xyz unit vectors from ra,decl
        cosdecl = np.cos(np.radians(objdecl))
        sindecl = np.sin(np.radians(objdecl))
        cosra = np.cos(np.radians(objra))
        sinra = np.sin(np.radians(objra))
        xyz = np.column_stack((cosra*cosdecl,sinra*cosdecl, sindecl))

        # generate the kdtree
        kdt = cKDTree(xyz,copy_data=True)

        # generate the outdict element for this catalog
        catoutdict = {'kdtree':kdt,
                      'data':catarr,
                      'columns':xk,
                      'colnames':catcolnames,
                      'colunits':catcolunits,
                      'name':catdefdict['name'],
                      'desc':catdefdict['description']}

        outdict[catshortname] = catoutdict

    if outfile is not None:

        # if we're on OSX, we apparently need to save the file in chunks smaller
        # than 2 GB to make it work right. can't load pickles larger than 4 GB
        # either, but 3 GB < total size < 4 GB appears to be OK when loading.
        # also see: https://bugs.python.org/issue24658.
        # fix adopted from: https://stackoverflow.com/a/38003910
        if sys.platform == 'darwin':

            dumpbytes = pickle.dumps(outdict, protocol=pickle.HIGHEST_PROTOCOL)
            max_bytes = 2**31 - 1

            with open(outfile, 'wb') as outfd:
                for idx in range(0, len(dumpbytes), max_bytes):
                    outfd.write(dumpbytes[idx:idx+max_bytes])

        else:
            with open(outfile, 'wb') as outfd:
                pickle.dump(outdict, outfd, pickle.HIGHEST_PROTOCOL)

        return outfile

    else:

        return outdict


[docs]def xmatch_external_catalogs(checkplotdict,
                             xmatchinfo,
                             xmatchradiusarcsec=2.0,
                             returndirect=False,
                             updatexmatch=True,
                             savepickle=None):
    '''This matches the current object in the checkplotdict to all of the
    external match catalogs specified.

    Parameters
    ----------

    checkplotdict : dict
        This is a checkplotdict, generated by either the `checkplot_dict`
        function, or read in from a `_read_checkplot_picklefile` function. This
        must have a structure somewhat like the following, where the indicated
        keys below are required::

            {'objectid': the ID assigned to this object
             'objectinfo': {'objectid': ID assigned to this object,
                            'ra': right ascension of the object in decimal deg,
                            'decl': declination of the object in decimal deg}}

    xmatchinfo : str or dict
        This is either the xmatch dict produced by the function
        :py:func:`astrobase.checkplot.pkl_xmatch.load_xmatch_external_catalogs`
        above, or the path to the xmatch info pickle file produced by that
        function.

    xmatchradiusarcsec : float
        This is the cross-matching radius to use in arcseconds.

    returndirect : bool
        If this is True, will only return the xmatch results as a dict. If this
        False, will return the checkplotdict with the xmatch results added in as
        a key-val pair.

    updatexmatch : bool
        This function will look for an existing 'xmatch' key in the input
        checkplotdict indicating that an xmatch has been performed before. If
        `updatexmatch` is set to True, the xmatch results will be added onto
        (e.g. when xmatching to additional catalogs after the first run). If
        this is set to False, the xmatch key-val pair will be completely
        overwritten.

    savepickle : str or None
        If this is None, it must be a path to where the updated checkplotdict
        will be written to as a new checkplot pickle. If this is False, only the
        updated checkplotdict is returned.

    Returns
    -------

    dict or str
        If `savepickle` is False, this returns a checkplotdict, with the xmatch
        results added in. An 'xmatch' key will be added to this dict, with
        something like the following dict as the value::

            {'xmatchradiusarcsec':xmatchradiusarcsec,
             'catalog1':{'name':'Catalog of interesting things',
                        'found':True,
                        'distarcsec':0.7,
                        'info':{'objectid':...,'ra':...,'decl':...,'desc':...}},
             'catalog2':{'name':'Catalog of more interesting things',
                         'found':False,
                         'distarcsec':nan,
                         'info':None},
            .
            .
            .
            ....}

        This will contain the matches of the object in the input checkplotdict
        to all of the catalogs provided in `xmatchinfo`.

        If `savepickle` is True, will return the path to the saved checkplot
        pickle file.

    '''

    # load the xmatch info
    if isinstance(xmatchinfo, str) and os.path.exists(xmatchinfo):
        with open(xmatchinfo,'rb') as infd:
            xmatchdict = pickle.load(infd)
    elif isinstance(xmatchinfo, dict):
        xmatchdict = xmatchinfo
    else:
        LOGERROR("can't figure out xmatch info, can't xmatch, skipping...")
        return checkplotdict

    #
    # generate the xmatch spec
    #

    # get our ra, decl
    objra = checkplotdict['objectinfo']['ra']
    objdecl = checkplotdict['objectinfo']['decl']

    cosdecl = np.cos(np.radians(objdecl))
    sindecl = np.sin(np.radians(objdecl))
    cosra = np.cos(np.radians(objra))
    sinra = np.sin(np.radians(objra))

    objxyz = np.column_stack((cosra*cosdecl,
                              sinra*cosdecl,
                              sindecl))

    # this is the search distance in xyz unit vectors
    xyzdist = 2.0 * np.sin(np.radians(xmatchradiusarcsec/3600.0)/2.0)

    #
    # now search in each external catalog
    #

    xmatchresults = {}

    extcats = sorted(xmatchdict.keys())

    for ecat in extcats:

        # get the kdtree
        kdt = xmatchdict[ecat]['kdtree']

        # look up the coordinates
        kdt_dist, kdt_ind = kdt.query(objxyz,
                                      k=1,
                                      distance_upper_bound=xyzdist)

        # sort by matchdist
        mdsorted = np.argsort(kdt_dist)
        matchdists = kdt_dist[mdsorted]
        matchinds = kdt_ind[mdsorted]

        if matchdists[np.isfinite(matchdists)].size == 0:

            xmatchresults[ecat] = {'name':xmatchdict[ecat]['name'],
                                   'desc':xmatchdict[ecat]['desc'],
                                   'found':False,
                                   'distarcsec':None,
                                   'info':None}

        else:

            for md, mi in zip(matchdists, matchinds):

                if np.isfinite(md) and md < xyzdist:

                    infodict = {}

                    distarcsec = _xyzdist_to_distarcsec(md)

                    for col in xmatchdict[ecat]['columns']:

                        coldata = xmatchdict[ecat]['data'][col][mi]

                        if isinstance(coldata, str):
                            coldata = coldata.strip()

                        infodict[col] = coldata

                    xmatchresults[ecat] = {
                        'name':xmatchdict[ecat]['name'],
                        'desc':xmatchdict[ecat]['desc'],
                        'found':True,
                        'distarcsec':distarcsec,
                        'info':infodict,
                        'colkeys':xmatchdict[ecat]['columns'],
                        'colnames':xmatchdict[ecat]['colnames'],
                        'colunit':xmatchdict[ecat]['colunits'],
                    }
                    break

    #
    # should now have match results for all external catalogs
    #

    if returndirect:

        return xmatchresults

    else:

        if updatexmatch and 'xmatch' in checkplotdict:
            checkplotdict['xmatch'].update(xmatchresults)
        else:
            checkplotdict['xmatch'] = xmatchresults

        if savepickle:

            cpf = _write_checkplot_picklefile(checkplotdict,
                                              outfile=savepickle,
                                              protocol=4)
            return cpf

        else:
            return checkplotdict