#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# pkl_xmatch.py - Waqas Bhatti (wbhatti@astro.princeton.edu) - Feb 2019
# License: MIT.
'''
This contains utility functions that support the checkplot.pkl xmatch
functionality.
'''
#############
## LOGGING ##
#############
import logging
from astrobase import log_sub, log_fmt, log_date_fmt
DEBUG = False
if DEBUG:
level = logging.DEBUG
else:
level = logging.INFO
LOGGER = logging.getLogger(__name__)
logging.basicConfig(
level=level,
style=log_sub,
format=log_fmt,
datefmt=log_date_fmt,
)
LOGDEBUG = LOGGER.debug
LOGINFO = LOGGER.info
LOGWARNING = LOGGER.warning
LOGERROR = LOGGER.error
LOGEXCEPTION = LOGGER.exception
#############
## IMPORTS ##
#############
import os
import os.path
import gzip
import sys
import json
import pickle
import numpy as np
# import sps.cKDTree for external catalog xmatches
from scipy.spatial import cKDTree
###################
## LOCAL IMPORTS ##
###################
from .pkl_utils import _xyzdist_to_distarcsec
from .pkl_io import _write_checkplot_picklefile
#########################################
## XMATCHING AGAINST EXTERNAL CATALOGS ##
#########################################
def _parse_xmatch_catalog_header(xc, xk):
'''
This parses the header for a catalog file and returns it as a file object.
Parameters
----------
xc : str
The file name of an xmatch catalog prepared previously.
xk : list of str
This is a list of column names to extract from the xmatch catalog.
Returns
-------
tuple
The tuple returned is of the form::
(infd: the file object associated with the opened xmatch catalog,
catdefdict: a dict describing the catalog column definitions,
catcolinds: column number indices of the catalog,
catcoldtypes: the numpy dtypes of the catalog columns,
catcolnames: the names of each catalog column,
catcolunits: the units associated with each catalog column)
'''
catdef = []
# read in this catalog and transparently handle gzipped files
if xc.endswith('.gz'):
infd = gzip.open(xc,'rb')
else:
infd = open(xc,'rb')
# read in the defs
for line in infd:
if line.decode().startswith('#'):
catdef.append(
line.decode().replace('#','').strip().rstrip('\n')
)
if not line.decode().startswith('#'):
break
if not len(catdef) > 0:
LOGERROR("catalog definition not parseable "
"for catalog: %s, skipping..." % xc)
return None
catdef = ' '.join(catdef)
catdefdict = json.loads(catdef)
catdefkeys = [x['key'] for x in catdefdict['columns']]
catdefdtypes = [x['dtype'] for x in catdefdict['columns']]
catdefnames = [x['name'] for x in catdefdict['columns']]
catdefunits = [x['unit'] for x in catdefdict['columns']]
# get the correct column indices and dtypes for the requested columns
# from the catdefdict
catcolinds = []
catcoldtypes = []
catcolnames = []
catcolunits = []
for xkcol in xk:
if xkcol in catdefkeys:
xkcolind = catdefkeys.index(xkcol)
catcolinds.append(xkcolind)
catcoldtypes.append(catdefdtypes[xkcolind])
catcolnames.append(catdefnames[xkcolind])
catcolunits.append(catdefunits[xkcolind])
return (infd, catdefdict,
catcolinds, catcoldtypes, catcolnames, catcolunits)
[docs]def load_xmatch_external_catalogs(xmatchto, xmatchkeys, outfile=None):
'''This loads the external xmatch catalogs into a dict for use in an xmatch.
Parameters
----------
xmatchto : list of str
This is a list of paths to all the catalog text files that will be
loaded.
The text files must be 'CSVs' that use the '|' character as the
separator betwen columns. These files should all begin with a header in
JSON format on lines starting with the '#' character. this header will
define the catalog and contains the name of the catalog and the column
definitions. Column definitions must have the column name and the numpy
dtype of the columns (in the same format as that expected for the
numpy.genfromtxt function). Any line that does not begin with '#' is
assumed to be part of the columns in the catalog. An example is shown
below::
# {"name":"NSVS catalog of variable stars",
# "columns":[
# {"key":"objectid", "dtype":"U20", "name":"Object ID", "unit": null},
# {"key":"ra", "dtype":"f8", "name":"RA", "unit":"deg"},
# {"key":"decl","dtype":"f8", "name": "Declination", "unit":"deg"},
# {"key":"sdssr","dtype":"f8","name":"SDSS r", "unit":"mag"},
# {"key":"vartype","dtype":"U20","name":"Variable type", "unit":null}
# ],
# "colra":"ra",
# "coldec":"decl",
# "description":"Contains variable stars from the NSVS catalog"}
objectid1 | 45.0 | -20.0 | 12.0 | detached EB
objectid2 | 145.0 | 23.0 | 10.0 | RRab
objectid3 | 12.0 | 11.0 | 14.0 | Cepheid
.
.
.
xmatchkeys : list of lists
This is the list of lists of column names (as str) to get out of each
`xmatchto` catalog. This should be the same length as `xmatchto` and
each element here will apply to the respective file in `xmatchto`.
outfile : str or None
If this is not None, set this to the name of the pickle to write the
collected xmatch catalogs to. this pickle can then be loaded
transparently by the :py:func:`astrobase.checkplot.pkl.checkplot_dict`,
:py:func:`astrobase.checkplot.pkl.checkplot_pickle` functions to provide
xmatch info to the
:py:func:`astrobase.checkplot.pkl_xmatch.xmatch_external_catalogs`
function below.
If this is None, will return the loaded xmatch catalogs directly. This
will be a huge dict, so make sure you have enough RAM.
Returns
-------
str or dict
Based on the `outfile` kwarg, will either return the path to a collected
xmatch pickle file or the collected xmatch dict.
'''
outdict = {}
for xc, xk in zip(xmatchto, xmatchkeys):
parsed_catdef = _parse_xmatch_catalog_header(xc, xk)
if not parsed_catdef:
continue
(infd, catdefdict,
catcolinds, catcoldtypes,
catcolnames, catcolunits) = parsed_catdef
# get the specified columns out of the catalog
catarr = np.genfromtxt(infd,
usecols=catcolinds,
names=xk,
dtype=','.join(catcoldtypes),
comments='#',
delimiter='|',
autostrip=True)
infd.close()
catshortname = os.path.splitext(os.path.basename(xc))[0]
catshortname = catshortname.replace('.csv','')
#
# make a kdtree for this catalog
#
# get the ra and decl columns
objra, objdecl = (catarr[catdefdict['colra']],
catarr[catdefdict['coldec']])
# get the xyz unit vectors from ra,decl
cosdecl = np.cos(np.radians(objdecl))
sindecl = np.sin(np.radians(objdecl))
cosra = np.cos(np.radians(objra))
sinra = np.sin(np.radians(objra))
xyz = np.column_stack((cosra*cosdecl,sinra*cosdecl, sindecl))
# generate the kdtree
kdt = cKDTree(xyz,copy_data=True)
# generate the outdict element for this catalog
catoutdict = {'kdtree':kdt,
'data':catarr,
'columns':xk,
'colnames':catcolnames,
'colunits':catcolunits,
'name':catdefdict['name'],
'desc':catdefdict['description']}
outdict[catshortname] = catoutdict
if outfile is not None:
# if we're on OSX, we apparently need to save the file in chunks smaller
# than 2 GB to make it work right. can't load pickles larger than 4 GB
# either, but 3 GB < total size < 4 GB appears to be OK when loading.
# also see: https://bugs.python.org/issue24658.
# fix adopted from: https://stackoverflow.com/a/38003910
if sys.platform == 'darwin':
dumpbytes = pickle.dumps(outdict, protocol=pickle.HIGHEST_PROTOCOL)
max_bytes = 2**31 - 1
with open(outfile, 'wb') as outfd:
for idx in range(0, len(dumpbytes), max_bytes):
outfd.write(dumpbytes[idx:idx+max_bytes])
else:
with open(outfile, 'wb') as outfd:
pickle.dump(outdict, outfd, pickle.HIGHEST_PROTOCOL)
return outfile
else:
return outdict
[docs]def xmatch_external_catalogs(checkplotdict,
xmatchinfo,
xmatchradiusarcsec=2.0,
returndirect=False,
updatexmatch=True,
savepickle=None):
'''This matches the current object in the checkplotdict to all of the
external match catalogs specified.
Parameters
----------
checkplotdict : dict
This is a checkplotdict, generated by either the `checkplot_dict`
function, or read in from a `_read_checkplot_picklefile` function. This
must have a structure somewhat like the following, where the indicated
keys below are required::
{'objectid': the ID assigned to this object
'objectinfo': {'objectid': ID assigned to this object,
'ra': right ascension of the object in decimal deg,
'decl': declination of the object in decimal deg}}
xmatchinfo : str or dict
This is either the xmatch dict produced by the function
:py:func:`astrobase.checkplot.pkl_xmatch.load_xmatch_external_catalogs`
above, or the path to the xmatch info pickle file produced by that
function.
xmatchradiusarcsec : float
This is the cross-matching radius to use in arcseconds.
returndirect : bool
If this is True, will only return the xmatch results as a dict. If this
False, will return the checkplotdict with the xmatch results added in as
a key-val pair.
updatexmatch : bool
This function will look for an existing 'xmatch' key in the input
checkplotdict indicating that an xmatch has been performed before. If
`updatexmatch` is set to True, the xmatch results will be added onto
(e.g. when xmatching to additional catalogs after the first run). If
this is set to False, the xmatch key-val pair will be completely
overwritten.
savepickle : str or None
If this is None, it must be a path to where the updated checkplotdict
will be written to as a new checkplot pickle. If this is False, only the
updated checkplotdict is returned.
Returns
-------
dict or str
If `savepickle` is False, this returns a checkplotdict, with the xmatch
results added in. An 'xmatch' key will be added to this dict, with
something like the following dict as the value::
{'xmatchradiusarcsec':xmatchradiusarcsec,
'catalog1':{'name':'Catalog of interesting things',
'found':True,
'distarcsec':0.7,
'info':{'objectid':...,'ra':...,'decl':...,'desc':...}},
'catalog2':{'name':'Catalog of more interesting things',
'found':False,
'distarcsec':nan,
'info':None},
.
.
.
....}
This will contain the matches of the object in the input checkplotdict
to all of the catalogs provided in `xmatchinfo`.
If `savepickle` is True, will return the path to the saved checkplot
pickle file.
'''
# load the xmatch info
if isinstance(xmatchinfo, str) and os.path.exists(xmatchinfo):
with open(xmatchinfo,'rb') as infd:
xmatchdict = pickle.load(infd)
elif isinstance(xmatchinfo, dict):
xmatchdict = xmatchinfo
else:
LOGERROR("can't figure out xmatch info, can't xmatch, skipping...")
return checkplotdict
#
# generate the xmatch spec
#
# get our ra, decl
objra = checkplotdict['objectinfo']['ra']
objdecl = checkplotdict['objectinfo']['decl']
cosdecl = np.cos(np.radians(objdecl))
sindecl = np.sin(np.radians(objdecl))
cosra = np.cos(np.radians(objra))
sinra = np.sin(np.radians(objra))
objxyz = np.column_stack((cosra*cosdecl,
sinra*cosdecl,
sindecl))
# this is the search distance in xyz unit vectors
xyzdist = 2.0 * np.sin(np.radians(xmatchradiusarcsec/3600.0)/2.0)
#
# now search in each external catalog
#
xmatchresults = {}
extcats = sorted(xmatchdict.keys())
for ecat in extcats:
# get the kdtree
kdt = xmatchdict[ecat]['kdtree']
# look up the coordinates
kdt_dist, kdt_ind = kdt.query(objxyz,
k=1,
distance_upper_bound=xyzdist)
# sort by matchdist
mdsorted = np.argsort(kdt_dist)
matchdists = kdt_dist[mdsorted]
matchinds = kdt_ind[mdsorted]
if matchdists[np.isfinite(matchdists)].size == 0:
xmatchresults[ecat] = {'name':xmatchdict[ecat]['name'],
'desc':xmatchdict[ecat]['desc'],
'found':False,
'distarcsec':None,
'info':None}
else:
for md, mi in zip(matchdists, matchinds):
if np.isfinite(md) and md < xyzdist:
infodict = {}
distarcsec = _xyzdist_to_distarcsec(md)
for col in xmatchdict[ecat]['columns']:
coldata = xmatchdict[ecat]['data'][col][mi]
if isinstance(coldata, str):
coldata = coldata.strip()
infodict[col] = coldata
xmatchresults[ecat] = {
'name':xmatchdict[ecat]['name'],
'desc':xmatchdict[ecat]['desc'],
'found':True,
'distarcsec':distarcsec,
'info':infodict,
'colkeys':xmatchdict[ecat]['columns'],
'colnames':xmatchdict[ecat]['colnames'],
'colunit':xmatchdict[ecat]['colunits'],
}
break
#
# should now have match results for all external catalogs
#
if returndirect:
return xmatchresults
else:
if updatexmatch and 'xmatch' in checkplotdict:
checkplotdict['xmatch'].update(xmatchresults)
else:
checkplotdict['xmatch'] = xmatchresults
if savepickle:
cpf = _write_checkplot_picklefile(checkplotdict,
outfile=savepickle,
protocol=4)
return cpf
else:
return checkplotdict