Source code for astrobase.services.gaia

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# gaia - Waqas Bhatti (wbhatti@astro.princeton.edu) - Dec 2017
# License: MIT. See the LICENSE file for more details.

'''
This queries the GAIA catalog for object lists in specified areas of the
sky. The main use of this module is to generate realistic spatial distributions
of stars for variability recovery simulations in combination with colors and
luminosities from the TRILEGAL galaxy model.

If you use this module, please cite the GAIA papers as outlined at:

https://gaia.esac.esa.int/documentation//GDR1/Miscellaneous/sec_credit_and_citation_instructions.html

Much of this module is derived from the example given at:

http://gea.esac.esa.int/archive-help/commandline/index.html

For a more general and useful interface to the GAIA catalog, see the astroquery
package by A. Ginsburg, B. Sipocz, et al.:

http://astroquery.readthedocs.io/en/latest/gaia/gaia.html

'''

#############
## LOGGING ##
#############

import logging
from astrobase import log_sub, log_fmt, log_date_fmt

DEBUG = False
if DEBUG:
    level = logging.DEBUG
else:
    level = logging.INFO
LOGGER = logging.getLogger(__name__)
logging.basicConfig(
    level=level,
    style=log_sub,
    format=log_fmt,
    datefmt=log_date_fmt,
)

LOGDEBUG = LOGGER.debug
LOGINFO = LOGGER.info
LOGWARNING = LOGGER.warning
LOGERROR = LOGGER.error
LOGEXCEPTION = LOGGER.exception


#############
## IMPORTS ##
#############

import os
import os.path
import gzip
import hashlib
import time
import pickle
import random

# to do the queries
import requests
import requests.exceptions

# to read the XML returned by the TAP service
from xml.dom.minidom import parseString


###################
## FORM SETTINGS ##
###################

GAIA_URLS = {
    'gaia':{'url':"https://gea.esac.esa.int/tap-server/tap/async",
            'table':'gaiadr2.gaia_source',
            'available_tables': {
                'dr2': 'gaiadr2.gaia_source',
                'edr3': 'gaiaedr3.gaia_source'
            },
            'phasekeyword':'uws:phase',
            'resultkeyword':'uws:result'},
    'heidelberg':{'url':"https://gaia.ari.uni-heidelberg.de/tap/async",
                  'table':'gaiadr2.gaia_source',
                  'available_tables': {
                      'dr2': 'gaiadr2.gaia_source',
                      'edr3': 'gaiaedr3.gaia_source'
                  },
                  'phasekeyword':'phase',
                  'resultkeyword':'result'},
    'vizier':{'url':"http://tapvizier.u-strasbg.fr/TAPVizieR/tap/async",
              'table':'"I/345/gaia2"',
              'available_tables': {
                  'dr2': '"I/345/gaia2"',
                  # NOTE: Vizier uses different column names so this is disabled
                  # for auto-table selection, otherwise the objectlist_search,
                  # cone_search functions below will fail if this mirror is
                  # selected with EDR3.
                  # 'edr3': '"I/350/gaiaedr3"'
              },
              'phasekeyword':'phase',
              'resultkeyword':'result'},
}

# default TAP query params, will be copied and overridden
TAP_PARAMS = {
    'REQUEST':'doQuery',
    'LANG':'ADQL',
    'FORMAT':'json',
    'PHASE':'RUN',
    'JOBNAME':'',
    'JOBDESCRIPTION':'',
    'QUERY':''
}

# valid return formats
RETURN_FORMATS = {
    'json':'json.gz',
    'csv':'csv.gz',
    'votable':'vot',
}


#####################
## QUERY FUNCTIONS ##
#####################

[docs]def tap_query(querystr,
              gaia_mirror=None,
              data_release="dr2",
              returnformat='csv',
              forcefetch=False,
              cachedir='~/.astrobase/gaia-cache',
              verbose=True,
              timeout=15.0,
              refresh=2.0,
              maxtimeout=300.0,
              maxtries=3,
              complete_query_later=False):
    '''This queries the GAIA TAP service using an ADQL query string.

    Parameters
    ----------

    querystr : str
        This is the ADQL query string. See:
        http://www.ivoa.net/documents/ADQL/2.0 for the specification and
        http://gea.esac.esa.int/archive-help/adql/index.html for GAIA-specific
        additions.

    gaia_mirror : {'gaia','heidelberg','vizier'} or None
        This is the key used to select a GAIA catalog mirror from the
        `GAIA_URLS` dict above. If set, the specified mirror will be used. If
        None, a random mirror chosen from that dict will be used.

    data_release: {'dr2', 'edr3'}
        The Gaia data release to use for the query. This provides hints for
        which table to use for the GAIA mirror being queried.

    returnformat : {'csv','votable','json'}
        The returned file format to request from the GAIA catalog service.

    forcefetch : bool
        If this is True, the query will be retried even if cached results for
        it exist.

    cachedir : str
        This points to the directory where results will be downloaded.

    verbose : bool
        If True, will indicate progress and warn of any issues.

    timeout : float
        This sets the amount of time in seconds to wait for the service to
        respond to our initial request.

    refresh : float
        This sets the amount of time in seconds to wait before checking if the
        result file is available. If the results file isn't available after
        `refresh` seconds have elapsed, the function will wait for `refresh`
        seconds continuously, until `maxtimeout` is reached or the results file
        becomes available.

    maxtimeout : float
        The maximum amount of time in seconds to wait for a result to become
        available after submitting our query request.

    maxtries : int
        The maximum number of tries (across all mirrors tried) to make to either
        submit the request or download the results, before giving up.

    completequerylater : bool
        If set to True, a submitted query that does not return a result before
        `maxtimeout` has passed will be cancelled but its input request
        parameters and the result URL provided by the service will be saved. If
        this function is then called later with these same input request
        parameters, it will check if the query finally finished and a result is
        available. If so, will download the results instead of submitting a new
        query. If it's not done yet, will start waiting for results again. To
        force launch a new query with the same request parameters, set the
        `forcefetch` kwarg to True.

    Returns
    -------

    dict
        This returns a dict of the following form::

            {'params':dict of the input params used for the query,
             'provenance':'cache' or 'new download',
             'result':path to the file on disk with the downloaded data table}

    '''

    # get the default params
    inputparams = TAP_PARAMS.copy()

    # update them with our input params

    inputparams['QUERY'] = querystr[::]

    if returnformat in RETURN_FORMATS:
        inputparams['FORMAT'] = returnformat
    else:
        LOGWARNING('unknown result format: %s requested, using CSV' %
                   returnformat)
        inputparams['FORMAT'] = 'csv'

    # see if the cachedir exists
    if '~' in cachedir:
        cachedir = os.path.expanduser(cachedir)
    if not os.path.exists(cachedir):
        os.makedirs(cachedir)

    # generate the cachefname and look for it
    xcachekey = '-'.join([repr(inputparams[x])
                         for x in sorted(inputparams.keys())])
    cachekey = hashlib.sha256(xcachekey.encode()).hexdigest()
    cachefname = os.path.join(
        cachedir,
        '%s.%s' % (cachekey, RETURN_FORMATS[returnformat])
    )
    provenance = 'cache'

    incomplete_qpklf = os.path.join(
        cachedir,
        'incomplete-query-%s' % cachekey
    )

    ##########################################
    ## COMPLETE A QUERY THAT MAY BE RUNNING ##
    ##########################################

    # first, check if this query can be resurrected
    if (not forcefetch and
        complete_query_later and
        os.path.exists(incomplete_qpklf)):

        with open(incomplete_qpklf, 'rb') as infd:
            incomplete_qinfo = pickle.load(infd)

        LOGWARNING('complete_query_later = True, and '
                   'this query was not completed on a '
                   'previous run, will check if it is done now...')

        # get the status URL and go into a loop to see if the query completed
        waitdone = False
        timeelapsed = 0.0

        gaia_mirror = incomplete_qinfo['gaia_mirror']
        data_release = incomplete_qinfo.get('data_release', 'dr2')
        status_url = incomplete_qinfo['status_url']
        phasekeyword = incomplete_qinfo['phase_keyword']
        resultkeyword = incomplete_qinfo['result_keyword']

        while not waitdone:

            if timeelapsed > maxtimeout:

                LOGERROR('GAIA TAP query still not done '
                         'after waiting %s seconds for results.\n'
                         'status URL is: %s' %
                         (maxtimeout,
                          status_url))

                return None

            try:

                resreq = requests.get(status_url,
                                      timeout=timeout)

                resreq.raise_for_status()

                # parse the response XML and get the job status
                resxml = parseString(resreq.text)

                jobstatuselem = (
                    resxml.getElementsByTagName(phasekeyword)[0]
                )
                jobstatus = jobstatuselem.firstChild.toxml()

                if jobstatus == 'COMPLETED':

                    if verbose:

                        LOGINFO('GAIA query completed, '
                                'retrieving results...')
                    waitdone = True

                # if we're not done yet, then wait some more
                elif jobstatus != 'ERROR':

                    if verbose:
                        LOGINFO('elapsed time: %.1f, '
                                'current status: %s, '
                                'status URL: %s, waiting...'
                                % (timeelapsed, jobstatus, status_url))

                    time.sleep(refresh)
                    timeelapsed = timeelapsed + refresh

                # if the JOB failed, then bail out immediately
                else:

                    LOGERROR('GAIA TAP query failed due to a server error.\n'
                             'status URL: %s\n'
                             'status contents: %s' %
                             (status_url,
                              resreq.text))

                    # since this job failed, remove the incomplete query pickle
                    # so we can try this from scratch
                    os.remove(incomplete_qpklf)

                    return None

            except requests.exceptions.Timeout:

                LOGEXCEPTION(
                    'GAIA query timed out while waiting for status '
                    'download results.\n'
                    'query: %s\n'
                    'status URL: %s' %
                    (repr(inputparams), status_url)
                )

                return None

            except Exception:

                LOGEXCEPTION(
                    'GAIA query failed while waiting for status\n'
                    'query: %s\n'
                    'status URL: %s\n'
                    'status contents: %s' %
                    (repr(inputparams),
                     status_url,
                     resreq.text)
                )

                # if the query fails completely, then either the status URL
                # doesn't exist any more or something else went wrong. we'll
                # remove the incomplete query pickle so we can try this from
                # scratch
                os.remove(incomplete_qpklf)

                return None

        #
        # at this point, we should be ready to get the query results
        #
        LOGINFO('query completed, retrieving results...')
        result_url_elem = resxml.getElementsByTagName(resultkeyword)[0]
        result_url = result_url_elem.getAttribute('xlink:href')
        result_nrows = result_url_elem.getAttribute('rows')

        try:

            resreq = requests.get(result_url, timeout=timeout)
            resreq.raise_for_status()

            if cachefname.endswith('.gz'):

                with gzip.open(cachefname,'wb') as outfd:
                    for chunk in resreq.iter_content(chunk_size=65536):
                        outfd.write(chunk)

            else:

                with open(cachefname,'wb') as outfd:
                    for chunk in resreq.iter_content(chunk_size=65536):
                        outfd.write(chunk)

            if verbose:
                LOGINFO('done. rows in result: %s' % result_nrows)
            tablefname = cachefname

            provenance = 'cache'

            # return a dict pointing to the result file
            # we'll parse this later
            resdict = {'params':inputparams,
                       'provenance':provenance,
                       'result':tablefname}

            # all went well, so we'll remove the incomplete query pickle
            os.remove(incomplete_qpklf)

            return resdict

        except requests.exceptions.Timeout:

            LOGEXCEPTION(
                'GAIA query timed out while trying to '
                'download results.\n'
                'query: %s\n'
                'result URL: %s' %
                (repr(inputparams), result_url)
            )
            return None

        except Exception:

            LOGEXCEPTION(
                'GAIA query failed because of an error '
                'while trying to download results.\n'
                'query: %s\n'
                'result URL: %s\n'
                'response status code: %s' %
                (repr(inputparams),
                 result_url,
                 resreq.status_code)
            )

            # if the result download fails, then either the result URL doesn't
            # exist any more or something else went wrong. we'll remove the
            # incomplete query pickle so we can try this from scratch
            os.remove(incomplete_qpklf)

            return None

    #####################
    ## RUN A NEW QUERY ##
    #####################

    # otherwise, we check the cache if it's done already, or run it again if not
    if forcefetch or (not os.path.exists(cachefname)):

        provenance = 'new download'

        # generate a jobid here and update the input params
        jobid = 'ab-gaia-%i' % time.time()
        inputparams['JOBNAME'] = jobid
        inputparams['JOBDESCRIPTION'] = 'astrobase-gaia-tap-ADQL-query'

        try:

            waitdone = False
            timeelapsed = 0.0

            # set the gaia mirror to use
            if gaia_mirror is not None and gaia_mirror in GAIA_URLS:

                tapurl = GAIA_URLS[gaia_mirror]['url']
                resultkeyword = GAIA_URLS[gaia_mirror]['resultkeyword']
                phasekeyword = GAIA_URLS[gaia_mirror]['phasekeyword']
                randkey = gaia_mirror

                # sub in a table name if this is left unresolved in the input
                # query
                if '{table}' in querystr:

                    # sub in the appropriate data-release
                    data_release_table = (
                        GAIA_URLS[randkey]['available_tables'].get(
                            data_release,
                            None
                        )
                    )

                    if data_release_table is None:
                        LOGERROR(
                            "Could not automatically select the "
                            "appropriate data table for "
                            "mirror: %s and data release: %s. "
                            "It may not have been enabled "
                            "for this mirror yet. Will fall back to the "
                            "default table: %s" %
                            (gaia_mirror, data_release,
                             GAIA_URLS[randkey]['table'])
                        )
                        data_release_table = GAIA_URLS[randkey]['table']

                    inputparams['QUERY'] = (
                        querystr.format(
                            table=data_release_table
                        )
                    )

            # if no gaia mirror is selected, pick a random one
            else:

                randkey = random.choice(list(GAIA_URLS.keys()))
                tapurl = GAIA_URLS[randkey]['url']
                resultkeyword = GAIA_URLS[randkey]['resultkeyword']
                phasekeyword = GAIA_URLS[randkey]['phasekeyword']

                # sub in a table name if this is left unresolved in the input
                # query
                if '{table}' in querystr:

                    # sub in the appropriate data-release
                    data_release_table = (
                        GAIA_URLS[randkey]['available_tables'].get(
                            data_release,
                            None
                        )
                    )

                    if data_release_table is None:
                        LOGERROR(
                            "Could not automatically select the "
                            "appropriate data table for "
                            "mirror: %s and data release: %s. "
                            "It may not have been enabled "
                            "for this mirror yet. Will fall back to the "
                            "default table: %s" %
                            (gaia_mirror, data_release,
                             GAIA_URLS[randkey]['table'])
                        )
                        data_release_table = GAIA_URLS[randkey]['table']

                    inputparams['QUERY'] = (
                        querystr.format(
                            table=data_release_table
                        )
                    )

            #
            # send the query and get status
            #
            if verbose:
                LOGINFO(
                    'using GAIA mirror TAP URL: %s, with table: %s' %
                    (tapurl, data_release_table)
                )

            if verbose:
                LOGINFO('submitting GAIA TAP query request for input params: %s'
                        % repr(inputparams))

            # here, we'll make sure the GAIA mirror works before doing anything
            # else
            mirrorok = False
            ntries = 1

            while not mirrorok:

                if ntries > maxtries:

                    LOGERROR('maximum number of allowed GAIA query '
                             'submission tries (%s) reached, bailing out...' %
                             maxtries)
                    return None

                try:

                    req = requests.post(tapurl,
                                        data=inputparams,
                                        timeout=timeout)
                    resp_status = req.status_code
                    req.raise_for_status()

                    mirrorok = True

                # this handles immediate 503s
                except requests.exceptions.HTTPError as e:

                    LOGWARNING(
                        'GAIA TAP server: %s raised an exception: %r, '
                        'trying another mirror...'
                        % (tapurl, e)
                    )
                    mirrorok = False

                    # make sure not to hit current mirror again if it's down
                    remainingmirrors = list(GAIA_URLS.keys())
                    remainingmirrors.remove(randkey)

                    randkey = random.choice(remainingmirrors)
                    tapurl = GAIA_URLS[randkey]['url']
                    resultkeyword = GAIA_URLS[randkey]['resultkeyword']
                    phasekeyword = GAIA_URLS[randkey]['phasekeyword']

                    # handle an unresolved table item in the query string
                    if '{table}' in querystr:

                        # sub in the appropriate data-release
                        data_release_table = (
                            GAIA_URLS[randkey]['available_tables'].get(
                                data_release,
                                None
                            )
                        )

                        if data_release_table is None:
                            LOGERROR(
                                "Could not automatically select the "
                                "appropriate data table for "
                                "mirror: %s and data release: %s. "
                                "It may not have been enabled "
                                "for this mirror yet. Will fall back to the "
                                "default table: %s" %
                                (gaia_mirror, data_release,
                                 GAIA_URLS[randkey]['table'])
                            )
                            data_release_table = GAIA_URLS[randkey]['table']

                        inputparams['QUERY'] = (
                            querystr.format(
                                table=data_release_table
                            )
                        )

                # this handles initial query submission timeouts
                except requests.exceptions.Timeout:

                    LOGWARNING(
                        'GAIA TAP query submission timed out, '
                        'mirror: %s is probably down. Trying another mirror...'
                        % tapurl
                    )
                    mirrorok = False

                    # make sure not to hit current mirror again if it's down
                    remainingmirrors = list(GAIA_URLS.keys())
                    remainingmirrors.remove(randkey)

                    randkey = random.choice(remainingmirrors)
                    tapurl = GAIA_URLS[randkey]['url']
                    resultkeyword = GAIA_URLS[randkey]['resultkeyword']
                    phasekeyword = GAIA_URLS[randkey]['phasekeyword']

                    # handle an unresolved table item in the query string
                    if '{table}' in querystr:

                        # sub in the appropriate data-release
                        data_release_table = (
                            GAIA_URLS[randkey]['available_tables'].get(
                                data_release,
                                None
                            )
                        )

                        if data_release_table is None:
                            LOGERROR(
                                "Could not automatically select the "
                                "appropriate data table for "
                                "mirror: %s and data release: %s. "
                                "It may not have been enabled "
                                "for this mirror yet. Will fall back to the "
                                "default table: %s" %
                                (gaia_mirror, data_release,
                                 GAIA_URLS[randkey]['table'])
                            )
                            data_release_table = GAIA_URLS[randkey]['table']

                        inputparams['QUERY'] = (
                            querystr.format(
                                table=data_release_table
                            )
                        )

                # update the number of submission tries
                ntries = ntries + 1

            # NOTE: python-requests follows the "303 See Other" redirect
            # automatically, so we get the XML status doc immediately. We don't
            # need to look up the location of it in the initial response's
            # header as in the GAIA example.
            status_url = req.url

            # parse the response XML and get the job status
            resxml = parseString(req.text)
            jobstatuselem = resxml.getElementsByTagName(phasekeyword)

            if jobstatuselem:

                jobstatuselem = jobstatuselem[0]

            else:
                LOGERROR('could not parse job phase using '
                         'keyword %s in result XML' % phasekeyword)
                LOGERROR(req.text)

                req.close()
                return None

            jobstatus = jobstatuselem.firstChild.toxml()

            # if the job completed already, jump down to retrieving results
            if jobstatus == 'COMPLETED':

                if verbose:

                    LOGINFO('GAIA query completed, '
                            'retrieving results...')

                    waitdone = True

            elif jobstatus == 'ERROR':

                if verbose:

                    LOGERROR(
                        'GAIA query failed immediately '
                        '(probably an ADQL error): %s, '
                        'status URL: %s, status contents: %s' %
                        (repr(inputparams),
                         status_url,
                         req.text)
                    )
                    return None

            # we wait for the job to complete if it's not done already
            else:

                if verbose:
                    LOGINFO(
                        'request submitted successfully, '
                        'current status is: %s. '
                        'waiting for results...' % jobstatus
                    )

                while not waitdone:

                    if timeelapsed > maxtimeout:

                        LOGERROR('GAIA TAP query timed out '
                                 'after waiting %s seconds for results.\n'
                                 'request was: %s\n'
                                 'status URL is: %s\n'
                                 'last status was: %s' %
                                 (maxtimeout,
                                  repr(inputparams),
                                  status_url,
                                  jobstatus))

                        # here, we'll check if we're allowed to sleep on a query
                        # for a bit and return to it later if the last status
                        # was QUEUED or EXECUTING
                        if complete_query_later and jobstatus in ('EXECUTING',
                                                                  'QUEUED'):

                            # write a pickle with the query params that we can
                            # pick up later to finish this query
                            incomplete_qpklf = os.path.join(
                                cachedir,
                                'incomplete-query-%s' % cachekey
                            )
                            with open(incomplete_qpklf, 'wb') as outfd:

                                savedict = inputparams.copy()

                                savedict['status_url'] = status_url
                                savedict['last_status'] = jobstatus
                                savedict['gaia_mirror'] = gaia_mirror
                                savedict['data_release'] = data_release
                                savedict['phase_keyword'] = phasekeyword
                                savedict['result_keyword'] = resultkeyword

                                pickle.dump(savedict,
                                            outfd,
                                            pickle.HIGHEST_PROTOCOL)

                            LOGINFO('complete_query_later = True, '
                                    'last state of query was: %s, '
                                    'will resume later if this function '
                                    'is called again with the same query' %
                                    jobstatus)

                        return None

                    time.sleep(refresh + random.random())
                    timeelapsed = timeelapsed + refresh

                    try:

                        resreq = requests.get(status_url, timeout=timeout)
                        resreq.raise_for_status()

                        # parse the response XML and get the job status
                        resxml = parseString(resreq.text)

                        jobstatuselem = (
                            resxml.getElementsByTagName(phasekeyword)[0]
                        )
                        jobstatus = jobstatuselem.firstChild.toxml()

                        if jobstatus == 'COMPLETED':

                            if verbose:

                                LOGINFO('GAIA query completed, '
                                        'retrieving results...')
                            waitdone = True

                        else:
                            if verbose:
                                LOGINFO('elapsed time: %.1f, '
                                        'current status: %s, '
                                        'status URL: %s, waiting...'
                                        % (timeelapsed, jobstatus, status_url))
                            continue

                    except requests.exceptions.Timeout:

                        LOGEXCEPTION(
                            'GAIA query timed out while waiting for results '
                            'download results.\n'
                            'query: %s\n'
                            'status URL: %s' %
                            (repr(inputparams), status_url)
                        )
                        return None

                    except Exception:

                        LOGEXCEPTION(
                            'GAIA query failed while waiting for results\n'
                            'query: %s\n'
                            'status URL: %s\n'
                            'status contents: %s' %
                            (repr(inputparams),
                             status_url,
                             resreq.text)
                        )
                        return None

            #
            # at this point, we should be ready to get the query results
            #
            result_url_elem = resxml.getElementsByTagName(resultkeyword)[0]
            result_url = result_url_elem.getAttribute('xlink:href')
            result_nrows = result_url_elem.getAttribute('rows')

            try:

                resreq = requests.get(result_url, timeout=timeout)
                resreq.raise_for_status()

                if cachefname.endswith('.gz'):

                    with gzip.open(cachefname,'wb') as outfd:
                        for chunk in resreq.iter_content(chunk_size=65536):
                            outfd.write(chunk)

                else:

                    with open(cachefname,'wb') as outfd:
                        for chunk in resreq.iter_content(chunk_size=65536):
                            outfd.write(chunk)

                if verbose:
                    LOGINFO('done. rows in result: %s' % result_nrows)
                tablefname = cachefname

            except requests.exceptions.Timeout:

                LOGEXCEPTION(
                    'GAIA query timed out while trying to '
                    'download results.\n'
                    'query: %s\n'
                    'result URL: %s' %
                    (repr(inputparams), result_url)
                )
                return None

            except Exception:

                LOGEXCEPTION(
                    'GAIA query failed because of an error '
                    'while trying to download results.\n'
                    'query: %s\n'
                    'result URL: %s\n'
                    'response status code: %s' %
                    (repr(inputparams),
                     result_url,
                     resreq.status_code)
                )
                return None

        except requests.exceptions.HTTPError:
            LOGEXCEPTION('GAIA TAP query failed.\nrequest status was: '
                         '%s.\nquery was: %s' % (resp_status,
                                                 repr(inputparams)))
            return None

        except requests.exceptions.Timeout:
            LOGERROR('GAIA TAP query submission timed out, '
                     'site is probably down. Request was: '
                     '%s' % repr(inputparams))
            return None

        except Exception:
            LOGEXCEPTION('GAIA TAP query request failed for '
                         '%s' % repr(inputparams))

            if 'resxml' in locals():
                LOGERROR('HTTP response from service:\n%s' % req.text)

            return None

    ############################
    ## GET RESULTS FROM CACHE ##
    ############################

    else:

        if verbose:
            LOGINFO('getting cached GAIA query result for '
                    'request: %s' %
                    (repr(inputparams)))

        tablefname = cachefname

    #
    # all done with retrieval, now return the result dict
    #

    # return a dict pointing to the result file
    # we'll parse this later
    resdict = {'params':inputparams,
               'provenance':provenance,
               'result':tablefname}

    return resdict


[docs]def objectlist_conesearch(racenter,
                          declcenter,
                          searchradiusarcsec,
                          gaia_mirror=None,
                          data_release="dr2",
                          columns=('source_id',
                                   'ra','dec',
                                   'phot_g_mean_mag',
                                   'l','b',
                                   'parallax', 'parallax_error',
                                   'pmra','pmra_error',
                                   'pmdec','pmdec_error'),
                          extra_filter=None,
                          returnformat='csv',
                          forcefetch=False,
                          cachedir='~/.astrobase/gaia-cache',
                          verbose=True,
                          timeout=15.0,
                          refresh=2.0,
                          maxtimeout=300.0,
                          maxtries=3,
                          complete_query_later=True):
    '''This queries the GAIA TAP service for a list of objects near the coords.

    Runs a conesearch around `(racenter, declcenter)` with radius in arcsec of
    `searchradiusarcsec`.

    Parameters
    ----------

    racenter,declcenter : float
        The center equatorial coordinates in decimal degrees.

    searchradiusarcsec : float
        The search radius of the cone-search in arcseconds.

    gaia_mirror : {'gaia','heidelberg','vizier'} or None
        This is the key used to select a GAIA catalog mirror from the
        `GAIA_URLS` dict above. If set, the specified mirror will be used. If
        None, a random mirror chosen from that dict will be used.

    data_release: {'dr2', 'edr3'}
        The Gaia data release to use for the query.

    columns : sequence of str
        This indicates which columns from the GAIA table to request for the
        objects found within the search radius.

    extra_filter: str or None
        If this is provided, must be a valid ADQL filter string that is used to
        further filter the cone-search results.

    returnformat : {'csv','votable','json'}
        The returned file format to request from the GAIA catalog service.

    forcefetch : bool
        If this is True, the query will be retried even if cached results for
        it exist.

    cachedir : str
        This points to the directory where results will be downloaded.

    verbose : bool
        If True, will indicate progress and warn of any issues.

    timeout : float
        This sets the amount of time in seconds to wait for the service to
        respond to our initial request.

    refresh : float
        This sets the amount of time in seconds to wait before checking if the
        result file is available. If the results file isn't available after
        `refresh` seconds have elapsed, the function will wait for `refresh`
        seconds continuously, until `maxtimeout` is reached or the results file
        becomes available.

    maxtimeout : float
        The maximum amount of time in seconds to wait for a result to become
        available after submitting our query request.

    maxtries : int
        The maximum number of tries (across all mirrors tried) to make to either
        submit the request or download the results, before giving up.

    completequerylater : bool
        If set to True, a submitted query that does not return a result before
        `maxtimeout` has passed will be cancelled but its input request
        parameters and the result URL provided by the service will be saved. If
        this function is then called later with these same input request
        parameters, it will check if the query finally finished and a result is
        available. If so, will download the results instead of submitting a new
        query. If it's not done yet, will start waiting for results again. To
        force launch a new query with the same request parameters, set the
        `forcefetch` kwarg to True.

    Returns
    -------

    dict
        This returns a dict of the following form::

            {'params':dict of the input params used for the query,
             'provenance':'cache' or 'new download',
             'result':path to the file on disk with the downloaded data table}

    '''

    # this was generated using the awesome query generator at:
    # https://gea.esac.esa.int/archive/

    # NOTE: here we don't resolve the table name right away. this is because
    # some of the GAIA mirrors use different table names, so we leave the table
    # name to be resolved by the lower level tap_query function. this is done by
    # the {{table}} construct.
    query = (
        "select {columns}, "
        "(DISTANCE(POINT('ICRS', "
        "{{table}}.ra, {{table}}.dec), "
        "POINT('ICRS', {ra_center:.5f}, {decl_center:.5f})))*3600.0 "
        "AS dist_arcsec "
        "from {{table}} where "
        "CONTAINS(POINT('ICRS',{{table}}.ra, {{table}}.dec),"
        "CIRCLE('ICRS',{ra_center:.5f},{decl_center:.5f},"
        "{search_radius:.6f}))=1 "
        "{extra_filter_str}"
        "ORDER by dist_arcsec asc "
    )

    if extra_filter is not None:
        extra_filter_str = ' and %s ' % extra_filter
    else:
        extra_filter_str = ''

    formatted_query = query.format(ra_center=racenter,
                                   decl_center=declcenter,
                                   search_radius=searchradiusarcsec/3600.0,
                                   extra_filter_str=extra_filter_str,
                                   columns=', '.join(columns))

    return tap_query(formatted_query,
                     gaia_mirror=gaia_mirror,
                     data_release=data_release,
                     returnformat=returnformat,
                     forcefetch=forcefetch,
                     cachedir=cachedir,
                     verbose=verbose,
                     timeout=timeout,
                     refresh=refresh,
                     maxtimeout=maxtimeout,
                     maxtries=maxtries,
                     complete_query_later=complete_query_later)


[docs]def objectlist_radeclbox(radeclbox,
                         gaia_mirror=None,
                         data_release='dr2',
                         columns=('source_id',
                                  'ra','dec',
                                  'phot_g_mean_mag',
                                  'l','b',
                                  'parallax, parallax_error',
                                  'pmra','pmra_error',
                                  'pmdec','pmdec_error'),
                         extra_filter=None,
                         returnformat='csv',
                         forcefetch=False,
                         cachedir='~/.astrobase/gaia-cache',
                         verbose=True,
                         timeout=15.0,
                         refresh=2.0,
                         maxtimeout=300.0,
                         maxtries=3,
                         complete_query_later=True):

    '''This queries the GAIA TAP service for a list of objects in an equatorial
    coordinate box.

    Parameters
    ----------

    radeclbox : sequence of four floats
        This defines the box to search in::

            [ra_min, ra_max, decl_min, decl_max]

    gaia_mirror : {'gaia','heidelberg','vizier'} or None
        This is the key used to select a GAIA catalog mirror from the
        `GAIA_URLS` dict above. If set, the specified mirror will be used. If
        None, a random mirror chosen from that dict will be used.

    data_release: {'dr2', 'edr3'}
        The Gaia data release to use for the query.

    columns : sequence of str
        This indicates which columns from the GAIA table to request for the
        objects found within the search radius.

    extra_filter: str or None
        If this is provided, must be a valid ADQL filter string that is used to
        further filter the cone-search results.

    returnformat : {'csv','votable','json'}
        The returned file format to request from the GAIA catalog service.

    forcefetch : bool
        If this is True, the query will be retried even if cached results for
        it exist.

    cachedir : str
        This points to the directory where results will be downloaded.

    verbose : bool
        If True, will indicate progress and warn of any issues.

    timeout : float
        This sets the amount of time in seconds to wait for the service to
        respond to our initial request.

    refresh : float
        This sets the amount of time in seconds to wait before checking if the
        result file is available. If the results file isn't available after
        `refresh` seconds have elapsed, the function will wait for `refresh`
        seconds continuously, until `maxtimeout` is reached or the results file
        becomes available.

    maxtimeout : float
        The maximum amount of time in seconds to wait for a result to become
        available after submitting our query request.

    maxtries : int
        The maximum number of tries (across all mirrors tried) to make to either
        submit the request or download the results, before giving up.

    completequerylater : bool
        If set to True, a submitted query that does not return a result before
        `maxtimeout` has passed will be cancelled but its input request
        parameters and the result URL provided by the service will be saved. If
        this function is then called later with these same input request
        parameters, it will check if the query finally finished and a result is
        available. If so, will download the results instead of submitting a new
        query. If it's not done yet, will start waiting for results again. To
        force launch a new query with the same request parameters, set the
        `forcefetch` kwarg to True.

    Returns
    -------

    dict
        This returns a dict of the following form::

            {'params':dict of the input params used for the query,
             'provenance':'cache' or 'new download',
             'result':path to the file on disk with the downloaded data table}

    '''

    # this was generated using the awesome query generator at:
    # https://gea.esac.esa.int/archive/

    # NOTE: here we don't resolve the table name right away. this is because
    # some of the GAIA mirrors use different table names, so we leave the table
    # name to be resolved by the lower level tap_query function. this is done by
    # the {{table}} construct.
    query = (
        "select {columns} from {{table}} where "
        "CONTAINS(POINT('ICRS',{{table}}.ra, {{table}}.dec),"
        "BOX('ICRS',{ra_center:.5f},{decl_center:.5f},"
        "{ra_width:.5f},{decl_height:.5f}))=1"
        "{extra_filter_str}"
    )

    ra_min, ra_max, decl_min, decl_max = radeclbox
    ra_center = (ra_max + ra_min)/2.0
    decl_center = (decl_max + decl_min)/2.0
    ra_width = ra_max - ra_min
    decl_height = decl_max - decl_min

    if extra_filter is not None:
        extra_filter_str = ' and %s ' % extra_filter
    else:
        extra_filter_str = ''

    formatted_query = query.format(columns=', '.join(columns),
                                   extra_filter_str=extra_filter_str,
                                   ra_center=ra_center,
                                   decl_center=decl_center,
                                   ra_width=ra_width,
                                   decl_height=decl_height)

    return tap_query(formatted_query,
                     gaia_mirror=gaia_mirror,
                     data_release=data_release,
                     returnformat=returnformat,
                     forcefetch=forcefetch,
                     cachedir=cachedir,
                     verbose=verbose,
                     timeout=timeout,
                     refresh=refresh,
                     maxtimeout=maxtimeout,
                     maxtries=maxtries,
                     complete_query_later=complete_query_later)


[docs]def objectid_search(gaiaid,
                    gaia_mirror=None,
                    data_release='dr2',
                    columns=('source_id',
                             'ra','dec',
                             'phot_g_mean_mag',
                             'phot_bp_mean_mag',
                             'phot_rp_mean_mag',
                             'l','b',
                             'parallax, parallax_error',
                             'pmra','pmra_error',
                             'pmdec','pmdec_error'),
                    returnformat='csv',
                    forcefetch=False,
                    cachedir='~/.astrobase/gaia-cache',
                    verbose=True,
                    timeout=15.0,
                    refresh=2.0,
                    maxtimeout=300.0,
                    maxtries=3,
                    complete_query_later=True):

    '''This queries the GAIA TAP service for a single GAIA source ID.

    Parameters
    ----------

    gaiaid : str
        The source ID of the object whose info will be collected.

    gaia_mirror : {'gaia','heidelberg','vizier'} or None
        This is the key used to select a GAIA catalog mirror from the
        `GAIA_URLS` dict above. If set, the specified mirror will be used. If
        None, a random mirror chosen from that dict will be used.

    data_release: {'dr2', 'edr3'}
        The Gaia data release to use for the query.

    columns : sequence of str
        This indicates which columns from the GAIA table to request for the
        objects found within the search radius.

    returnformat : {'csv','votable','json'}
        The returned file format to request from the GAIA catalog service.

    forcefetch : bool
        If this is True, the query will be retried even if cached results for
        it exist.

    cachedir : str
        This points to the directory where results will be downloaded.

    verbose : bool
        If True, will indicate progress and warn of any issues.

    timeout : float
        This sets the amount of time in seconds to wait for the service to
        respond to our initial request.

    refresh : float
        This sets the amount of time in seconds to wait before checking if the
        result file is available. If the results file isn't available after
        `refresh` seconds have elapsed, the function will wait for `refresh`
        seconds continuously, until `maxtimeout` is reached or the results file
        becomes available.

    maxtimeout : float
        The maximum amount of time in seconds to wait for a result to become
        available after submitting our query request.

    maxtries : int
        The maximum number of tries (across all mirrors tried) to make to either
        submit the request or download the results, before giving up.

    completequerylater : bool
        If set to True, a submitted query that does not return a result before
        `maxtimeout` has passed will be cancelled but its input request
        parameters and the result URL provided by the service will be saved. If
        this function is then called later with these same input request
        parameters, it will check if the query finally finished and a result is
        available. If so, will download the results instead of submitting a new
        query. If it's not done yet, will start waiting for results again. To
        force launch a new query with the same request parameters, set the
        `forcefetch` kwarg to True.

    Returns
    -------

    dict
        This returns a dict of the following form::

            {'params':dict of the input params used for the query,
             'provenance':'cache' or 'new download',
             'result':path to the file on disk with the downloaded data table}

    '''

    # NOTE: here we don't resolve the table name right away. this is because
    # some of the GAIA mirrors use different table names, so we leave the table
    # name to be resolved by the lower level tap_query function. this is done by
    # the {{table}} construct.
    query = (
        "select {columns} from {{table}} where "
        "source_id = {gaiaid}"
    )

    formatted_query = query.format(columns=', '.join(columns),
                                   gaiaid=gaiaid)

    return tap_query(formatted_query,
                     gaia_mirror=gaia_mirror,
                     data_release=data_release,
                     returnformat=returnformat,
                     forcefetch=forcefetch,
                     cachedir=cachedir,
                     verbose=verbose,
                     timeout=timeout,
                     refresh=refresh,
                     maxtimeout=maxtimeout,
                     maxtries=maxtries,
                     complete_query_later=complete_query_later)