Source code for astrobase.varclass.rfclass

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# rfclass.py - Waqas Bhatti (wbhatti@astro.princeton.edu) - Dec 2017
# License: MIT. See the LICENSE file for more details.

'''Does variable classification using random forests. Two types of
classification are supported:

- Variable classification using non-periodic features: this is used to perform a
  binary classification between non-variable and variable. Uses the features in
  :py:mod:`astrobase.varclass.varfeatures` and
  :py:mod:`astrobase.varclass.starfeatures`.

- TODO: Periodic variable classification using periodic features: this is used
  to perform multi-class classification for periodic variables using the
  features in :py:mod:`astrobase.varclass.periodicfeatures` and
  :py:mod:`astrobase.varclass.starfeatures`. The classes recognized are listed
  in PERIODIC_VARCLASSES below and were generated from manual classification run
  on various HATNet, HATSouth and HATPI fields.

'''

#############
## LOGGING ##
#############

import logging
from astrobase import log_sub, log_fmt, log_date_fmt

DEBUG = False
if DEBUG:
    level = logging.DEBUG
else:
    level = logging.INFO
LOGGER = logging.getLogger(__name__)
logging.basicConfig(
    level=level,
    style=log_sub,
    format=log_fmt,
    datefmt=log_date_fmt,
)

LOGDEBUG = LOGGER.debug
LOGINFO = LOGGER.info
LOGWARNING = LOGGER.warning
LOGERROR = LOGGER.error
LOGEXCEPTION = LOGGER.exception


#############
## IMPORTS ##
#############

import glob
import os.path
import os
import itertools
import pickle

try:
    from tqdm import tqdm
    TQDM = True
except Exception:
    TQDM = False
    pass

import numpy as np
import numpy.random as npr
# seed the numpy random generator
# we'll use RANDSEED for scipy.stats distribution functions as well
RANDSEED = 0xdecaff
npr.seed(RANDSEED)

from scipy.stats import randint as sp_randint

# scikit imports
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.model_selection import train_test_split

from sklearn.metrics import (
    precision_score, recall_score, confusion_matrix, f1_score
)

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt


#######################
## UTILITY FUNCTIONS ##
#######################

# Utility function to report best scores
# modified from a snippet taken from:
# http://scikit-learn.org/stable/auto_examples/model_selection/plot_randomized_search.html
def _gridsearch_report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            LOGINFO("Model with rank: {0}".format(i))
            LOGINFO("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                results['mean_test_score'][candidate],
                results['std_test_score'][candidate]))
            LOGINFO("Parameters: {0}".format(results['params'][candidate]))


###################################
## NON-PERIODIC VAR FEATURE LIST ##
###################################

NONPERIODIC_FEATURES_TO_COLLECT = [
    'stetsonj',
    'stetsonk',
    'amplitude',
    'magnitude_ratio',
    'linear_fit_slope',
    'eta_normal',
    'percentile_difference_flux_percentile',
    'mad',
    'skew',
    'kurtosis',
    'mag_iqr',
    'beyond1std',
    'grcolor',
    'gicolor',
    'ricolor',
    'bvcolor',
    'jhcolor',
    'jkcolor',
    'hkcolor',
    'gkcolor',
    'propermotion',
]


########################
## FEATURE COLLECTION ##
########################

[docs]def collect_nonperiodic_features(
        featuresdir,
        magcol,
        outfile,
        pklglob='varfeatures-*.pkl',
        featurestouse=NONPERIODIC_FEATURES_TO_COLLECT,
        maxobjects=None,
        labeldict=None,
        labeltype='binary',
):
    '''This collects variability features into arrays for use with the classifer.

    Parameters
    ----------

    featuresdir : str
        This is the directory where all the varfeatures pickles are. Use
        `pklglob` to specify the glob to search for. The `varfeatures` pickles
        contain objectids, a light curve magcol, and features as dict
        key-vals. The :py:mod:`astrobase.lcproc.lcvfeatures` module can be used
        to produce these.

    magcol : str
        This is the key in each varfeatures pickle corresponding to the magcol
        of the light curve the variability features were extracted from.

    outfile : str
        This is the filename of the output pickle that will be written
        containing a dict of all the features extracted into np.arrays.

    pklglob : str
        This is the UNIX file glob to use to search for varfeatures pickle files
        in `featuresdir`.

    featurestouse : list of str
        Each varfeatures pickle can contain any combination of non-periodic,
        stellar, and periodic features; these must have the same names as
        elements in the list of strings provided in `featurestouse`.  This tries
        to get all the features listed in NONPERIODIC_FEATURES_TO_COLLECT by
        default. If `featurestouse` is provided as a list, gets only the
        features listed in this kwarg instead.

    maxobjects : int or None
        The controls how many pickles from the featuresdir to process. If None,
        will process all varfeatures pickles.

    labeldict : dict or None
        If this is provided, it must be a dict with the following key:val list::

            '<objectid>':<label value>

        for each objectid collected from the varfeatures pickles. This will turn
        the collected information into a training set for classifiers.

        Example: to carry out non-periodic variable feature collection of fake
        LCS prepared by :py:mod:`astrobase.fakelcs.generation`, use the value
        of the 'isvariable' dict elem from the `fakelcs-info.pkl` here, like
        so::

            labeldict={x:y for x,y in zip(fakelcinfo['objectid'],
                                          fakelcinfo['isvariable'])}

    labeltype : {'binary', 'classes'}
        This is either 'binary' or 'classes' for binary/multi-class
        classification respectively.

    Returns
    -------

    dict
        This returns a dict with all of the features collected into np.arrays,
        ready to use as input to a scikit-learn classifier.

    '''

    # list of input pickles generated by varfeatures in lcproc.py
    pklist = glob.glob(os.path.join(featuresdir, pklglob))

    if maxobjects:
        pklist = pklist[:maxobjects]

    # fancy progress bar with tqdm if present
    if TQDM:
        listiterator = tqdm(pklist)
    else:
        listiterator = pklist

    # go through all the varfeatures arrays

    feature_dict = {'objectids':[],'magcol':magcol, 'availablefeatures':[]}

    LOGINFO('collecting features for magcol: %s' % magcol)

    for pkl in listiterator:

        with open(pkl,'rb') as infd:
            varf = pickle.load(infd)

        # update the objectid list
        objectid = varf['objectid']
        if objectid not in feature_dict['objectids']:
            feature_dict['objectids'].append(objectid)

        thisfeatures = varf[magcol]

        if featurestouse and len(featurestouse) > 0:
            featurestoget = featurestouse
        else:
            featurestoget = NONPERIODIC_FEATURES_TO_COLLECT

        # collect all the features for this magcol/objectid combination
        for feature in featurestoget:

            # update the global feature list if necessary
            if ((feature not in feature_dict['availablefeatures']) and
                (feature in thisfeatures)):

                feature_dict['availablefeatures'].append(feature)
                feature_dict[feature] = []

            if feature in thisfeatures:

                feature_dict[feature].append(
                    thisfeatures[feature]
                )

    # now that we've collected all the objects and their features, turn the list
    # into arrays, and then concatenate them
    for feat in feature_dict['availablefeatures']:
        feature_dict[feat] = np.array(feature_dict[feat])

    feature_dict['objectids'] = np.array(feature_dict['objectids'])

    feature_array = np.column_stack([feature_dict[feat] for feat in
                                     feature_dict['availablefeatures']])
    feature_dict['features_array'] = feature_array

    # if there's a labeldict available, use it to generate a label array. this
    # feature collection is now a training set.
    if isinstance(labeldict, dict):

        labelarray = np.zeros(feature_dict['objectids'].size, dtype=np.int64)

        # populate the labels for each object in the training set
        for ind, objectid in enumerate(feature_dict['objectids']):

            if objectid in labeldict:

                # if this is a binary classifier training set, convert bools to
                # ones and zeros
                if labeltype == 'binary':

                    if labeldict[objectid]:
                        labelarray[ind] = 1

                # otherwise, use the actual class label integer
                elif labeltype == 'classes':
                    labelarray[ind] = labeldict[objectid]

        feature_dict['labels_array'] = labelarray

    feature_dict['kwargs'] = {'pklglob':pklglob,
                              'featurestouse':featurestouse,
                              'maxobjects':maxobjects,
                              'labeltype':labeltype}

    # write the info to the output pickle
    with open(outfile,'wb') as outfd:
        pickle.dump(feature_dict, outfd, pickle.HIGHEST_PROTOCOL)

    # return the feature_dict
    return feature_dict


#################################
## TRAINING THE RF CLASSIFIERS ##
#################################

[docs]def train_rf_classifier(
        collected_features,
        test_fraction=0.25,
        n_crossval_iterations=20,
        n_kfolds=5,
        crossval_scoring_metric='f1',
        classifier_to_pickle=None,
        nworkers=-1,
):

    '''This gets the best RF classifier after running cross-validation.

    - splits the training set into test/train samples
    - does `KFold` stratified cross-validation using `RandomizedSearchCV`
    - gets the `RandomForestClassifier` with the best performance after CV
    - gets the confusion matrix for the test set

    Runs on the output dict from functions that produce dicts similar to that
    produced by `collect_nonperiodic_features` above.

    Parameters
    ----------

    collected_features : dict or str
        This is either the dict produced by a `collect_*_features` function or
        the pickle produced by the same.

    test_fraction : float
        This sets the fraction of the input set that will be used as the
        test set after training.

    n_crossval_iterations : int
        This sets the number of iterations to use when running the
        cross-validation.

    n_kfolds : int
        This sets the number of K-folds to use on the data when doing a
        test-train split.

    crossval_scoring_metric : str
        This is a string that describes how the cross-validation score is
        calculated for each iteration. See the URL below for how to specify this
        parameter:

        http://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
        By default, this is tuned for binary classification and uses the F1
        scoring metric. Change the `crossval_scoring_metric` to another metric
        (probably 'accuracy') for multi-class classification, e.g. for periodic
        variable classification.

    classifier_to_pickle : str
        If this is a string indicating the name of a pickle file to write, will
        write the trained classifier to the pickle that can be later loaded and
        used to classify data.

    nworkers : int
        This is the number of parallel workers to use in the
        RandomForestClassifier. Set to -1 to use all CPUs on your machine.

    Returns
    -------

    dict
        A dict containing the trained classifier, cross-validation results, the
        input data set, and all input kwargs used is returned, along with
        cross-validation score metrics.

    '''

    if (isinstance(collected_features,str) and
        os.path.exists(collected_features)):
        with open(collected_features,'rb') as infd:
            fdict = pickle.load(infd)
    elif isinstance(collected_features, dict):
        fdict = collected_features
    else:
        LOGERROR("can't figure out the input collected_features arg")
        return None

    tfeatures = fdict['features_array']
    tlabels = fdict['labels_array']
    tfeaturenames = fdict['availablefeatures']
    tmagcol = fdict['magcol']
    tobjectids = fdict['objectids']

    # split the training set into training/test samples using stratification
    # to keep the same fraction of variable/nonvariables in each
    training_features, testing_features, training_labels, testing_labels = (
        train_test_split(
            tfeatures,
            tlabels,
            test_size=test_fraction,
            random_state=RANDSEED,
            stratify=tlabels
        )
    )

    # get a random forest classifier
    clf = RandomForestClassifier(n_jobs=nworkers,
                                 random_state=RANDSEED)

    # this is the grid def for hyperparam optimization
    rf_hyperparams = {
        "max_depth": [3,4,5,None],
        "n_estimators":sp_randint(100,2000),
        "max_features": sp_randint(1, 5),
        "min_samples_split": sp_randint(2, 11),
        "min_samples_leaf": sp_randint(2, 11),
    }

    # run the stratified kfold cross-validation on training features using our
    # random forest classifier object
    cvsearch = RandomizedSearchCV(
        clf,
        param_distributions=rf_hyperparams,
        n_iter=n_crossval_iterations,
        scoring=crossval_scoring_metric,
        cv=StratifiedKFold(n_splits=n_kfolds,
                           shuffle=True,
                           random_state=RANDSEED),
        random_state=RANDSEED
    )

    LOGINFO('running grid-search CV to optimize RF hyperparameters...')
    cvsearch_classifiers = cvsearch.fit(training_features,
                                        training_labels)

    # report on the classifiers' performance
    _gridsearch_report(cvsearch_classifiers.cv_results_)

    # get the best classifier after CV is done
    bestclf = cvsearch_classifiers.best_estimator_
    bestclf_score = cvsearch_classifiers.best_score_
    bestclf_hyperparams = cvsearch_classifiers.best_params_

    # test this classifier on the testing set
    test_predicted_labels = bestclf.predict(testing_features)

    recscore = recall_score(testing_labels, test_predicted_labels)
    precscore = precision_score(testing_labels,test_predicted_labels)
    f1score = f1_score(testing_labels, test_predicted_labels)
    confmatrix = confusion_matrix(testing_labels, test_predicted_labels)

    # write the classifier, its training/testing set, and its stats to the
    # pickle if requested
    outdict = {'features':tfeatures,
               'labels':tlabels,
               'feature_names':tfeaturenames,
               'magcol':tmagcol,
               'objectids':tobjectids,
               'kwargs':{'test_fraction':test_fraction,
                         'n_crossval_iterations':n_crossval_iterations,
                         'n_kfolds':n_kfolds,
                         'crossval_scoring_metric':crossval_scoring_metric,
                         'nworkers':nworkers},
               'collect_kwargs':fdict['kwargs'],
               'testing_features':testing_features,
               'testing_labels':testing_labels,
               'training_features':training_features,
               'training_labels':training_labels,
               'best_classifier':bestclf,
               'best_score':bestclf_score,
               'best_hyperparams':bestclf_hyperparams,
               'best_recall':recscore,
               'best_precision':precscore,
               'best_f1':f1score,
               'best_confmatrix':confmatrix}

    if classifier_to_pickle:

        with open(classifier_to_pickle,'wb') as outfd:
            pickle.dump(outdict, outfd, pickle.HIGHEST_PROTOCOL)

    # return this classifier and accompanying info
    return outdict


[docs]def apply_rf_classifier(classifier,
                        varfeaturesdir,
                        outpickle,
                        maxobjects=None):
    '''This applys an RF classifier trained using `train_rf_classifier`
    to varfeatures pickles in `varfeaturesdir`.

    Parameters
    ----------

    classifier : dict or str
        This is the output dict or pickle created by `get_rf_classifier`. This
        will contain a `features_name` key that will be used to collect the same
        features used to train the classifier from the varfeatures pickles in
        varfeaturesdir.

    varfeaturesdir : str
        The directory containing the varfeatures pickles for objects that will
        be classified by the trained `classifier`.

    outpickle : str
        This is a filename for the pickle that will be written containing the
        result dict from this function.

    maxobjects : int
        This sets the number of objects to process in `varfeaturesdir`.

    Returns
    -------

    dict
        The classification results after running the trained `classifier` as
        returned as a dict. This contains predicted labels and their prediction
        probabilities.

    '''

    if isinstance(classifier,str) and os.path.exists(classifier):
        with open(classifier,'rb') as infd:
            clfdict = pickle.load(infd)
    elif isinstance(classifier, dict):
        clfdict = classifier
    else:
        LOGERROR("can't figure out the input classifier arg")
        return None

    # get the features to extract from clfdict
    if 'feature_names' not in clfdict:
        LOGERROR("feature_names not present in classifier input, "
                 "can't figure out which ones to extract from "
                 "varfeature pickles in %s" % varfeaturesdir)
        return None

    # get the feature labeltype, pklglob, and maxobjects from classifier's
    # collect_kwargs elem.
    featurestouse = clfdict['feature_names']
    pklglob = clfdict['collect_kwargs']['pklglob']
    magcol = clfdict['magcol']

    # extract the features used by the classifier from the varfeatures pickles
    # in varfeaturesdir using the pklglob provided
    featfile = os.path.join(
        os.path.dirname(outpickle),
        'actual-collected-features.pkl'
    )

    features = collect_nonperiodic_features(
        varfeaturesdir,
        magcol,
        featfile,
        pklglob=pklglob,
        featurestouse=featurestouse,
        maxobjects=maxobjects
    )

    # now use the trained classifier on these features
    bestclf = clfdict['best_classifier']

    predicted_labels = bestclf.predict(features['features_array'])

    # FIXME: do we need to use the probability calibration curves to fix these
    # probabilities? probably. figure out how to do this.
    predicted_label_probs = bestclf.predict_proba(
        features['features_array']
    )

    outdict = {
        'features':features,
        'featfile':featfile,
        'classifier':clfdict,
        'predicted_labels':predicted_labels,
        'predicted_label_probs':predicted_label_probs,
    }

    with open(outpickle,'wb') as outfd:
        pickle.dump(outdict, outfd, pickle.HIGHEST_PROTOCOL)

    return outdict


######################
## PLOTTING RESULTS ##
######################

[docs]def plot_training_results(classifier,
                          classlabels,
                          outfile):
    '''This plots the training results from the classifier run on the training
    set.

    - plots the confusion matrix

    - plots the feature importances

    - FIXME: plot the learning curves too, see:
      http://scikit-learn.org/stable/modules/learning_curve.html

    Parameters
    ----------

    classifier : dict or str
        This is the output dict or pickle created by `get_rf_classifier`
        containing the trained classifier.

    classlabels : list of str
        This contains all of the class labels for the current classification
        problem.

    outfile : str
        This is the filename where the plots will be written.

    Returns
    -------

    str
        The path to the generated plot file.

    '''

    if isinstance(classifier,str) and os.path.exists(classifier):
        with open(classifier,'rb') as infd:
            clfdict = pickle.load(infd)
    elif isinstance(classifier, dict):
        clfdict = classifier
    else:
        LOGERROR("can't figure out the input classifier arg")
        return None

    confmatrix = clfdict['best_confmatrix']
    overall_feature_importances = clfdict[
        'best_classifier'
    ].feature_importances_
    feature_importances_per_tree = np.array([
        tree.feature_importances_
        for tree in clfdict['best_classifier'].estimators_
    ])
    stdev_feature_importances = np.std(feature_importances_per_tree,axis=0)

    feature_names = np.array(clfdict['feature_names'])

    plt.figure(figsize=(6.4*3.0,4.8))

    # confusion matrix
    plt.subplot(121)
    classes = np.array(classlabels)
    plt.imshow(confmatrix, interpolation='nearest', cmap=plt.cm.Blues)
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes)
    plt.yticks(tick_marks, classes)
    plt.title('evaluation set confusion matrix')
    plt.ylabel('predicted class')
    plt.xlabel('actual class')

    thresh = confmatrix.max() / 2.
    for i, j in itertools.product(range(confmatrix.shape[0]),
                                  range(confmatrix.shape[1])):
        plt.text(j, i, confmatrix[i, j],
                 horizontalalignment="center",
                 color="white" if confmatrix[i, j] > thresh else "black")

    # feature importances
    plt.subplot(122)

    features = np.array(feature_names)
    sorted_ind = np.argsort(overall_feature_importances)[::-1]

    features = features[sorted_ind]
    feature_names = feature_names[sorted_ind]
    overall_feature_importances = overall_feature_importances[sorted_ind]
    stdev_feature_importances = stdev_feature_importances[sorted_ind]

    plt.bar(np.arange(0,features.size),
            overall_feature_importances,
            yerr=stdev_feature_importances,
            width=0.8,
            color='grey')
    plt.xticks(np.arange(0,features.size),
               features,
               rotation=90)
    plt.yticks([0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0])
    plt.xlim(-0.75, features.size - 1.0 + 0.75)
    plt.ylim(0.0,0.9)
    plt.ylabel('relative importance')
    plt.title('relative importance of features')

    plt.subplots_adjust(wspace=0.1)

    plt.savefig(outfile,
                bbox_inches='tight',
                dpi=100)
    plt.close('all')
    return outfile