Source code for astrobase.varclass.rfclass

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# rfclass.py - Waqas Bhatti (wbhatti@astro.princeton.edu) - Dec 2017
# License: MIT. See the LICENSE file for more details.

'''Does variable classification using random forests. Two types of
classification are supported:

- Variable classification using non-periodic features: this is used to perform a
  binary classification between non-variable and variable. Uses the features in
  :py:mod:`astrobase.varclass.varfeatures` and
  :py:mod:`astrobase.varclass.starfeatures`.

- TODO: Periodic variable classification using periodic features: this is used
  to perform multi-class classification for periodic variables using the
  features in :py:mod:`astrobase.varclass.periodicfeatures` and
  :py:mod:`astrobase.varclass.starfeatures`. The classes recognized are listed
  in PERIODIC_VARCLASSES below and were generated from manual classification run
  on various HATNet, HATSouth and HATPI fields.

'''

#############
## LOGGING ##
#############

import logging
from astrobase import log_sub, log_fmt, log_date_fmt

DEBUG = False
if DEBUG:
    level = logging.DEBUG
else:
    level = logging.INFO
LOGGER = logging.getLogger(__name__)
logging.basicConfig(
    level=level,
    style=log_sub,
    format=log_fmt,
    datefmt=log_date_fmt,
)

LOGDEBUG = LOGGER.debug
LOGINFO = LOGGER.info
LOGWARNING = LOGGER.warning
LOGERROR = LOGGER.error
LOGEXCEPTION = LOGGER.exception


#############
## IMPORTS ##
#############

import glob
import os.path
import os
import itertools
import pickle

try:
    from tqdm import tqdm
    TQDM = True
except Exception:
    TQDM = False
    pass

import numpy as np
import numpy.random as npr
# seed the numpy random generator
# we'll use RANDSEED for scipy.stats distribution functions as well
RANDSEED = 0xdecaff
npr.seed(RANDSEED)

from scipy.stats import randint as sp_randint

# scikit imports
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.model_selection import train_test_split

from sklearn.metrics import (
    precision_score, recall_score, confusion_matrix, f1_score
)

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt


#######################
## UTILITY FUNCTIONS ##
#######################

# Utility function to report best scores
# modified from a snippet taken from:
# http://scikit-learn.org/stable/auto_examples/model_selection/plot_randomized_search.html
def _gridsearch_report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            LOGINFO("Model with rank: {0}".format(i))
            LOGINFO("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                results['mean_test_score'][candidate],
                results['std_test_score'][candidate]))
            LOGINFO("Parameters: {0}".format(results['params'][candidate]))


###################################
## NON-PERIODIC VAR FEATURE LIST ##
###################################

NONPERIODIC_FEATURES_TO_COLLECT = [
    'stetsonj',
    'stetsonk',
    'amplitude',
    'magnitude_ratio',
    'linear_fit_slope',
    'eta_normal',
    'percentile_difference_flux_percentile',
    'mad',
    'skew',
    'kurtosis',
    'mag_iqr',
    'beyond1std',
    'grcolor',
    'gicolor',
    'ricolor',
    'bvcolor',
    'jhcolor',
    'jkcolor',
    'hkcolor',
    'gkcolor',
    'propermotion',
]


########################
## FEATURE COLLECTION ##
########################

[docs]def collect_nonperiodic_features( featuresdir, magcol, outfile, pklglob='varfeatures-*.pkl', featurestouse=NONPERIODIC_FEATURES_TO_COLLECT, maxobjects=None, labeldict=None, labeltype='binary', ): '''This collects variability features into arrays for use with the classifer. Parameters ---------- featuresdir : str This is the directory where all the varfeatures pickles are. Use `pklglob` to specify the glob to search for. The `varfeatures` pickles contain objectids, a light curve magcol, and features as dict key-vals. The :py:mod:`astrobase.lcproc.lcvfeatures` module can be used to produce these. magcol : str This is the key in each varfeatures pickle corresponding to the magcol of the light curve the variability features were extracted from. outfile : str This is the filename of the output pickle that will be written containing a dict of all the features extracted into np.arrays. pklglob : str This is the UNIX file glob to use to search for varfeatures pickle files in `featuresdir`. featurestouse : list of str Each varfeatures pickle can contain any combination of non-periodic, stellar, and periodic features; these must have the same names as elements in the list of strings provided in `featurestouse`. This tries to get all the features listed in NONPERIODIC_FEATURES_TO_COLLECT by default. If `featurestouse` is provided as a list, gets only the features listed in this kwarg instead. maxobjects : int or None The controls how many pickles from the featuresdir to process. If None, will process all varfeatures pickles. labeldict : dict or None If this is provided, it must be a dict with the following key:val list:: '<objectid>':<label value> for each objectid collected from the varfeatures pickles. This will turn the collected information into a training set for classifiers. Example: to carry out non-periodic variable feature collection of fake LCS prepared by :py:mod:`astrobase.fakelcs.generation`, use the value of the 'isvariable' dict elem from the `fakelcs-info.pkl` here, like so:: labeldict={x:y for x,y in zip(fakelcinfo['objectid'], fakelcinfo['isvariable'])} labeltype : {'binary', 'classes'} This is either 'binary' or 'classes' for binary/multi-class classification respectively. Returns ------- dict This returns a dict with all of the features collected into np.arrays, ready to use as input to a scikit-learn classifier. ''' # list of input pickles generated by varfeatures in lcproc.py pklist = glob.glob(os.path.join(featuresdir, pklglob)) if maxobjects: pklist = pklist[:maxobjects] # fancy progress bar with tqdm if present if TQDM: listiterator = tqdm(pklist) else: listiterator = pklist # go through all the varfeatures arrays feature_dict = {'objectids':[],'magcol':magcol, 'availablefeatures':[]} LOGINFO('collecting features for magcol: %s' % magcol) for pkl in listiterator: with open(pkl,'rb') as infd: varf = pickle.load(infd) # update the objectid list objectid = varf['objectid'] if objectid not in feature_dict['objectids']: feature_dict['objectids'].append(objectid) thisfeatures = varf[magcol] if featurestouse and len(featurestouse) > 0: featurestoget = featurestouse else: featurestoget = NONPERIODIC_FEATURES_TO_COLLECT # collect all the features for this magcol/objectid combination for feature in featurestoget: # update the global feature list if necessary if ((feature not in feature_dict['availablefeatures']) and (feature in thisfeatures)): feature_dict['availablefeatures'].append(feature) feature_dict[feature] = [] if feature in thisfeatures: feature_dict[feature].append( thisfeatures[feature] ) # now that we've collected all the objects and their features, turn the list # into arrays, and then concatenate them for feat in feature_dict['availablefeatures']: feature_dict[feat] = np.array(feature_dict[feat]) feature_dict['objectids'] = np.array(feature_dict['objectids']) feature_array = np.column_stack([feature_dict[feat] for feat in feature_dict['availablefeatures']]) feature_dict['features_array'] = feature_array # if there's a labeldict available, use it to generate a label array. this # feature collection is now a training set. if isinstance(labeldict, dict): labelarray = np.zeros(feature_dict['objectids'].size, dtype=np.int64) # populate the labels for each object in the training set for ind, objectid in enumerate(feature_dict['objectids']): if objectid in labeldict: # if this is a binary classifier training set, convert bools to # ones and zeros if labeltype == 'binary': if labeldict[objectid]: labelarray[ind] = 1 # otherwise, use the actual class label integer elif labeltype == 'classes': labelarray[ind] = labeldict[objectid] feature_dict['labels_array'] = labelarray feature_dict['kwargs'] = {'pklglob':pklglob, 'featurestouse':featurestouse, 'maxobjects':maxobjects, 'labeltype':labeltype} # write the info to the output pickle with open(outfile,'wb') as outfd: pickle.dump(feature_dict, outfd, pickle.HIGHEST_PROTOCOL) # return the feature_dict return feature_dict
################################# ## TRAINING THE RF CLASSIFIERS ## #################################
[docs]def train_rf_classifier( collected_features, test_fraction=0.25, n_crossval_iterations=20, n_kfolds=5, crossval_scoring_metric='f1', classifier_to_pickle=None, nworkers=-1, ): '''This gets the best RF classifier after running cross-validation. - splits the training set into test/train samples - does `KFold` stratified cross-validation using `RandomizedSearchCV` - gets the `RandomForestClassifier` with the best performance after CV - gets the confusion matrix for the test set Runs on the output dict from functions that produce dicts similar to that produced by `collect_nonperiodic_features` above. Parameters ---------- collected_features : dict or str This is either the dict produced by a `collect_*_features` function or the pickle produced by the same. test_fraction : float This sets the fraction of the input set that will be used as the test set after training. n_crossval_iterations : int This sets the number of iterations to use when running the cross-validation. n_kfolds : int This sets the number of K-folds to use on the data when doing a test-train split. crossval_scoring_metric : str This is a string that describes how the cross-validation score is calculated for each iteration. See the URL below for how to specify this parameter: http://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter By default, this is tuned for binary classification and uses the F1 scoring metric. Change the `crossval_scoring_metric` to another metric (probably 'accuracy') for multi-class classification, e.g. for periodic variable classification. classifier_to_pickle : str If this is a string indicating the name of a pickle file to write, will write the trained classifier to the pickle that can be later loaded and used to classify data. nworkers : int This is the number of parallel workers to use in the RandomForestClassifier. Set to -1 to use all CPUs on your machine. Returns ------- dict A dict containing the trained classifier, cross-validation results, the input data set, and all input kwargs used is returned, along with cross-validation score metrics. ''' if (isinstance(collected_features,str) and os.path.exists(collected_features)): with open(collected_features,'rb') as infd: fdict = pickle.load(infd) elif isinstance(collected_features, dict): fdict = collected_features else: LOGERROR("can't figure out the input collected_features arg") return None tfeatures = fdict['features_array'] tlabels = fdict['labels_array'] tfeaturenames = fdict['availablefeatures'] tmagcol = fdict['magcol'] tobjectids = fdict['objectids'] # split the training set into training/test samples using stratification # to keep the same fraction of variable/nonvariables in each training_features, testing_features, training_labels, testing_labels = ( train_test_split( tfeatures, tlabels, test_size=test_fraction, random_state=RANDSEED, stratify=tlabels ) ) # get a random forest classifier clf = RandomForestClassifier(n_jobs=nworkers, random_state=RANDSEED) # this is the grid def for hyperparam optimization rf_hyperparams = { "max_depth": [3,4,5,None], "n_estimators":sp_randint(100,2000), "max_features": sp_randint(1, 5), "min_samples_split": sp_randint(2, 11), "min_samples_leaf": sp_randint(2, 11), } # run the stratified kfold cross-validation on training features using our # random forest classifier object cvsearch = RandomizedSearchCV( clf, param_distributions=rf_hyperparams, n_iter=n_crossval_iterations, scoring=crossval_scoring_metric, cv=StratifiedKFold(n_splits=n_kfolds, shuffle=True, random_state=RANDSEED), random_state=RANDSEED ) LOGINFO('running grid-search CV to optimize RF hyperparameters...') cvsearch_classifiers = cvsearch.fit(training_features, training_labels) # report on the classifiers' performance _gridsearch_report(cvsearch_classifiers.cv_results_) # get the best classifier after CV is done bestclf = cvsearch_classifiers.best_estimator_ bestclf_score = cvsearch_classifiers.best_score_ bestclf_hyperparams = cvsearch_classifiers.best_params_ # test this classifier on the testing set test_predicted_labels = bestclf.predict(testing_features) recscore = recall_score(testing_labels, test_predicted_labels) precscore = precision_score(testing_labels,test_predicted_labels) f1score = f1_score(testing_labels, test_predicted_labels) confmatrix = confusion_matrix(testing_labels, test_predicted_labels) # write the classifier, its training/testing set, and its stats to the # pickle if requested outdict = {'features':tfeatures, 'labels':tlabels, 'feature_names':tfeaturenames, 'magcol':tmagcol, 'objectids':tobjectids, 'kwargs':{'test_fraction':test_fraction, 'n_crossval_iterations':n_crossval_iterations, 'n_kfolds':n_kfolds, 'crossval_scoring_metric':crossval_scoring_metric, 'nworkers':nworkers}, 'collect_kwargs':fdict['kwargs'], 'testing_features':testing_features, 'testing_labels':testing_labels, 'training_features':training_features, 'training_labels':training_labels, 'best_classifier':bestclf, 'best_score':bestclf_score, 'best_hyperparams':bestclf_hyperparams, 'best_recall':recscore, 'best_precision':precscore, 'best_f1':f1score, 'best_confmatrix':confmatrix} if classifier_to_pickle: with open(classifier_to_pickle,'wb') as outfd: pickle.dump(outdict, outfd, pickle.HIGHEST_PROTOCOL) # return this classifier and accompanying info return outdict
[docs]def apply_rf_classifier(classifier, varfeaturesdir, outpickle, maxobjects=None): '''This applys an RF classifier trained using `train_rf_classifier` to varfeatures pickles in `varfeaturesdir`. Parameters ---------- classifier : dict or str This is the output dict or pickle created by `get_rf_classifier`. This will contain a `features_name` key that will be used to collect the same features used to train the classifier from the varfeatures pickles in varfeaturesdir. varfeaturesdir : str The directory containing the varfeatures pickles for objects that will be classified by the trained `classifier`. outpickle : str This is a filename for the pickle that will be written containing the result dict from this function. maxobjects : int This sets the number of objects to process in `varfeaturesdir`. Returns ------- dict The classification results after running the trained `classifier` as returned as a dict. This contains predicted labels and their prediction probabilities. ''' if isinstance(classifier,str) and os.path.exists(classifier): with open(classifier,'rb') as infd: clfdict = pickle.load(infd) elif isinstance(classifier, dict): clfdict = classifier else: LOGERROR("can't figure out the input classifier arg") return None # get the features to extract from clfdict if 'feature_names' not in clfdict: LOGERROR("feature_names not present in classifier input, " "can't figure out which ones to extract from " "varfeature pickles in %s" % varfeaturesdir) return None # get the feature labeltype, pklglob, and maxobjects from classifier's # collect_kwargs elem. featurestouse = clfdict['feature_names'] pklglob = clfdict['collect_kwargs']['pklglob'] magcol = clfdict['magcol'] # extract the features used by the classifier from the varfeatures pickles # in varfeaturesdir using the pklglob provided featfile = os.path.join( os.path.dirname(outpickle), 'actual-collected-features.pkl' ) features = collect_nonperiodic_features( varfeaturesdir, magcol, featfile, pklglob=pklglob, featurestouse=featurestouse, maxobjects=maxobjects ) # now use the trained classifier on these features bestclf = clfdict['best_classifier'] predicted_labels = bestclf.predict(features['features_array']) # FIXME: do we need to use the probability calibration curves to fix these # probabilities? probably. figure out how to do this. predicted_label_probs = bestclf.predict_proba( features['features_array'] ) outdict = { 'features':features, 'featfile':featfile, 'classifier':clfdict, 'predicted_labels':predicted_labels, 'predicted_label_probs':predicted_label_probs, } with open(outpickle,'wb') as outfd: pickle.dump(outdict, outfd, pickle.HIGHEST_PROTOCOL) return outdict
###################### ## PLOTTING RESULTS ## ######################
[docs]def plot_training_results(classifier, classlabels, outfile): '''This plots the training results from the classifier run on the training set. - plots the confusion matrix - plots the feature importances - FIXME: plot the learning curves too, see: http://scikit-learn.org/stable/modules/learning_curve.html Parameters ---------- classifier : dict or str This is the output dict or pickle created by `get_rf_classifier` containing the trained classifier. classlabels : list of str This contains all of the class labels for the current classification problem. outfile : str This is the filename where the plots will be written. Returns ------- str The path to the generated plot file. ''' if isinstance(classifier,str) and os.path.exists(classifier): with open(classifier,'rb') as infd: clfdict = pickle.load(infd) elif isinstance(classifier, dict): clfdict = classifier else: LOGERROR("can't figure out the input classifier arg") return None confmatrix = clfdict['best_confmatrix'] overall_feature_importances = clfdict[ 'best_classifier' ].feature_importances_ feature_importances_per_tree = np.array([ tree.feature_importances_ for tree in clfdict['best_classifier'].estimators_ ]) stdev_feature_importances = np.std(feature_importances_per_tree,axis=0) feature_names = np.array(clfdict['feature_names']) plt.figure(figsize=(6.4*3.0,4.8)) # confusion matrix plt.subplot(121) classes = np.array(classlabels) plt.imshow(confmatrix, interpolation='nearest', cmap=plt.cm.Blues) tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes) plt.yticks(tick_marks, classes) plt.title('evaluation set confusion matrix') plt.ylabel('predicted class') plt.xlabel('actual class') thresh = confmatrix.max() / 2. for i, j in itertools.product(range(confmatrix.shape[0]), range(confmatrix.shape[1])): plt.text(j, i, confmatrix[i, j], horizontalalignment="center", color="white" if confmatrix[i, j] > thresh else "black") # feature importances plt.subplot(122) features = np.array(feature_names) sorted_ind = np.argsort(overall_feature_importances)[::-1] features = features[sorted_ind] feature_names = feature_names[sorted_ind] overall_feature_importances = overall_feature_importances[sorted_ind] stdev_feature_importances = stdev_feature_importances[sorted_ind] plt.bar(np.arange(0,features.size), overall_feature_importances, yerr=stdev_feature_importances, width=0.8, color='grey') plt.xticks(np.arange(0,features.size), features, rotation=90) plt.yticks([0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]) plt.xlim(-0.75, features.size - 1.0 + 0.75) plt.ylim(0.0,0.9) plt.ylabel('relative importance') plt.title('relative importance of features') plt.subplots_adjust(wspace=0.1) plt.savefig(outfile, bbox_inches='tight', dpi=100) plt.close('all') return outfile