Wrap a Sklearn-compatible Primitive¶
This tutorial will walk-through and explain select parts of the SKRandomForestClassifier Primitive code For more information on DO’s and DONT’s please visit Write a Good Primitive
SKRandomForestClassifier Example¶
# Custom import commands if any
from sklearn.ensemble.forest import RandomForestClassifier
from d3m.container.numpy import ndarray as d3m_ndarray
from d3m.container import DataFrame as d3m_dataframe
from d3m.metadata import hyperparams, params, base as metadata_base
from d3m import utils
from d3m.base import utils as base_utils
from d3m.exceptions import PrimitiveNotFittedError
from d3m.primitive_interfaces.base import CallResult, DockerContainer
from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase
from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin
from d3m import exceptions
These are the necessary imports from d3m
core which are utilized in every primitive as well as our custom import of
RandomForestClassifier
from sklearn
. Since we are wrapping a supervised classification primitive we must also
import the SupervisedLearnerPrimitiveBase
base class from d3m
.
class Params(params.Params):
estimators_: Optional[List[sklearn.tree.DecisionTreeClassifier]]
classes_: Optional[Union[ndarray, List[ndarray]]]
n_classes_: Optional[Union[int, List[int]]]
n_features_: Optional[int]
n_outputs_: Optional[int]
oob_score_: Optional[float]
oob_decision_function_: Optional[ndarray]
base_estimator_: Optional[object]
estimator_params: Optional[tuple]
base_estimator: Optional[object]
input_column_names: Optional[pandas.core.indexes.base.Index]
target_names_: Optional[Sequence[Any]]
training_indices_: Optional[Sequence[int]]
target_column_indices_: Optional[Sequence[int]]
target_columns_metadata_: Optional[List[OrderedDict]]
Next we defined our primitive’s parameters as class attributes with type annotations. The majority of this list is comprised
of the attributes from the sklearn source code.
In addition, we have added a few more parameters which contains information we would like to store during fitting such as
target_names_
, training_indices_
etc. More information can be found in Parameters
class Hyperparams(hyperparams.Hyperparams):
n_estimators = hyperparams.Bounded[int](
default=10,
lower=1,
upper=None,
description='The number of trees in the forest.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
criterion = hyperparams.Enumeration[str](
values=['gini', 'entropy'],
default='gini',
description='The function to measure the quality of a split. Supported criteria are "gini" for the Gini impurity and "entropy" for the information gain. Note: this parameter is tree-specific.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
.
.
.
use_inputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.",
)
use_outputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.",
)
exclude_inputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.",
)
exclude_outputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.",
)
return_result = hyperparams.Enumeration(
values=['append', 'replace', 'new'],
default='new',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.",
)
use_semantic_types = hyperparams.UniformBool(
default=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe"
)
add_index_columns = hyperparams.UniformBool(
default=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".",
)
error_on_no_input = hyperparams.UniformBool(
default=True,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.",
)
return_semantic_type = hyperparams.Enumeration[str](
values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'],
default='https://metadata.datadrivendiscovery.org/types/PredictedTarget',
description='Decides what semantic type to attach to generated output',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter']
)
Above we added the Hyper-parameters from the sklearn source code
(this has been abridged in the example block above). Following the Hyper-parameters from the original source code we then
add the standard d3m
Hyper-parameters to the Hyperparams
class.
These Hyper-parameters include return_result
which indicate whether the output
should append to the original dataframe (append
), replace the altered columns on the original dataframe (replace
),
or simply return the output as is (new
). More information can be found in Hyper-parameters.
class SKRandomForestClassifier(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams],
ProbabilisticCompositionalityMixin[Inputs, Outputs, Params, Hyperparams]):
metadata = metadata_base.PrimitiveMetadata({
"algorithm_types": [metadata_base.PrimitiveAlgorithmType.RANDOM_FOREST, ],
"name": "sklearn.ensemble.forest.RandomForestClassifier",
"primitive_family": metadata_base.PrimitiveFamily.CLASSIFICATION,
"python_path": "d3m.primitives.classification.random_forest.SKlearn",
"source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html']},
"version": "2020.12.1",
"id": "1dd82833-5692-39cb-84fb-2455683075f3",
"hyperparams_to_tune": ['n_estimators', 'max_depth', 'min_samples_split', 'min_samples_leaf', 'max_features'],
'installation': [
{'type': metadata_base.PrimitiveInstallationType.PIP,
'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format(
git_commit=utils.current_git_commit(os.path.dirname(__file__)),
),
}]
})
We then add the Primitive metadata which describes the algorithm type and family, name, id etc. The id
should be unique
for every primitive and we recommend using uuid4
to generate. We also list our recommendations for which Hyper-Parameters
to tune in hyperparams_to_tune
. More information can be found in Primitive metadata and Primitive family.
def __init__(self, *,
hyperparams: Hyperparams,
random_seed: int = 0,
docker_containers: Dict[str, DockerContainer] = None,
_verbose: int = 0) -> None:
super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)
self._clf = RandomForestClassifier(
n_estimators=self.hyperparams['n_estimators'],
criterion=self.hyperparams['criterion'],
.
.
.
random_state=self.random_seed,
verbose=_verbose
)
self._inputs = None
self._outputs = None
self._training_inputs = None
self._training_outputs = None
self._target_names = None
self._training_indices = None
self._target_column_indices = None
self._target_columns_metadata: List[OrderedDict] = None
self._input_column_names = None
self._fitted = False
self._new_training_data = False
In our init
we initialize all of our parameters as well as the RandomForestClassifier
.
Note: you should use self.random_seed
for random_state
instead of adding it as a Hyper-parameter.
def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
self._inputs = inputs
self._outputs = outputs
self._fitted = False
self._new_training_data = True
Next we add our set_training_data
method which will be used by TA2 systems to set the inputs and output. Any
pre-processing or data selection should be done in the fit
method instead of set_training_data
. More information
can be found in Input/Output types
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams)
self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams)
self._input_column_names = self._training_inputs.columns.astype(str)
if len(self._training_indices) > 0 and len(self._target_column_indices) > 0:
self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams)
sk_training_output = self._training_outputs.values
shape = sk_training_output.shape
if len(shape) == 2 and shape[1] == 1:
sk_training_output = numpy.ravel(sk_training_output)
self._clf.fit(self._training_inputs, sk_training_output)
self._fitted = True
return CallResult(None)
In the fit
method we select the training input and outputs using self._get_columns_to_fit
and self._get_targets
.
TA2 systems can choose to use semantic types for filtering columns in input dataframe, this is set in the
Hyper-parameter use_semantic_types
. self._get_columns_to_fit
and self._get_targets
should check use_semantic_types
and support both methods of selecting columns. Then we fit
the primitive using the selected training inputs and outputs.
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams)
output = []
if len(sk_inputs.columns):
sk_output = self._clf.predict(sk_inputs)
if not self._fitted:
raise PrimitiveNotFittedError("Primitive not fitted.")
if sparse.issparse(sk_output):
sk_output = pandas.DataFrame.sparse.from_spmatrix(sk_output)
output = self._wrap_predictions(inputs, sk_output)
output.columns = self._target_names
output = [output]
outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'],
add_index_columns=self.hyperparams['add_index_columns'],
inputs=inputs, column_indices=self._target_column_indices,
columns_list=output)
return CallResult(outputs)
In the produce
method we use our fitted model to predict the outputs. We then use self._wrap_predictions
to add
metadata to the predicted output and add the target column names. Finally combine_columns
will return the appropriate
return_result
and add the d3mIndex
column. Produce methods and some other methods return results wrapped in CallResult.
def get_params(self) -> Params:
if not self._fitted:
return Params(
estimators_=None,
classes_=None,
.
.
.
target_columns_metadata_=self._target_columns_metadata
)
return Params(
estimators_=getattr(self._clf, 'estimators_', None),
classes_=getattr(self._clf, 'classes_', None),
.
.
.
target_columns_metadata_=self._target_columns_metadata
)
def set_params(self, *, params: Params) -> None:
self._clf.estimators_ = params['estimators_']
self._clf.classes_ = params['classes_']
.
.
.
self._target_columns_metadata = params['target_columns_metadata_']
if params['estimators_'] is not None:
self._fitted = True
if params['classes_'] is not None:
self._fitted = True
.
.
.
An instance of the d3m.metadata.params
subclass should be returned from primitive’s get_params
method, and accepted in set_params
. All model attributes and custom parameters found
in d3m.metadata.params
should also be in get_params
and
set_params