#!/usr/bin/env python # -*- coding: utf-8 -*- # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from pipeline.param.base_param import BaseParam class DataSplitParam(BaseParam): """ Define data split param that used in data split. Parameters ---------- random_state : None or int, default: None Specify the random state for shuffle. test_size : float or int or None, default: 0.0 Specify test data set size. float value specifies fraction of input data set, int value specifies exact number of data instances train_size : float or int or None, default: 0.8 Specify train data set size. float value specifies fraction of input data set, int value specifies exact number of data instances validate_size : float or int or None, default: 0.2 Specify validate data set size. float value specifies fraction of input data set, int value specifies exact number of data instances stratified : bool, default: False Define whether sampling should be stratified, according to label value. shuffle : bool, default: True Define whether do shuffle before splitting or not. split_points : None or list, default : None Specify the point(s) by which continuous label values are bucketed into bins for stratified split. eg.[0.2] for two bins or [0.1, 1, 3] for 4 bins need_run: bool, default: True Specify whether to run data split """ def __init__(self, random_state=None, test_size=None, train_size=None, validate_size=None, stratified=False, shuffle=True, split_points=None, need_run=True): super(DataSplitParam, self).__init__() self.random_state = random_state self.test_size = test_size self.train_size = train_size self.validate_size = validate_size self.stratified = stratified self.shuffle = shuffle self.split_points = split_points self.need_run = need_run def check(self): model_param_descr = "data split param's " if self.random_state is not None: if not isinstance(self.random_state, int): raise ValueError(f"{model_param_descr} random state should be int type") BaseParam.check_nonnegative_number(self.random_state, f"{model_param_descr} random_state ") if self.test_size is not None: BaseParam.check_nonnegative_number(self.test_size, f"{model_param_descr} test_size ") if isinstance(self.test_size, float): BaseParam.check_decimal_float(self.test_size, f"{model_param_descr} test_size ") if self.train_size is not None: BaseParam.check_nonnegative_number(self.train_size, f"{model_param_descr} train_size ") if isinstance(self.train_size, float): BaseParam.check_decimal_float(self.train_size, f"{model_param_descr} train_size ") if self.validate_size is not None: BaseParam.check_nonnegative_number(self.validate_size, f"{model_param_descr} validate_size ") if isinstance(self.validate_size, float): BaseParam.check_decimal_float(self.validate_size, f"{model_param_descr} validate_size ") # use default size values if none given if self.test_size is None and self.train_size is None and self.validate_size is None: self.test_size = 0.0 self.train_size = 0.8 self.validate_size = 0.2 BaseParam.check_boolean(self.stratified, f"{model_param_descr} stratified ") BaseParam.check_boolean(self.shuffle, f"{model_param_descr} shuffle ") BaseParam.check_boolean(self.need_run, f"{model_param_descr} need run ") if self.split_points is not None: if not isinstance(self.split_points, list): raise ValueError(f"{model_param_descr} split_points should be list type") return True