123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329 |
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- #
- # Copyright 2019 The FATE Authors. All Rights Reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- #
- import copy
- from pipeline.param.base_param import BaseParam
- from pipeline.param.encrypt_param import EncryptParam
- from pipeline.param import consts
- class TransformParam(BaseParam):
- """
- Define how to transfer the cols
- Parameters
- ----------
- transform_cols : list of column index, default: -1
- Specify which columns need to be transform. If column index is None, None of columns will be transformed.
- If it is -1, it will use same columns as cols in binning module.
- Note tha columns specified by `transform_cols` and `transform_names` will be combined.
- transform_names: list of string, default: []
- Specify which columns need to calculated. Each element in the list represent for a column name in header.
- Note tha columns specified by `transform_cols` and `transform_names` will be combined.
- transform_type: str, 'bin_num'or 'woe' or None default: 'bin_num'
- Specify which value these columns going to replace.
- 1. bin_num: Transfer original feature value to bin index in which this value belongs to.
- 2. woe: This is valid for guest party only. It will replace original value to its woe value
- 3. None: nothing will be replaced.
- """
- def __init__(self, transform_cols=-1, transform_names=None, transform_type="bin_num"):
- super(TransformParam, self).__init__()
- self.transform_cols = transform_cols
- self.transform_names = transform_names
- self.transform_type = transform_type
- def check(self):
- descr = "Transform Param's "
- if self.transform_cols is not None and self.transform_cols != -1:
- self.check_defined_type(self.transform_cols, descr, ['list'])
- self.check_defined_type(self.transform_names, descr, ['list', "NoneType"])
- if self.transform_names is not None:
- for name in self.transform_names:
- if not isinstance(name, str):
- raise ValueError("Elements in transform_names should be string type")
- self.check_valid_value(self.transform_type, descr, ['bin_num', 'woe', None])
- class OptimalBinningParam(BaseParam):
- """
- Indicate optimal binning params
- Parameters
- ----------
- metric_method: str, default: "iv"
- The algorithm metric method. Support iv, gini, ks, chi-square
- min_bin_pct: float, default: 0.05
- The minimum percentage of each bucket
- max_bin_pct: float, default: 1.0
- The maximum percentage of each bucket
- init_bin_nums: int, default 100
- Number of bins when initialize
- mixture: bool, default: True
- Whether each bucket need event and non-event records
- init_bucket_method: str default: quantile
- Init bucket methods. Accept quantile and bucket.
- """
- def __init__(self, metric_method='iv', min_bin_pct=0.05, max_bin_pct=1.0,
- init_bin_nums=1000, mixture=True, init_bucket_method='quantile'):
- super().__init__()
- self.init_bucket_method = init_bucket_method
- self.metric_method = metric_method
- self.max_bin = None
- self.mixture = mixture
- self.max_bin_pct = max_bin_pct
- self.min_bin_pct = min_bin_pct
- self.init_bin_nums = init_bin_nums
- self.adjustment_factor = None
- def check(self):
- descr = "hetero binning's optimal binning param's"
- self.check_string(self.metric_method, descr)
- self.metric_method = self.metric_method.lower()
- if self.metric_method in ['chi_square', 'chi-square']:
- self.metric_method = 'chi_square'
- self.check_valid_value(self.metric_method, descr, ['iv', 'gini', 'chi_square', 'ks'])
- self.check_positive_integer(self.init_bin_nums, descr)
- self.init_bucket_method = self.init_bucket_method.lower()
- self.check_valid_value(self.init_bucket_method, descr, ['quantile', 'bucket'])
- if self.max_bin_pct not in [1, 0]:
- self.check_decimal_float(self.max_bin_pct, descr)
- if self.min_bin_pct not in [1, 0]:
- self.check_decimal_float(self.min_bin_pct, descr)
- if self.min_bin_pct > self.max_bin_pct:
- raise ValueError("Optimal binning's min_bin_pct should less or equal than max_bin_pct")
- self.check_boolean(self.mixture, descr)
- self.check_positive_integer(self.init_bin_nums, descr)
- class FeatureBinningParam(BaseParam):
- """
- Define the feature binning method
- Parameters
- ----------
- method : str, 'quantile', 'bucket' or 'optimal', default: 'quantile'
- Binning method.
- compress_thres: int, default: 10000
- When the number of saved summaries exceed this threshold, it will call its compress function
- head_size: int, default: 10000
- The buffer size to store inserted observations. When head list reach this buffer size, the
- QuantileSummaries object start to generate summary(or stats) and insert into its sampled list.
- error: float, 0 <= error < 1 default: 0.001
- The error of tolerance of binning. The final split point comes from original data, and the rank
- of this value is close to the exact rank. More precisely,
- floor((p - 2 * error) * N) <= rank(x) <= ceil((p + 2 * error) * N)
- where p is the quantile in float, and N is total number of data.
- bin_num: int, bin_num > 0, default: 10
- The max bin number for binning
- bin_indexes : list of int or int, default: -1
- Specify which columns need to be binned. -1 represent for all columns. If you need to indicate specific
- cols, provide a list of header index instead of -1.
- Note tha columns specified by `bin_indexes` and `bin_names` will be combined.
- bin_names : list of string, default: []
- Specify which columns need to calculated. Each element in the list represent for a column name in header.
- Note tha columns specified by `bin_indexes` and `bin_names` will be combined.
- adjustment_factor : float, default: 0.5
- the adjustment factor when calculating WOE. This is useful when there is no event or non-event in
- a bin. Please note that this parameter will NOT take effect for setting in host.
- category_indexes : list of int or int, default: []
- Specify which columns are category features. -1 represent for all columns. List of int indicate a set of
- such features. For category features, bin_obj will take its original values as split_points and treat them
- as have been binned. If this is not what you expect, please do NOT put it into this parameters.
- The number of categories should not exceed bin_num set above.
- Note tha columns specified by `category_indexes` and `category_names` will be combined.
- category_names : list of string, default: []
- Use column names to specify category features. Each element in the list represent for a column name in header.
- Note tha columns specified by `category_indexes` and `category_names` will be combined.
- local_only : bool, default: False
- Whether just provide binning method to guest party. If true, host party will do nothing.
- Warnings: This parameter will be deprecated in future version.
- transform_param: TransformParam
- Define how to transfer the binned data.
- need_run: bool, default True
- Indicate if this module needed to be run
- skip_static: bool, default False
- If true, binning will not calculate iv, woe etc. In this case, optimal-binning
- will not be supported.
- """
- def __init__(self, method=consts.QUANTILE,
- compress_thres=consts.DEFAULT_COMPRESS_THRESHOLD,
- head_size=consts.DEFAULT_HEAD_SIZE,
- error=consts.DEFAULT_RELATIVE_ERROR,
- bin_num=consts.G_BIN_NUM, bin_indexes=-1, bin_names=None, adjustment_factor=0.5,
- transform_param=TransformParam(),
- local_only=False,
- category_indexes=None, category_names=None,
- need_run=True, skip_static=False):
- super(FeatureBinningParam, self).__init__()
- self.method = method
- self.compress_thres = compress_thres
- self.head_size = head_size
- self.error = error
- self.adjustment_factor = adjustment_factor
- self.bin_num = bin_num
- self.bin_indexes = bin_indexes
- self.bin_names = bin_names
- self.category_indexes = category_indexes
- self.category_names = category_names
- self.transform_param = copy.deepcopy(transform_param)
- self.need_run = need_run
- self.skip_static = skip_static
- self.local_only = local_only
- def check(self):
- descr = "Binning param's"
- self.check_string(self.method, descr)
- self.method = self.method.lower()
- self.check_positive_integer(self.compress_thres, descr)
- self.check_positive_integer(self.head_size, descr)
- self.check_decimal_float(self.error, descr)
- self.check_positive_integer(self.bin_num, descr)
- if self.bin_indexes != -1:
- self.check_defined_type(self.bin_indexes, descr, ['list', 'RepeatedScalarContainer', "NoneType"])
- self.check_defined_type(self.bin_names, descr, ['list', "NoneType"])
- self.check_defined_type(self.category_indexes, descr, ['list', "NoneType"])
- self.check_defined_type(self.category_names, descr, ['list', "NoneType"])
- self.check_open_unit_interval(self.adjustment_factor, descr)
- self.check_boolean(self.local_only, descr)
- class HeteroFeatureBinningParam(FeatureBinningParam):
- """
- split_points_by_index: dict, default None
- Manually specified split points for local features;
- key should be feature index, value should be split points in sorted list;
- along with `split_points_by_col_name`, keys should cover all local features, including categorical features;
- note that each split point list should have length equal to desired bin num(n),
- with first (n-1) entries equal to the maximum value(inclusive) of each first (n-1) bins,
- and nth value the max of current feature.
- split_points_by_col_name: dict, default None
- Manually specified split points for local features;
- key should be feature name, value should be split points in sorted list;
- along with `split_points_by_index`, keys should cover all local features, including categorical features;
- note that each split point list should have length equal to desired bin num(n),
- with first (n-1) entries equal to the maximum value(inclusive) of each first (n-1) bins,
- and nth value the max of current feature.
- """
- def __init__(self, method=consts.QUANTILE, compress_thres=consts.DEFAULT_COMPRESS_THRESHOLD,
- head_size=consts.DEFAULT_HEAD_SIZE,
- error=consts.DEFAULT_RELATIVE_ERROR,
- bin_num=consts.G_BIN_NUM, bin_indexes=-1, bin_names=None, adjustment_factor=0.5,
- transform_param=TransformParam(), optimal_binning_param=OptimalBinningParam(),
- local_only=False, category_indexes=None, category_names=None,
- encrypt_param=EncryptParam(),
- need_run=True, skip_static=False,
- split_points_by_index=None, split_points_by_col_name=None):
- super(HeteroFeatureBinningParam, self).__init__(method=method, compress_thres=compress_thres,
- head_size=head_size, error=error,
- bin_num=bin_num, bin_indexes=bin_indexes,
- bin_names=bin_names, adjustment_factor=adjustment_factor,
- transform_param=transform_param,
- category_indexes=category_indexes,
- category_names=category_names,
- need_run=need_run, local_only=local_only,
- skip_static=skip_static)
- self.optimal_binning_param = copy.deepcopy(optimal_binning_param)
- self.encrypt_param = encrypt_param
- self.split_points_by_index = split_points_by_index
- self.split_points_by_col_name = split_points_by_col_name
- def check(self):
- descr = "Hetero Binning param's"
- super(HeteroFeatureBinningParam, self).check()
- self.check_valid_value(self.method, descr, [consts.QUANTILE, consts.BUCKET, consts.OPTIMAL])
- self.optimal_binning_param.check()
- self.encrypt_param.check()
- if self.encrypt_param.method != consts.PAILLIER:
- raise ValueError("Feature Binning support Paillier encrypt method only.")
- if self.skip_static and self.method == consts.OPTIMAL:
- raise ValueError("When skip_static, optimal binning is not supported.")
- self.transform_param.check()
- if self.skip_static and self.transform_param.transform_type == 'woe':
- raise ValueError("To use woe transform, skip_static should set as False")
- class HomoFeatureBinningParam(FeatureBinningParam):
- def __init__(self, method=consts.VIRTUAL_SUMMARY,
- compress_thres=consts.DEFAULT_COMPRESS_THRESHOLD,
- head_size=consts.DEFAULT_HEAD_SIZE,
- error=consts.DEFAULT_RELATIVE_ERROR,
- sample_bins=100,
- bin_num=consts.G_BIN_NUM, bin_indexes=-1, bin_names=None, adjustment_factor=0.5,
- transform_param=TransformParam(),
- category_indexes=None, category_names=None,
- need_run=True, skip_static=False, max_iter=100):
- super(HomoFeatureBinningParam, self).__init__(method=method, compress_thres=compress_thres,
- head_size=head_size, error=error,
- bin_num=bin_num, bin_indexes=bin_indexes,
- bin_names=bin_names, adjustment_factor=adjustment_factor,
- transform_param=transform_param,
- category_indexes=category_indexes, category_names=category_names,
- need_run=need_run,
- skip_static=skip_static)
- self.sample_bins = sample_bins
- self.max_iter = max_iter
- def check(self):
- descr = "homo binning param's"
- super(HomoFeatureBinningParam, self).check()
- self.check_string(self.method, descr)
- self.method = self.method.lower()
- self.check_valid_value(self.method, descr, [consts.VIRTUAL_SUMMARY, consts.RECURSIVE_QUERY])
- self.check_positive_integer(self.max_iter, descr)
- if self.max_iter > 100:
- raise ValueError("Max iter is not allowed exceed 100")
|