feature_binning_param.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. #
  4. # Copyright 2019 The FATE Authors. All Rights Reserved.
  5. #
  6. # Licensed under the Apache License, Version 2.0 (the "License");
  7. # you may not use this file except in compliance with the License.
  8. # You may obtain a copy of the License at
  9. #
  10. # http://www.apache.org/licenses/LICENSE-2.0
  11. #
  12. # Unless required by applicable law or agreed to in writing, software
  13. # distributed under the License is distributed on an "AS IS" BASIS,
  14. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. # See the License for the specific language governing permissions and
  16. # limitations under the License.
  17. #
  18. import copy
  19. from federatedml.param.base_param import BaseParam
  20. from federatedml.param.encrypt_param import EncryptParam
  21. from federatedml.util import consts, LOGGER
  22. class TransformParam(BaseParam):
  23. """
  24. Define how to transfer the cols
  25. Parameters
  26. ----------
  27. transform_cols : list of column index, default: -1
  28. Specify which columns need to be transform. If column index is None, None of columns will be transformed.
  29. If it is -1, it will use same columns as cols in binning module.
  30. Note tha columns specified by `transform_cols` and `transform_names` will be combined.
  31. transform_names: list of string, default: []
  32. Specify which columns need to calculated. Each element in the list represent for a column name in header.
  33. Note tha columns specified by `transform_cols` and `transform_names` will be combined.
  34. transform_type: {'bin_num', 'woe', None}
  35. Specify which value these columns going to replace.
  36. 1. bin_num: Transfer original feature value to bin index in which this value belongs to.
  37. 2. woe: This is valid for guest party only. It will replace original value to its woe value
  38. 3. None: nothing will be replaced.
  39. """
  40. def __init__(self, transform_cols=-1, transform_names=None, transform_type="bin_num"):
  41. super(TransformParam, self).__init__()
  42. self.transform_cols = transform_cols
  43. self.transform_names = transform_names
  44. self.transform_type = transform_type
  45. def check(self):
  46. descr = "Transform Param's "
  47. if self.transform_cols is not None and self.transform_cols != -1:
  48. self.check_defined_type(self.transform_cols, descr, ['list'])
  49. self.check_defined_type(self.transform_names, descr, ['list', "NoneType"])
  50. if self.transform_names is not None:
  51. for name in self.transform_names:
  52. if not isinstance(name, str):
  53. raise ValueError("Elements in transform_names should be string type")
  54. self.check_valid_value(self.transform_type, descr, ['bin_num', 'woe', None])
  55. class OptimalBinningParam(BaseParam):
  56. """
  57. Indicate optimal binning params
  58. Parameters
  59. ----------
  60. metric_method: str, default: "iv"
  61. The algorithm metric method. Support iv, gini, ks, chi-square
  62. min_bin_pct: float, default: 0.05
  63. The minimum percentage of each bucket
  64. max_bin_pct: float, default: 1.0
  65. The maximum percentage of each bucket
  66. init_bin_nums: int, default 100
  67. Number of bins when initialize
  68. mixture: bool, default: True
  69. Whether each bucket need event and non-event records
  70. init_bucket_method: str default: quantile
  71. Init bucket methods. Accept quantile and bucket.
  72. """
  73. def __init__(self, metric_method='iv', min_bin_pct=0.05, max_bin_pct=1.0,
  74. init_bin_nums=1000, mixture=True, init_bucket_method='quantile'):
  75. super().__init__()
  76. self.init_bucket_method = init_bucket_method
  77. self.metric_method = metric_method
  78. self.max_bin = None
  79. self.mixture = mixture
  80. self.max_bin_pct = max_bin_pct
  81. self.min_bin_pct = min_bin_pct
  82. self.init_bin_nums = init_bin_nums
  83. self.adjustment_factor = None
  84. def check(self):
  85. descr = "hetero binning's optimal binning param's"
  86. self.check_string(self.metric_method, descr)
  87. self.metric_method = self.metric_method.lower()
  88. if self.metric_method in ['chi_square', 'chi-square']:
  89. self.metric_method = 'chi_square'
  90. self.check_valid_value(self.metric_method, descr, ['iv', 'gini', 'chi_square', 'ks'])
  91. self.check_positive_integer(self.init_bin_nums, descr)
  92. self.init_bucket_method = self.init_bucket_method.lower()
  93. self.check_valid_value(self.init_bucket_method, descr, ['quantile', 'bucket'])
  94. if self.max_bin_pct not in [1, 0]:
  95. self.check_decimal_float(self.max_bin_pct, descr)
  96. if self.min_bin_pct not in [1, 0]:
  97. self.check_decimal_float(self.min_bin_pct, descr)
  98. if self.min_bin_pct > self.max_bin_pct:
  99. raise ValueError("Optimal binning's min_bin_pct should less or equal than max_bin_pct")
  100. self.check_boolean(self.mixture, descr)
  101. self.check_positive_integer(self.init_bin_nums, descr)
  102. class FeatureBinningParam(BaseParam):
  103. """
  104. Define the feature binning method
  105. Parameters
  106. ----------
  107. method : str, 'quantile', 'bucket' or 'optimal', default: 'quantile'
  108. Binning method.
  109. compress_thres: int, default: 10000
  110. When the number of saved summaries exceed this threshold, it will call its compress function
  111. head_size: int, default: 10000
  112. The buffer size to store inserted observations. When head list reach this buffer size, the
  113. QuantileSummaries object start to generate summary(or stats) and insert into its sampled list.
  114. error: float, 0 <= error < 1 default: 0.001
  115. The error of tolerance of binning. The final split point comes from original data, and the rank
  116. of this value is close to the exact rank. More precisely,
  117. floor((p - 2 * error) * N) <= rank(x) <= ceil((p + 2 * error) * N)
  118. where p is the quantile in float, and N is total number of data.
  119. bin_num: int, bin_num > 0, default: 10
  120. The max bin number for binning
  121. bin_indexes : list of int or int, default: -1
  122. Specify which columns need to be binned. -1 represent for all columns. If you need to indicate specific
  123. cols, provide a list of header index instead of -1.
  124. Note tha columns specified by `bin_indexes` and `bin_names` will be combined.
  125. bin_names : list of string, default: []
  126. Specify which columns need to calculated. Each element in the list represent for a column name in header.
  127. Note tha columns specified by `bin_indexes` and `bin_names` will be combined.
  128. adjustment_factor : float, default: 0.5
  129. the adjustment factor when calculating WOE. This is useful when there is no event or non-event in
  130. a bin. Please note that this parameter will NOT take effect for setting in host.
  131. category_indexes : list of int or int, default: []
  132. Specify which columns are category features. -1 represent for all columns. List of int indicate a set of
  133. such features. For category features, bin_obj will take its original values as split_points and treat them
  134. as have been binned. If this is not what you expect, please do NOT put it into this parameters.
  135. The number of categories should not exceed bin_num set above.
  136. Note tha columns specified by `category_indexes` and `category_names` will be combined.
  137. category_names : list of string, default: []
  138. Use column names to specify category features. Each element in the list represent for a column name in header.
  139. Note tha columns specified by `category_indexes` and `category_names` will be combined.
  140. local_only : bool, default: False
  141. Whether just provide binning method to guest party. If true, host party will do nothing.
  142. Warnings: This parameter will be deprecated in future version.
  143. transform_param: TransformParam
  144. Define how to transfer the binned data.
  145. need_run: bool, default True
  146. Indicate if this module needed to be run
  147. skip_static: bool, default False
  148. If true, binning will not calculate iv, woe etc. In this case, optimal-binning
  149. will not be supported.
  150. """
  151. def __init__(self, method=consts.QUANTILE,
  152. compress_thres=consts.DEFAULT_COMPRESS_THRESHOLD,
  153. head_size=consts.DEFAULT_HEAD_SIZE,
  154. error=consts.DEFAULT_RELATIVE_ERROR,
  155. bin_num=consts.G_BIN_NUM, bin_indexes=-1, bin_names=None, adjustment_factor=0.5,
  156. transform_param=TransformParam(),
  157. local_only=False,
  158. category_indexes=None, category_names=None,
  159. need_run=True, skip_static=False):
  160. super(FeatureBinningParam, self).__init__()
  161. self.method = method
  162. self.compress_thres = compress_thres
  163. self.head_size = head_size
  164. self.error = error
  165. self.adjustment_factor = adjustment_factor
  166. self.bin_num = bin_num
  167. self.bin_indexes = bin_indexes
  168. self.bin_names = bin_names
  169. self.category_indexes = category_indexes
  170. self.category_names = category_names
  171. self.transform_param = copy.deepcopy(transform_param)
  172. self.need_run = need_run
  173. self.skip_static = skip_static
  174. self.local_only = local_only
  175. def check(self):
  176. descr = "Binning param's"
  177. self.check_string(self.method, descr)
  178. self.method = self.method.lower()
  179. self.check_positive_integer(self.compress_thres, descr)
  180. self.check_positive_integer(self.head_size, descr)
  181. self.check_decimal_float(self.error, descr)
  182. self.check_positive_integer(self.bin_num, descr)
  183. if self.bin_indexes != -1:
  184. self.check_defined_type(self.bin_indexes, descr, ['list', 'RepeatedScalarContainer', "NoneType"])
  185. self.check_defined_type(self.bin_names, descr, ['list', "NoneType"])
  186. self.check_defined_type(self.category_indexes, descr, ['list', "NoneType"])
  187. self.check_defined_type(self.category_names, descr, ['list', "NoneType"])
  188. self.check_open_unit_interval(self.adjustment_factor, descr)
  189. self.check_boolean(self.local_only, descr)
  190. class HeteroFeatureBinningParam(FeatureBinningParam):
  191. """
  192. split_points_by_index: dict, default None
  193. Manually specified split points for local features;
  194. key should be feature index, value should be split points in sorted list;
  195. along with `split_points_by_col_name`, keys should cover all local features, including categorical features;
  196. note that each split point list should have length equal to desired bin num(n),
  197. with first (n-1) entries equal to the maximum value(inclusive) of each first (n-1) bins,
  198. and nth value the max of current feature.
  199. split_points_by_col_name: dict, default None
  200. Manually specified split points for local features;
  201. key should be feature name, value should be split points in sorted list;
  202. along with `split_points_by_index`, keys should cover all local features, including categorical features;
  203. note that each split point list should have length equal to desired bin num(n),
  204. with first (n-1) entries equal to the maximum value(inclusive) of each first (n-1) bins,
  205. and nth value the max of current feature.
  206. """
  207. def __init__(self, method=consts.QUANTILE, compress_thres=consts.DEFAULT_COMPRESS_THRESHOLD,
  208. head_size=consts.DEFAULT_HEAD_SIZE,
  209. error=consts.DEFAULT_RELATIVE_ERROR,
  210. bin_num=consts.G_BIN_NUM, bin_indexes=-1, bin_names=None, adjustment_factor=0.5,
  211. transform_param=TransformParam(), optimal_binning_param=OptimalBinningParam(),
  212. local_only=False, category_indexes=None, category_names=None,
  213. encrypt_param=EncryptParam(),
  214. need_run=True, skip_static=False,
  215. split_points_by_index=None, split_points_by_col_name=None):
  216. super(HeteroFeatureBinningParam, self).__init__(method=method, compress_thres=compress_thres,
  217. head_size=head_size, error=error,
  218. bin_num=bin_num, bin_indexes=bin_indexes,
  219. bin_names=bin_names, adjustment_factor=adjustment_factor,
  220. transform_param=transform_param,
  221. category_indexes=category_indexes,
  222. category_names=category_names,
  223. need_run=need_run, local_only=local_only,
  224. skip_static=skip_static)
  225. self.optimal_binning_param = copy.deepcopy(optimal_binning_param)
  226. self.encrypt_param = encrypt_param
  227. self.split_points_by_index = split_points_by_index
  228. self.split_points_by_col_name = split_points_by_col_name
  229. def check(self):
  230. descr = "Hetero Binning param's"
  231. super(HeteroFeatureBinningParam, self).check()
  232. self.check_valid_value(self.method, descr, [consts.QUANTILE, consts.BUCKET, consts.OPTIMAL])
  233. self.optimal_binning_param.check()
  234. self.encrypt_param.check()
  235. if self.encrypt_param.method != consts.PAILLIER:
  236. raise ValueError("Feature Binning support Paillier encrypt method only.")
  237. if self.skip_static and self.method == consts.OPTIMAL:
  238. raise ValueError("When skip_static, optimal binning is not supported.")
  239. self.transform_param.check()
  240. if self.skip_static and self.transform_param.transform_type == 'woe':
  241. raise ValueError("To use woe transform, skip_static should set as False")
  242. if self.split_points_by_index is not None:
  243. LOGGER.warning(f"When manually setting binning split points, 'method' will be ignored.")
  244. if not isinstance(self.split_points_by_index, dict):
  245. raise ValueError(f"{descr} `split_points_by_index` should be a dict")
  246. for k, v in self.split_points_by_index.items():
  247. if not isinstance(k, str):
  248. raise ValueError(f"{descr} `split_points_by_index`'s keys should be str")
  249. if not isinstance(v, list):
  250. raise ValueError(f"{descr} `split_points_by_index`'s values should be given in list format")
  251. if sorted(v) != v:
  252. raise ValueError(f"{k}'s split points({v}) should be given in sorted order.")
  253. if self.split_points_by_col_name is not None:
  254. LOGGER.warning(f"When manually setting binning split points, 'method' will be ignored.")
  255. if not isinstance(self.split_points_by_col_name, dict):
  256. raise ValueError(f"{descr} `split_points_by_col_name` should be a dict")
  257. for k, v in self.split_points_by_col_name.items():
  258. if not isinstance(k, str):
  259. raise ValueError(f"{descr} `split_points_by_col_name`'s keys should be str")
  260. if not isinstance(v, list):
  261. raise ValueError(f"{descr} `split_points_by_col_name`'s values should be given in list format")
  262. if sorted(v) != v:
  263. raise ValueError(f"{k}'s split points({v}) should be given in sorted order.")
  264. class HomoFeatureBinningParam(FeatureBinningParam):
  265. def __init__(self, method=consts.VIRTUAL_SUMMARY,
  266. compress_thres=consts.DEFAULT_COMPRESS_THRESHOLD,
  267. head_size=consts.DEFAULT_HEAD_SIZE,
  268. error=consts.DEFAULT_RELATIVE_ERROR,
  269. sample_bins=100,
  270. bin_num=consts.G_BIN_NUM, bin_indexes=-1, bin_names=None, adjustment_factor=0.5,
  271. transform_param=TransformParam(),
  272. category_indexes=None, category_names=None,
  273. need_run=True, skip_static=False, max_iter=100):
  274. super(HomoFeatureBinningParam, self).__init__(method=method, compress_thres=compress_thres,
  275. head_size=head_size, error=error,
  276. bin_num=bin_num, bin_indexes=bin_indexes,
  277. bin_names=bin_names, adjustment_factor=adjustment_factor,
  278. transform_param=transform_param,
  279. category_indexes=category_indexes, category_names=category_names,
  280. need_run=need_run,
  281. skip_static=skip_static)
  282. self.sample_bins = sample_bins
  283. self.max_iter = max_iter
  284. def check(self):
  285. descr = "homo binning param's"
  286. super(HomoFeatureBinningParam, self).check()
  287. self.check_string(self.method, descr)
  288. self.method = self.method.lower()
  289. self.check_valid_value(self.method, descr, [consts.VIRTUAL_SUMMARY, consts.RECURSIVE_QUERY])
  290. self.check_positive_integer(self.max_iter, descr)
  291. if self.max_iter > 100:
  292. raise ValueError("Max iter is not allowed exceed 100")